new-words
diff new-words.sh @ 22:46e987f4636d
part.pl script + german normalization support
author | Igor Chubin <igor@chub.in> |
---|---|
date | Sun May 16 18:20:18 2010 +0300 (2010-05-16) |
parents | 190d4ac6b07c |
children | 4b9d13c78de2 |
line diff
1.1 --- a/new-words.sh Thu May 06 21:05:20 2010 +0300 1.2 +++ b/new-words.sh Sun May 16 18:20:18 2010 +0300 1.3 @@ -44,10 +44,13 @@ 1.4 1.5 LANGUAGE=en 1.6 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `" 1.7 -if echo "$1" | grep -q http://...wikipedia.org/wiki/ 1.8 -then 1.9 -LANGUAGE="`echo $1 | sed s@http://@@ | sed s@.wikipedia.*@@`" 1.10 -fi 1.11 +for arg 1.12 +do 1.13 + if echo "$arg" | grep -q http://...wikipedia.org/wiki/ 1.14 + then 1.15 + LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`" 1.16 + fi 1.17 +done 1.18 [ "${my_name}" = "new" ] || LANGUAGE="$my_name" 1.19 if [ "$1" = "-l" ] 1.20 then 1.21 @@ -179,11 +182,11 @@ 1.22 1.23 group_words() 1.24 { 1.25 - if [ "$LANGUAGE" != "en" ] 1.26 - then 1.27 - cat 1.28 - return 1.29 - fi 1.30 + #if [ "$LANGUAGE" != "en" ] 1.31 + #then 1.32 + # cat 1.33 + # return 1.34 + #fi 1.35 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX` 1.36 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.37 #!/usr/bin/perl 1.38 @@ -211,9 +214,43 @@ 1.39 1.40 sub normalize($) 1.41 { 1.42 + if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); } 1.43 + elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); } 1.44 + else { return shift ; } 1.45 +} 1.46 + 1.47 +sub normalize_german($) 1.48 +{ 1.49 + $_=lc(shift); 1.50 + 1.51 + s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//; 1.52 + s/en$//; s/er$//; 1.53 + 1.54 + s/lich$//; s/ig$//; 1.55 + s/al$//; s/isch$//; 1.56 + s/ell$//; s/haft$//; 1.57 + 1.58 + s/bar$//; s/sam$//; s/lich$//; 1.59 + 1.60 + @prefixes=qw( 1.61 + ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu 1.62 + be emp ent er ge miss ver zer durch über um unter wieder); 1.63 + for $pref (@prefixes) { 1.64 + s/^$pref//; 1.65 + } 1.66 + 1.67 + 1.68 + return $_; 1.69 +} 1.70 + 1.71 +sub normalize_english($) 1.72 +{ 1.73 $_=lc(shift); 1.74 1.75 s/s$//; 1.76 + 1.77 + s/ation$//; s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//; 1.78 + 1.79 s/ed$//; 1.80 s/en$//; 1.81 s/er$//; 1.82 @@ -221,8 +258,7 @@ 1.83 s/ing$//; 1.84 1.85 s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//; 1.86 - s/ation$//; s/ness$//; s/ship$//; s/ally$//; 1.87 - s/ment$//; s/ify$//; s/ity$//; s/fy$//; s/ly$//; 1.88 + s/ify$//; s/fy$//; s/ly$//; 1.89 s/ise$//; s/ize$//; 1.90 1.91 s/e$//; 1.92 @@ -286,6 +322,7 @@ 1.93 PERL_SCRIPT 1.94 export VOCABULARY 1.95 export NEED_TO_USE_VOCABULARY_WHEN_SORT 1.96 + export LANGUAGE 1.97 perl $PERL_SCRIPT_TEMP_NAME 1.98 rm $PERL_SCRIPT_TEMP_NAME 1.99 }