new-words
changeset 31:48ca8248e9cc
+esperanto normalization
author | Igor Chubin <igor@chub.in> |
---|---|
date | Tue Aug 17 21:35:57 2010 +0200 (2010-08-17) |
parents | 07d89c2505e7 |
children | 753fb84437aa |
files | en.sh grep-sentences.pl new-words.sh |
line diff
1.1 --- a/en.sh Sun Jul 04 12:24:01 2010 +0200 1.2 +++ b/en.sh Tue Aug 17 21:35:57 2010 +0200 1.3 @@ -22,6 +22,9 @@ 1.4 "uk") 1.5 slovnyk "$1" 1.6 ;; 1.7 + "io") 1.8 + vortaro "$1" 1.9 + ;; 1.10 esac 1.11 } 1.12
2.1 --- a/grep-sentences.pl Sun Jul 04 12:24:01 2010 +0200 2.2 +++ b/grep-sentences.pl Tue Aug 17 21:35:57 2010 +0200 2.3 @@ -9,6 +9,8 @@ 2.4 $regexp=decode("utf8",$regexp); 2.5 2.6 $page=$ARGV[1]; 2.7 +$number_of_printed = 0; 2.8 +$number_of_printed_max = 10; 2.9 #if (open(PAGE, "lynx -dump '$page'|")) { 2.10 if (open(PAGE, "$page")) { 2.11 binmode PAGE,":utf8"; 2.12 @@ -28,6 +30,9 @@ 2.13 s/\s*$//; 2.14 s/\[[0-9]+\]//g; 2.15 s/\s+/ /g; 2.16 - print "$_.\n\n" if /\b$regexp\b/; 2.17 + if (/\b$regexp\b/ and $number_of_printed < $number_of_printed_max ) { 2.18 + print "$_.\n\n"; 2.19 + $number_of_printed++; 2.20 + } 2.21 } 2.22 }
3.1 --- a/new-words.sh Sun Jul 04 12:24:01 2010 +0200 3.2 +++ b/new-words.sh Tue Aug 17 21:35:57 2010 +0200 3.3 @@ -113,7 +113,7 @@ 3.4 { 3.5 tr ' ' '\n' | sed 's/--/ /g' \ 3.6 | sed "s/'/__APOSTROPHE__/g" \ 3.7 -| perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:]'"'"'_-]*$/'\ 3.8 +| perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\ 3.9 | sed "s/__APOSTROPHE__/'/g" \ 3.10 | tr ' ' '\n' \ 3.11 | tee "$1" \ 3.12 @@ -284,6 +284,7 @@ 3.13 if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); } 3.14 elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); } 3.15 elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); } 3.16 + elsif ( $ENV{LANGUAGE} eq "io" ) { return normalize_esperanto(shift); } 3.17 else { return shift ; } 3.18 } 3.19 3.20 @@ -294,6 +295,18 @@ 3.21 return $_; 3.22 } 3.23 3.24 +sub normalize_esperanto($) 3.25 +{ 3.26 + $_=lc(shift); 3.27 +# verbs 3.28 + s/i$//; s/is$//; s/os$//; s/as$//; s/us$//; 3.29 + 3.30 +# nouns 3.31 + s/j?n?$//; 3.32 + 3.33 + return $_; 3.34 +} 3.35 + 3.36 sub normalize_german($) 3.37 { 3.38 $_=lc(shift);