new-words
changeset 29:c631833fa2be
minifixes related to unicode support
author | Igor Chubin <igor@chub.in> |
---|---|
date | Mon Jun 21 19:46:58 2010 +0300 (2010-06-21) |
parents | 7db7bbf96fad |
children | 07d89c2505e7 |
files | en.sh grep-sentences.pl new-words.sh |
line diff
1.1 --- a/en.sh Tue Jun 15 05:46:50 2010 +0300 1.2 +++ b/en.sh Mon Jun 21 19:46:58 2010 +0300 1.3 @@ -5,18 +5,32 @@ 1.4 DICT_NAME=mueller24 1.5 1.6 1.7 -myname="`echo $0 | sed s@.*/@@`" 1.8 +_dict() 1.9 +{ 1.10 case $myname in 1.11 - "de") DICT_NAME=deu-eng ; DICTD_SERVER_ARGS='' ;; 1.12 - "pl") DICT_NAME=slovnyk_pl-ru ;; 1.13 - "ru") DICT_NAME=ozhshv ;; 1.14 + "de") DICT_NAME=deu-eng ; 1.15 + DICTD_SERVER_ARGS='' 1.16 + dict $DICTD_SERVER_ARGS -d "$DICT_NAME" "$@" 1.17 + ;; 1.18 + "pl") DICT_NAME=slovnyk_pl-ru 1.19 + dict $DICTD_SERVER_ARGS -d "$DICT_NAME" "$@" 1.20 + ;; 1.21 + "ru") DICT_NAME=ozhshv 1.22 + dict $DICTD_SERVER_ARGS -d "$DICT_NAME" "$@" 1.23 + ;; 1.24 + "uk") 1.25 + slovnyk "$1" 1.26 + ;; 1.27 esac 1.28 +} 1.29 1.30 ( 1.31 - dict $DICTD_SERVER_ARGS -d "$DICT_NAME" "$@" 1.32 + myname="`echo $0 | sed s@.*/@@`" 1.33 + _dict "$@" 1.34 if [ -e "$ORIGINAL_TEXT" ] 1.35 then 1.36 echo . . . . . . . . . . . . . . . . . . . . . . . . . ; echo 1.37 grep-sentences "$*" "$ORIGINAL_TEXT" 1.38 fi 1.39 -) | less -p "$*" 1.40 +) 1.41 +# | less -p "$*"
2.1 --- a/grep-sentences.pl Tue Jun 15 05:46:50 2010 +0300 2.2 +++ b/grep-sentences.pl Mon Jun 21 19:46:58 2010 +0300 2.3 @@ -1,10 +1,17 @@ 2.4 #!/usr/bin/perl 2.5 2.6 +use utf8; 2.7 +use Encode; 2.8 +binmode STDIN,":utf8"; 2.9 +binmode STDOUT,":utf8"; 2.10 2.11 $regexp=$ARGV[0]; 2.12 +$regexp=decode("utf8",$regexp); 2.13 + 2.14 $page=$ARGV[1]; 2.15 #if (open(PAGE, "lynx -dump '$page'|")) { 2.16 if (open(PAGE, "$page")) { 2.17 + binmode PAGE,":utf8"; 2.18 local $/; 2.19 $text=<PAGE>; 2.20 $text =~ s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g;
3.1 --- a/new-words.sh Tue Jun 15 05:46:50 2010 +0300 3.2 +++ b/new-words.sh Mon Jun 21 19:46:58 2010 +0300 3.3 @@ -255,6 +255,9 @@ 3.4 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 3.5 #!/usr/bin/perl 3.6 3.7 +use Encode; 3.8 +use utf8; 3.9 + 3.10 eval { 3.11 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it 3.12 require String::Similarity; 3.13 @@ -280,9 +283,17 @@ 3.14 { 3.15 if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); } 3.16 elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); } 3.17 + elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); } 3.18 else { return shift ; } 3.19 } 3.20 3.21 +sub normalize_ukrainian($) 3.22 +{ 3.23 + $_=lc(shift); 3.24 + s/[юіоеуаи]$//g; 3.25 + return $_; 3.26 +} 3.27 + 3.28 sub normalize_german($) 3.29 { 3.30 $_=lc(shift); 3.31 @@ -365,9 +376,11 @@ 3.32 } 3.33 close(VOC); 3.34 3.35 +binmode STDIN,":utf8"; 3.36 @lines=<STDIN>; 3.37 for $L (@lines) { 3.38 chomp($L); 3.39 + #$L = decode( "utf8", $L); 3.40 $l=$L; 3.41 $l =~ s/^\s*//; 3.42 my ($a, $b)=split(/\s+/,$l,2); 3.43 @@ -381,6 +394,7 @@ 3.44 } 3.45 } 3.46 @lines2 = sort { compare($b,$a) } @lines; 3.47 +binmode STDOUT, ":utf8"; 3.48 for $l (@lines2) { 3.49 print "$l\n"; 3.50 } 3.51 @@ -401,10 +415,13 @@ 3.52 { 3.53 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` 3.54 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 3.55 +use Encode; 3.56 + 3.57 $file = $ARGV[0]; 3.58 our $dict; 3.59 if (open(NOTES, $ENV{NOTES_FILE})) { 3.60 while(<NOTES>) { 3.61 + $_ = decode( "utf8", $_); 3.62 chomp; 3.63 s/^\s+//; 3.64 my ($a,$b)=split /\s+/,$_,2; 3.65 @@ -414,8 +431,10 @@ 3.66 if (open(F, $file)) { 3.67 @lines=<F>; 3.68 close(F); 3.69 + for (@lines) {$_ = decode( "utf8", $_);}; 3.70 3.71 if (open(F, ">$file")) { 3.72 + binmode F, ":utf8"; 3.73 for (@lines) { 3.74 m/\s+\S+\s+(\S+)/; 3.75 $name=$1;