new-words
diff new-words.sh @ 29:c631833fa2be
minifixes related to unicode support
author | Igor Chubin <igor@chub.in> |
---|---|
date | Mon Jun 21 19:46:58 2010 +0300 (2010-06-21) |
parents | 7db7bbf96fad |
children | 48ca8248e9cc |
line diff
1.1 --- a/new-words.sh Tue Jun 15 05:46:50 2010 +0300 1.2 +++ b/new-words.sh Mon Jun 21 19:46:58 2010 +0300 1.3 @@ -255,6 +255,9 @@ 1.4 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.5 #!/usr/bin/perl 1.6 1.7 +use Encode; 1.8 +use utf8; 1.9 + 1.10 eval { 1.11 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it 1.12 require String::Similarity; 1.13 @@ -280,9 +283,17 @@ 1.14 { 1.15 if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); } 1.16 elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); } 1.17 + elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); } 1.18 else { return shift ; } 1.19 } 1.20 1.21 +sub normalize_ukrainian($) 1.22 +{ 1.23 + $_=lc(shift); 1.24 + s/[юіоеуаи]$//g; 1.25 + return $_; 1.26 +} 1.27 + 1.28 sub normalize_german($) 1.29 { 1.30 $_=lc(shift); 1.31 @@ -365,9 +376,11 @@ 1.32 } 1.33 close(VOC); 1.34 1.35 +binmode STDIN,":utf8"; 1.36 @lines=<STDIN>; 1.37 for $L (@lines) { 1.38 chomp($L); 1.39 + #$L = decode( "utf8", $L); 1.40 $l=$L; 1.41 $l =~ s/^\s*//; 1.42 my ($a, $b)=split(/\s+/,$l,2); 1.43 @@ -381,6 +394,7 @@ 1.44 } 1.45 } 1.46 @lines2 = sort { compare($b,$a) } @lines; 1.47 +binmode STDOUT, ":utf8"; 1.48 for $l (@lines2) { 1.49 print "$l\n"; 1.50 } 1.51 @@ -401,10 +415,13 @@ 1.52 { 1.53 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` 1.54 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.55 +use Encode; 1.56 + 1.57 $file = $ARGV[0]; 1.58 our $dict; 1.59 if (open(NOTES, $ENV{NOTES_FILE})) { 1.60 while(<NOTES>) { 1.61 + $_ = decode( "utf8", $_); 1.62 chomp; 1.63 s/^\s+//; 1.64 my ($a,$b)=split /\s+/,$_,2; 1.65 @@ -414,8 +431,10 @@ 1.66 if (open(F, $file)) { 1.67 @lines=<F>; 1.68 close(F); 1.69 + for (@lines) {$_ = decode( "utf8", $_);}; 1.70 1.71 if (open(F, ">$file")) { 1.72 + binmode F, ":utf8"; 1.73 for (@lines) { 1.74 m/\s+\S+\s+(\S+)/; 1.75 $name=$1;