# HG changeset patch # User Igor Chubin # Date 1277138818 -10800 # Node ID c631833fa2bec0f2e8a1c85e50524696d5ac6cc0 # Parent 7db7bbf96fade86ddec8d21e7cafb0d6465197b7 minifixes related to unicode support diff -r 7db7bbf96fad -r c631833fa2be en.sh --- a/en.sh Tue Jun 15 05:46:50 2010 +0300 +++ b/en.sh Mon Jun 21 19:46:58 2010 +0300 @@ -5,18 +5,32 @@ DICT_NAME=mueller24 -myname="`echo $0 | sed s@.*/@@`" +_dict() +{ case $myname in - "de") DICT_NAME=deu-eng ; DICTD_SERVER_ARGS='' ;; - "pl") DICT_NAME=slovnyk_pl-ru ;; - "ru") DICT_NAME=ozhshv ;; + "de") DICT_NAME=deu-eng ; + DICTD_SERVER_ARGS='' + dict $DICTD_SERVER_ARGS -d "$DICT_NAME" "$@" + ;; + "pl") DICT_NAME=slovnyk_pl-ru + dict $DICTD_SERVER_ARGS -d "$DICT_NAME" "$@" + ;; + "ru") DICT_NAME=ozhshv + dict $DICTD_SERVER_ARGS -d "$DICT_NAME" "$@" + ;; + "uk") + slovnyk "$1" + ;; esac +} ( - dict $DICTD_SERVER_ARGS -d "$DICT_NAME" "$@" + myname="`echo $0 | sed s@.*/@@`" + _dict "$@" if [ -e "$ORIGINAL_TEXT" ] then echo . . . . . . . . . . . . . . . . . . . . . . . . . ; echo grep-sentences "$*" "$ORIGINAL_TEXT" fi -) | less -p "$*" +) +# | less -p "$*" diff -r 7db7bbf96fad -r c631833fa2be grep-sentences.pl --- a/grep-sentences.pl Tue Jun 15 05:46:50 2010 +0300 +++ b/grep-sentences.pl Mon Jun 21 19:46:58 2010 +0300 @@ -1,10 +1,17 @@ #!/usr/bin/perl +use utf8; +use Encode; +binmode STDIN,":utf8"; +binmode STDOUT,":utf8"; $regexp=$ARGV[0]; +$regexp=decode("utf8",$regexp); + $page=$ARGV[1]; #if (open(PAGE, "lynx -dump '$page'|")) { if (open(PAGE, "$page")) { + binmode PAGE,":utf8"; local $/; $text=; $text =~ s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; diff -r 7db7bbf96fad -r c631833fa2be new-words.sh --- a/new-words.sh Tue Jun 15 05:46:50 2010 +0300 +++ b/new-words.sh Mon Jun 21 19:46:58 2010 +0300 @@ -255,6 +255,9 @@ cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME #!/usr/bin/perl +use Encode; +use utf8; + eval { # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it require String::Similarity; @@ -280,9 +283,17 @@ { if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); } elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); } + elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); } else { return shift ; } } +sub normalize_ukrainian($) +{ + $_=lc(shift); + s/[юіоеуаи]$//g; + return $_; +} + sub normalize_german($) { $_=lc(shift); @@ -365,9 +376,11 @@ } close(VOC); +binmode STDIN,":utf8"; @lines=; for $L (@lines) { chomp($L); + #$L = decode( "utf8", $L); $l=$L; $l =~ s/^\s*//; my ($a, $b)=split(/\s+/,$l,2); @@ -381,6 +394,7 @@ } } @lines2 = sort { compare($b,$a) } @lines; +binmode STDOUT, ":utf8"; for $l (@lines2) { print "$l\n"; } @@ -401,10 +415,13 @@ { PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME +use Encode; + $file = $ARGV[0]; our $dict; if (open(NOTES, $ENV{NOTES_FILE})) { while() { + $_ = decode( "utf8", $_); chomp; s/^\s+//; my ($a,$b)=split /\s+/,$_,2; @@ -414,8 +431,10 @@ if (open(F, $file)) { @lines=; close(F); + for (@lines) {$_ = decode( "utf8", $_);}; if (open(F, ">$file")) { + binmode F, ":utf8"; for (@lines) { m/\s+\S+\s+(\S+)/; $name=$1;