new-words

diff new-words.sh @ 30:07d89c2505e7

lingvo en->ru by Andrii Grytsenko
author Igor Chubin <igor@chub.in>
date Sun Jul 04 12:24:01 2010 +0200 (2010-07-04)
parents 7db7bbf96fad
children 48ca8248e9cc
line diff
     1.1 --- a/new-words.sh	Tue Jun 15 05:46:50 2010 +0300
     1.2 +++ b/new-words.sh	Sun Jul 04 12:24:01 2010 +0200
     1.3 @@ -255,6 +255,9 @@
     1.4      cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
     1.5  #!/usr/bin/perl
     1.6  
     1.7 +use Encode;
     1.8 +use utf8;
     1.9 +
    1.10  eval {
    1.11  # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
    1.12      require String::Similarity;
    1.13 @@ -280,9 +283,17 @@
    1.14  {
    1.15      if   ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
    1.16      elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
    1.17 +    elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); }
    1.18      else { return shift ; }
    1.19  }
    1.20  
    1.21 +sub normalize_ukrainian($)
    1.22 +{
    1.23 +    $_=lc(shift);
    1.24 +    s/[юіоеуаи]$//g;
    1.25 +    return $_;
    1.26 +}
    1.27 +
    1.28  sub normalize_german($)
    1.29  {
    1.30      $_=lc(shift);
    1.31 @@ -365,9 +376,11 @@
    1.32  }
    1.33  close(VOC);
    1.34  
    1.35 +binmode STDIN,":utf8";
    1.36  @lines=<STDIN>;
    1.37  for $L (@lines) {
    1.38      chomp($L);
    1.39 +    #$L = decode( "utf8", $L);
    1.40      $l=$L;
    1.41      $l =~ s/^\s*//;
    1.42      my ($a, $b)=split(/\s+/,$l,2);
    1.43 @@ -381,6 +394,7 @@
    1.44      }
    1.45  }
    1.46  @lines2 = sort { compare($b,$a) } @lines;
    1.47 +binmode STDOUT, ":utf8";
    1.48  for $l (@lines2) {
    1.49      print "$l\n";
    1.50  }
    1.51 @@ -401,10 +415,13 @@
    1.52  {
    1.53      PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
    1.54      cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
    1.55 +use Encode;
    1.56 +
    1.57  $file = $ARGV[0];
    1.58  our $dict;
    1.59  if (open(NOTES, $ENV{NOTES_FILE})) {
    1.60      while(<NOTES>) {
    1.61 +        $_ = decode( "utf8", $_);
    1.62          chomp;
    1.63          s/^\s+//;
    1.64          my ($a,$b)=split /\s+/,$_,2;
    1.65 @@ -414,8 +431,10 @@
    1.66  if (open(F, $file)) {
    1.67      @lines=<F>;
    1.68      close(F);
    1.69 +    for (@lines) {$_ = decode( "utf8", $_);};
    1.70  
    1.71      if (open(F, ">$file")) {
    1.72 +        binmode F, ":utf8";
    1.73          for (@lines) {
    1.74              m/\s+\S+\s+(\S+)/;
    1.75              $name=$1;