new-words

changeset 29:c631833fa2be

minifixes related to unicode support
author Igor Chubin <igor@chub.in>
date Mon Jun 21 19:46:58 2010 +0300 (2010-06-21)
parents 7db7bbf96fad
children 07d89c2505e7
files en.sh grep-sentences.pl new-words.sh
line diff
     1.1 --- a/en.sh	Tue Jun 15 05:46:50 2010 +0300
     1.2 +++ b/en.sh	Mon Jun 21 19:46:58 2010 +0300
     1.3 @@ -5,18 +5,32 @@
     1.4  DICT_NAME=mueller24
     1.5  
     1.6  
     1.7 -myname="`echo $0 | sed s@.*/@@`"
     1.8 +_dict()
     1.9 +{
    1.10  case $myname in
    1.11 -  "de") DICT_NAME=deu-eng ; DICTD_SERVER_ARGS='' ;;
    1.12 -  "pl") DICT_NAME=slovnyk_pl-ru ;;
    1.13 -  "ru") DICT_NAME=ozhshv ;;
    1.14 +  "de") DICT_NAME=deu-eng ; 
    1.15 +        DICTD_SERVER_ARGS='' 
    1.16 +        dict $DICTD_SERVER_ARGS -d "$DICT_NAME" "$@"
    1.17 +        ;;
    1.18 +  "pl") DICT_NAME=slovnyk_pl-ru 
    1.19 +        dict $DICTD_SERVER_ARGS -d "$DICT_NAME" "$@"
    1.20 +        ;;
    1.21 +  "ru") DICT_NAME=ozhshv 
    1.22 +        dict $DICTD_SERVER_ARGS -d "$DICT_NAME" "$@"
    1.23 +        ;;
    1.24 +  "uk")
    1.25 +        slovnyk "$1"
    1.26 +        ;;
    1.27  esac
    1.28 +}
    1.29  
    1.30  (
    1.31 -    dict $DICTD_SERVER_ARGS -d "$DICT_NAME" "$@"
    1.32 +    myname="`echo $0 | sed s@.*/@@`"
    1.33 +    _dict "$@"
    1.34      if [ -e "$ORIGINAL_TEXT" ]
    1.35      then
    1.36          echo . . . . . . . . . . . . . . . . . . . . . . . . . ; echo
    1.37          grep-sentences "$*" "$ORIGINAL_TEXT"
    1.38      fi
    1.39 -) | less -p "$*"
    1.40 +)
    1.41 +# | less -p "$*"
     2.1 --- a/grep-sentences.pl	Tue Jun 15 05:46:50 2010 +0300
     2.2 +++ b/grep-sentences.pl	Mon Jun 21 19:46:58 2010 +0300
     2.3 @@ -1,10 +1,17 @@
     2.4  #!/usr/bin/perl
     2.5  
     2.6 +use utf8;
     2.7 +use Encode;
     2.8 +binmode STDIN,":utf8";
     2.9 +binmode STDOUT,":utf8";
    2.10  
    2.11  $regexp=$ARGV[0];
    2.12 +$regexp=decode("utf8",$regexp);
    2.13 +
    2.14  $page=$ARGV[1];
    2.15  #if (open(PAGE, "lynx -dump '$page'|")) {
    2.16  if (open(PAGE, "$page")) {
    2.17 +    binmode PAGE,":utf8";
    2.18      local $/;
    2.19      $text=<PAGE>;
    2.20      $text =~ s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g;
     3.1 --- a/new-words.sh	Tue Jun 15 05:46:50 2010 +0300
     3.2 +++ b/new-words.sh	Mon Jun 21 19:46:58 2010 +0300
     3.3 @@ -255,6 +255,9 @@
     3.4      cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
     3.5  #!/usr/bin/perl
     3.6  
     3.7 +use Encode;
     3.8 +use utf8;
     3.9 +
    3.10  eval {
    3.11  # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
    3.12      require String::Similarity;
    3.13 @@ -280,9 +283,17 @@
    3.14  {
    3.15      if   ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
    3.16      elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
    3.17 +    elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); }
    3.18      else { return shift ; }
    3.19  }
    3.20  
    3.21 +sub normalize_ukrainian($)
    3.22 +{
    3.23 +    $_=lc(shift);
    3.24 +    s/[юіоеуаи]$//g;
    3.25 +    return $_;
    3.26 +}
    3.27 +
    3.28  sub normalize_german($)
    3.29  {
    3.30      $_=lc(shift);
    3.31 @@ -365,9 +376,11 @@
    3.32  }
    3.33  close(VOC);
    3.34  
    3.35 +binmode STDIN,":utf8";
    3.36  @lines=<STDIN>;
    3.37  for $L (@lines) {
    3.38      chomp($L);
    3.39 +    #$L = decode( "utf8", $L);
    3.40      $l=$L;
    3.41      $l =~ s/^\s*//;
    3.42      my ($a, $b)=split(/\s+/,$l,2);
    3.43 @@ -381,6 +394,7 @@
    3.44      }
    3.45  }
    3.46  @lines2 = sort { compare($b,$a) } @lines;
    3.47 +binmode STDOUT, ":utf8";
    3.48  for $l (@lines2) {
    3.49      print "$l\n";
    3.50  }
    3.51 @@ -401,10 +415,13 @@
    3.52  {
    3.53      PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
    3.54      cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
    3.55 +use Encode;
    3.56 +
    3.57  $file = $ARGV[0];
    3.58  our $dict;
    3.59  if (open(NOTES, $ENV{NOTES_FILE})) {
    3.60      while(<NOTES>) {
    3.61 +        $_ = decode( "utf8", $_);
    3.62          chomp;
    3.63          s/^\s+//;
    3.64          my ($a,$b)=split /\s+/,$_,2;
    3.65 @@ -414,8 +431,10 @@
    3.66  if (open(F, $file)) {
    3.67      @lines=<F>;
    3.68      close(F);
    3.69 +    for (@lines) {$_ = decode( "utf8", $_);};
    3.70  
    3.71      if (open(F, ">$file")) {
    3.72 +        binmode F, ":utf8";
    3.73          for (@lines) {
    3.74              m/\s+\S+\s+(\S+)/;
    3.75              $name=$1;