new-words

changeset 31:48ca8248e9cc

+esperanto normalization
author Igor Chubin <igor@chub.in>
date Tue Aug 17 21:35:57 2010 +0200 (2010-08-17)
parents 07d89c2505e7
children 753fb84437aa
files en.sh grep-sentences.pl new-words.sh
line diff
     1.1 --- a/en.sh	Sun Jul 04 12:24:01 2010 +0200
     1.2 +++ b/en.sh	Tue Aug 17 21:35:57 2010 +0200
     1.3 @@ -22,6 +22,9 @@
     1.4    "uk")
     1.5          slovnyk "$1"
     1.6          ;;
     1.7 +  "io")
     1.8 +        vortaro "$1"
     1.9 +        ;;
    1.10  esac
    1.11  }
    1.12  
     2.1 --- a/grep-sentences.pl	Sun Jul 04 12:24:01 2010 +0200
     2.2 +++ b/grep-sentences.pl	Tue Aug 17 21:35:57 2010 +0200
     2.3 @@ -9,6 +9,8 @@
     2.4  $regexp=decode("utf8",$regexp);
     2.5  
     2.6  $page=$ARGV[1];
     2.7 +$number_of_printed = 0;
     2.8 +$number_of_printed_max = 10;
     2.9  #if (open(PAGE, "lynx -dump '$page'|")) {
    2.10  if (open(PAGE, "$page")) {
    2.11      binmode PAGE,":utf8";
    2.12 @@ -28,6 +30,9 @@
    2.13          s/\s*$//;
    2.14          s/\[[0-9]+\]//g;
    2.15          s/\s+/ /g;
    2.16 -        print "$_.\n\n" if /\b$regexp\b/;
    2.17 +        if (/\b$regexp\b/ and $number_of_printed < $number_of_printed_max ) {
    2.18 +            print "$_.\n\n";
    2.19 +            $number_of_printed++;
    2.20 +        }
    2.21      }
    2.22  }
     3.1 --- a/new-words.sh	Sun Jul 04 12:24:01 2010 +0200
     3.2 +++ b/new-words.sh	Tue Aug 17 21:35:57 2010 +0200
     3.3 @@ -113,7 +113,7 @@
     3.4  {
     3.5  tr ' ' '\n' | sed 's/--/ /g' \
     3.6  | sed "s/'/__APOSTROPHE__/g" \
     3.7 -| perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/                        /; binmode STDOUT, ":utf8"; print if /^[[:alpha:]'"'"'_-]*$/'\
     3.8 +| perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\
     3.9  | sed "s/__APOSTROPHE__/'/g" \
    3.10  | tr ' ' '\n' \
    3.11  | tee "$1" \
    3.12 @@ -284,6 +284,7 @@
    3.13      if   ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
    3.14      elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
    3.15      elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); }
    3.16 +    elsif ( $ENV{LANGUAGE} eq "io" ) { return normalize_esperanto(shift); }
    3.17      else { return shift ; }
    3.18  }
    3.19  
    3.20 @@ -294,6 +295,18 @@
    3.21      return $_;
    3.22  }
    3.23  
    3.24 +sub normalize_esperanto($)
    3.25 +{
    3.26 +    $_=lc(shift);
    3.27 +# verbs
    3.28 +    s/i$//; s/is$//; s/os$//; s/as$//; s/us$//;
    3.29 +
    3.30 +# nouns
    3.31 +    s/j?n?$//;
    3.32 +
    3.33 +    return $_;
    3.34 +}
    3.35 +
    3.36  sub normalize_german($)
    3.37  {
    3.38      $_=lc(shift);