new-words

changeset 22:46e987f4636d

part.pl script + german normalization support
author Igor Chubin <igor@chub.in>
date Sun May 16 18:20:18 2010 +0300 (2010-05-16)
parents 190d4ac6b07c
children 4b9d13c78de2
files en.sh new-words.sh part.pl
line diff
     1.1 --- a/en.sh	Thu May 06 21:05:20 2010 +0300
     1.2 +++ b/en.sh	Sun May 16 18:20:18 2010 +0300
     1.3 @@ -1,7 +1,19 @@
     1.4  #!/bin/sh
     1.5  
     1.6 +DICTD_SERVER="dictd.xdsl.by"
     1.7 +DICTD_SERVER_ARGS="-h $DICTD_SERVER"
     1.8 +DICT_NAME=mueller24
     1.9 +
    1.10 +
    1.11 +myname="`echo $0 | sed s@.*/@@`"
    1.12 +case $myname in
    1.13 +  "de") DICT_NAME=deu-eng ; DICTD_SERVER_ARGS='' ;;
    1.14 +  "pl") DICT_NAME=slovnyk_pl-ru ;;
    1.15 +  "ru") DICT_NAME=ozhshv ;;
    1.16 +esac
    1.17 +
    1.18  (
    1.19 -    dict -h dictd.xdsl.by -d mueller24 "$@"
    1.20 +    dict $DICTD_SERVER_ARGS -d "$DICT_NAME" "$@"
    1.21      if [ -e "$ORIGINAL_TEXT" ]
    1.22      then
    1.23          echo . . . . . . . . . . . . . . . . . . . . . . . . . ; echo
     2.1 --- a/new-words.sh	Thu May 06 21:05:20 2010 +0300
     2.2 +++ b/new-words.sh	Sun May 16 18:20:18 2010 +0300
     2.3 @@ -44,10 +44,13 @@
     2.4  
     2.5  LANGUAGE=en
     2.6  my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
     2.7 -if echo "$1" | grep -q http://...wikipedia.org/wiki/
     2.8 -then
     2.9 -LANGUAGE="`echo $1 | sed s@http://@@ | sed s@.wikipedia.*@@`"
    2.10 -fi
    2.11 +for arg
    2.12 +do
    2.13 +    if echo "$arg" | grep -q http://...wikipedia.org/wiki/
    2.14 +    then
    2.15 +    LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
    2.16 +    fi
    2.17 +done
    2.18  [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
    2.19  if [ "$1" = "-l" ]
    2.20  then
    2.21 @@ -179,11 +182,11 @@
    2.22  
    2.23  group_words()
    2.24  {
    2.25 -    if [ "$LANGUAGE" != "en" ]
    2.26 -    then
    2.27 -        cat 
    2.28 -        return
    2.29 -    fi
    2.30 +    #if [ "$LANGUAGE" != "en" ]
    2.31 +    #then
    2.32 +    #    cat 
    2.33 +    #    return
    2.34 +    #fi
    2.35      PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX`
    2.36      cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
    2.37  #!/usr/bin/perl
    2.38 @@ -211,9 +214,43 @@
    2.39  
    2.40  sub normalize($)
    2.41  {
    2.42 +    if   ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
    2.43 +    elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
    2.44 +    else { return shift ; }
    2.45 +}
    2.46 +
    2.47 +sub normalize_german($)
    2.48 +{
    2.49 +    $_=lc(shift);
    2.50 +
    2.51 +    s/heit$//;  s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//; 
    2.52 +    s/en$//; s/er$//;
    2.53 +
    2.54 +    s/lich$//; s/ig$//;
    2.55 +    s/al$//; s/isch$//;
    2.56 +    s/ell$//; s/haft$//;
    2.57 +
    2.58 +    s/bar$//; s/sam$//; s/lich$//;
    2.59 +
    2.60 +    @prefixes=qw(
    2.61 +        ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu
    2.62 +        be emp ent er ge miss ver zer durch über um unter wieder);
    2.63 +    for $pref (@prefixes) {
    2.64 +        s/^$pref//;
    2.65 +    }
    2.66 +
    2.67 +
    2.68 +    return $_;
    2.69 +}
    2.70 +
    2.71 +sub normalize_english($)
    2.72 +{
    2.73      $_=lc(shift);
    2.74  
    2.75      s/s$//;
    2.76 +
    2.77 +    s/ation$//;  s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//; 
    2.78 +
    2.79      s/ed$//;
    2.80      s/en$//;
    2.81      s/er$//;
    2.82 @@ -221,8 +258,7 @@
    2.83      s/ing$//;
    2.84  
    2.85      s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//;
    2.86 -    s/ation$//;  s/ness$//; s/ship$//; s/ally$//;
    2.87 -    s/ment$//; s/ify$//; s/ity$//; s/fy$//; s/ly$//;
    2.88 +    s/ify$//; s/fy$//; s/ly$//;
    2.89      s/ise$//; s/ize$//;
    2.90  
    2.91      s/e$//;
    2.92 @@ -286,6 +322,7 @@
    2.93  PERL_SCRIPT
    2.94      export VOCABULARY
    2.95      export NEED_TO_USE_VOCABULARY_WHEN_SORT
    2.96 +    export LANGUAGE
    2.97      perl $PERL_SCRIPT_TEMP_NAME
    2.98      rm $PERL_SCRIPT_TEMP_NAME
    2.99  }
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/part.pl	Sun May 16 18:20:18 2010 +0300
     3.3 @@ -0,0 +1,39 @@
     3.4 +#!/usr/bin/perl
     3.5 +
     3.6 +my @lines=<STDIN>;
     3.7 +my $lines=$#lines;
     3.8 +my $interval=$ARGV[0];
     3.9 +my ($start,$stop,$total);
    3.10 +if ($interval =~ m@(.*)/(.*)@) {
    3.11 +    $start = $1;
    3.12 +    $total = $2;
    3.13 +}
    3.14 +else {
    3.15 +    $start=$interval;
    3.16 +    $total=0;
    3.17 +}
    3.18 +if ($start =~ m@(.*)-(.*)@) {
    3.19 +    $start = $1;
    3.20 +    $stop = $2;
    3.21 +}
    3.22 +if ($start =~ m@(.*)\+(.*)@) {
    3.23 +    $start = $1;
    3.24 +    #if ($2 =~ /%$/) {
    3.25 +    #}
    3.26 +    #else {
    3.27 +    $stop = $start+$2;
    3.28 +    #}
    3.29 +}
    3.30 +#print "start = $start\n";
    3.31 +#print "stop = $stop\n";
    3.32 +#print "total = $total\n";
    3.33 +
    3.34 +$start=int($lines/$total*$start);
    3.35 +$stop=int($lines/$total*$stop);
    3.36 +
    3.37 +#print "start = $start\n";
    3.38 +#print "stop = $stop\n";
    3.39 +#print "total = $total\n";
    3.40 +for($i=$start;$i<$stop;$i++){
    3.41 +    print $lines[$i];
    3.42 +}