new-words

diff new-words.sh @ 22:46e987f4636d

part.pl script + german normalization support
author Igor Chubin <igor@chub.in>
date Sun May 16 18:20:18 2010 +0300 (2010-05-16)
parents 190d4ac6b07c
children 4b9d13c78de2
line diff
     1.1 --- a/new-words.sh	Thu May 06 21:05:20 2010 +0300
     1.2 +++ b/new-words.sh	Sun May 16 18:20:18 2010 +0300
     1.3 @@ -44,10 +44,13 @@
     1.4  
     1.5  LANGUAGE=en
     1.6  my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
     1.7 -if echo "$1" | grep -q http://...wikipedia.org/wiki/
     1.8 -then
     1.9 -LANGUAGE="`echo $1 | sed s@http://@@ | sed s@.wikipedia.*@@`"
    1.10 -fi
    1.11 +for arg
    1.12 +do
    1.13 +    if echo "$arg" | grep -q http://...wikipedia.org/wiki/
    1.14 +    then
    1.15 +    LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
    1.16 +    fi
    1.17 +done
    1.18  [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
    1.19  if [ "$1" = "-l" ]
    1.20  then
    1.21 @@ -179,11 +182,11 @@
    1.22  
    1.23  group_words()
    1.24  {
    1.25 -    if [ "$LANGUAGE" != "en" ]
    1.26 -    then
    1.27 -        cat 
    1.28 -        return
    1.29 -    fi
    1.30 +    #if [ "$LANGUAGE" != "en" ]
    1.31 +    #then
    1.32 +    #    cat 
    1.33 +    #    return
    1.34 +    #fi
    1.35      PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX`
    1.36      cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
    1.37  #!/usr/bin/perl
    1.38 @@ -211,9 +214,43 @@
    1.39  
    1.40  sub normalize($)
    1.41  {
    1.42 +    if   ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
    1.43 +    elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
    1.44 +    else { return shift ; }
    1.45 +}
    1.46 +
    1.47 +sub normalize_german($)
    1.48 +{
    1.49 +    $_=lc(shift);
    1.50 +
    1.51 +    s/heit$//;  s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//; 
    1.52 +    s/en$//; s/er$//;
    1.53 +
    1.54 +    s/lich$//; s/ig$//;
    1.55 +    s/al$//; s/isch$//;
    1.56 +    s/ell$//; s/haft$//;
    1.57 +
    1.58 +    s/bar$//; s/sam$//; s/lich$//;
    1.59 +
    1.60 +    @prefixes=qw(
    1.61 +        ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu
    1.62 +        be emp ent er ge miss ver zer durch über um unter wieder);
    1.63 +    for $pref (@prefixes) {
    1.64 +        s/^$pref//;
    1.65 +    }
    1.66 +
    1.67 +
    1.68 +    return $_;
    1.69 +}
    1.70 +
    1.71 +sub normalize_english($)
    1.72 +{
    1.73      $_=lc(shift);
    1.74  
    1.75      s/s$//;
    1.76 +
    1.77 +    s/ation$//;  s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//; 
    1.78 +
    1.79      s/ed$//;
    1.80      s/en$//;
    1.81      s/er$//;
    1.82 @@ -221,8 +258,7 @@
    1.83      s/ing$//;
    1.84  
    1.85      s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//;
    1.86 -    s/ation$//;  s/ness$//; s/ship$//; s/ally$//;
    1.87 -    s/ment$//; s/ify$//; s/ity$//; s/fy$//; s/ly$//;
    1.88 +    s/ify$//; s/fy$//; s/ly$//;
    1.89      s/ise$//; s/ize$//;
    1.90  
    1.91      s/e$//;
    1.92 @@ -286,6 +322,7 @@
    1.93  PERL_SCRIPT
    1.94      export VOCABULARY
    1.95      export NEED_TO_USE_VOCABULARY_WHEN_SORT
    1.96 +    export LANGUAGE
    1.97      perl $PERL_SCRIPT_TEMP_NAME
    1.98      rm $PERL_SCRIPT_TEMP_NAME
    1.99  }