new-words

diff new-words.sh @ 12:4bbe553c1ee2

Удаление суффиксов в английских словах.

+ функция для поиска похожих слов similarity,
но она пока, хотя и присутствует в коде,
но не используется
author Igor Chubin <igor@chub.in>
date Sat Apr 03 00:27:00 2010 +0300 (2010-04-03)
parents 34d0332f238c
children 975b549364f2
line diff
     1.1 --- a/new-words.sh	Fri Apr 02 19:46:44 2010 +0300
     1.2 +++ b/new-words.sh	Sat Apr 03 00:27:00 2010 +0300
     1.3 @@ -103,16 +103,48 @@
     1.4      cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
     1.5  #!/usr/bin/perl
     1.6  
     1.7 +eval {
     1.8 +# http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
     1.9 +    require String::Similarity;
    1.10 +    String::Similarity->import();
    1.11 +};
    1.12 +unless($@)
    1.13 +{
    1.14 +    our $HAVE_String_Similarity=1;
    1.15 +}
    1.16 +
    1.17 +sub similar($$){
    1.18 +    my $a=shift;
    1.19 +    my $b=shift;
    1.20 +    if ($HAVE_String_Similarity) {
    1.21 +        return $Similarity{"$a $b"};
    1.22 +    } 
    1.23 +    else {
    1.24 +        return 0;
    1.25 +    }
    1.26 +}
    1.27 +
    1.28  sub normalize($)
    1.29  {
    1.30      $_=lc(shift);
    1.31 -    s///;
    1.32 +
    1.33      s/s$//;
    1.34      s/ed$//;
    1.35 +    s/en$//;
    1.36 +    s/er$//;
    1.37 +    s/est$//;
    1.38      s/ing$//;
    1.39 +
    1.40 +    s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//;
    1.41 +    s/ation$//;  s/ness$//; s/ally$//;
    1.42 +    s/ment$//; s/ify$//; s/ity$//; s/fy$//; s/ly$//;
    1.43 +    s/ise$//; s/ize$//;
    1.44 +
    1.45 +    s/e$//;
    1.46      return $_;
    1.47  }
    1.48  
    1.49 +
    1.50  sub compare($$)
    1.51  {
    1.52      my $a=shift;
    1.53 @@ -123,6 +155,7 @@
    1.54      my ($b1, $b2)= split /\s+/,$b,2;
    1.55  
    1.56      my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)};
    1.57 +
    1.58      if ($cmp) {
    1.59          return $cmp;
    1.60      }