new-words

changeset 32:753fb84437aa

link between words support; link is created using @
author Igor Chubin <igor@chub.in>
date Fri Nov 05 20:07:46 2010 +0100 (2010-11-05)
parents 48ca8248e9cc
children 720a701b2ba9
files new-words.sh
line diff
     1.1 --- a/new-words.sh	Tue Aug 17 21:35:57 2010 +0200
     1.2 +++ b/new-words.sh	Fri Nov 05 20:07:46 2010 +0100
     1.3 @@ -257,6 +257,7 @@
     1.4  
     1.5  use Encode;
     1.6  use utf8;
     1.7 +use Lingua::Stem::Snowball qw(stem);
     1.8  
     1.9  eval {
    1.10  # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
    1.11 @@ -268,6 +269,22 @@
    1.12      our $HAVE_String_Similarity=1;
    1.13  }
    1.14  
    1.15 +
    1.16 +sub load_notes_dict()
    1.17 +{
    1.18 +    my %dict;
    1.19 +    if (open(NOTES, $ENV{NOTES_FILE})) {
    1.20 +        while(<NOTES>) {
    1.21 +            $_ = decode( "utf8", $_);
    1.22 +            chomp;
    1.23 +            s/^\s+//;
    1.24 +            my ($a,$b)=split /\s+/,$_,2;
    1.25 +            $dict{$a}=$b;
    1.26 +        }
    1.27 +    }
    1.28 +    return %dict;
    1.29 +}
    1.30 +
    1.31  sub similar($$){
    1.32      my $a=shift;
    1.33      my $b=shift;
    1.34 @@ -279,7 +296,8 @@
    1.35      }
    1.36  }
    1.37  
    1.38 -sub normalize($)
    1.39 +
    1.40 +sub normalize_without_linked($)
    1.41  {
    1.42      if   ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
    1.43      elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
    1.44 @@ -288,6 +306,23 @@
    1.45      else { return shift ; }
    1.46  }
    1.47  
    1.48 +sub normalize_with_linked($)
    1.49 +{
    1.50 +    my $word = normalize_without_linked(shift);
    1.51 +        #return $word;
    1.52 +    if ($linked_words{$word}) {
    1.53 +        return $linked_words{$word};
    1.54 +    }
    1.55 +    else {
    1.56 +        return $word;
    1.57 +    }
    1.58 +}
    1.59 +
    1.60 +sub normalize($)
    1.61 +{
    1.62 +    return normalize_with_linked(shift);
    1.63 +}
    1.64 +
    1.65  sub normalize_ukrainian($)
    1.66  {
    1.67      $_=lc(shift);
    1.68 @@ -309,6 +344,12 @@
    1.69  
    1.70  sub normalize_german($)
    1.71  {
    1.72 +    @stems = stem('de', \@_);
    1.73 +    return $stems[0];
    1.74 +}
    1.75 +
    1.76 +sub normalize_german_($)
    1.77 +{
    1.78      $_=lc(shift);
    1.79  
    1.80      s/heit$//;  s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//; 
    1.81 @@ -379,6 +420,35 @@
    1.82      }
    1.83  }
    1.84  
    1.85 +sub log_($)
    1.86 +{
    1.87 +    return;
    1.88 +    open(LOG, ">>", "/tmp/log1");
    1.89 +    print LOG $_[0];
    1.90 +    close(LOG);
    1.91 +}
    1.92 +
    1.93 +sub find_linked_words($)
    1.94 +{
    1.95 +    my %linked_words;
    1.96 +    my $dict = shift;
    1.97 +    log_("1");
    1.98 +    log_(join(" ", keys(%$dict)));
    1.99 +
   1.100 +    for $key (keys(%$dict)) {
   1.101 +        $val = $dict->{$key};
   1.102 +        log_($key."\n");
   1.103 +        if ($val =~ /\@([a-z]*)/) {
   1.104 +            $linked_words{normalize($key)} = normalize($1);
   1.105 +            log_(normalize($key)." = ".normalize($1)."\n");
   1.106 +        }
   1.107 +    }
   1.108 +    return %linked_words;
   1.109 +}
   1.110 +
   1.111 +our %dict = load_notes_dict();
   1.112 +our %linked_words = find_linked_words(\%dict);
   1.113 +
   1.114  our %Vocabulary;
   1.115  open(VOC, $ENV{VOCABULARY})
   1.116   or die "Can't open VOCABULARY";
   1.117 @@ -415,6 +485,8 @@
   1.118      export VOCABULARY
   1.119      export NEED_TO_USE_VOCABULARY_WHEN_SORT
   1.120      export LANGUAGE
   1.121 +    [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
   1.122 +    export NOTES_FILE
   1.123      perl $PERL_SCRIPT_TEMP_NAME
   1.124      rm $PERL_SCRIPT_TEMP_NAME
   1.125  }
   1.126 @@ -430,17 +502,24 @@
   1.127      cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
   1.128  use Encode;
   1.129  
   1.130 +sub load_notes_dict()
   1.131 +{
   1.132 +    my %dict;
   1.133 +    if (open(NOTES, $ENV{NOTES_FILE})) {
   1.134 +        while(<NOTES>) {
   1.135 +            $_ = decode( "utf8", $_);
   1.136 +            chomp;
   1.137 +            s/^\s+//;
   1.138 +            my ($a,$b)=split /\s+/,$_,2;
   1.139 +            $dict{$a}=$b;
   1.140 +        }
   1.141 +    }
   1.142 +    return %dict;
   1.143 +}
   1.144 +
   1.145 +%dict = load_notes_dict();
   1.146 +
   1.147  $file = $ARGV[0];
   1.148 -our $dict;
   1.149 -if (open(NOTES, $ENV{NOTES_FILE})) {
   1.150 -    while(<NOTES>) {
   1.151 -        $_ = decode( "utf8", $_);
   1.152 -        chomp;
   1.153 -        s/^\s+//;
   1.154 -        my ($a,$b)=split /\s+/,$_,2;
   1.155 -        $dict{$a}=$b;
   1.156 -    }
   1.157 -}
   1.158  if (open(F, $file)) {
   1.159      @lines=<F>;
   1.160      close(F);