new-words
diff new-words.sh @ 32:753fb84437aa
link between words support; link is created using @
author | Igor Chubin <igor@chub.in> |
---|---|
date | Fri Nov 05 20:07:46 2010 +0100 (2010-11-05) |
parents | 48ca8248e9cc |
children | 720a701b2ba9 |
line diff
1.1 --- a/new-words.sh Tue Aug 17 21:35:57 2010 +0200 1.2 +++ b/new-words.sh Fri Nov 05 20:07:46 2010 +0100 1.3 @@ -257,6 +257,7 @@ 1.4 1.5 use Encode; 1.6 use utf8; 1.7 +use Lingua::Stem::Snowball qw(stem); 1.8 1.9 eval { 1.10 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it 1.11 @@ -268,6 +269,22 @@ 1.12 our $HAVE_String_Similarity=1; 1.13 } 1.14 1.15 + 1.16 +sub load_notes_dict() 1.17 +{ 1.18 + my %dict; 1.19 + if (open(NOTES, $ENV{NOTES_FILE})) { 1.20 + while(<NOTES>) { 1.21 + $_ = decode( "utf8", $_); 1.22 + chomp; 1.23 + s/^\s+//; 1.24 + my ($a,$b)=split /\s+/,$_,2; 1.25 + $dict{$a}=$b; 1.26 + } 1.27 + } 1.28 + return %dict; 1.29 +} 1.30 + 1.31 sub similar($$){ 1.32 my $a=shift; 1.33 my $b=shift; 1.34 @@ -279,7 +296,8 @@ 1.35 } 1.36 } 1.37 1.38 -sub normalize($) 1.39 + 1.40 +sub normalize_without_linked($) 1.41 { 1.42 if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); } 1.43 elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); } 1.44 @@ -288,6 +306,23 @@ 1.45 else { return shift ; } 1.46 } 1.47 1.48 +sub normalize_with_linked($) 1.49 +{ 1.50 + my $word = normalize_without_linked(shift); 1.51 + #return $word; 1.52 + if ($linked_words{$word}) { 1.53 + return $linked_words{$word}; 1.54 + } 1.55 + else { 1.56 + return $word; 1.57 + } 1.58 +} 1.59 + 1.60 +sub normalize($) 1.61 +{ 1.62 + return normalize_with_linked(shift); 1.63 +} 1.64 + 1.65 sub normalize_ukrainian($) 1.66 { 1.67 $_=lc(shift); 1.68 @@ -309,6 +344,12 @@ 1.69 1.70 sub normalize_german($) 1.71 { 1.72 + @stems = stem('de', \@_); 1.73 + return $stems[0]; 1.74 +} 1.75 + 1.76 +sub normalize_german_($) 1.77 +{ 1.78 $_=lc(shift); 1.79 1.80 s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//; 1.81 @@ -379,6 +420,35 @@ 1.82 } 1.83 } 1.84 1.85 +sub log_($) 1.86 +{ 1.87 + return; 1.88 + open(LOG, ">>", "/tmp/log1"); 1.89 + print LOG $_[0]; 1.90 + close(LOG); 1.91 +} 1.92 + 1.93 +sub find_linked_words($) 1.94 +{ 1.95 + my %linked_words; 1.96 + my $dict = shift; 1.97 + log_("1"); 1.98 + log_(join(" ", keys(%$dict))); 1.99 + 1.100 + for $key (keys(%$dict)) { 1.101 + $val = $dict->{$key}; 1.102 + log_($key."\n"); 1.103 + if ($val =~ /\@([a-z]*)/) { 1.104 + $linked_words{normalize($key)} = normalize($1); 1.105 + log_(normalize($key)." = ".normalize($1)."\n"); 1.106 + } 1.107 + } 1.108 + return %linked_words; 1.109 +} 1.110 + 1.111 +our %dict = load_notes_dict(); 1.112 +our %linked_words = find_linked_words(\%dict); 1.113 + 1.114 our %Vocabulary; 1.115 open(VOC, $ENV{VOCABULARY}) 1.116 or die "Can't open VOCABULARY"; 1.117 @@ -415,6 +485,8 @@ 1.118 export VOCABULARY 1.119 export NEED_TO_USE_VOCABULARY_WHEN_SORT 1.120 export LANGUAGE 1.121 + [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" 1.122 + export NOTES_FILE 1.123 perl $PERL_SCRIPT_TEMP_NAME 1.124 rm $PERL_SCRIPT_TEMP_NAME 1.125 } 1.126 @@ -430,17 +502,24 @@ 1.127 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.128 use Encode; 1.129 1.130 +sub load_notes_dict() 1.131 +{ 1.132 + my %dict; 1.133 + if (open(NOTES, $ENV{NOTES_FILE})) { 1.134 + while(<NOTES>) { 1.135 + $_ = decode( "utf8", $_); 1.136 + chomp; 1.137 + s/^\s+//; 1.138 + my ($a,$b)=split /\s+/,$_,2; 1.139 + $dict{$a}=$b; 1.140 + } 1.141 + } 1.142 + return %dict; 1.143 +} 1.144 + 1.145 +%dict = load_notes_dict(); 1.146 + 1.147 $file = $ARGV[0]; 1.148 -our $dict; 1.149 -if (open(NOTES, $ENV{NOTES_FILE})) { 1.150 - while(<NOTES>) { 1.151 - $_ = decode( "utf8", $_); 1.152 - chomp; 1.153 - s/^\s+//; 1.154 - my ($a,$b)=split /\s+/,$_,2; 1.155 - $dict{$a}=$b; 1.156 - } 1.157 -} 1.158 if (open(F, $file)) { 1.159 @lines=<F>; 1.160 close(F);