# HG changeset patch # User Igor Chubin # Date 1288984066 -3600 # Node ID 753fb84437aa0b09fee9831149d0148161982b50 # Parent 48ca8248e9cc157b89a9757ca8333c25a94bfe41 link between words support; link is created using @ diff -r 48ca8248e9cc -r 753fb84437aa new-words.sh --- a/new-words.sh Tue Aug 17 21:35:57 2010 +0200 +++ b/new-words.sh Fri Nov 05 20:07:46 2010 +0100 @@ -257,6 +257,7 @@ use Encode; use utf8; +use Lingua::Stem::Snowball qw(stem); eval { # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it @@ -268,6 +269,22 @@ our $HAVE_String_Similarity=1; } + +sub load_notes_dict() +{ + my %dict; + if (open(NOTES, $ENV{NOTES_FILE})) { + while() { + $_ = decode( "utf8", $_); + chomp; + s/^\s+//; + my ($a,$b)=split /\s+/,$_,2; + $dict{$a}=$b; + } + } + return %dict; +} + sub similar($$){ my $a=shift; my $b=shift; @@ -279,7 +296,8 @@ } } -sub normalize($) + +sub normalize_without_linked($) { if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); } elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); } @@ -288,6 +306,23 @@ else { return shift ; } } +sub normalize_with_linked($) +{ + my $word = normalize_without_linked(shift); + #return $word; + if ($linked_words{$word}) { + return $linked_words{$word}; + } + else { + return $word; + } +} + +sub normalize($) +{ + return normalize_with_linked(shift); +} + sub normalize_ukrainian($) { $_=lc(shift); @@ -309,6 +344,12 @@ sub normalize_german($) { + @stems = stem('de', \@_); + return $stems[0]; +} + +sub normalize_german_($) +{ $_=lc(shift); s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//; @@ -379,6 +420,35 @@ } } +sub log_($) +{ + return; + open(LOG, ">>", "/tmp/log1"); + print LOG $_[0]; + close(LOG); +} + +sub find_linked_words($) +{ + my %linked_words; + my $dict = shift; + log_("1"); + log_(join(" ", keys(%$dict))); + + for $key (keys(%$dict)) { + $val = $dict->{$key}; + log_($key."\n"); + if ($val =~ /\@([a-z]*)/) { + $linked_words{normalize($key)} = normalize($1); + log_(normalize($key)." = ".normalize($1)."\n"); + } + } + return %linked_words; +} + +our %dict = load_notes_dict(); +our %linked_words = find_linked_words(\%dict); + our %Vocabulary; open(VOC, $ENV{VOCABULARY}) or die "Can't open VOCABULARY"; @@ -415,6 +485,8 @@ export VOCABULARY export NEED_TO_USE_VOCABULARY_WHEN_SORT export LANGUAGE + [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" + export NOTES_FILE perl $PERL_SCRIPT_TEMP_NAME rm $PERL_SCRIPT_TEMP_NAME } @@ -430,17 +502,24 @@ cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME use Encode; +sub load_notes_dict() +{ + my %dict; + if (open(NOTES, $ENV{NOTES_FILE})) { + while() { + $_ = decode( "utf8", $_); + chomp; + s/^\s+//; + my ($a,$b)=split /\s+/,$_,2; + $dict{$a}=$b; + } + } + return %dict; +} + +%dict = load_notes_dict(); + $file = $ARGV[0]; -our $dict; -if (open(NOTES, $ENV{NOTES_FILE})) { - while() { - $_ = decode( "utf8", $_); - chomp; - s/^\s+//; - my ($a,$b)=split /\s+/,$_,2; - $dict{$a}=$b; - } -} if (open(F, $file)) { @lines=; close(F);