# HG changeset patch # User Igor Chubin # Date 1295798984 -3600 # Node ID c3a50c0d2400e1a8afe7f6e54d64fdfdf53874da # Parent a598e0d25784559ddc8d5b33532712a9bb9a6ed2 Functions for adding/removing notes + statistics now implemented in Python. Option -O (old-style) is not supported anymore. If you need old-style new-words use new-words.sh diff -r a598e0d25784 -r c3a50c0d2400 new-words-py.sh --- a/new-words-py.sh Sun Jan 23 14:25:52 2011 +0100 +++ b/new-words-py.sh Sun Jan 23 17:09:44 2011 +0100 @@ -82,11 +82,9 @@ FILTER_WORDS=YES SHOW_VOC_STAT=NO COMPRESSED_WORDLIST=NO -OLD_STYLE="NO" -while getopts Ocl:sSkanNp:t:Tm:Mr:23 opt +while getopts cl:sSkanNp:t:Tm:Mr:23 opt do case "$opt" in - O) OLD_STYLE=YES;; c) COMPRESSED_WORDLIST=YES;; s) STAT_ONLY=YES;; S) SHOW_VOC_STAT=YES;; @@ -125,87 +123,6 @@ exit 0 fi -#---------------------------------------------------- - -get_words() -{ - if [ "$OLD_STYLE" = NO ] - then - $NEW_WORDS_PY -l "$LANGUAGE" -f get_words "$1" - else - get_words_OLD "$@" - fi -} - -get_words_OLD() -{ - export FILTER_WORDS -tr ' ' '\n' | sed 's/--/ /g' \ -| sed "s/'/__APOSTROPHE__/g" \ -| perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\ -| sed "s/__APOSTROPHE__/'/g" \ -| tr ' ' '\n' \ -| tee "$1" \ -| grep_v_english_perl \ -| sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn -} - -add_stat() -{ - if [ "$DONT_ADD_MARKLINES" = "YES" ] - then - cat - return - fi - before="$1" - after=${before}2 - cat > "$after" - total="`wc -w $1 | awk '{print $1}'`" - total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`" - total_known="`echo $total-$total_unknown|bc`" - percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`" - #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`" - sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`" - - - if [ "$STAT_ONLY" = "YES" ] - then - echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10" - echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` " - rm $after - return 0 - else - groups="`echo $(grep '# groups' $after | awk '{print $3}')`" - words="`echo $(grep -v '^#' $after | wc -l)`" - echo "# $LANGUAGE, $percentage, <$total_known/$total>, <$groups/$words>" - fi - - PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` - cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME -my $total=shift(@ARGV); -my $total_known=shift(@ARGV); -my $s=0; -my $mark_line=int($total_known*100/$total/5)*5; -if ($mark_line>=90) { - $mark_line=int($total_known*100/$total)+1; -} else { $mark_line +=5; }; -while(<>) -{ - next if /^#\s*groups\s*/; - print; - /^\s*([0-9]*)\s*/; - $s+=$1; - if (($total_known+$s)*100/$total>=$mark_line) { - print "# $mark_line\n"; - if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; }; - } -} -PERL_SCRIPT - perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after" - rm $PERL_SCRIPT_TEMP_NAME - rm $after -} - two_and_three_words() { if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ] @@ -239,338 +156,6 @@ fi } -grep_v_english() -{ -[ -e "$VOCABULARY" ] || touch "$VOCABULARY" -eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat -} - -grep_v_english_perl() -{ - PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` - cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME - if ($ENV{FILTER_WORDS} eq "NO") { - while(<>) { print; } - exit(0); - } -$voc_files=$ENV{VOC_FILES}; -$voc_files=~s@^ @@; -for $voc_file (split /\s+/,$voc_files) { - if (open(VOC, $voc_file)) { - while (){ - chomp; - #s/'//g; - $voc{$_}="1"; - } - } -} -while(<>) { - chomp; - if (not defined($voc{$_})) { print "$_\n"; } -} -PERL_SCRIPT - [ -e "$VOCABULARY" ] || touch "$VOCABULARY" - export VOCABULARY VOC_FILES - VOC_FILES=$VOCABULARY - for i in $TAG_NAME - do - VOC_FILES="${VOC_FILES} `tag_file_name $i`" - done - perl $PERL_SCRIPT_TEMP_NAME - rm $PERL_SCRIPT_TEMP_NAME -} - -group_words() -{ - if [ "$OLD_STYLE" = NO ] - then - $NEW_WORDS_PY -l "$LANGUAGE" -f group_words "$1" - else - group_words_OLD "$@" - fi -} - -group_words_OLD() -{ - #if [ "$LANGUAGE" != "en" ] - #then - # cat - # return - #fi - PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX` - cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME -#!/usr/bin/perl - -use Encode; -use utf8; -use Lingua::Stem::Snowball qw(stem); - -eval { -# http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it - require String::Similarity; - String::Similarity->import(); -}; -unless($@) -{ - our $HAVE_String_Similarity=1; -} - - -sub load_notes_dict() -{ - my %dict; - if (open(NOTES, $ENV{NOTES_FILE})) { - while() { - $_ = decode( "utf8", $_); - chomp; - s/^\s+//; - my ($a,$b)=split /\s+/,$_,2; - $dict{$a}=$b; - } - } - return %dict; -} - -sub similar($$){ - my $a=shift; - my $b=shift; - if ($HAVE_String_Similarity) { - return $Similarity{"$a $b"}; - } - else { - return 0; - } -} - - -sub normalize_without_linked($) -{ - if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); } - elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); } - elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); } - elsif ( $ENV{LANGUAGE} eq "io" ) { return normalize_esperanto(shift); } - else { return shift ; } -} - -sub normalize_with_linked($) -{ - my $word = normalize_without_linked(shift); - #return $word; - if ($linked_words{$word}) { - return $linked_words{$word}; - } - else { - return $word; - } -} - -sub normalize($) -{ - return normalize_with_linked(shift); -} - -sub normalize_ukrainian($) -{ - $_=lc(shift); - s/[юіоеуаи]$//g; - return $_; -} - -sub normalize_esperanto($) -{ - $_=lc(shift); -# verbs - s/i$//; s/is$//; s/os$//; s/as$//; s/us$//; - -# nouns - s/j?n?$//; - - return $_; -} - -sub normalize_german($) -{ - @stems = stem('de', \@_); - return $stems[0]; -} - -sub normalize_german_($) -{ - $_=lc(shift); - - s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//; - s/en$//; s/er$//; - - s/lich$//; s/ig$//; - s/al$//; s/isch$//; - s/ell$//; s/haft$//; - - s/bar$//; s/sam$//; s/lich$//; - - @prefixes=qw( - ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu - be emp ent er ge miss ver zer durch über um unter wieder); - @prefixes=(); - for $pref (@prefixes) { - s/^$pref//; - } - - - return $_; -} - -sub normalize_english($) -{ - $_=lc(shift); - - s/s$//; - - s/ation$//; s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//; - - s/ed$//; - s/en$//; - s/er$//; - s/est$//; - s/ing$//; - - s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//; - s/ify$//; s/fy$//; s/ly$//; - s/ise$//; s/ize$//; - - s/e$//; - return $_; -} - - -sub compare($$) -{ - my $a=shift; - my $b=shift; - $a =~ s/^\s*//; - $b =~ s/^\s*//; - my ($a1, $a2)= split /\s+/,$a,2; - my ($b1, $b2)= split /\s+/,$b,2; - - my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)}; - - if ($cmp) { - return $cmp; - } - else { - if (normalize($a2) ne normalize($b2)) { - return normalize($a2) cmp normalize($b2); - } - else { - return $a1 <=> $b1; - } - } -} - -sub log_($) -{ - return; - open(LOG, ">>", "/tmp/log1"); - print LOG $_[0]; - close(LOG); -} - -sub find_linked_words($) -{ - my %linked_words; - my $dict = shift; - log_("1"); - log_(join(" ", keys(%$dict))); - - for $key (keys(%$dict)) { - $val = $dict->{$key}; - log_($key."\n"); - if ($val =~ /\@([a-z]*)/) { - $linked_words{normalize($key)} = normalize($1); - log_(normalize($key)." = ".normalize($1)."\n"); - } - } - return %linked_words; -} - -sub lc_length($) -{ - my $a= shift; - $a =~ s/[a-z]//g; - return length($a); -} - -our %dict = load_notes_dict(); -our %linked_words = find_linked_words(\%dict); - -our %Vocabulary; -open(VOC, $ENV{VOCABULARY}) - or die "Can't open VOCABULARY"; -while (){ - chomp; - #s/'//g; - $Vocabulary{normalize($_)}="1"; -} -close(VOC); - -binmode STDIN,":utf8"; -@lines=; -for $L (@lines) { - chomp($L); - #$L = decode( "utf8", $L); - $l=$L; - $l =~ s/^\s*//; - my ($a, $b)=split(/\s+/,$l,2); - $group_weight{normalize($b)}+=$a; -} -if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") { - for $k (keys %group_weight) { - if (defined($Vocabulary{$k})) { - $group_weight{$k} *= 2; - } - } -} -@lines2 = sort { compare($b,$a) } @lines; -binmode STDOUT, ":utf8"; -print "# groups ".scalar(keys(%group_weight))."\n"; -if ($ENV{COMPRESSED_WORDLIST} eq "YES") { - my $sum = 0; - my $min = 9999; - for $L (@lines2) { - chomp($L); $l=$L; $l =~ s/^(\s*)//; my $spaces = $1; my ($a, $b)=split(/\s+/,$l,2); - $group_name = normalize($b); - if ($group_name ne $prev_group_name and $prev_group_name ne '' ) { - #print (" "x(7-length($sum))),"$sum $main_word\n"; - print +(" "x(7-length($sum))),"$sum $main_word\n"; - $sum = $a; - $min = length($b) + 2*lc_length($b); - $main_word = $b; - } - else { - $sum += $a; - if ($min > length($b) + 2*lc_length($b)) { - $min = length($b) + 2*lc_length($b); - $main_word = $b; - } - } - $prev_group_name = $group_name; - } -} -else { - for $l (@lines2) { - print "$l\n"; - } -} -PERL_SCRIPT - export VOCABULARY - export NEED_TO_USE_VOCABULARY_WHEN_SORT - export LANGUAGE - export COMPRESSED_WORDLIST - [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" - export NOTES_FILE - perl $PERL_SCRIPT_TEMP_NAME - rm $PERL_SCRIPT_TEMP_NAME -} - text_from_url() { lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@' @@ -578,130 +163,15 @@ add_marks() { - if [ "$OLD_STYLE" = NO ] - then - $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1" - else - group_words_OLD "$@" - fi + $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1" } - -add_marks_OLD() +remove_marks() { - PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` - cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME -use Encode; - -sub load_notes_dict() + $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1" +} +get_words_group_words_add_stat() { - my %dict; - if (open(NOTES, $ENV{NOTES_FILE})) { - while() { - $_ = decode( "utf8", $_); - chomp; - s/^\s+//; - my ($a,$b)=split /\s+/,$_,2; - $dict{$a}=$b; - } - } - return %dict; -} - -%dict = load_notes_dict(); - -$file = $ARGV[0]; -if (open(F, $file)) { - @lines=; - close(F); - for (@lines) {$_ = decode( "utf8", $_);}; - - if (open(F, ">$file")) { - binmode F, ":utf8"; - for (@lines) { - m/\s+\S+\s+(\S+)/; - $name=$1; - if (not /^#/ and defined($dict{$name})) { - chomp; - $mark=$dict{$name}; - $space=" "x(30-length($_)); - print F "$_$space$mark\n"; - } - else { - print F "$_"; - } - } - close(F); - } -} -PERL_SCRIPT - [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" - export NOTES_FILE - perl $PERL_SCRIPT_TEMP_NAME "$1" - rm $PERL_SCRIPT_TEMP_NAME -} - -remove_marks() -{ - if [ "$OLD_STYLE" = NO ] - then - $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1" - else - group_words_OLD "$@" - fi -} - -remove_marks_OLD() -{ - PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` - cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME -$file = $ARGV[0]; -our %dict; -if (open(F, $file)) { - @lines=; - close(F); - - if (open(F, ">$file")) { - for (@lines) { - chomp; - if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) { - my $name=$4; - my $comment=$6; - $dict{$name}=$comment; - print F "$1$2$3$4\n"; - } - else { - print F "$_\n"; - } - } - } -} -if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) { - @lines=; - close(NOTES); - - if (open(NOTES, ">".$ENV{NOTES_FILE})) { - for (@lines) { - chomp; - s/^\s+//; - my ($a,$b)=split /\s+/,$_,2; - if (not defined($dict{$a}) || ($dict{$a} eq $b)) { - print NOTES "$_\n"; - if (defined($dict{$a})) { unset($dict{$a}); } - } - } - for (keys %dict) { - $mark=$dict{$_}; - $space=" "x(30-length($_)); - print NOTES "$_$space$mark\n"; - } - } -} -PERL_SCRIPT - [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" - export NOTES_FILE - export DONT_ADD_MARKS - perl $PERL_SCRIPT_TEMP_NAME "$1" - rm $PERL_SCRIPT_TEMP_NAME + $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1" } part() @@ -813,9 +283,7 @@ | part $PART_TO_PROCESS \ | tee $ORIGINAL_TEXT \ | two_and_three_words \ - | get_words ${TEMP1}-full \ - | group_words \ - | add_stat ${TEMP1}-full \ + | get_words_group_words_add_stat \ | tee "$TEMP1" > "$TEMP2" if [ "$STAT_ONLY" = "YES" ] diff -r a598e0d25784 -r c3a50c0d2400 new-words.py --- a/new-words.py Sun Jan 23 14:25:52 2011 +0100 +++ b/new-words.py Sun Jan 23 17:09:44 2011 +0100 @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +from __future__ import with_statement import codecs import logging import os @@ -253,10 +254,6 @@ f.write(line) -def print_words_sorted(words_freq): - for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True): - codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k)) - def substract_dictionary(dict1, dict2): """ returns dict1 - dict2 @@ -275,14 +272,12 @@ def error_message(text): print text -def find_wordgroups_weights(lines, normalizator): +def find_wordgroups_weights(word_pairs, normalizator): weight = {} - for line in lines: - line = re.sub('^\s*', '', line.rstrip('\n')) - (num, word) = re.split('\s+', line, maxsplit=1) + for (num, word) in word_pairs: normalized = normalizator.normalize(word) weight.setdefault(normalized, 0) - weight[normalized] += int(num) + weight[normalized] += num return weight def find_linked_words(notes): @@ -297,12 +292,9 @@ linked_words[word] = main_word return linked_words - -def compare_word_lines(line1, line2, wgw, normalizator, linked_words): - line1 = re.sub('^\s*', '', line1.rstrip('\n')) - (num1, word1) = re.split('\s+', line1, 1) - line2 = re.sub('^\s*', '', line2.rstrip('\n')) - (num2, word2) = re.split('\s+', line2, 1) +def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words): + (num1, word1) = pair1 + (num2, word2) = pair2 normalized_word1 = normalizator.normalize(word1) normalized_word2 = normalizator.normalize(word2) @@ -317,25 +309,28 @@ else: return cmp(int(num1), int(num2)) -def filter_get_words(args): - vocabulary = load_vocabulary() - words = get_words(readlines_from_stdin()) - dump_words(words, args[0]) - words = substract_dictionary(words, vocabulary) - print_words_sorted(words) +def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False): + if stats_only: + codecs.getwriter("utf-8")(sys.stdout).write("stat_only") + return -def filter_group_words(args): - lines = readlines_from_stdin() - notes = load_notes(notes_filenames()) - linked_words = find_linked_words(notes) - normalizator = Normalizator(config['language'], linked_words) + if print_stats: + codecs.getwriter("utf-8")(sys.stdout).write( + "# %(language)s, %(percentage)s, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats) - wgw = find_wordgroups_weights(lines, normalizator) - for line in sorted( - lines, - cmp=lambda x,y:compare_word_lines(x,y, wgw, normalizator, linked_words), - reverse=True): - codecs.getwriter("utf-8")(sys.stdout).write(line) + level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102) + known = int(stats['total_known']) + total = int(stats['total']) + current_level = 0 + for word_pair in word_pairs: + codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair) + known += word_pair[0] + if 100.0*known/total >= level_lines[0]: + current_level = level_lines[0] + while 100.0*known/total > level_lines[0]: + current_level = level_lines[0] + level_lines = level_lines[1:] + codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level) def filter_add_notes(args): lines = readlines_from_file(args[0]) @@ -353,16 +348,48 @@ for line in lines: f.write(line) +def filter_get_words_group_words_add_stat(args): + vocabulary = load_vocabulary() + notes = load_notes(notes_filenames()) + lines = readlines_from_stdin() + words = get_words(lines) + + stats = {} + stats['total'] = sum(words[x] for x in words.keys()) + words = substract_dictionary(words, vocabulary) + + stats['total_unknown'] = sum(words[x] for x in words.keys()) + stats['total_known'] = stats['total'] - stats['total_unknown'] + stats['percentage'] = "%7.2f"%(100.0*stats['total_known']/stats['total']) + stats['groups'] = 0 + stats['words'] = len(words) + stats['sentences'] = 0 #FIXME + stats['language'] = config['language'] + + linked_words = find_linked_words(notes) + normalizator = Normalizator(config['language'], linked_words) + + word_pairs = [] + for k in sorted(words.keys(), key=lambda k: words[k], reverse=True): + word_pairs.append((words[k], k)) + + wgw = find_wordgroups_weights(word_pairs, normalizator) + word_pairs = sorted( + word_pairs, + cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words), + reverse=True) + + print_words_sorted(word_pairs, stats) + (options, args) = parser.parse_args() if options.language: config['language'] = options.language if options.function: function_names = { - 'get_words' : filter_get_words, - 'group_words' : filter_group_words, 'add_notes' : filter_add_notes, 'remove_notes': filter_remove_notes, + 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat, } if options.function in function_names: function_names[options.function](args)