new-words
changeset 40:c3a50c0d2400
Functions for adding/removing notes + statistics now implemented in Python.
Option -O (old-style) is not supported anymore. If you need old-style new-words use new-words.sh
Option -O (old-style) is not supported anymore. If you need old-style new-words use new-words.sh
author | Igor Chubin <igor@chub.in> |
---|---|
date | Sun Jan 23 17:09:44 2011 +0100 (2011-01-23) |
parents | a598e0d25784 |
children | 4629e08b0d87 |
files | new-words-py.sh new-words.py |
line diff
1.1 --- a/new-words-py.sh Sun Jan 23 14:25:52 2011 +0100 1.2 +++ b/new-words-py.sh Sun Jan 23 17:09:44 2011 +0100 1.3 @@ -82,11 +82,9 @@ 1.4 FILTER_WORDS=YES 1.5 SHOW_VOC_STAT=NO 1.6 COMPRESSED_WORDLIST=NO 1.7 -OLD_STYLE="NO" 1.8 -while getopts Ocl:sSkanNp:t:Tm:Mr:23 opt 1.9 +while getopts cl:sSkanNp:t:Tm:Mr:23 opt 1.10 do 1.11 case "$opt" in 1.12 - O) OLD_STYLE=YES;; 1.13 c) COMPRESSED_WORDLIST=YES;; 1.14 s) STAT_ONLY=YES;; 1.15 S) SHOW_VOC_STAT=YES;; 1.16 @@ -125,87 +123,6 @@ 1.17 exit 0 1.18 fi 1.19 1.20 -#---------------------------------------------------- 1.21 - 1.22 -get_words() 1.23 -{ 1.24 - if [ "$OLD_STYLE" = NO ] 1.25 - then 1.26 - $NEW_WORDS_PY -l "$LANGUAGE" -f get_words "$1" 1.27 - else 1.28 - get_words_OLD "$@" 1.29 - fi 1.30 -} 1.31 - 1.32 -get_words_OLD() 1.33 -{ 1.34 - export FILTER_WORDS 1.35 -tr ' ' '\n' | sed 's/--/ /g' \ 1.36 -| sed "s/'/__APOSTROPHE__/g" \ 1.37 -| perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\ 1.38 -| sed "s/__APOSTROPHE__/'/g" \ 1.39 -| tr ' ' '\n' \ 1.40 -| tee "$1" \ 1.41 -| grep_v_english_perl \ 1.42 -| sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn 1.43 -} 1.44 - 1.45 -add_stat() 1.46 -{ 1.47 - if [ "$DONT_ADD_MARKLINES" = "YES" ] 1.48 - then 1.49 - cat 1.50 - return 1.51 - fi 1.52 - before="$1" 1.53 - after=${before}2 1.54 - cat > "$after" 1.55 - total="`wc -w $1 | awk '{print $1}'`" 1.56 - total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`" 1.57 - total_known="`echo $total-$total_unknown|bc`" 1.58 - percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`" 1.59 - #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`" 1.60 - sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`" 1.61 - 1.62 - 1.63 - if [ "$STAT_ONLY" = "YES" ] 1.64 - then 1.65 - echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10" 1.66 - echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` " 1.67 - rm $after 1.68 - return 0 1.69 - else 1.70 - groups="`echo $(grep '# groups' $after | awk '{print $3}')`" 1.71 - words="`echo $(grep -v '^#' $after | wc -l)`" 1.72 - echo "# $LANGUAGE, $percentage, <$total_known/$total>, <$groups/$words>" 1.73 - fi 1.74 - 1.75 - PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` 1.76 - cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.77 -my $total=shift(@ARGV); 1.78 -my $total_known=shift(@ARGV); 1.79 -my $s=0; 1.80 -my $mark_line=int($total_known*100/$total/5)*5; 1.81 -if ($mark_line>=90) { 1.82 - $mark_line=int($total_known*100/$total)+1; 1.83 -} else { $mark_line +=5; }; 1.84 -while(<>) 1.85 -{ 1.86 - next if /^#\s*groups\s*/; 1.87 - print; 1.88 - /^\s*([0-9]*)\s*/; 1.89 - $s+=$1; 1.90 - if (($total_known+$s)*100/$total>=$mark_line) { 1.91 - print "# $mark_line\n"; 1.92 - if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; }; 1.93 - } 1.94 -} 1.95 -PERL_SCRIPT 1.96 - perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after" 1.97 - rm $PERL_SCRIPT_TEMP_NAME 1.98 - rm $after 1.99 -} 1.100 - 1.101 two_and_three_words() 1.102 { 1.103 if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ] 1.104 @@ -239,338 +156,6 @@ 1.105 fi 1.106 } 1.107 1.108 -grep_v_english() 1.109 -{ 1.110 -[ -e "$VOCABULARY" ] || touch "$VOCABULARY" 1.111 -eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat 1.112 -} 1.113 - 1.114 -grep_v_english_perl() 1.115 -{ 1.116 - PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` 1.117 - cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.118 - if ($ENV{FILTER_WORDS} eq "NO") { 1.119 - while(<>) { print; } 1.120 - exit(0); 1.121 - } 1.122 -$voc_files=$ENV{VOC_FILES}; 1.123 -$voc_files=~s@^ @@; 1.124 -for $voc_file (split /\s+/,$voc_files) { 1.125 - if (open(VOC, $voc_file)) { 1.126 - while (<VOC>){ 1.127 - chomp; 1.128 - #s/'//g; 1.129 - $voc{$_}="1"; 1.130 - } 1.131 - } 1.132 -} 1.133 -while(<>) { 1.134 - chomp; 1.135 - if (not defined($voc{$_})) { print "$_\n"; } 1.136 -} 1.137 -PERL_SCRIPT 1.138 - [ -e "$VOCABULARY" ] || touch "$VOCABULARY" 1.139 - export VOCABULARY VOC_FILES 1.140 - VOC_FILES=$VOCABULARY 1.141 - for i in $TAG_NAME 1.142 - do 1.143 - VOC_FILES="${VOC_FILES} `tag_file_name $i`" 1.144 - done 1.145 - perl $PERL_SCRIPT_TEMP_NAME 1.146 - rm $PERL_SCRIPT_TEMP_NAME 1.147 -} 1.148 - 1.149 -group_words() 1.150 -{ 1.151 - if [ "$OLD_STYLE" = NO ] 1.152 - then 1.153 - $NEW_WORDS_PY -l "$LANGUAGE" -f group_words "$1" 1.154 - else 1.155 - group_words_OLD "$@" 1.156 - fi 1.157 -} 1.158 - 1.159 -group_words_OLD() 1.160 -{ 1.161 - #if [ "$LANGUAGE" != "en" ] 1.162 - #then 1.163 - # cat 1.164 - # return 1.165 - #fi 1.166 - PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX` 1.167 - cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.168 -#!/usr/bin/perl 1.169 - 1.170 -use Encode; 1.171 -use utf8; 1.172 -use Lingua::Stem::Snowball qw(stem); 1.173 - 1.174 -eval { 1.175 -# http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it 1.176 - require String::Similarity; 1.177 - String::Similarity->import(); 1.178 -}; 1.179 -unless($@) 1.180 -{ 1.181 - our $HAVE_String_Similarity=1; 1.182 -} 1.183 - 1.184 - 1.185 -sub load_notes_dict() 1.186 -{ 1.187 - my %dict; 1.188 - if (open(NOTES, $ENV{NOTES_FILE})) { 1.189 - while(<NOTES>) { 1.190 - $_ = decode( "utf8", $_); 1.191 - chomp; 1.192 - s/^\s+//; 1.193 - my ($a,$b)=split /\s+/,$_,2; 1.194 - $dict{$a}=$b; 1.195 - } 1.196 - } 1.197 - return %dict; 1.198 -} 1.199 - 1.200 -sub similar($$){ 1.201 - my $a=shift; 1.202 - my $b=shift; 1.203 - if ($HAVE_String_Similarity) { 1.204 - return $Similarity{"$a $b"}; 1.205 - } 1.206 - else { 1.207 - return 0; 1.208 - } 1.209 -} 1.210 - 1.211 - 1.212 -sub normalize_without_linked($) 1.213 -{ 1.214 - if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); } 1.215 - elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); } 1.216 - elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); } 1.217 - elsif ( $ENV{LANGUAGE} eq "io" ) { return normalize_esperanto(shift); } 1.218 - else { return shift ; } 1.219 -} 1.220 - 1.221 -sub normalize_with_linked($) 1.222 -{ 1.223 - my $word = normalize_without_linked(shift); 1.224 - #return $word; 1.225 - if ($linked_words{$word}) { 1.226 - return $linked_words{$word}; 1.227 - } 1.228 - else { 1.229 - return $word; 1.230 - } 1.231 -} 1.232 - 1.233 -sub normalize($) 1.234 -{ 1.235 - return normalize_with_linked(shift); 1.236 -} 1.237 - 1.238 -sub normalize_ukrainian($) 1.239 -{ 1.240 - $_=lc(shift); 1.241 - s/[юіоеуаи]$//g; 1.242 - return $_; 1.243 -} 1.244 - 1.245 -sub normalize_esperanto($) 1.246 -{ 1.247 - $_=lc(shift); 1.248 -# verbs 1.249 - s/i$//; s/is$//; s/os$//; s/as$//; s/us$//; 1.250 - 1.251 -# nouns 1.252 - s/j?n?$//; 1.253 - 1.254 - return $_; 1.255 -} 1.256 - 1.257 -sub normalize_german($) 1.258 -{ 1.259 - @stems = stem('de', \@_); 1.260 - return $stems[0]; 1.261 -} 1.262 - 1.263 -sub normalize_german_($) 1.264 -{ 1.265 - $_=lc(shift); 1.266 - 1.267 - s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//; 1.268 - s/en$//; s/er$//; 1.269 - 1.270 - s/lich$//; s/ig$//; 1.271 - s/al$//; s/isch$//; 1.272 - s/ell$//; s/haft$//; 1.273 - 1.274 - s/bar$//; s/sam$//; s/lich$//; 1.275 - 1.276 - @prefixes=qw( 1.277 - ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu 1.278 - be emp ent er ge miss ver zer durch über um unter wieder); 1.279 - @prefixes=(); 1.280 - for $pref (@prefixes) { 1.281 - s/^$pref//; 1.282 - } 1.283 - 1.284 - 1.285 - return $_; 1.286 -} 1.287 - 1.288 -sub normalize_english($) 1.289 -{ 1.290 - $_=lc(shift); 1.291 - 1.292 - s/s$//; 1.293 - 1.294 - s/ation$//; s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//; 1.295 - 1.296 - s/ed$//; 1.297 - s/en$//; 1.298 - s/er$//; 1.299 - s/est$//; 1.300 - s/ing$//; 1.301 - 1.302 - s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//; 1.303 - s/ify$//; s/fy$//; s/ly$//; 1.304 - s/ise$//; s/ize$//; 1.305 - 1.306 - s/e$//; 1.307 - return $_; 1.308 -} 1.309 - 1.310 - 1.311 -sub compare($$) 1.312 -{ 1.313 - my $a=shift; 1.314 - my $b=shift; 1.315 - $a =~ s/^\s*//; 1.316 - $b =~ s/^\s*//; 1.317 - my ($a1, $a2)= split /\s+/,$a,2; 1.318 - my ($b1, $b2)= split /\s+/,$b,2; 1.319 - 1.320 - my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)}; 1.321 - 1.322 - if ($cmp) { 1.323 - return $cmp; 1.324 - } 1.325 - else { 1.326 - if (normalize($a2) ne normalize($b2)) { 1.327 - return normalize($a2) cmp normalize($b2); 1.328 - } 1.329 - else { 1.330 - return $a1 <=> $b1; 1.331 - } 1.332 - } 1.333 -} 1.334 - 1.335 -sub log_($) 1.336 -{ 1.337 - return; 1.338 - open(LOG, ">>", "/tmp/log1"); 1.339 - print LOG $_[0]; 1.340 - close(LOG); 1.341 -} 1.342 - 1.343 -sub find_linked_words($) 1.344 -{ 1.345 - my %linked_words; 1.346 - my $dict = shift; 1.347 - log_("1"); 1.348 - log_(join(" ", keys(%$dict))); 1.349 - 1.350 - for $key (keys(%$dict)) { 1.351 - $val = $dict->{$key}; 1.352 - log_($key."\n"); 1.353 - if ($val =~ /\@([a-z]*)/) { 1.354 - $linked_words{normalize($key)} = normalize($1); 1.355 - log_(normalize($key)." = ".normalize($1)."\n"); 1.356 - } 1.357 - } 1.358 - return %linked_words; 1.359 -} 1.360 - 1.361 -sub lc_length($) 1.362 -{ 1.363 - my $a= shift; 1.364 - $a =~ s/[a-z]//g; 1.365 - return length($a); 1.366 -} 1.367 - 1.368 -our %dict = load_notes_dict(); 1.369 -our %linked_words = find_linked_words(\%dict); 1.370 - 1.371 -our %Vocabulary; 1.372 -open(VOC, $ENV{VOCABULARY}) 1.373 - or die "Can't open VOCABULARY"; 1.374 -while (<VOC>){ 1.375 - chomp; 1.376 - #s/'//g; 1.377 - $Vocabulary{normalize($_)}="1"; 1.378 -} 1.379 -close(VOC); 1.380 - 1.381 -binmode STDIN,":utf8"; 1.382 -@lines=<STDIN>; 1.383 -for $L (@lines) { 1.384 - chomp($L); 1.385 - #$L = decode( "utf8", $L); 1.386 - $l=$L; 1.387 - $l =~ s/^\s*//; 1.388 - my ($a, $b)=split(/\s+/,$l,2); 1.389 - $group_weight{normalize($b)}+=$a; 1.390 -} 1.391 -if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") { 1.392 - for $k (keys %group_weight) { 1.393 - if (defined($Vocabulary{$k})) { 1.394 - $group_weight{$k} *= 2; 1.395 - } 1.396 - } 1.397 -} 1.398 -@lines2 = sort { compare($b,$a) } @lines; 1.399 -binmode STDOUT, ":utf8"; 1.400 -print "# groups ".scalar(keys(%group_weight))."\n"; 1.401 -if ($ENV{COMPRESSED_WORDLIST} eq "YES") { 1.402 - my $sum = 0; 1.403 - my $min = 9999; 1.404 - for $L (@lines2) { 1.405 - chomp($L); $l=$L; $l =~ s/^(\s*)//; my $spaces = $1; my ($a, $b)=split(/\s+/,$l,2); 1.406 - $group_name = normalize($b); 1.407 - if ($group_name ne $prev_group_name and $prev_group_name ne '' ) { 1.408 - #print (" "x(7-length($sum))),"$sum $main_word\n"; 1.409 - print +(" "x(7-length($sum))),"$sum $main_word\n"; 1.410 - $sum = $a; 1.411 - $min = length($b) + 2*lc_length($b); 1.412 - $main_word = $b; 1.413 - } 1.414 - else { 1.415 - $sum += $a; 1.416 - if ($min > length($b) + 2*lc_length($b)) { 1.417 - $min = length($b) + 2*lc_length($b); 1.418 - $main_word = $b; 1.419 - } 1.420 - } 1.421 - $prev_group_name = $group_name; 1.422 - } 1.423 -} 1.424 -else { 1.425 - for $l (@lines2) { 1.426 - print "$l\n"; 1.427 - } 1.428 -} 1.429 -PERL_SCRIPT 1.430 - export VOCABULARY 1.431 - export NEED_TO_USE_VOCABULARY_WHEN_SORT 1.432 - export LANGUAGE 1.433 - export COMPRESSED_WORDLIST 1.434 - [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" 1.435 - export NOTES_FILE 1.436 - perl $PERL_SCRIPT_TEMP_NAME 1.437 - rm $PERL_SCRIPT_TEMP_NAME 1.438 -} 1.439 - 1.440 text_from_url() 1.441 { 1.442 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@' 1.443 @@ -578,130 +163,15 @@ 1.444 1.445 add_marks() 1.446 { 1.447 - if [ "$OLD_STYLE" = NO ] 1.448 - then 1.449 - $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1" 1.450 - else 1.451 - group_words_OLD "$@" 1.452 - fi 1.453 + $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1" 1.454 } 1.455 - 1.456 -add_marks_OLD() 1.457 +remove_marks() 1.458 { 1.459 - PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` 1.460 - cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.461 -use Encode; 1.462 - 1.463 -sub load_notes_dict() 1.464 + $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1" 1.465 +} 1.466 +get_words_group_words_add_stat() 1.467 { 1.468 - my %dict; 1.469 - if (open(NOTES, $ENV{NOTES_FILE})) { 1.470 - while(<NOTES>) { 1.471 - $_ = decode( "utf8", $_); 1.472 - chomp; 1.473 - s/^\s+//; 1.474 - my ($a,$b)=split /\s+/,$_,2; 1.475 - $dict{$a}=$b; 1.476 - } 1.477 - } 1.478 - return %dict; 1.479 -} 1.480 - 1.481 -%dict = load_notes_dict(); 1.482 - 1.483 -$file = $ARGV[0]; 1.484 -if (open(F, $file)) { 1.485 - @lines=<F>; 1.486 - close(F); 1.487 - for (@lines) {$_ = decode( "utf8", $_);}; 1.488 - 1.489 - if (open(F, ">$file")) { 1.490 - binmode F, ":utf8"; 1.491 - for (@lines) { 1.492 - m/\s+\S+\s+(\S+)/; 1.493 - $name=$1; 1.494 - if (not /^#/ and defined($dict{$name})) { 1.495 - chomp; 1.496 - $mark=$dict{$name}; 1.497 - $space=" "x(30-length($_)); 1.498 - print F "$_$space$mark\n"; 1.499 - } 1.500 - else { 1.501 - print F "$_"; 1.502 - } 1.503 - } 1.504 - close(F); 1.505 - } 1.506 -} 1.507 -PERL_SCRIPT 1.508 - [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" 1.509 - export NOTES_FILE 1.510 - perl $PERL_SCRIPT_TEMP_NAME "$1" 1.511 - rm $PERL_SCRIPT_TEMP_NAME 1.512 -} 1.513 - 1.514 -remove_marks() 1.515 -{ 1.516 - if [ "$OLD_STYLE" = NO ] 1.517 - then 1.518 - $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1" 1.519 - else 1.520 - group_words_OLD "$@" 1.521 - fi 1.522 -} 1.523 - 1.524 -remove_marks_OLD() 1.525 -{ 1.526 - PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` 1.527 - cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.528 -$file = $ARGV[0]; 1.529 -our %dict; 1.530 -if (open(F, $file)) { 1.531 - @lines=<F>; 1.532 - close(F); 1.533 - 1.534 - if (open(F, ">$file")) { 1.535 - for (@lines) { 1.536 - chomp; 1.537 - if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) { 1.538 - my $name=$4; 1.539 - my $comment=$6; 1.540 - $dict{$name}=$comment; 1.541 - print F "$1$2$3$4\n"; 1.542 - } 1.543 - else { 1.544 - print F "$_\n"; 1.545 - } 1.546 - } 1.547 - } 1.548 -} 1.549 -if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) { 1.550 - @lines=<NOTES>; 1.551 - close(NOTES); 1.552 - 1.553 - if (open(NOTES, ">".$ENV{NOTES_FILE})) { 1.554 - for (@lines) { 1.555 - chomp; 1.556 - s/^\s+//; 1.557 - my ($a,$b)=split /\s+/,$_,2; 1.558 - if (not defined($dict{$a}) || ($dict{$a} eq $b)) { 1.559 - print NOTES "$_\n"; 1.560 - if (defined($dict{$a})) { unset($dict{$a}); } 1.561 - } 1.562 - } 1.563 - for (keys %dict) { 1.564 - $mark=$dict{$_}; 1.565 - $space=" "x(30-length($_)); 1.566 - print NOTES "$_$space$mark\n"; 1.567 - } 1.568 - } 1.569 -} 1.570 -PERL_SCRIPT 1.571 - [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" 1.572 - export NOTES_FILE 1.573 - export DONT_ADD_MARKS 1.574 - perl $PERL_SCRIPT_TEMP_NAME "$1" 1.575 - rm $PERL_SCRIPT_TEMP_NAME 1.576 + $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1" 1.577 } 1.578 1.579 part() 1.580 @@ -813,9 +283,7 @@ 1.581 | part $PART_TO_PROCESS \ 1.582 | tee $ORIGINAL_TEXT \ 1.583 | two_and_three_words \ 1.584 - | get_words ${TEMP1}-full \ 1.585 - | group_words \ 1.586 - | add_stat ${TEMP1}-full \ 1.587 + | get_words_group_words_add_stat \ 1.588 | tee "$TEMP1" > "$TEMP2" 1.589 1.590 if [ "$STAT_ONLY" = "YES" ]
2.1 --- a/new-words.py Sun Jan 23 14:25:52 2011 +0100 2.2 +++ b/new-words.py Sun Jan 23 17:09:44 2011 +0100 2.3 @@ -1,6 +1,7 @@ 2.4 #!/usr/bin/env python 2.5 # -*- coding: utf-8 -*- 2.6 2.7 +from __future__ import with_statement 2.8 import codecs 2.9 import logging 2.10 import os 2.11 @@ -253,10 +254,6 @@ 2.12 f.write(line) 2.13 2.14 2.15 -def print_words_sorted(words_freq): 2.16 - for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True): 2.17 - codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k)) 2.18 - 2.19 def substract_dictionary(dict1, dict2): 2.20 """ 2.21 returns dict1 - dict2 2.22 @@ -275,14 +272,12 @@ 2.23 def error_message(text): 2.24 print text 2.25 2.26 -def find_wordgroups_weights(lines, normalizator): 2.27 +def find_wordgroups_weights(word_pairs, normalizator): 2.28 weight = {} 2.29 - for line in lines: 2.30 - line = re.sub('^\s*', '', line.rstrip('\n')) 2.31 - (num, word) = re.split('\s+', line, maxsplit=1) 2.32 + for (num, word) in word_pairs: 2.33 normalized = normalizator.normalize(word) 2.34 weight.setdefault(normalized, 0) 2.35 - weight[normalized] += int(num) 2.36 + weight[normalized] += num 2.37 return weight 2.38 2.39 def find_linked_words(notes): 2.40 @@ -297,12 +292,9 @@ 2.41 linked_words[word] = main_word 2.42 return linked_words 2.43 2.44 - 2.45 -def compare_word_lines(line1, line2, wgw, normalizator, linked_words): 2.46 - line1 = re.sub('^\s*', '', line1.rstrip('\n')) 2.47 - (num1, word1) = re.split('\s+', line1, 1) 2.48 - line2 = re.sub('^\s*', '', line2.rstrip('\n')) 2.49 - (num2, word2) = re.split('\s+', line2, 1) 2.50 +def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words): 2.51 + (num1, word1) = pair1 2.52 + (num2, word2) = pair2 2.53 2.54 normalized_word1 = normalizator.normalize(word1) 2.55 normalized_word2 = normalizator.normalize(word2) 2.56 @@ -317,25 +309,28 @@ 2.57 else: 2.58 return cmp(int(num1), int(num2)) 2.59 2.60 -def filter_get_words(args): 2.61 - vocabulary = load_vocabulary() 2.62 - words = get_words(readlines_from_stdin()) 2.63 - dump_words(words, args[0]) 2.64 - words = substract_dictionary(words, vocabulary) 2.65 - print_words_sorted(words) 2.66 +def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False): 2.67 + if stats_only: 2.68 + codecs.getwriter("utf-8")(sys.stdout).write("stat_only") 2.69 + return 2.70 2.71 -def filter_group_words(args): 2.72 - lines = readlines_from_stdin() 2.73 - notes = load_notes(notes_filenames()) 2.74 - linked_words = find_linked_words(notes) 2.75 - normalizator = Normalizator(config['language'], linked_words) 2.76 + if print_stats: 2.77 + codecs.getwriter("utf-8")(sys.stdout).write( 2.78 + "# %(language)s, %(percentage)s, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats) 2.79 2.80 - wgw = find_wordgroups_weights(lines, normalizator) 2.81 - for line in sorted( 2.82 - lines, 2.83 - cmp=lambda x,y:compare_word_lines(x,y, wgw, normalizator, linked_words), 2.84 - reverse=True): 2.85 - codecs.getwriter("utf-8")(sys.stdout).write(line) 2.86 + level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102) 2.87 + known = int(stats['total_known']) 2.88 + total = int(stats['total']) 2.89 + current_level = 0 2.90 + for word_pair in word_pairs: 2.91 + codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair) 2.92 + known += word_pair[0] 2.93 + if 100.0*known/total >= level_lines[0]: 2.94 + current_level = level_lines[0] 2.95 + while 100.0*known/total > level_lines[0]: 2.96 + current_level = level_lines[0] 2.97 + level_lines = level_lines[1:] 2.98 + codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level) 2.99 2.100 def filter_add_notes(args): 2.101 lines = readlines_from_file(args[0]) 2.102 @@ -353,16 +348,48 @@ 2.103 for line in lines: 2.104 f.write(line) 2.105 2.106 +def filter_get_words_group_words_add_stat(args): 2.107 + vocabulary = load_vocabulary() 2.108 + notes = load_notes(notes_filenames()) 2.109 + lines = readlines_from_stdin() 2.110 + words = get_words(lines) 2.111 + 2.112 + stats = {} 2.113 + stats['total'] = sum(words[x] for x in words.keys()) 2.114 + words = substract_dictionary(words, vocabulary) 2.115 + 2.116 + stats['total_unknown'] = sum(words[x] for x in words.keys()) 2.117 + stats['total_known'] = stats['total'] - stats['total_unknown'] 2.118 + stats['percentage'] = "%7.2f"%(100.0*stats['total_known']/stats['total']) 2.119 + stats['groups'] = 0 2.120 + stats['words'] = len(words) 2.121 + stats['sentences'] = 0 #FIXME 2.122 + stats['language'] = config['language'] 2.123 + 2.124 + linked_words = find_linked_words(notes) 2.125 + normalizator = Normalizator(config['language'], linked_words) 2.126 + 2.127 + word_pairs = [] 2.128 + for k in sorted(words.keys(), key=lambda k: words[k], reverse=True): 2.129 + word_pairs.append((words[k], k)) 2.130 + 2.131 + wgw = find_wordgroups_weights(word_pairs, normalizator) 2.132 + word_pairs = sorted( 2.133 + word_pairs, 2.134 + cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words), 2.135 + reverse=True) 2.136 + 2.137 + print_words_sorted(word_pairs, stats) 2.138 + 2.139 (options, args) = parser.parse_args() 2.140 if options.language: 2.141 config['language'] = options.language 2.142 2.143 if options.function: 2.144 function_names = { 2.145 - 'get_words' : filter_get_words, 2.146 - 'group_words' : filter_group_words, 2.147 'add_notes' : filter_add_notes, 2.148 'remove_notes': filter_remove_notes, 2.149 + 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat, 2.150 } 2.151 if options.function in function_names: 2.152 function_names[options.function](args)