new-words
changeset 38:adbc809d3924
Transition to Python started
new-words-py.sh is a wrapper around
new-words.py version which is not finished yet.
new-words-py.sh is a wrapper around
new-words.py version which is not finished yet.
author | Igor Chubin <igor@chub.in> |
---|---|
date | Sat Jan 22 23:42:31 2011 +0100 (2011-01-22) |
parents | be6336e98b3c |
children | a598e0d25784 |
files | new-words-py.sh new-words.py |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/new-words-py.sh Sat Jan 22 23:42:31 2011 +0100 1.3 @@ -0,0 +1,825 @@ 1.4 +#!/bin/bash 1.5 + 1.6 +show_usage() 1.7 +{ 1.8 +cat <<HELP > /dev/stderr 1.9 + 1.10 +USAGE: 1.11 + 1.12 + new-words [ -l lang ] [ -s ] [ ARG ] 1.13 + 1.14 +SWITCHES: 1.15 + 1.16 + -h print this screen 1.17 + -c show compressed wordlist: one word per group 1.18 + -k put higher words that are similar to the known words (only for English) 1.19 + -l lang override language settings 1.20 + -n non-interactive mode (don't run vi) 1.21 + -N turn off known words filtering 1.22 + -a don't add marks (and don't save marks added by user) 1.23 + -p pages work with specified pages only (pages = start-stop/total ) 1.24 + -s show the text statistics (percentage of known words and so on) and exit 1.25 + -S show your vocabulary statistics (number of words and word groups) 1.26 + -t tag tag known words with tag 1.27 + -T show list of active tags 1.28 + -m tag merge the words tagged with "tag" into the main vocabulary 1.29 + -M merge the words tagged with any tag into the main vocabulary 1.30 + -r tag remove subvocabulary for the "tag" 1.31 + -2 -3 find 2 and 3 words' sequences 1.32 + 1.33 +The language of the text can be specified also 1.34 +by name of the program new-words (correspondent link must be created before). 1.35 +For example, these calls are equivalent: 1.36 + 1.37 + de-words URL 1.38 + new-words -l de URL 1.39 + 1.40 +HELP 1.41 +} 1.42 + 1.43 +if [ "$1" = "-h" ] 1.44 +then 1.45 + show_usage 1.46 + exit 0 1.47 +fi 1.48 + 1.49 +NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py 1.50 +WORK_DIR=~/.new-words/ 1.51 +TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX` 1.52 +TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX` 1.53 +export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX` 1.54 +editor=${EDITOR:-vim} 1.55 + 1.56 +# language detection 1.57 + 1.58 +LANGUAGE=en 1.59 +my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `" 1.60 +for arg 1.61 +do 1.62 + if echo "$arg" | grep -q http://...wikipedia.org/wiki/ 1.63 + then 1.64 + LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`" 1.65 + fi 1.66 +done 1.67 +[ "${my_name}" = "new" ] || LANGUAGE="$my_name" 1.68 + 1.69 +#---------------------------------------------------- 1.70 +# command line options processing 1.71 + 1.72 +STAT_ONLY=NO 1.73 +NEED_TO_USE_VOCABULARY_WHEN_SORT=NO 1.74 +DONT_ADD_MARKS=NO 1.75 +NON_INTERACTIVE_MODE=NO 1.76 +PART_TO_PROCESS='' 1.77 +GROUP_WORDS_BY_THREE=NO 1.78 +GROUP_WORDS_BY_TWO=NO 1.79 +TAG_NAME='' 1.80 +MERGE_THIS_TAGS='' 1.81 +TAGS_LIST_ONLY=NO 1.82 +MERGE_TAGGED_WORDS=NO 1.83 +MERGE_ALL_TAGGED=NO 1.84 +DONT_ADD_MARKLINES=NO 1.85 +FILTER_WORDS=YES 1.86 +SHOW_VOC_STAT=NO 1.87 +COMPRESSED_WORDLIST=NO 1.88 +OLD_STYLE="NO" 1.89 +while getopts Ocl:sSkanNp:t:Tm:Mr:23 opt 1.90 +do 1.91 + case "$opt" in 1.92 + O) OLD_STYLE=YES;; 1.93 + c) COMPRESSED_WORDLIST=YES;; 1.94 + s) STAT_ONLY=YES;; 1.95 + S) SHOW_VOC_STAT=YES;; 1.96 + k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;; 1.97 + l) LANGUAGE="$OPTARG";; 1.98 + a) DONT_ADD_MARKS=YES;; 1.99 + n) NON_INTERACTIVE_MODE=YES;; 1.100 + N) FILTER_WORDS=NO;; 1.101 + p) PART_TO_PROCESS="$OPTARG";; 1.102 + t) TAG_NAME="$OPTARG";; 1.103 + T) TAGS_LIST_ONLY="YES";; 1.104 + m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";; 1.105 + M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";; 1.106 + r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";; 1.107 + 2) GROUP_WORDS_BY_TWO=YES;; 1.108 + 3) GROUP_WORDS_BY_THREE=YES;; 1.109 + \?) # unknown flag 1.110 + show_usage 1.111 + exit 1;; 1.112 + esac 1.113 +done 1.114 +shift `expr $OPTIND - 1` 1.115 + 1.116 +if [ "$1" = "-l" ] 1.117 +then 1.118 + LANGUAGE="$2" 1.119 + shift 2 1.120 +fi 1.121 + 1.122 +VOCABULARY=${LANGUAGE}.txt 1.123 +NOTES_FILE=notes-${LANGUAGE}.txt 1.124 + 1.125 +if [ "${SHOW_VOC_STAT}" = "YES" ] 1.126 +then 1.127 + $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>" 1.128 + exit 0 1.129 +fi 1.130 + 1.131 +#---------------------------------------------------- 1.132 + 1.133 +get_words() 1.134 +{ 1.135 + if [ "$OLD_STYLE" = NO ] 1.136 + then 1.137 + $NEW_WORDS_PY -l "$LANGUAGE" -f get_words "$1" 1.138 + else 1.139 + get_words_OLD "$@" 1.140 + fi 1.141 +} 1.142 + 1.143 +get_words_OLD() 1.144 +{ 1.145 + export FILTER_WORDS 1.146 +tr ' ' '\n' | sed 's/--/ /g' \ 1.147 +| sed "s/'/__APOSTROPHE__/g" \ 1.148 +| perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\ 1.149 +| sed "s/__APOSTROPHE__/'/g" \ 1.150 +| tr ' ' '\n' \ 1.151 +| tee "$1" \ 1.152 +| grep_v_english_perl \ 1.153 +| sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn 1.154 +} 1.155 + 1.156 +add_stat() 1.157 +{ 1.158 + if [ "$DONT_ADD_MARKLINES" = "YES" ] 1.159 + then 1.160 + cat 1.161 + return 1.162 + fi 1.163 + before="$1" 1.164 + after=${before}2 1.165 + cat > "$after" 1.166 + total="`wc -w $1 | awk '{print $1}'`" 1.167 + total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`" 1.168 + total_known="`echo $total-$total_unknown|bc`" 1.169 + percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`" 1.170 + #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`" 1.171 + sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`" 1.172 + 1.173 + 1.174 + if [ "$STAT_ONLY" = "YES" ] 1.175 + then 1.176 + echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10" 1.177 + echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` " 1.178 + rm $after 1.179 + return 0 1.180 + else 1.181 + groups="`echo $(grep '# groups' $after | awk '{print $3}')`" 1.182 + words="`echo $(grep -v '^#' $after | wc -l)`" 1.183 + echo "# $LANGUAGE, $percentage, <$total_known/$total>, <$groups/$words>" 1.184 + fi 1.185 + 1.186 + PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` 1.187 + cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.188 +my $total=shift(@ARGV); 1.189 +my $total_known=shift(@ARGV); 1.190 +my $s=0; 1.191 +my $mark_line=int($total_known*100/$total/5)*5; 1.192 +if ($mark_line>=90) { 1.193 + $mark_line=int($total_known*100/$total)+1; 1.194 +} else { $mark_line +=5; }; 1.195 +while(<>) 1.196 +{ 1.197 + next if /^#\s*groups\s*/; 1.198 + print; 1.199 + /^\s*([0-9]*)\s*/; 1.200 + $s+=$1; 1.201 + if (($total_known+$s)*100/$total>=$mark_line) { 1.202 + print "# $mark_line\n"; 1.203 + if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; }; 1.204 + } 1.205 +} 1.206 +PERL_SCRIPT 1.207 + perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after" 1.208 + rm $PERL_SCRIPT_TEMP_NAME 1.209 + rm $after 1.210 +} 1.211 + 1.212 +two_and_three_words() 1.213 +{ 1.214 + if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ] 1.215 + then 1.216 + cat 1.217 + else 1.218 + cat 1.219 + 1.220 + export GROUP_WORDS_BY_THREE 1.221 + export GROUP_WORDS_BY_TWO 1.222 + PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX` 1.223 + cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.224 +#!/usr/bin/perl 1.225 +local $/; 1.226 +$words=<>; 1.227 +$words=~ s@[!?;,:#1-9".]@ @g; 1.228 +$words =~ s@\s+@ @g; 1.229 +@words = split /\s+/, $words; 1.230 +for ($i=0; $i<$#words-3;$i++) { 1.231 + my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]); 1.232 + if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) { 1.233 + print "${a}_${b}_${c}\n"; 1.234 + }; 1.235 + if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) { 1.236 + print "${a}_${b}\n"; 1.237 + }; 1.238 +} 1.239 +PERL_SCRIPT 1.240 + perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT" 1.241 + rm $PERL_SCRIPT_TEMP_NAME 1.242 + fi 1.243 +} 1.244 + 1.245 +grep_v_english() 1.246 +{ 1.247 +[ -e "$VOCABULARY" ] || touch "$VOCABULARY" 1.248 +eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat 1.249 +} 1.250 + 1.251 +grep_v_english_perl() 1.252 +{ 1.253 + PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` 1.254 + cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.255 + if ($ENV{FILTER_WORDS} eq "NO") { 1.256 + while(<>) { print; } 1.257 + exit(0); 1.258 + } 1.259 +$voc_files=$ENV{VOC_FILES}; 1.260 +$voc_files=~s@^ @@; 1.261 +for $voc_file (split /\s+/,$voc_files) { 1.262 + if (open(VOC, $voc_file)) { 1.263 + while (<VOC>){ 1.264 + chomp; 1.265 + #s/'//g; 1.266 + $voc{$_}="1"; 1.267 + } 1.268 + } 1.269 +} 1.270 +while(<>) { 1.271 + chomp; 1.272 + if (not defined($voc{$_})) { print "$_\n"; } 1.273 +} 1.274 +PERL_SCRIPT 1.275 + [ -e "$VOCABULARY" ] || touch "$VOCABULARY" 1.276 + export VOCABULARY VOC_FILES 1.277 + VOC_FILES=$VOCABULARY 1.278 + for i in $TAG_NAME 1.279 + do 1.280 + VOC_FILES="${VOC_FILES} `tag_file_name $i`" 1.281 + done 1.282 + perl $PERL_SCRIPT_TEMP_NAME 1.283 + rm $PERL_SCRIPT_TEMP_NAME 1.284 +} 1.285 + 1.286 +group_words() 1.287 +{ 1.288 + if [ "$OLD_STYLE" = NO ] 1.289 + then 1.290 + $NEW_WORDS_PY -l "$LANGUAGE" -f group_words "$1" 1.291 + else 1.292 + group_words_OLD "$@" 1.293 + fi 1.294 +} 1.295 + 1.296 +group_words_OLD() 1.297 +{ 1.298 + #if [ "$LANGUAGE" != "en" ] 1.299 + #then 1.300 + # cat 1.301 + # return 1.302 + #fi 1.303 + PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX` 1.304 + cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.305 +#!/usr/bin/perl 1.306 + 1.307 +use Encode; 1.308 +use utf8; 1.309 +use Lingua::Stem::Snowball qw(stem); 1.310 + 1.311 +eval { 1.312 +# http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it 1.313 + require String::Similarity; 1.314 + String::Similarity->import(); 1.315 +}; 1.316 +unless($@) 1.317 +{ 1.318 + our $HAVE_String_Similarity=1; 1.319 +} 1.320 + 1.321 + 1.322 +sub load_notes_dict() 1.323 +{ 1.324 + my %dict; 1.325 + if (open(NOTES, $ENV{NOTES_FILE})) { 1.326 + while(<NOTES>) { 1.327 + $_ = decode( "utf8", $_); 1.328 + chomp; 1.329 + s/^\s+//; 1.330 + my ($a,$b)=split /\s+/,$_,2; 1.331 + $dict{$a}=$b; 1.332 + } 1.333 + } 1.334 + return %dict; 1.335 +} 1.336 + 1.337 +sub similar($$){ 1.338 + my $a=shift; 1.339 + my $b=shift; 1.340 + if ($HAVE_String_Similarity) { 1.341 + return $Similarity{"$a $b"}; 1.342 + } 1.343 + else { 1.344 + return 0; 1.345 + } 1.346 +} 1.347 + 1.348 + 1.349 +sub normalize_without_linked($) 1.350 +{ 1.351 + if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); } 1.352 + elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); } 1.353 + elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); } 1.354 + elsif ( $ENV{LANGUAGE} eq "io" ) { return normalize_esperanto(shift); } 1.355 + else { return shift ; } 1.356 +} 1.357 + 1.358 +sub normalize_with_linked($) 1.359 +{ 1.360 + my $word = normalize_without_linked(shift); 1.361 + #return $word; 1.362 + if ($linked_words{$word}) { 1.363 + return $linked_words{$word}; 1.364 + } 1.365 + else { 1.366 + return $word; 1.367 + } 1.368 +} 1.369 + 1.370 +sub normalize($) 1.371 +{ 1.372 + return normalize_with_linked(shift); 1.373 +} 1.374 + 1.375 +sub normalize_ukrainian($) 1.376 +{ 1.377 + $_=lc(shift); 1.378 + s/[юіоеуаи]$//g; 1.379 + return $_; 1.380 +} 1.381 + 1.382 +sub normalize_esperanto($) 1.383 +{ 1.384 + $_=lc(shift); 1.385 +# verbs 1.386 + s/i$//; s/is$//; s/os$//; s/as$//; s/us$//; 1.387 + 1.388 +# nouns 1.389 + s/j?n?$//; 1.390 + 1.391 + return $_; 1.392 +} 1.393 + 1.394 +sub normalize_german($) 1.395 +{ 1.396 + @stems = stem('de', \@_); 1.397 + return $stems[0]; 1.398 +} 1.399 + 1.400 +sub normalize_german_($) 1.401 +{ 1.402 + $_=lc(shift); 1.403 + 1.404 + s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//; 1.405 + s/en$//; s/er$//; 1.406 + 1.407 + s/lich$//; s/ig$//; 1.408 + s/al$//; s/isch$//; 1.409 + s/ell$//; s/haft$//; 1.410 + 1.411 + s/bar$//; s/sam$//; s/lich$//; 1.412 + 1.413 + @prefixes=qw( 1.414 + ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu 1.415 + be emp ent er ge miss ver zer durch über um unter wieder); 1.416 + @prefixes=(); 1.417 + for $pref (@prefixes) { 1.418 + s/^$pref//; 1.419 + } 1.420 + 1.421 + 1.422 + return $_; 1.423 +} 1.424 + 1.425 +sub normalize_english($) 1.426 +{ 1.427 + $_=lc(shift); 1.428 + 1.429 + s/s$//; 1.430 + 1.431 + s/ation$//; s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//; 1.432 + 1.433 + s/ed$//; 1.434 + s/en$//; 1.435 + s/er$//; 1.436 + s/est$//; 1.437 + s/ing$//; 1.438 + 1.439 + s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//; 1.440 + s/ify$//; s/fy$//; s/ly$//; 1.441 + s/ise$//; s/ize$//; 1.442 + 1.443 + s/e$//; 1.444 + return $_; 1.445 +} 1.446 + 1.447 + 1.448 +sub compare($$) 1.449 +{ 1.450 + my $a=shift; 1.451 + my $b=shift; 1.452 + $a =~ s/^\s*//; 1.453 + $b =~ s/^\s*//; 1.454 + my ($a1, $a2)= split /\s+/,$a,2; 1.455 + my ($b1, $b2)= split /\s+/,$b,2; 1.456 + 1.457 + my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)}; 1.458 + 1.459 + if ($cmp) { 1.460 + return $cmp; 1.461 + } 1.462 + else { 1.463 + if (normalize($a2) ne normalize($b2)) { 1.464 + return normalize($a2) cmp normalize($b2); 1.465 + } 1.466 + else { 1.467 + return $a1 <=> $b1; 1.468 + } 1.469 + } 1.470 +} 1.471 + 1.472 +sub log_($) 1.473 +{ 1.474 + return; 1.475 + open(LOG, ">>", "/tmp/log1"); 1.476 + print LOG $_[0]; 1.477 + close(LOG); 1.478 +} 1.479 + 1.480 +sub find_linked_words($) 1.481 +{ 1.482 + my %linked_words; 1.483 + my $dict = shift; 1.484 + log_("1"); 1.485 + log_(join(" ", keys(%$dict))); 1.486 + 1.487 + for $key (keys(%$dict)) { 1.488 + $val = $dict->{$key}; 1.489 + log_($key."\n"); 1.490 + if ($val =~ /\@([a-z]*)/) { 1.491 + $linked_words{normalize($key)} = normalize($1); 1.492 + log_(normalize($key)." = ".normalize($1)."\n"); 1.493 + } 1.494 + } 1.495 + return %linked_words; 1.496 +} 1.497 + 1.498 +sub lc_length($) 1.499 +{ 1.500 + my $a= shift; 1.501 + $a =~ s/[a-z]//g; 1.502 + return length($a); 1.503 +} 1.504 + 1.505 +our %dict = load_notes_dict(); 1.506 +our %linked_words = find_linked_words(\%dict); 1.507 + 1.508 +our %Vocabulary; 1.509 +open(VOC, $ENV{VOCABULARY}) 1.510 + or die "Can't open VOCABULARY"; 1.511 +while (<VOC>){ 1.512 + chomp; 1.513 + #s/'//g; 1.514 + $Vocabulary{normalize($_)}="1"; 1.515 +} 1.516 +close(VOC); 1.517 + 1.518 +binmode STDIN,":utf8"; 1.519 +@lines=<STDIN>; 1.520 +for $L (@lines) { 1.521 + chomp($L); 1.522 + #$L = decode( "utf8", $L); 1.523 + $l=$L; 1.524 + $l =~ s/^\s*//; 1.525 + my ($a, $b)=split(/\s+/,$l,2); 1.526 + $group_weight{normalize($b)}+=$a; 1.527 +} 1.528 +if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") { 1.529 + for $k (keys %group_weight) { 1.530 + if (defined($Vocabulary{$k})) { 1.531 + $group_weight{$k} *= 2; 1.532 + } 1.533 + } 1.534 +} 1.535 +@lines2 = sort { compare($b,$a) } @lines; 1.536 +binmode STDOUT, ":utf8"; 1.537 +print "# groups ".scalar(keys(%group_weight))."\n"; 1.538 +if ($ENV{COMPRESSED_WORDLIST} eq "YES") { 1.539 + my $sum = 0; 1.540 + my $min = 9999; 1.541 + for $L (@lines2) { 1.542 + chomp($L); $l=$L; $l =~ s/^(\s*)//; my $spaces = $1; my ($a, $b)=split(/\s+/,$l,2); 1.543 + $group_name = normalize($b); 1.544 + if ($group_name ne $prev_group_name and $prev_group_name ne '' ) { 1.545 + #print (" "x(7-length($sum))),"$sum $main_word\n"; 1.546 + print +(" "x(7-length($sum))),"$sum $main_word\n"; 1.547 + $sum = $a; 1.548 + $min = length($b) + 2*lc_length($b); 1.549 + $main_word = $b; 1.550 + } 1.551 + else { 1.552 + $sum += $a; 1.553 + if ($min > length($b) + 2*lc_length($b)) { 1.554 + $min = length($b) + 2*lc_length($b); 1.555 + $main_word = $b; 1.556 + } 1.557 + } 1.558 + $prev_group_name = $group_name; 1.559 + } 1.560 +} 1.561 +else { 1.562 + for $l (@lines2) { 1.563 + print "$l\n"; 1.564 + } 1.565 +} 1.566 +PERL_SCRIPT 1.567 + export VOCABULARY 1.568 + export NEED_TO_USE_VOCABULARY_WHEN_SORT 1.569 + export LANGUAGE 1.570 + export COMPRESSED_WORDLIST 1.571 + [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" 1.572 + export NOTES_FILE 1.573 + perl $PERL_SCRIPT_TEMP_NAME 1.574 + rm $PERL_SCRIPT_TEMP_NAME 1.575 +} 1.576 + 1.577 +text_from_url() 1.578 +{ 1.579 +lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@' 1.580 +} 1.581 + 1.582 +add_marks() 1.583 +{ 1.584 + PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` 1.585 + cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.586 +use Encode; 1.587 + 1.588 +sub load_notes_dict() 1.589 +{ 1.590 + my %dict; 1.591 + if (open(NOTES, $ENV{NOTES_FILE})) { 1.592 + while(<NOTES>) { 1.593 + $_ = decode( "utf8", $_); 1.594 + chomp; 1.595 + s/^\s+//; 1.596 + my ($a,$b)=split /\s+/,$_,2; 1.597 + $dict{$a}=$b; 1.598 + } 1.599 + } 1.600 + return %dict; 1.601 +} 1.602 + 1.603 +%dict = load_notes_dict(); 1.604 + 1.605 +$file = $ARGV[0]; 1.606 +if (open(F, $file)) { 1.607 + @lines=<F>; 1.608 + close(F); 1.609 + for (@lines) {$_ = decode( "utf8", $_);}; 1.610 + 1.611 + if (open(F, ">$file")) { 1.612 + binmode F, ":utf8"; 1.613 + for (@lines) { 1.614 + m/\s+\S+\s+(\S+)/; 1.615 + $name=$1; 1.616 + if (not /^#/ and defined($dict{$name})) { 1.617 + chomp; 1.618 + $mark=$dict{$name}; 1.619 + $space=" "x(30-length($_)); 1.620 + print F "$_$space$mark\n"; 1.621 + } 1.622 + else { 1.623 + print F "$_"; 1.624 + } 1.625 + } 1.626 + close(F); 1.627 + } 1.628 +} 1.629 +PERL_SCRIPT 1.630 + [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" 1.631 + export NOTES_FILE 1.632 + perl $PERL_SCRIPT_TEMP_NAME "$1" 1.633 + rm $PERL_SCRIPT_TEMP_NAME 1.634 +} 1.635 + 1.636 +remove_marks() 1.637 +{ 1.638 + PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` 1.639 + cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.640 +$file = $ARGV[0]; 1.641 +our %dict; 1.642 +if (open(F, $file)) { 1.643 + @lines=<F>; 1.644 + close(F); 1.645 + 1.646 + if (open(F, ">$file")) { 1.647 + for (@lines) { 1.648 + chomp; 1.649 + if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) { 1.650 + my $name=$4; 1.651 + my $comment=$6; 1.652 + $dict{$name}=$comment; 1.653 + print F "$1$2$3$4\n"; 1.654 + } 1.655 + else { 1.656 + print F "$_\n"; 1.657 + } 1.658 + } 1.659 + } 1.660 +} 1.661 +if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) { 1.662 + @lines=<NOTES>; 1.663 + close(NOTES); 1.664 + 1.665 + if (open(NOTES, ">".$ENV{NOTES_FILE})) { 1.666 + for (@lines) { 1.667 + chomp; 1.668 + s/^\s+//; 1.669 + my ($a,$b)=split /\s+/,$_,2; 1.670 + if (not defined($dict{$a}) || ($dict{$a} eq $b)) { 1.671 + print NOTES "$_\n"; 1.672 + if (defined($dict{$a})) { unset($dict{$a}); } 1.673 + } 1.674 + } 1.675 + for (keys %dict) { 1.676 + $mark=$dict{$_}; 1.677 + $space=" "x(30-length($_)); 1.678 + print NOTES "$_$space$mark\n"; 1.679 + } 1.680 + } 1.681 +} 1.682 +PERL_SCRIPT 1.683 + [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" 1.684 + export NOTES_FILE 1.685 + export DONT_ADD_MARKS 1.686 + perl $PERL_SCRIPT_TEMP_NAME "$1" 1.687 + rm $PERL_SCRIPT_TEMP_NAME 1.688 +} 1.689 + 1.690 +part() 1.691 +{ 1.692 + PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX` 1.693 + cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.694 +#!/usr/bin/perl 1.695 + 1.696 +my @lines=<STDIN>; 1.697 +my $lines=$#lines; 1.698 +my $interval=$ARGV[0]; 1.699 +if (not $interval) { 1.700 + print @lines; 1.701 +} 1.702 +else { 1.703 + my ($start,$stop,$total); 1.704 + if ($interval =~ m@(.*)/(.*)@) { 1.705 + $start = $1; 1.706 + $total = $2; 1.707 + } 1.708 + else { 1.709 + $start=$interval; 1.710 + $total=0; 1.711 + } 1.712 + if ($start =~ m@(.*)-(.*)@) { 1.713 + $start = $1; 1.714 + $stop = $2; 1.715 + } 1.716 + if ($start =~ m@(.*)\+(.*)@) { 1.717 + $start = $1; 1.718 + $stop = $start+$2; 1.719 + } 1.720 + 1.721 + $start=int($lines/$total*$start); 1.722 + $stop=int($lines/$total*$stop); 1.723 + 1.724 + for($i=$start;$i<$stop;$i++){ 1.725 + print $lines[$i]; 1.726 + } 1.727 +} 1.728 +PERL_SCRIPT 1.729 + perl $PERL_SCRIPT_TEMP_NAME "$1" 1.730 + rm $PERL_SCRIPT_TEMP_NAME 1.731 +} 1.732 + 1.733 +if [ "$TAGS_LIST_ONLY" = "YES" ] 1.734 +then 1.735 + cd "${WORK_DIR}" 1.736 + echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//' 1.737 + exit 0 1.738 +fi 1.739 + 1.740 +tag_file_name() 1.741 +{ 1.742 + echo "${LANGUAGE}_${1}.txt" 1.743 +} 1.744 + 1.745 +if [ "$REMOVE_TAG" = "YES" ] 1.746 +then 1.747 + cd "${WORK_DIR}" 1.748 + for i in $TAG_NAME 1.749 + do 1.750 + echo "$TAGNAME" | grep -q '[/*?]' && continue 1.751 + f="`tag_file_name $i`" 1.752 + if [ -e "$f" ] 1.753 + then 1.754 + rm -f "$f" && echo Tag "'$i'" removed 1.755 + else 1.756 + echo Unknown tag "'$i'" 1.757 + fi 1.758 + done 1.759 + exit 0 1.760 +fi 1.761 + 1.762 +mkdir -p $WORK_DIR 1.763 +oldpwd="$PWD" 1.764 +cd $WORK_DIR 1.765 +if [ "$MERGE_TAGGED_WORDS" = "YES" ] 1.766 +then 1.767 + VOC_FILES='' 1.768 + for i in $MERGE_THIS_TAGS 1.769 + do 1.770 + f=`tag_file_name $i` 1.771 + [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f" 1.772 + done 1.773 + if [ -z "$VOC_FILES" ] 1.774 + then 1.775 + echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr 1.776 + else 1.777 + cat $VOC_FILES 1.778 + fi 1.779 +elif [ "$MERGE_ALL_TAGGED" = "YES" ] 1.780 +then 1.781 + cat ${LANGUAGE}_*.txt 1.782 +elif echo "$1" | grep -q http: 1.783 +then 1.784 + text_from_url "$1" 1.785 +elif [ "$#" != 0 ] 1.786 +then 1.787 + if echo $1 | grep -q ^/ 1.788 + then 1.789 + cat "$1" 1.790 + else 1.791 + cat "$oldpwd/$1" 1.792 + fi 1.793 +else 1.794 + cat 1.795 +fi \ 1.796 + | part $PART_TO_PROCESS \ 1.797 + | tee $ORIGINAL_TEXT \ 1.798 + | two_and_three_words \ 1.799 + | get_words ${TEMP1}-full \ 1.800 + | group_words \ 1.801 + | add_stat ${TEMP1}-full \ 1.802 + | tee "$TEMP1" > "$TEMP2" 1.803 + 1.804 +if [ "$STAT_ONLY" = "YES" ] 1.805 +then 1.806 + cat "$TEMP1" 1.807 +elif [ "$NON_INTERACTIVE_MODE" = "YES" ] 1.808 +then 1.809 + cat "$TEMP1" 1.810 +else 1.811 + if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ] 1.812 + then 1.813 + [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2" 1.814 + if [ "$editor" = vim ] 1.815 + then 1.816 + vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty 1.817 + else 1.818 + $editor "$TEMP2" 1.819 + fi 1.820 + remove_marks "$TEMP2" 1.821 + 1.822 + vocabulary="$VOCABULARY" 1.823 + [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`" 1.824 + diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary" 1.825 + fi 1.826 +fi 1.827 + 1.828 +rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"
2.1 --- a/new-words.py Fri Jan 21 15:59:45 2011 +0200 2.2 +++ b/new-words.py Sat Jan 22 23:42:31 2011 +0100 2.3 @@ -1,6 +1,39 @@ 2.4 #!/usr/bin/env python 2.5 +# -*- coding: utf-8 -*- 2.6 2.7 +import codecs 2.8 +import logging 2.9 +import os 2.10 import optparse 2.11 +import re 2.12 +import subprocess 2.13 +import sys 2.14 +import Stemmer 2.15 + 2.16 +config = { 2.17 + 'config_directory': os.environ['HOME'] + '/.new-words', 2.18 + 'language': 'en', 2.19 +} 2.20 + 2.21 +logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG) 2.22 + 2.23 +class Normalizator: 2.24 + def __init__(self, language, linked_words={}): 2.25 + stemmer_algorithm = { 2.26 + 'de' : 'german', 2.27 + 'en' : 'english', 2.28 + 'ru' : 'russian', 2.29 + 'uk' : 'ukrainian', 2.30 + } 2.31 + self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language]) 2.32 + self.linked_words = linked_words 2.33 + 2.34 + def normalize(self, word): 2.35 + word_chain = [] 2.36 + while word in self.linked_words and not word in word_chain: 2.37 + word_chain.append(word) 2.38 + word = self.linked_words[word] 2.39 + return self.stemmer.stemWord(word.lower()) 2.40 2.41 parser = optparse.OptionParser() 2.42 2.43 @@ -29,6 +62,12 @@ 2.44 dest="language") 2.45 2.46 parser.add_option( 2.47 + "-f", "--function", 2.48 + help="filter through subsystem [INTERNAL]", 2.49 + action="store", 2.50 + dest="function") 2.51 + 2.52 +parser.add_option( 2.53 "-m", "--merge-tag", 2.54 help="merge words tagged with specified tag into the main vocabulary", 2.55 action="store", 2.56 @@ -100,31 +139,156 @@ 2.57 action="store_true", 2.58 dest="three_words") 2.59 2.60 +def readlines_from_file(filename): 2.61 + res = [] 2.62 + with codecs.open(filename, "r", "utf-8") as f: 2.63 + for line in f.readlines(): 2.64 + res += [line] 2.65 + return res 2.66 + 2.67 +def readlines_from_stdin(): 2.68 + return codecs.getreader("utf-8")(sys.stdin).readlines() 2.69 + 2.70 +def words_from_line(line): 2.71 + line = line.rstrip('\n') 2.72 + #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line) 2.73 + #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line) 2.74 + return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line) 2.75 + 2.76 +def get_words(lines): 2.77 + """ 2.78 + Returns hash of words in a file 2.79 + word => number 2.80 + """ 2.81 + result = {} 2.82 + for line in lines: 2.83 + words = words_from_line(line) 2.84 + for word in words: 2.85 + result.setdefault(word, 0) 2.86 + result[word] += 1 2.87 + return result 2.88 + 2.89 +def load_vocabulary(): 2.90 + return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language']))) 2.91 + 2.92 +def notes_filenames(): 2.93 + return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])] 2.94 + 2.95 +def load_notes(files): 2.96 + notes = {} 2.97 + for filename in files: 2.98 + with open(filename) as f: 2.99 + for line in f.readlines(): 2.100 + (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) 2.101 + notes.setdefault(word, {}) 2.102 + notes[word][filename] = note 2.103 + return notes 2.104 + 2.105 +def print_words_sorted(words_freq): 2.106 + for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True): 2.107 + codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k)) 2.108 + 2.109 +def substract_dictionary(dict1, dict2): 2.110 + """ 2.111 + returns dict1 - dict2 2.112 + """ 2.113 + result = {} 2.114 + for (k,v) in dict1.items(): 2.115 + if not k in dict2: 2.116 + result[k] = v 2.117 + return result 2.118 + 2.119 +def dump_words(words, filename): 2.120 + with codecs.open(filename, "w+", "utf-8") as f: 2.121 + for word in words.keys(): 2.122 + f.write(("%s\n"%word)*words[word]) 2.123 + 2.124 +def error_message(text): 2.125 + print text 2.126 + 2.127 +def find_wordgroups_weights(lines, normalizator): 2.128 + weight = {} 2.129 + for line in lines: 2.130 + line = re.sub('^\s*', '', line.rstrip('\n')) 2.131 + (num, word) = re.split('\s+', line, maxsplit=1) 2.132 + normalized = normalizator.normalize(word) 2.133 + weight.setdefault(normalized, 0) 2.134 + weight[normalized] += int(num) 2.135 + return weight 2.136 + 2.137 +def find_linked_words(notes): 2.138 + linked_words = {} 2.139 + for word in notes.keys(): 2.140 + for note in notes[word].values(): 2.141 + if "@" in note: 2.142 + logging.debug("%s %s" % (word, note)) 2.143 + result = re.search(r'\@(\S*)', note) 2.144 + if result: 2.145 + main_word = result.group(1) 2.146 + logging.debug("%s %s" % (word, main_word)) 2.147 + if main_word: 2.148 + linked_words[word] = main_word 2.149 + return linked_words 2.150 + 2.151 + 2.152 +def compare_word_lines(line1, line2, wgw, normalizator, linked_words): 2.153 + line1 = re.sub('^\s*', '', line1.rstrip('\n')) 2.154 + (num1, word1) = re.split('\s+', line1, 1) 2.155 + line2 = re.sub('^\s*', '', line2.rstrip('\n')) 2.156 + (num2, word2) = re.split('\s+', line2, 1) 2.157 + 2.158 + normalized_word1 = normalizator.normalize(word1) 2.159 + normalized_word2 = normalizator.normalize(word2) 2.160 + 2.161 + cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2]) 2.162 + if cmp_res != 0: 2.163 + return cmp_res 2.164 + else: 2.165 + cmp_res = cmp(normalized_word1, normalized_word2) 2.166 + if cmp_res != 0: 2.167 + return cmp_res 2.168 + else: 2.169 + return cmp(int(num1), int(num2)) 2.170 + 2.171 +def filter_get_words(args): 2.172 + vocabulary = load_vocabulary() 2.173 + words = get_words(readlines_from_stdin()) 2.174 + dump_words(words, args[0]) 2.175 + words = substract_dictionary(words, vocabulary) 2.176 + print_words_sorted(words) 2.177 + 2.178 +def filter_group_words(args): 2.179 + lines = readlines_from_stdin() 2.180 + notes = load_notes(notes_filenames()) 2.181 + linked_words = find_linked_words(notes) 2.182 + logging.debug(linked_words) 2.183 + normalizator = Normalizator(config['language'], linked_words) 2.184 + 2.185 + wgw = find_wordgroups_weights(lines, normalizator) 2.186 + for line in sorted( 2.187 + lines, 2.188 + cmp=lambda x,y:compare_word_lines(x,y, wgw, normalizator, linked_words), 2.189 + reverse=True): 2.190 + codecs.getwriter("utf-8")(sys.stdout).write(line) 2.191 + 2.192 (options, args) = parser.parse_args() 2.193 +if options.language: 2.194 + config['language'] = options.language 2.195 2.196 -def get_words(): 2.197 - pass 2.198 +if options.function: 2.199 + function_names = { 2.200 + 'get_words' : filter_get_words, 2.201 + 'group_words' : filter_group_words, 2.202 + } 2.203 + if options.function in function_names: 2.204 + function_names[options.function](args) 2.205 + else: 2.206 + error_message("Unkown function %s.\nAvailable functions:\n%s" % ( 2.207 + options.function, "".join([" "+x for x in sorted(function_names.keys())]))) 2.208 + sys.exit(1) 2.209 2.210 -def add_stat(): 2.211 - pass 2.212 2.213 -def two_and_three_words(): 2.214 - pass 2.215 2.216 -def grep_v_english(): 2.217 - pass 2.218 2.219 -def group_words(): 2.220 - pass 2.221 +#os.system("vim") 2.222 2.223 -def add_marks(): 2.224 - pass 2.225 - 2.226 -def remove_marks(): 2.227 - pass 2.228 - 2.229 -def text_from_url(): 2.230 - pass 2.231 - 2.232 -def part(): 2.233 - pass