new-words

view new-words.sh @ 26:4a10c0f4510c

apostrophe support, some improvements in speed, two and three words combination support
author Igor Chubin <igor@chub.in>
date Fri May 21 01:02:21 2010 +0300 (2010-05-21)
parents d1eb7dc37feb
children 0a80b2fa3ed8
line source
1 #!/bin/bash
3 show_usage()
4 {
5 cat <<HELP > /dev/stderr
7 USAGE:
9 new-words [ -l lang ] [ -s ] [ ARG ]
11 SWITCHES:
13 -h print this screen
14 -k put higher words that are similar to the known words (only for English)
15 -l lang override language settings
16 -n non-interactive mode (don't run vi)
17 -m don't add marks (and don't save marks added by user)
18 -p pages work with specified pages only (pages = start-stop/total )
19 -s show the text statistics (percentage of known words and so on) and exit
20 -2 -3 find 2 and 3 words' sequences
22 The language of the text can be specified also
23 by name of the program new-words (correspondent link must be created before).
24 For example, these calls are equivalent:
26 de-words URL
27 new-words -l de URL
29 HELP
30 }
32 if [ "$1" = "-h" ]
33 then
34 show_usage
35 exit 0
36 fi
38 WORK_DIR=~/.new-words/
39 TEMP1=`mktemp /tmp/new-words-XXXXXXXXXX-temp1`
40 TEMP2=`mktemp /tmp/new-words-XXXXXXXXXX-temp2`
41 export ORIGINAL_TEXT=`mktemp /tmp/new-words-XXXXXXXXXX-orig`
42 editor=${EDITOR:-vim}
44 # language detection
46 LANGUAGE=en
47 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
48 for arg
49 do
50 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
51 then
52 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
53 fi
54 done
55 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
57 #----------------------------------------------------
58 # command line options processing
60 STAT_ONLY=NO
61 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
62 DONT_ADD_MARKS=NO
63 NON_INTERACTIVE_MODE=NO
64 PART_TO_PROCESS=''
65 GROUP_WORDS_BY_THREE=NO
66 GROUP_WORDS_BY_TWO=NO
67 while getopts l:skmnp:23 opt
68 do
69 case "$opt" in
70 s) STAT_ONLY=YES;;
71 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
72 l) LANGUAGE="$OPTARG";;
73 m) DONT_ADD_MARKS=YES;;
74 n) NON_INTERACTIVE_MODE=YES;;
75 p) PART_TO_PROCESS="$OPTARG";;
76 2) GROUP_WORDS_BY_TWO=YES;;
77 3) GROUP_WORDS_BY_THREE=YES;;
78 \?) # unknown flag
79 show_usage
80 exit 1;;
81 esac
82 done
83 shift `expr $OPTIND - 1`
85 if [ "$1" = "-l" ]
86 then
87 LANGUAGE="$2"
88 shift 2
89 fi
91 VOCABULARY=${LANGUAGE}.txt
92 NOTES_FILE=notes-${LANGUAGE}.txt
94 #----------------------------------------------------
96 get_words()
97 {
98 tr ' ' '\n' | sed 's/--/ /g' \
99 | sed "s/'/__APOSTROPHE__/g" \
100 | perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:]'"'"'_-]*$/'\
101 | sed "s/__APOSTROPHE__/'/g" \
102 | tr ' ' '\n' \
103 | tee "$1" \
104 | grep_v_english_perl \
105 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
106 }
108 add_stat()
109 {
110 before="$1"
111 after=${before}2
112 cat > "$after"
113 total="`wc -w $1 | awk '{print $1}'`"
114 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
115 total_known="`echo $total-$total_unknown|bc`"
116 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
117 #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`"
118 sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`"
121 if [ "$STAT_ONLY" = "YES" ]
122 then
123 echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10"
124 echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` "
125 rm $after
126 return 0
127 else
128 echo "# $LANGUAGE, $percentage, <$total_known/$total>"
129 fi
131 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
132 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
133 my $total=shift(@ARGV);
134 my $total_known=shift(@ARGV);
135 my $s=0;
136 my $mark_line=int($total_known*100/$total/5)*5;
137 if ($mark_line>=90) {
138 $mark_line=int($total_known*100/$total)+1;
139 } else { $mark_line +=5; };
140 while(<>)
141 {
142 print;
143 /^\s*([0-9]*)\s*/;
144 $s+=$1;
145 if (($total_known+$s)*100/$total>=$mark_line) {
146 print "# $mark_line\n";
147 if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; };
148 }
149 }
150 PERL_SCRIPT
151 perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after"
152 rm $PERL_SCRIPT_TEMP_NAME
153 rm $after
154 }
156 two_and_three_words()
157 {
158 if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ]
159 then
160 cat
161 else
162 cat
164 export GROUP_WORDS_BY_THREE
165 export GROUP_WORDS_BY_TWO
166 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX`
167 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
168 #!/usr/bin/perl
169 local $/;
170 $words=<>;
171 $words=~ s@[!?;,:#1-9".]@ @g;
172 $words =~ s@\s+@ @g;
173 @words = split /\s+/, $words;
174 for ($i=0; $i<$#words-3;$i++) {
175 my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]);
176 if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) {
177 print "${a}_${b}_${c}\n";
178 };
179 if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) {
180 print "${a}_${b}\n";
181 };
182 }
183 PERL_SCRIPT
184 perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT"
185 rm $PERL_SCRIPT_TEMP_NAME
186 fi
187 }
189 grep_v_english()
190 {
191 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
192 eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat
193 }
195 grep_v_english_perl()
196 {
197 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
198 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
199 open(VOC, $ENV{VOCABULARY})
200 or die "Can't open VOCABULARY";
201 while (<VOC>){
202 chomp;
203 #s/'//g;
204 $voc{$_}="1";
205 }
206 while(<>) {
207 chomp;
208 if (not defined($voc{$_})) { print "$_\n"; }
209 }
210 PERL_SCRIPT
211 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
212 export VOCABULARY
213 perl $PERL_SCRIPT_TEMP_NAME
214 rm $PERL_SCRIPT_TEMP_NAME
215 }
217 group_words()
218 {
219 #if [ "$LANGUAGE" != "en" ]
220 #then
221 # cat
222 # return
223 #fi
224 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX`
225 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
226 #!/usr/bin/perl
228 eval {
229 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
230 require String::Similarity;
231 String::Similarity->import();
232 };
233 unless($@)
234 {
235 our $HAVE_String_Similarity=1;
236 }
238 sub similar($$){
239 my $a=shift;
240 my $b=shift;
241 if ($HAVE_String_Similarity) {
242 return $Similarity{"$a $b"};
243 }
244 else {
245 return 0;
246 }
247 }
249 sub normalize($)
250 {
251 if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
252 elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
253 else { return shift ; }
254 }
256 sub normalize_german($)
257 {
258 $_=lc(shift);
260 s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//;
261 s/en$//; s/er$//;
263 s/lich$//; s/ig$//;
264 s/al$//; s/isch$//;
265 s/ell$//; s/haft$//;
267 s/bar$//; s/sam$//; s/lich$//;
269 @prefixes=qw(
270 ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu
271 be emp ent er ge miss ver zer durch über um unter wieder);
272 @prefixes=();
273 for $pref (@prefixes) {
274 s/^$pref//;
275 }
278 return $_;
279 }
281 sub normalize_english($)
282 {
283 $_=lc(shift);
285 s/s$//;
287 s/ation$//; s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//;
289 s/ed$//;
290 s/en$//;
291 s/er$//;
292 s/est$//;
293 s/ing$//;
295 s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//;
296 s/ify$//; s/fy$//; s/ly$//;
297 s/ise$//; s/ize$//;
299 s/e$//;
300 return $_;
301 }
304 sub compare($$)
305 {
306 my $a=shift;
307 my $b=shift;
308 $a =~ s/^\s*//;
309 $b =~ s/^\s*//;
310 my ($a1, $a2)= split /\s+/,$a,2;
311 my ($b1, $b2)= split /\s+/,$b,2;
313 my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)};
315 if ($cmp) {
316 return $cmp;
317 }
318 else {
319 if (normalize($a2) ne normalize($b2)) {
320 return normalize($a2) cmp normalize($b2);
321 }
322 else {
323 return $a1 <=> $b1;
324 }
325 }
326 }
328 our %Vocabulary;
329 open(VOC, $ENV{VOCABULARY})
330 or die "Can't open VOCABULARY";
331 while (<VOC>){
332 chomp;
333 #s/'//g;
334 $Vocabulary{normalize($_)}="1";
335 }
336 close(VOC);
338 @lines=<STDIN>;
339 for $L (@lines) {
340 chomp($L);
341 $l=$L;
342 $l =~ s/^\s*//;
343 my ($a, $b)=split(/\s+/,$l,2);
344 $group_weight{normalize($b)}+=$a;
345 }
346 if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") {
347 for $k (keys %group_weight) {
348 if (defined($Vocabulary{$k})) {
349 $group_weight{$k} *= 2;
350 }
351 }
352 }
353 @lines2 = sort { compare($b,$a) } @lines;
354 for $l (@lines2) {
355 print "$l\n";
356 }
357 PERL_SCRIPT
358 export VOCABULARY
359 export NEED_TO_USE_VOCABULARY_WHEN_SORT
360 export LANGUAGE
361 perl $PERL_SCRIPT_TEMP_NAME
362 rm $PERL_SCRIPT_TEMP_NAME
363 }
365 text_from_url()
366 {
367 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
368 }
370 add_marks()
371 {
372 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
373 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
374 $file = $ARGV[0];
375 our $dict;
376 if (open(NOTES, $ENV{NOTES_FILE})) {
377 while(<NOTES>) {
378 chomp;
379 s/^\s+//;
380 my ($a,$b)=split /\s+/,$_,2;
381 $dict{$a}=$b;
382 }
383 }
384 if (open(F, $file)) {
385 @lines=<F>;
386 close(F);
388 if (open(F, ">$file")) {
389 for (@lines) {
390 m/\s+\S+\s+(\S+)/;
391 $name=$1;
392 if (not /^#/ and defined($dict{$name})) {
393 chomp;
394 $mark=$dict{$name};
395 $space=" "x(30-length($_));
396 print F "$_$space$mark\n";
397 }
398 else {
399 print F "$_";
400 }
401 }
402 close(F);
403 }
404 }
405 PERL_SCRIPT
406 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
407 export NOTES_FILE
408 perl $PERL_SCRIPT_TEMP_NAME "$1"
409 rm $PERL_SCRIPT_TEMP_NAME
410 }
412 remove_marks()
413 {
414 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
415 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
416 $file = $ARGV[0];
417 our %dict;
418 if (open(F, $file)) {
419 @lines=<F>;
420 close(F);
422 if (open(F, ">$file")) {
423 for (@lines) {
424 chomp;
425 if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
426 my $name=$4;
427 my $comment=$6;
428 $dict{$name}=$comment;
429 print F "$1$2$3$4\n";
430 }
431 else {
432 print F "$_\n";
433 }
434 }
435 }
436 }
437 if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) {
438 @lines=<NOTES>;
439 close(NOTES);
441 if (open(NOTES, ">".$ENV{NOTES_FILE})) {
442 for (@lines) {
443 chomp;
444 s/^\s+//;
445 my ($a,$b)=split /\s+/,$_,2;
446 if (not defined($dict{$a}) || ($dict{$a} eq $b)) {
447 print NOTES "$_\n";
448 if (defined($dict{$a})) { unset($dict{$a}); }
449 }
450 }
451 for (keys %dict) {
452 $mark=$dict{$_};
453 $space=" "x(30-length($_));
454 print NOTES "$_$space$mark\n";
455 }
456 }
457 }
458 PERL_SCRIPT
459 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
460 export NOTES_FILE
461 export DONT_ADD_MARKS
462 perl $PERL_SCRIPT_TEMP_NAME "$1"
463 rm $PERL_SCRIPT_TEMP_NAME
464 }
466 part()
467 {
468 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
469 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
470 #!/usr/bin/perl
472 my @lines=<STDIN>;
473 my $lines=$#lines;
474 my $interval=$ARGV[0];
475 if (not $interval) {
476 print @lines;
477 }
478 else {
479 my ($start,$stop,$total);
480 if ($interval =~ m@(.*)/(.*)@) {
481 $start = $1;
482 $total = $2;
483 }
484 else {
485 $start=$interval;
486 $total=0;
487 }
488 if ($start =~ m@(.*)-(.*)@) {
489 $start = $1;
490 $stop = $2;
491 }
492 if ($start =~ m@(.*)\+(.*)@) {
493 $start = $1;
494 $stop = $start+$2;
495 }
497 $start=int($lines/$total*$start);
498 $stop=int($lines/$total*$stop);
500 for($i=$start;$i<$stop;$i++){
501 print $lines[$i];
502 }
503 }
504 PERL_SCRIPT
505 perl $PERL_SCRIPT_TEMP_NAME "$1"
506 rm $PERL_SCRIPT_TEMP_NAME
507 }
509 mkdir -p $WORK_DIR
510 oldpwd="$PWD"
511 cd $WORK_DIR
512 if echo "$1" | grep -q http:
513 then
514 text_from_url "$1"
515 elif [ "$#" != 0 ]
516 then
517 if echo $1 | grep -q ^/
518 then
519 cat "$1"
520 else
521 cat "$oldpwd/$1"
522 fi
523 else
524 cat
525 fi \
526 | part $PART_TO_PROCESS \
527 | tee $ORIGINAL_TEXT \
528 | two_and_three_words \
529 | get_words ${TEMP1}-full \
530 | group_words \
531 | add_stat ${TEMP1}-full \
532 | tee "$TEMP1" > "$TEMP2"
534 if [ "$STAT_ONLY" = "YES" ]
535 then
536 cat "$TEMP1"
537 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
538 then
539 cat "$TEMP1"
540 else
541 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
542 if [ "$editor" = vim ]
543 then
544 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
545 else
546 echo 2
547 $editor "$TEMP2"
548 fi
549 remove_marks "$TEMP2"
550 fi
552 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$VOCABULARY"
553 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"