new-words

view new-words.sh @ 29:c631833fa2be

minifixes related to unicode support
author Igor Chubin <igor@chub.in>
date Mon Jun 21 19:46:58 2010 +0300 (2010-06-21)
parents 7db7bbf96fad
children 48ca8248e9cc
line source
1 #!/bin/bash
3 show_usage()
4 {
5 cat <<HELP > /dev/stderr
7 USAGE:
9 new-words [ -l lang ] [ -s ] [ ARG ]
11 SWITCHES:
13 -h print this screen
14 -k put higher words that are similar to the known words (only for English)
15 -l lang override language settings
16 -n non-interactive mode (don't run vi)
17 -a don't add marks (and don't save marks added by user)
18 -p pages work with specified pages only (pages = start-stop/total )
19 -s show the text statistics (percentage of known words and so on) and exit
20 -t tag tag known words with tag
21 -T show list of active tags
22 -m tag merge the words tagged with "tag" into the main vocabulary
23 -M merge the words tagged with any tag into the main vocabulary
24 -r tag remove subvocabulary for the "tag"
25 -2 -3 find 2 and 3 words' sequences
27 The language of the text can be specified also
28 by name of the program new-words (correspondent link must be created before).
29 For example, these calls are equivalent:
31 de-words URL
32 new-words -l de URL
34 HELP
35 }
37 if [ "$1" = "-h" ]
38 then
39 show_usage
40 exit 0
41 fi
43 WORK_DIR=~/.new-words/
44 TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX`
45 TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX`
46 export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX`
47 editor=${EDITOR:-vim}
49 # language detection
51 LANGUAGE=en
52 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
53 for arg
54 do
55 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
56 then
57 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
58 fi
59 done
60 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
62 #----------------------------------------------------
63 # command line options processing
65 STAT_ONLY=NO
66 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
67 DONT_ADD_MARKS=NO
68 NON_INTERACTIVE_MODE=NO
69 PART_TO_PROCESS=''
70 GROUP_WORDS_BY_THREE=NO
71 GROUP_WORDS_BY_TWO=NO
72 TAG_NAME=''
73 MERGE_THIS_TAGS=''
74 TAGS_LIST_ONLY=NO
75 MERGE_TAGGED_WORDS=NO
76 MERGE_ALL_TAGGED=NO
77 DONT_ADD_MARKLINES=NO
78 while getopts l:skanp:t:Tm:Mr:23 opt
79 do
80 case "$opt" in
81 s) STAT_ONLY=YES;;
82 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
83 l) LANGUAGE="$OPTARG";;
84 a) DONT_ADD_MARKS=YES;;
85 n) NON_INTERACTIVE_MODE=YES;;
86 p) PART_TO_PROCESS="$OPTARG";;
87 t) TAG_NAME="$OPTARG";;
88 T) TAGS_LIST_ONLY="YES";;
89 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
90 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
91 r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
92 2) GROUP_WORDS_BY_TWO=YES;;
93 3) GROUP_WORDS_BY_THREE=YES;;
94 \?) # unknown flag
95 show_usage
96 exit 1;;
97 esac
98 done
99 shift `expr $OPTIND - 1`
101 if [ "$1" = "-l" ]
102 then
103 LANGUAGE="$2"
104 shift 2
105 fi
107 VOCABULARY=${LANGUAGE}.txt
108 NOTES_FILE=notes-${LANGUAGE}.txt
110 #----------------------------------------------------
112 get_words()
113 {
114 tr ' ' '\n' | sed 's/--/ /g' \
115 | sed "s/'/__APOSTROPHE__/g" \
116 | perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:]'"'"'_-]*$/'\
117 | sed "s/__APOSTROPHE__/'/g" \
118 | tr ' ' '\n' \
119 | tee "$1" \
120 | grep_v_english_perl \
121 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
122 }
124 add_stat()
125 {
126 if [ "$DONT_ADD_MARKLINES" = "YES" ]
127 then
128 cat
129 return
130 fi
131 before="$1"
132 after=${before}2
133 cat > "$after"
134 total="`wc -w $1 | awk '{print $1}'`"
135 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
136 total_known="`echo $total-$total_unknown|bc`"
137 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
138 #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`"
139 sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`"
142 if [ "$STAT_ONLY" = "YES" ]
143 then
144 echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10"
145 echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` "
146 rm $after
147 return 0
148 else
149 echo "# $LANGUAGE, $percentage, <$total_known/$total>"
150 fi
152 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
153 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
154 my $total=shift(@ARGV);
155 my $total_known=shift(@ARGV);
156 my $s=0;
157 my $mark_line=int($total_known*100/$total/5)*5;
158 if ($mark_line>=90) {
159 $mark_line=int($total_known*100/$total)+1;
160 } else { $mark_line +=5; };
161 while(<>)
162 {
163 print;
164 /^\s*([0-9]*)\s*/;
165 $s+=$1;
166 if (($total_known+$s)*100/$total>=$mark_line) {
167 print "# $mark_line\n";
168 if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; };
169 }
170 }
171 PERL_SCRIPT
172 perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after"
173 rm $PERL_SCRIPT_TEMP_NAME
174 rm $after
175 }
177 two_and_three_words()
178 {
179 if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ]
180 then
181 cat
182 else
183 cat
185 export GROUP_WORDS_BY_THREE
186 export GROUP_WORDS_BY_TWO
187 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX`
188 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
189 #!/usr/bin/perl
190 local $/;
191 $words=<>;
192 $words=~ s@[!?;,:#1-9".]@ @g;
193 $words =~ s@\s+@ @g;
194 @words = split /\s+/, $words;
195 for ($i=0; $i<$#words-3;$i++) {
196 my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]);
197 if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) {
198 print "${a}_${b}_${c}\n";
199 };
200 if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) {
201 print "${a}_${b}\n";
202 };
203 }
204 PERL_SCRIPT
205 perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT"
206 rm $PERL_SCRIPT_TEMP_NAME
207 fi
208 }
210 grep_v_english()
211 {
212 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
213 eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat
214 }
216 grep_v_english_perl()
217 {
218 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
219 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
220 $voc_files=$ENV{VOC_FILES};
221 $voc_files=~s@^ @@;
222 for $voc_file (split /\s+/,$voc_files) {
223 if (open(VOC, $voc_file)) {
224 while (<VOC>){
225 chomp;
226 #s/'//g;
227 $voc{$_}="1";
228 }
229 }
230 }
231 while(<>) {
232 chomp;
233 if (not defined($voc{$_})) { print "$_\n"; }
234 }
235 PERL_SCRIPT
236 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
237 export VOCABULARY VOC_FILES
238 VOC_FILES=$VOCABULARY
239 for i in $TAG_NAME
240 do
241 VOC_FILES="${VOC_FILES} `tag_file_name $i`"
242 done
243 perl $PERL_SCRIPT_TEMP_NAME
244 rm $PERL_SCRIPT_TEMP_NAME
245 }
247 group_words()
248 {
249 #if [ "$LANGUAGE" != "en" ]
250 #then
251 # cat
252 # return
253 #fi
254 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX`
255 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
256 #!/usr/bin/perl
258 use Encode;
259 use utf8;
261 eval {
262 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
263 require String::Similarity;
264 String::Similarity->import();
265 };
266 unless($@)
267 {
268 our $HAVE_String_Similarity=1;
269 }
271 sub similar($$){
272 my $a=shift;
273 my $b=shift;
274 if ($HAVE_String_Similarity) {
275 return $Similarity{"$a $b"};
276 }
277 else {
278 return 0;
279 }
280 }
282 sub normalize($)
283 {
284 if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
285 elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
286 elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); }
287 else { return shift ; }
288 }
290 sub normalize_ukrainian($)
291 {
292 $_=lc(shift);
293 s/[юіоеуаи]$//g;
294 return $_;
295 }
297 sub normalize_german($)
298 {
299 $_=lc(shift);
301 s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//;
302 s/en$//; s/er$//;
304 s/lich$//; s/ig$//;
305 s/al$//; s/isch$//;
306 s/ell$//; s/haft$//;
308 s/bar$//; s/sam$//; s/lich$//;
310 @prefixes=qw(
311 ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu
312 be emp ent er ge miss ver zer durch über um unter wieder);
313 @prefixes=();
314 for $pref (@prefixes) {
315 s/^$pref//;
316 }
319 return $_;
320 }
322 sub normalize_english($)
323 {
324 $_=lc(shift);
326 s/s$//;
328 s/ation$//; s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//;
330 s/ed$//;
331 s/en$//;
332 s/er$//;
333 s/est$//;
334 s/ing$//;
336 s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//;
337 s/ify$//; s/fy$//; s/ly$//;
338 s/ise$//; s/ize$//;
340 s/e$//;
341 return $_;
342 }
345 sub compare($$)
346 {
347 my $a=shift;
348 my $b=shift;
349 $a =~ s/^\s*//;
350 $b =~ s/^\s*//;
351 my ($a1, $a2)= split /\s+/,$a,2;
352 my ($b1, $b2)= split /\s+/,$b,2;
354 my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)};
356 if ($cmp) {
357 return $cmp;
358 }
359 else {
360 if (normalize($a2) ne normalize($b2)) {
361 return normalize($a2) cmp normalize($b2);
362 }
363 else {
364 return $a1 <=> $b1;
365 }
366 }
367 }
369 our %Vocabulary;
370 open(VOC, $ENV{VOCABULARY})
371 or die "Can't open VOCABULARY";
372 while (<VOC>){
373 chomp;
374 #s/'//g;
375 $Vocabulary{normalize($_)}="1";
376 }
377 close(VOC);
379 binmode STDIN,":utf8";
380 @lines=<STDIN>;
381 for $L (@lines) {
382 chomp($L);
383 #$L = decode( "utf8", $L);
384 $l=$L;
385 $l =~ s/^\s*//;
386 my ($a, $b)=split(/\s+/,$l,2);
387 $group_weight{normalize($b)}+=$a;
388 }
389 if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") {
390 for $k (keys %group_weight) {
391 if (defined($Vocabulary{$k})) {
392 $group_weight{$k} *= 2;
393 }
394 }
395 }
396 @lines2 = sort { compare($b,$a) } @lines;
397 binmode STDOUT, ":utf8";
398 for $l (@lines2) {
399 print "$l\n";
400 }
401 PERL_SCRIPT
402 export VOCABULARY
403 export NEED_TO_USE_VOCABULARY_WHEN_SORT
404 export LANGUAGE
405 perl $PERL_SCRIPT_TEMP_NAME
406 rm $PERL_SCRIPT_TEMP_NAME
407 }
409 text_from_url()
410 {
411 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
412 }
414 add_marks()
415 {
416 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
417 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
418 use Encode;
420 $file = $ARGV[0];
421 our $dict;
422 if (open(NOTES, $ENV{NOTES_FILE})) {
423 while(<NOTES>) {
424 $_ = decode( "utf8", $_);
425 chomp;
426 s/^\s+//;
427 my ($a,$b)=split /\s+/,$_,2;
428 $dict{$a}=$b;
429 }
430 }
431 if (open(F, $file)) {
432 @lines=<F>;
433 close(F);
434 for (@lines) {$_ = decode( "utf8", $_);};
436 if (open(F, ">$file")) {
437 binmode F, ":utf8";
438 for (@lines) {
439 m/\s+\S+\s+(\S+)/;
440 $name=$1;
441 if (not /^#/ and defined($dict{$name})) {
442 chomp;
443 $mark=$dict{$name};
444 $space=" "x(30-length($_));
445 print F "$_$space$mark\n";
446 }
447 else {
448 print F "$_";
449 }
450 }
451 close(F);
452 }
453 }
454 PERL_SCRIPT
455 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
456 export NOTES_FILE
457 perl $PERL_SCRIPT_TEMP_NAME "$1"
458 rm $PERL_SCRIPT_TEMP_NAME
459 }
461 remove_marks()
462 {
463 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
464 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
465 $file = $ARGV[0];
466 our %dict;
467 if (open(F, $file)) {
468 @lines=<F>;
469 close(F);
471 if (open(F, ">$file")) {
472 for (@lines) {
473 chomp;
474 if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
475 my $name=$4;
476 my $comment=$6;
477 $dict{$name}=$comment;
478 print F "$1$2$3$4\n";
479 }
480 else {
481 print F "$_\n";
482 }
483 }
484 }
485 }
486 if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) {
487 @lines=<NOTES>;
488 close(NOTES);
490 if (open(NOTES, ">".$ENV{NOTES_FILE})) {
491 for (@lines) {
492 chomp;
493 s/^\s+//;
494 my ($a,$b)=split /\s+/,$_,2;
495 if (not defined($dict{$a}) || ($dict{$a} eq $b)) {
496 print NOTES "$_\n";
497 if (defined($dict{$a})) { unset($dict{$a}); }
498 }
499 }
500 for (keys %dict) {
501 $mark=$dict{$_};
502 $space=" "x(30-length($_));
503 print NOTES "$_$space$mark\n";
504 }
505 }
506 }
507 PERL_SCRIPT
508 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
509 export NOTES_FILE
510 export DONT_ADD_MARKS
511 perl $PERL_SCRIPT_TEMP_NAME "$1"
512 rm $PERL_SCRIPT_TEMP_NAME
513 }
515 part()
516 {
517 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
518 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
519 #!/usr/bin/perl
521 my @lines=<STDIN>;
522 my $lines=$#lines;
523 my $interval=$ARGV[0];
524 if (not $interval) {
525 print @lines;
526 }
527 else {
528 my ($start,$stop,$total);
529 if ($interval =~ m@(.*)/(.*)@) {
530 $start = $1;
531 $total = $2;
532 }
533 else {
534 $start=$interval;
535 $total=0;
536 }
537 if ($start =~ m@(.*)-(.*)@) {
538 $start = $1;
539 $stop = $2;
540 }
541 if ($start =~ m@(.*)\+(.*)@) {
542 $start = $1;
543 $stop = $start+$2;
544 }
546 $start=int($lines/$total*$start);
547 $stop=int($lines/$total*$stop);
549 for($i=$start;$i<$stop;$i++){
550 print $lines[$i];
551 }
552 }
553 PERL_SCRIPT
554 perl $PERL_SCRIPT_TEMP_NAME "$1"
555 rm $PERL_SCRIPT_TEMP_NAME
556 }
558 if [ "$TAGS_LIST_ONLY" = "YES" ]
559 then
560 cd "${WORK_DIR}"
561 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'
562 exit 0
563 fi
565 tag_file_name()
566 {
567 echo "${LANGUAGE}_${1}.txt"
568 }
570 if [ "$REMOVE_TAG" = "YES" ]
571 then
572 cd "${WORK_DIR}"
573 for i in $TAG_NAME
574 do
575 echo "$TAGNAME" | grep -q '[/*?]' && continue
576 f="`tag_file_name $i`"
577 if [ -e "$f" ]
578 then
579 rm -f "$f" && echo Tag "'$i'" removed
580 else
581 echo Unknown tag "'$i'"
582 fi
583 done
584 exit 0
585 fi
587 mkdir -p $WORK_DIR
588 oldpwd="$PWD"
589 cd $WORK_DIR
590 if [ "$MERGE_TAGGED_WORDS" = "YES" ]
591 then
592 VOC_FILES=''
593 for i in $MERGE_THIS_TAGS
594 do
595 f=`tag_file_name $i`
596 [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
597 done
598 if [ -z "$VOC_FILES" ]
599 then
600 echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
601 else
602 cat $VOC_FILES
603 fi
604 elif [ "$MERGE_ALL_TAGGED" = "YES" ]
605 then
606 cat ${LANGUAGE}_*.txt
607 elif echo "$1" | grep -q http:
608 then
609 text_from_url "$1"
610 elif [ "$#" != 0 ]
611 then
612 if echo $1 | grep -q ^/
613 then
614 cat "$1"
615 else
616 cat "$oldpwd/$1"
617 fi
618 else
619 cat
620 fi \
621 | part $PART_TO_PROCESS \
622 | tee $ORIGINAL_TEXT \
623 | two_and_three_words \
624 | get_words ${TEMP1}-full \
625 | group_words \
626 | add_stat ${TEMP1}-full \
627 | tee "$TEMP1" > "$TEMP2"
629 if [ "$STAT_ONLY" = "YES" ]
630 then
631 cat "$TEMP1"
632 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
633 then
634 cat "$TEMP1"
635 else
636 if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ]
637 then
638 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
639 if [ "$editor" = vim ]
640 then
641 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
642 else
643 $editor "$TEMP2"
644 fi
645 remove_marks "$TEMP2"
647 vocabulary="$VOCABULARY"
648 [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`"
649 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary"
650 fi
651 fi
653 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"