new-words

view new-words.sh @ 31:48ca8248e9cc

+esperanto normalization
author Igor Chubin <igor@chub.in>
date Tue Aug 17 21:35:57 2010 +0200 (2010-08-17)
parents c631833fa2be
children 753fb84437aa
line source
1 #!/bin/bash
3 show_usage()
4 {
5 cat <<HELP > /dev/stderr
7 USAGE:
9 new-words [ -l lang ] [ -s ] [ ARG ]
11 SWITCHES:
13 -h print this screen
14 -k put higher words that are similar to the known words (only for English)
15 -l lang override language settings
16 -n non-interactive mode (don't run vi)
17 -a don't add marks (and don't save marks added by user)
18 -p pages work with specified pages only (pages = start-stop/total )
19 -s show the text statistics (percentage of known words and so on) and exit
20 -t tag tag known words with tag
21 -T show list of active tags
22 -m tag merge the words tagged with "tag" into the main vocabulary
23 -M merge the words tagged with any tag into the main vocabulary
24 -r tag remove subvocabulary for the "tag"
25 -2 -3 find 2 and 3 words' sequences
27 The language of the text can be specified also
28 by name of the program new-words (correspondent link must be created before).
29 For example, these calls are equivalent:
31 de-words URL
32 new-words -l de URL
34 HELP
35 }
37 if [ "$1" = "-h" ]
38 then
39 show_usage
40 exit 0
41 fi
43 WORK_DIR=~/.new-words/
44 TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX`
45 TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX`
46 export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX`
47 editor=${EDITOR:-vim}
49 # language detection
51 LANGUAGE=en
52 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
53 for arg
54 do
55 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
56 then
57 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
58 fi
59 done
60 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
62 #----------------------------------------------------
63 # command line options processing
65 STAT_ONLY=NO
66 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
67 DONT_ADD_MARKS=NO
68 NON_INTERACTIVE_MODE=NO
69 PART_TO_PROCESS=''
70 GROUP_WORDS_BY_THREE=NO
71 GROUP_WORDS_BY_TWO=NO
72 TAG_NAME=''
73 MERGE_THIS_TAGS=''
74 TAGS_LIST_ONLY=NO
75 MERGE_TAGGED_WORDS=NO
76 MERGE_ALL_TAGGED=NO
77 DONT_ADD_MARKLINES=NO
78 while getopts l:skanp:t:Tm:Mr:23 opt
79 do
80 case "$opt" in
81 s) STAT_ONLY=YES;;
82 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
83 l) LANGUAGE="$OPTARG";;
84 a) DONT_ADD_MARKS=YES;;
85 n) NON_INTERACTIVE_MODE=YES;;
86 p) PART_TO_PROCESS="$OPTARG";;
87 t) TAG_NAME="$OPTARG";;
88 T) TAGS_LIST_ONLY="YES";;
89 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
90 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
91 r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
92 2) GROUP_WORDS_BY_TWO=YES;;
93 3) GROUP_WORDS_BY_THREE=YES;;
94 \?) # unknown flag
95 show_usage
96 exit 1;;
97 esac
98 done
99 shift `expr $OPTIND - 1`
101 if [ "$1" = "-l" ]
102 then
103 LANGUAGE="$2"
104 shift 2
105 fi
107 VOCABULARY=${LANGUAGE}.txt
108 NOTES_FILE=notes-${LANGUAGE}.txt
110 #----------------------------------------------------
112 get_words()
113 {
114 tr ' ' '\n' | sed 's/--/ /g' \
115 | sed "s/'/__APOSTROPHE__/g" \
116 | perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\
117 | sed "s/__APOSTROPHE__/'/g" \
118 | tr ' ' '\n' \
119 | tee "$1" \
120 | grep_v_english_perl \
121 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
122 }
124 add_stat()
125 {
126 if [ "$DONT_ADD_MARKLINES" = "YES" ]
127 then
128 cat
129 return
130 fi
131 before="$1"
132 after=${before}2
133 cat > "$after"
134 total="`wc -w $1 | awk '{print $1}'`"
135 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
136 total_known="`echo $total-$total_unknown|bc`"
137 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
138 #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`"
139 sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`"
142 if [ "$STAT_ONLY" = "YES" ]
143 then
144 echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10"
145 echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` "
146 rm $after
147 return 0
148 else
149 echo "# $LANGUAGE, $percentage, <$total_known/$total>"
150 fi
152 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
153 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
154 my $total=shift(@ARGV);
155 my $total_known=shift(@ARGV);
156 my $s=0;
157 my $mark_line=int($total_known*100/$total/5)*5;
158 if ($mark_line>=90) {
159 $mark_line=int($total_known*100/$total)+1;
160 } else { $mark_line +=5; };
161 while(<>)
162 {
163 print;
164 /^\s*([0-9]*)\s*/;
165 $s+=$1;
166 if (($total_known+$s)*100/$total>=$mark_line) {
167 print "# $mark_line\n";
168 if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; };
169 }
170 }
171 PERL_SCRIPT
172 perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after"
173 rm $PERL_SCRIPT_TEMP_NAME
174 rm $after
175 }
177 two_and_three_words()
178 {
179 if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ]
180 then
181 cat
182 else
183 cat
185 export GROUP_WORDS_BY_THREE
186 export GROUP_WORDS_BY_TWO
187 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX`
188 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
189 #!/usr/bin/perl
190 local $/;
191 $words=<>;
192 $words=~ s@[!?;,:#1-9".]@ @g;
193 $words =~ s@\s+@ @g;
194 @words = split /\s+/, $words;
195 for ($i=0; $i<$#words-3;$i++) {
196 my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]);
197 if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) {
198 print "${a}_${b}_${c}\n";
199 };
200 if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) {
201 print "${a}_${b}\n";
202 };
203 }
204 PERL_SCRIPT
205 perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT"
206 rm $PERL_SCRIPT_TEMP_NAME
207 fi
208 }
210 grep_v_english()
211 {
212 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
213 eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat
214 }
216 grep_v_english_perl()
217 {
218 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
219 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
220 $voc_files=$ENV{VOC_FILES};
221 $voc_files=~s@^ @@;
222 for $voc_file (split /\s+/,$voc_files) {
223 if (open(VOC, $voc_file)) {
224 while (<VOC>){
225 chomp;
226 #s/'//g;
227 $voc{$_}="1";
228 }
229 }
230 }
231 while(<>) {
232 chomp;
233 if (not defined($voc{$_})) { print "$_\n"; }
234 }
235 PERL_SCRIPT
236 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
237 export VOCABULARY VOC_FILES
238 VOC_FILES=$VOCABULARY
239 for i in $TAG_NAME
240 do
241 VOC_FILES="${VOC_FILES} `tag_file_name $i`"
242 done
243 perl $PERL_SCRIPT_TEMP_NAME
244 rm $PERL_SCRIPT_TEMP_NAME
245 }
247 group_words()
248 {
249 #if [ "$LANGUAGE" != "en" ]
250 #then
251 # cat
252 # return
253 #fi
254 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX`
255 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
256 #!/usr/bin/perl
258 use Encode;
259 use utf8;
261 eval {
262 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
263 require String::Similarity;
264 String::Similarity->import();
265 };
266 unless($@)
267 {
268 our $HAVE_String_Similarity=1;
269 }
271 sub similar($$){
272 my $a=shift;
273 my $b=shift;
274 if ($HAVE_String_Similarity) {
275 return $Similarity{"$a $b"};
276 }
277 else {
278 return 0;
279 }
280 }
282 sub normalize($)
283 {
284 if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
285 elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
286 elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); }
287 elsif ( $ENV{LANGUAGE} eq "io" ) { return normalize_esperanto(shift); }
288 else { return shift ; }
289 }
291 sub normalize_ukrainian($)
292 {
293 $_=lc(shift);
294 s/[юіоеуаи]$//g;
295 return $_;
296 }
298 sub normalize_esperanto($)
299 {
300 $_=lc(shift);
301 # verbs
302 s/i$//; s/is$//; s/os$//; s/as$//; s/us$//;
304 # nouns
305 s/j?n?$//;
307 return $_;
308 }
310 sub normalize_german($)
311 {
312 $_=lc(shift);
314 s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//;
315 s/en$//; s/er$//;
317 s/lich$//; s/ig$//;
318 s/al$//; s/isch$//;
319 s/ell$//; s/haft$//;
321 s/bar$//; s/sam$//; s/lich$//;
323 @prefixes=qw(
324 ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu
325 be emp ent er ge miss ver zer durch über um unter wieder);
326 @prefixes=();
327 for $pref (@prefixes) {
328 s/^$pref//;
329 }
332 return $_;
333 }
335 sub normalize_english($)
336 {
337 $_=lc(shift);
339 s/s$//;
341 s/ation$//; s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//;
343 s/ed$//;
344 s/en$//;
345 s/er$//;
346 s/est$//;
347 s/ing$//;
349 s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//;
350 s/ify$//; s/fy$//; s/ly$//;
351 s/ise$//; s/ize$//;
353 s/e$//;
354 return $_;
355 }
358 sub compare($$)
359 {
360 my $a=shift;
361 my $b=shift;
362 $a =~ s/^\s*//;
363 $b =~ s/^\s*//;
364 my ($a1, $a2)= split /\s+/,$a,2;
365 my ($b1, $b2)= split /\s+/,$b,2;
367 my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)};
369 if ($cmp) {
370 return $cmp;
371 }
372 else {
373 if (normalize($a2) ne normalize($b2)) {
374 return normalize($a2) cmp normalize($b2);
375 }
376 else {
377 return $a1 <=> $b1;
378 }
379 }
380 }
382 our %Vocabulary;
383 open(VOC, $ENV{VOCABULARY})
384 or die "Can't open VOCABULARY";
385 while (<VOC>){
386 chomp;
387 #s/'//g;
388 $Vocabulary{normalize($_)}="1";
389 }
390 close(VOC);
392 binmode STDIN,":utf8";
393 @lines=<STDIN>;
394 for $L (@lines) {
395 chomp($L);
396 #$L = decode( "utf8", $L);
397 $l=$L;
398 $l =~ s/^\s*//;
399 my ($a, $b)=split(/\s+/,$l,2);
400 $group_weight{normalize($b)}+=$a;
401 }
402 if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") {
403 for $k (keys %group_weight) {
404 if (defined($Vocabulary{$k})) {
405 $group_weight{$k} *= 2;
406 }
407 }
408 }
409 @lines2 = sort { compare($b,$a) } @lines;
410 binmode STDOUT, ":utf8";
411 for $l (@lines2) {
412 print "$l\n";
413 }
414 PERL_SCRIPT
415 export VOCABULARY
416 export NEED_TO_USE_VOCABULARY_WHEN_SORT
417 export LANGUAGE
418 perl $PERL_SCRIPT_TEMP_NAME
419 rm $PERL_SCRIPT_TEMP_NAME
420 }
422 text_from_url()
423 {
424 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
425 }
427 add_marks()
428 {
429 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
430 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
431 use Encode;
433 $file = $ARGV[0];
434 our $dict;
435 if (open(NOTES, $ENV{NOTES_FILE})) {
436 while(<NOTES>) {
437 $_ = decode( "utf8", $_);
438 chomp;
439 s/^\s+//;
440 my ($a,$b)=split /\s+/,$_,2;
441 $dict{$a}=$b;
442 }
443 }
444 if (open(F, $file)) {
445 @lines=<F>;
446 close(F);
447 for (@lines) {$_ = decode( "utf8", $_);};
449 if (open(F, ">$file")) {
450 binmode F, ":utf8";
451 for (@lines) {
452 m/\s+\S+\s+(\S+)/;
453 $name=$1;
454 if (not /^#/ and defined($dict{$name})) {
455 chomp;
456 $mark=$dict{$name};
457 $space=" "x(30-length($_));
458 print F "$_$space$mark\n";
459 }
460 else {
461 print F "$_";
462 }
463 }
464 close(F);
465 }
466 }
467 PERL_SCRIPT
468 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
469 export NOTES_FILE
470 perl $PERL_SCRIPT_TEMP_NAME "$1"
471 rm $PERL_SCRIPT_TEMP_NAME
472 }
474 remove_marks()
475 {
476 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
477 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
478 $file = $ARGV[0];
479 our %dict;
480 if (open(F, $file)) {
481 @lines=<F>;
482 close(F);
484 if (open(F, ">$file")) {
485 for (@lines) {
486 chomp;
487 if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
488 my $name=$4;
489 my $comment=$6;
490 $dict{$name}=$comment;
491 print F "$1$2$3$4\n";
492 }
493 else {
494 print F "$_\n";
495 }
496 }
497 }
498 }
499 if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) {
500 @lines=<NOTES>;
501 close(NOTES);
503 if (open(NOTES, ">".$ENV{NOTES_FILE})) {
504 for (@lines) {
505 chomp;
506 s/^\s+//;
507 my ($a,$b)=split /\s+/,$_,2;
508 if (not defined($dict{$a}) || ($dict{$a} eq $b)) {
509 print NOTES "$_\n";
510 if (defined($dict{$a})) { unset($dict{$a}); }
511 }
512 }
513 for (keys %dict) {
514 $mark=$dict{$_};
515 $space=" "x(30-length($_));
516 print NOTES "$_$space$mark\n";
517 }
518 }
519 }
520 PERL_SCRIPT
521 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
522 export NOTES_FILE
523 export DONT_ADD_MARKS
524 perl $PERL_SCRIPT_TEMP_NAME "$1"
525 rm $PERL_SCRIPT_TEMP_NAME
526 }
528 part()
529 {
530 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
531 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
532 #!/usr/bin/perl
534 my @lines=<STDIN>;
535 my $lines=$#lines;
536 my $interval=$ARGV[0];
537 if (not $interval) {
538 print @lines;
539 }
540 else {
541 my ($start,$stop,$total);
542 if ($interval =~ m@(.*)/(.*)@) {
543 $start = $1;
544 $total = $2;
545 }
546 else {
547 $start=$interval;
548 $total=0;
549 }
550 if ($start =~ m@(.*)-(.*)@) {
551 $start = $1;
552 $stop = $2;
553 }
554 if ($start =~ m@(.*)\+(.*)@) {
555 $start = $1;
556 $stop = $start+$2;
557 }
559 $start=int($lines/$total*$start);
560 $stop=int($lines/$total*$stop);
562 for($i=$start;$i<$stop;$i++){
563 print $lines[$i];
564 }
565 }
566 PERL_SCRIPT
567 perl $PERL_SCRIPT_TEMP_NAME "$1"
568 rm $PERL_SCRIPT_TEMP_NAME
569 }
571 if [ "$TAGS_LIST_ONLY" = "YES" ]
572 then
573 cd "${WORK_DIR}"
574 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'
575 exit 0
576 fi
578 tag_file_name()
579 {
580 echo "${LANGUAGE}_${1}.txt"
581 }
583 if [ "$REMOVE_TAG" = "YES" ]
584 then
585 cd "${WORK_DIR}"
586 for i in $TAG_NAME
587 do
588 echo "$TAGNAME" | grep -q '[/*?]' && continue
589 f="`tag_file_name $i`"
590 if [ -e "$f" ]
591 then
592 rm -f "$f" && echo Tag "'$i'" removed
593 else
594 echo Unknown tag "'$i'"
595 fi
596 done
597 exit 0
598 fi
600 mkdir -p $WORK_DIR
601 oldpwd="$PWD"
602 cd $WORK_DIR
603 if [ "$MERGE_TAGGED_WORDS" = "YES" ]
604 then
605 VOC_FILES=''
606 for i in $MERGE_THIS_TAGS
607 do
608 f=`tag_file_name $i`
609 [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
610 done
611 if [ -z "$VOC_FILES" ]
612 then
613 echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
614 else
615 cat $VOC_FILES
616 fi
617 elif [ "$MERGE_ALL_TAGGED" = "YES" ]
618 then
619 cat ${LANGUAGE}_*.txt
620 elif echo "$1" | grep -q http:
621 then
622 text_from_url "$1"
623 elif [ "$#" != 0 ]
624 then
625 if echo $1 | grep -q ^/
626 then
627 cat "$1"
628 else
629 cat "$oldpwd/$1"
630 fi
631 else
632 cat
633 fi \
634 | part $PART_TO_PROCESS \
635 | tee $ORIGINAL_TEXT \
636 | two_and_three_words \
637 | get_words ${TEMP1}-full \
638 | group_words \
639 | add_stat ${TEMP1}-full \
640 | tee "$TEMP1" > "$TEMP2"
642 if [ "$STAT_ONLY" = "YES" ]
643 then
644 cat "$TEMP1"
645 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
646 then
647 cat "$TEMP1"
648 else
649 if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ]
650 then
651 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
652 if [ "$editor" = vim ]
653 then
654 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
655 else
656 $editor "$TEMP2"
657 fi
658 remove_marks "$TEMP2"
660 vocabulary="$VOCABULARY"
661 [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`"
662 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary"
663 fi
664 fi
666 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"