new-words
view new-words.sh @ 27:0a80b2fa3ed8
initial tagging support
author | Igor Chubin <igor@chub.in> |
---|---|
date | Fri May 21 19:53:24 2010 +0300 (2010-05-21) |
parents | 4a10c0f4510c |
children | 7db7bbf96fad |
line source
1 #!/bin/bash
3 show_usage()
4 {
5 cat <<HELP > /dev/stderr
7 USAGE:
9 new-words [ -l lang ] [ -s ] [ ARG ]
11 SWITCHES:
13 -h print this screen
14 -k put higher words that are similar to the known words (only for English)
15 -l lang override language settings
16 -n non-interactive mode (don't run vi)
17 -a don't add marks (and don't save marks added by user)
18 -p pages work with specified pages only (pages = start-stop/total )
19 -s show the text statistics (percentage of known words and so on) and exit
20 -t tag tag known words with tag
21 -T show list of active tags
22 -m tag merge the words tagged with "tag" into the main vocabulary
23 -M merge the words tagged with any tag into the main vocabulary
24 -r tag remove subvocabulary for the "tag"
25 -2 -3 find 2 and 3 words' sequences
27 The language of the text can be specified also
28 by name of the program new-words (correspondent link must be created before).
29 For example, these calls are equivalent:
31 de-words URL
32 new-words -l de URL
34 HELP
35 }
37 if [ "$1" = "-h" ]
38 then
39 show_usage
40 exit 0
41 fi
43 WORK_DIR=~/.new-words/
44 TEMP1=`mktemp /tmp/new-words-XXXXXXXXXX-temp1`
45 TEMP2=`mktemp /tmp/new-words-XXXXXXXXXX-temp2`
46 export ORIGINAL_TEXT=`mktemp /tmp/new-words-XXXXXXXXXX-orig`
47 editor=${EDITOR:-vim}
49 # language detection
51 LANGUAGE=en
52 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
53 for arg
54 do
55 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
56 then
57 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
58 fi
59 done
60 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
62 #----------------------------------------------------
63 # command line options processing
65 STAT_ONLY=NO
66 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
67 DONT_ADD_MARKS=NO
68 NON_INTERACTIVE_MODE=NO
69 PART_TO_PROCESS=''
70 GROUP_WORDS_BY_THREE=NO
71 GROUP_WORDS_BY_TWO=NO
72 TAG_NAME=''
73 MERGE_THIS_TAGS=''
74 TAGS_LIST_ONLY=NO
75 MERGE_TAGGED_WORDS=NO
76 MERGE_ALL_TAGGED=NO
77 DONT_ADD_MARKLINES=NO
78 while getopts l:skanp:t:Tm:Mr:23 opt
79 do
80 case "$opt" in
81 s) STAT_ONLY=YES;;
82 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
83 l) LANGUAGE="$OPTARG";;
84 a) DONT_ADD_MARKS=YES;;
85 n) NON_INTERACTIVE_MODE=YES;;
86 p) PART_TO_PROCESS="$OPTARG";;
87 t) TAG_NAME="$OPTARG";;
88 T) TAGS_LIST_ONLY="YES";;
89 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
90 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
91 r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
92 2) GROUP_WORDS_BY_TWO=YES;;
93 3) GROUP_WORDS_BY_THREE=YES;;
94 \?) # unknown flag
95 show_usage
96 exit 1;;
97 esac
98 done
99 shift `expr $OPTIND - 1`
101 if [ "$1" = "-l" ]
102 then
103 LANGUAGE="$2"
104 shift 2
105 fi
107 VOCABULARY=${LANGUAGE}.txt
108 NOTES_FILE=notes-${LANGUAGE}.txt
110 #----------------------------------------------------
112 get_words()
113 {
114 tr ' ' '\n' | sed 's/--/ /g' \
115 | sed "s/'/__APOSTROPHE__/g" \
116 | perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:]'"'"'_-]*$/'\
117 | sed "s/__APOSTROPHE__/'/g" \
118 | tr ' ' '\n' \
119 | tee "$1" \
120 | grep_v_english_perl \
121 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
122 }
124 add_stat()
125 {
126 if [ "$DONT_ADD_MARKLINES" = "YES" ]
127 then
128 cat
129 return
130 fi
131 before="$1"
132 after=${before}2
133 cat > "$after"
134 total="`wc -w $1 | awk '{print $1}'`"
135 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
136 total_known="`echo $total-$total_unknown|bc`"
137 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
138 #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`"
139 sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`"
142 if [ "$STAT_ONLY" = "YES" ]
143 then
144 echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10"
145 echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` "
146 rm $after
147 return 0
148 else
149 echo "# $LANGUAGE, $percentage, <$total_known/$total>"
150 fi
152 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
153 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
154 my $total=shift(@ARGV);
155 my $total_known=shift(@ARGV);
156 my $s=0;
157 my $mark_line=int($total_known*100/$total/5)*5;
158 if ($mark_line>=90) {
159 $mark_line=int($total_known*100/$total)+1;
160 } else { $mark_line +=5; };
161 while(<>)
162 {
163 print;
164 /^\s*([0-9]*)\s*/;
165 $s+=$1;
166 if (($total_known+$s)*100/$total>=$mark_line) {
167 print "# $mark_line\n";
168 if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; };
169 }
170 }
171 PERL_SCRIPT
172 perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after"
173 rm $PERL_SCRIPT_TEMP_NAME
174 rm $after
175 }
177 two_and_three_words()
178 {
179 if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ]
180 then
181 cat
182 else
183 cat
185 export GROUP_WORDS_BY_THREE
186 export GROUP_WORDS_BY_TWO
187 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX`
188 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
189 #!/usr/bin/perl
190 local $/;
191 $words=<>;
192 $words=~ s@[!?;,:#1-9".]@ @g;
193 $words =~ s@\s+@ @g;
194 @words = split /\s+/, $words;
195 for ($i=0; $i<$#words-3;$i++) {
196 my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]);
197 if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) {
198 print "${a}_${b}_${c}\n";
199 };
200 if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) {
201 print "${a}_${b}\n";
202 };
203 }
204 PERL_SCRIPT
205 perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT"
206 rm $PERL_SCRIPT_TEMP_NAME
207 fi
208 }
210 grep_v_english()
211 {
212 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
213 eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat
214 }
216 grep_v_english_perl()
217 {
218 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
219 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
220 $voc_files=$ENV{VOC_FILES};
221 $voc_files=~s@^ @@;
222 for $voc_file (split /\s+/,$voc_files) {
223 if (open(VOC, $voc_file)) {
224 while (<VOC>){
225 chomp;
226 #s/'//g;
227 $voc{$_}="1";
228 }
229 }
230 }
231 while(<>) {
232 chomp;
233 if (not defined($voc{$_})) { print "$_\n"; }
234 }
235 PERL_SCRIPT
236 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
237 export VOCABULARY VOC_FILES
238 VOC_FILES=$VOCABULARY
239 for i in $TAG_NAME
240 do
241 VOC_FILES="${VOC_FILES} `tag_file_name $i`"
242 done
243 perl $PERL_SCRIPT_TEMP_NAME
244 rm $PERL_SCRIPT_TEMP_NAME
245 }
247 group_words()
248 {
249 #if [ "$LANGUAGE" != "en" ]
250 #then
251 # cat
252 # return
253 #fi
254 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX`
255 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
256 #!/usr/bin/perl
258 eval {
259 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
260 require String::Similarity;
261 String::Similarity->import();
262 };
263 unless($@)
264 {
265 our $HAVE_String_Similarity=1;
266 }
268 sub similar($$){
269 my $a=shift;
270 my $b=shift;
271 if ($HAVE_String_Similarity) {
272 return $Similarity{"$a $b"};
273 }
274 else {
275 return 0;
276 }
277 }
279 sub normalize($)
280 {
281 if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
282 elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
283 else { return shift ; }
284 }
286 sub normalize_german($)
287 {
288 $_=lc(shift);
290 s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//;
291 s/en$//; s/er$//;
293 s/lich$//; s/ig$//;
294 s/al$//; s/isch$//;
295 s/ell$//; s/haft$//;
297 s/bar$//; s/sam$//; s/lich$//;
299 @prefixes=qw(
300 ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu
301 be emp ent er ge miss ver zer durch über um unter wieder);
302 @prefixes=();
303 for $pref (@prefixes) {
304 s/^$pref//;
305 }
308 return $_;
309 }
311 sub normalize_english($)
312 {
313 $_=lc(shift);
315 s/s$//;
317 s/ation$//; s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//;
319 s/ed$//;
320 s/en$//;
321 s/er$//;
322 s/est$//;
323 s/ing$//;
325 s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//;
326 s/ify$//; s/fy$//; s/ly$//;
327 s/ise$//; s/ize$//;
329 s/e$//;
330 return $_;
331 }
334 sub compare($$)
335 {
336 my $a=shift;
337 my $b=shift;
338 $a =~ s/^\s*//;
339 $b =~ s/^\s*//;
340 my ($a1, $a2)= split /\s+/,$a,2;
341 my ($b1, $b2)= split /\s+/,$b,2;
343 my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)};
345 if ($cmp) {
346 return $cmp;
347 }
348 else {
349 if (normalize($a2) ne normalize($b2)) {
350 return normalize($a2) cmp normalize($b2);
351 }
352 else {
353 return $a1 <=> $b1;
354 }
355 }
356 }
358 our %Vocabulary;
359 open(VOC, $ENV{VOCABULARY})
360 or die "Can't open VOCABULARY";
361 while (<VOC>){
362 chomp;
363 #s/'//g;
364 $Vocabulary{normalize($_)}="1";
365 }
366 close(VOC);
368 @lines=<STDIN>;
369 for $L (@lines) {
370 chomp($L);
371 $l=$L;
372 $l =~ s/^\s*//;
373 my ($a, $b)=split(/\s+/,$l,2);
374 $group_weight{normalize($b)}+=$a;
375 }
376 if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") {
377 for $k (keys %group_weight) {
378 if (defined($Vocabulary{$k})) {
379 $group_weight{$k} *= 2;
380 }
381 }
382 }
383 @lines2 = sort { compare($b,$a) } @lines;
384 for $l (@lines2) {
385 print "$l\n";
386 }
387 PERL_SCRIPT
388 export VOCABULARY
389 export NEED_TO_USE_VOCABULARY_WHEN_SORT
390 export LANGUAGE
391 perl $PERL_SCRIPT_TEMP_NAME
392 rm $PERL_SCRIPT_TEMP_NAME
393 }
395 text_from_url()
396 {
397 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
398 }
400 add_marks()
401 {
402 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
403 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
404 $file = $ARGV[0];
405 our $dict;
406 if (open(NOTES, $ENV{NOTES_FILE})) {
407 while(<NOTES>) {
408 chomp;
409 s/^\s+//;
410 my ($a,$b)=split /\s+/,$_,2;
411 $dict{$a}=$b;
412 }
413 }
414 if (open(F, $file)) {
415 @lines=<F>;
416 close(F);
418 if (open(F, ">$file")) {
419 for (@lines) {
420 m/\s+\S+\s+(\S+)/;
421 $name=$1;
422 if (not /^#/ and defined($dict{$name})) {
423 chomp;
424 $mark=$dict{$name};
425 $space=" "x(30-length($_));
426 print F "$_$space$mark\n";
427 }
428 else {
429 print F "$_";
430 }
431 }
432 close(F);
433 }
434 }
435 PERL_SCRIPT
436 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
437 export NOTES_FILE
438 perl $PERL_SCRIPT_TEMP_NAME "$1"
439 rm $PERL_SCRIPT_TEMP_NAME
440 }
442 remove_marks()
443 {
444 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
445 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
446 $file = $ARGV[0];
447 our %dict;
448 if (open(F, $file)) {
449 @lines=<F>;
450 close(F);
452 if (open(F, ">$file")) {
453 for (@lines) {
454 chomp;
455 if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
456 my $name=$4;
457 my $comment=$6;
458 $dict{$name}=$comment;
459 print F "$1$2$3$4\n";
460 }
461 else {
462 print F "$_\n";
463 }
464 }
465 }
466 }
467 if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) {
468 @lines=<NOTES>;
469 close(NOTES);
471 if (open(NOTES, ">".$ENV{NOTES_FILE})) {
472 for (@lines) {
473 chomp;
474 s/^\s+//;
475 my ($a,$b)=split /\s+/,$_,2;
476 if (not defined($dict{$a}) || ($dict{$a} eq $b)) {
477 print NOTES "$_\n";
478 if (defined($dict{$a})) { unset($dict{$a}); }
479 }
480 }
481 for (keys %dict) {
482 $mark=$dict{$_};
483 $space=" "x(30-length($_));
484 print NOTES "$_$space$mark\n";
485 }
486 }
487 }
488 PERL_SCRIPT
489 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
490 export NOTES_FILE
491 export DONT_ADD_MARKS
492 perl $PERL_SCRIPT_TEMP_NAME "$1"
493 rm $PERL_SCRIPT_TEMP_NAME
494 }
496 part()
497 {
498 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
499 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
500 #!/usr/bin/perl
502 my @lines=<STDIN>;
503 my $lines=$#lines;
504 my $interval=$ARGV[0];
505 if (not $interval) {
506 print @lines;
507 }
508 else {
509 my ($start,$stop,$total);
510 if ($interval =~ m@(.*)/(.*)@) {
511 $start = $1;
512 $total = $2;
513 }
514 else {
515 $start=$interval;
516 $total=0;
517 }
518 if ($start =~ m@(.*)-(.*)@) {
519 $start = $1;
520 $stop = $2;
521 }
522 if ($start =~ m@(.*)\+(.*)@) {
523 $start = $1;
524 $stop = $start+$2;
525 }
527 $start=int($lines/$total*$start);
528 $stop=int($lines/$total*$stop);
530 for($i=$start;$i<$stop;$i++){
531 print $lines[$i];
532 }
533 }
534 PERL_SCRIPT
535 perl $PERL_SCRIPT_TEMP_NAME "$1"
536 rm $PERL_SCRIPT_TEMP_NAME
537 }
539 if [ "$TAGS_LIST_ONLY" = "YES" ]
540 then
541 cd "${WORK_DIR}"
542 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'
543 exit 0
544 fi
546 tag_file_name()
547 {
548 echo "${LANGUAGE}_${1}.txt"
549 }
551 if [ "$REMOVE_TAG" = "YES" ]
552 then
553 cd "${WORK_DIR}"
554 for i in $TAG_NAME
555 do
556 echo "$TAGNAME" | grep -q '[/*?]' && continue
557 f="`tag_file_name $i`"
558 if [ -e "$f" ]
559 then
560 rm -f "$f" && echo Tag "'$i'" removed
561 else
562 echo Unknown tag "'$i'"
563 fi
564 done
565 exit 0
566 fi
568 mkdir -p $WORK_DIR
569 oldpwd="$PWD"
570 cd $WORK_DIR
571 if [ "$MERGE_TAGGED_WORDS" = "YES" ]
572 then
573 VOC_FILES=''
574 for i in $MERGE_THIS_TAGS
575 do
576 f=`tag_file_name $i`
577 [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
578 done
579 if [ -z "$VOC_FILES" ]
580 then
581 echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
582 else
583 cat $VOC_FILES
584 fi
585 elif [ "$MERGE_ALL_TAGGED" = "YES" ]
586 then
587 cat ${LANGUAGE}_*.txt
588 elif echo "$1" | grep -q http:
589 then
590 text_from_url "$1"
591 elif [ "$#" != 0 ]
592 then
593 if echo $1 | grep -q ^/
594 then
595 cat "$1"
596 else
597 cat "$oldpwd/$1"
598 fi
599 else
600 cat
601 fi \
602 | part $PART_TO_PROCESS \
603 | tee $ORIGINAL_TEXT \
604 | two_and_three_words \
605 | get_words ${TEMP1}-full \
606 | group_words \
607 | add_stat ${TEMP1}-full \
608 | tee "$TEMP1" > "$TEMP2"
610 if [ "$STAT_ONLY" = "YES" ]
611 then
612 cat "$TEMP1"
613 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
614 then
615 cat "$TEMP1"
616 else
617 if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ]
618 then
619 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
620 if [ "$editor" = vim ]
621 then
622 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
623 else
624 $editor "$TEMP2"
625 fi
626 remove_marks "$TEMP2"
628 vocabulary="$VOCABULARY"
629 [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`"
630 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary"
631 fi
632 fi
634 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"