new-words
view new-words.sh @ 68:846240941452
added -C key: compress to lines; fixed bug with #90-line
author | Igor Chubin <igor@chub.in> |
---|---|
date | Sun Sep 23 16:07:29 2012 +0300 (2012-09-23) |
parents | f95804355b0f |
children |
line source
1 #!/bin/bash
3 show_usage()
4 {
5 cat <<HELP > /dev/stderr
7 USAGE:
9 new-words [ -l lang ] [ -s ] [ ARG ]
11 SWITCHES:
13 -h print this screen
14 -c show compressed wordlist: one word per group
15 -f file show only words that are related to the words from the file
16 -k put higher words that are similar to the known words (only for English)
17 -l lang override language settings
18 -n non-interactive mode (don't run vi)
19 -N turn off known words filtering
20 -a don't add marks (and don't save marks added by user)
21 -p pages work with specified pages only (pages = start-stop/total )
22 -s show the text statistics (percentage of known words and so on) and exit
23 -S show your vocabulary statistics (number of words and word groups)
24 -t tag tag known words with tag
25 -T show list of active tags
26 -m tag merge the words tagged with "tag" into the main vocabulary
27 -M merge the words tagged with any tag into the main vocabulary
28 -r tag remove subvocabulary for the "tag"
29 -2 -3 find 2 and 3 words' sequences
31 The language of the text can be specified also
32 by name of the program new-words (correspondent link must be created before).
33 For example, these calls are equivalent:
35 de-words URL
36 new-words -l de URL
38 HELP
39 }
41 if [ "$1" = "-h" ]
42 then
43 show_usage
44 exit 0
45 fi
47 WORK_DIR=~/.new-words/
48 TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX`
49 TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX`
50 export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX`
51 editor=${EDITOR:-vim}
53 # language detection
55 LANGUAGE=en
56 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
57 for arg
58 do
59 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
60 then
61 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
62 fi
63 done
64 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
66 #----------------------------------------------------
67 # command line options processing
69 STAT_ONLY=NO
70 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
71 DONT_ADD_MARKS=NO
72 NON_INTERACTIVE_MODE=NO
73 PART_TO_PROCESS=''
74 GROUP_WORDS_BY_THREE=NO
75 GROUP_WORDS_BY_TWO=NO
76 TAG_NAME=''
77 MERGE_THIS_TAGS=''
78 TAGS_LIST_ONLY=NO
79 MERGE_TAGGED_WORDS=NO
80 MERGE_ALL_TAGGED=NO
81 DONT_ADD_MARKLINES=NO
82 FILTER_WORDS=YES
83 SHOW_VOC_STAT=NO
84 COMPRESSED_WORDLIST=NO
85 ALLOWED_WORDS_FILENAME=''
86 while getopts cl:sSkanNp:t:Tm:Mr:23 opt
87 do
88 case "$opt" in
89 c) COMPRESSED_WORDLIST=YES;;
90 s) STAT_ONLY=YES;;
91 S) SHOW_VOC_STAT=YES;;
92 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
93 l) LANGUAGE="$OPTARG";;
94 a) DONT_ADD_MARKS=YES;;
95 n) NON_INTERACTIVE_MODE=YES;;
96 N) FILTER_WORDS=NO;;
97 p) PART_TO_PROCESS="$OPTARG";;
98 t) TAG_NAME="$OPTARG";;
99 T) TAGS_LIST_ONLY="YES";;
100 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
101 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
102 r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
103 2) GROUP_WORDS_BY_TWO=YES;;
104 3) GROUP_WORDS_BY_THREE=YES;;
105 \?) # unknown flag
106 show_usage
107 exit 1;;
108 esac
109 done
110 shift `expr $OPTIND - 1`
112 if [ "$1" = "-l" ]
113 then
114 LANGUAGE="$2"
115 shift 2
116 fi
118 VOCABULARY=${LANGUAGE}.txt
119 NOTES_FILE=notes-${LANGUAGE}.txt
121 if [ "${SHOW_VOC_STAT}" = "YES" ]
122 then
123 $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>"
124 exit 0
125 fi
127 #----------------------------------------------------
129 get_words()
130 {
131 export FILTER_WORDS
132 tr ' ' '\n' | sed 's/--/ /g' \
133 | sed "s/'/__APOSTROPHE__/g" \
134 | perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\
135 | sed "s/__APOSTROPHE__/'/g" \
136 | tr ' ' '\n' \
137 | tee "$1" \
138 | grep_v_english_perl \
139 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
140 }
142 add_stat()
143 {
144 if [ "$DONT_ADD_MARKLINES" = "YES" ]
145 then
146 cat
147 return
148 fi
149 before="$1"
150 after=${before}2
151 cat > "$after"
152 total="`wc -w $1 | awk '{print $1}'`"
153 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
154 total_known="`echo $total-$total_unknown|bc`"
155 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
156 #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`"
157 sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`"
160 if [ "$STAT_ONLY" = "YES" ]
161 then
162 echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10"
163 echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` "
164 rm $after
165 return 0
166 else
167 groups="`echo $(grep '# groups' $after | awk '{print $3}')`"
168 words="`echo $(grep -v '^#' $after | wc -l)`"
169 echo "# $LANGUAGE, $percentage, <$total_known/$total>, <$groups/$words>"
170 fi
172 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
173 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
174 my $total=shift(@ARGV);
175 my $total_known=shift(@ARGV);
176 my $s=0;
177 my $mark_line=int($total_known*100/$total/5)*5;
178 if ($mark_line>=90) {
179 $mark_line=int($total_known*100/$total)+1;
180 } else { $mark_line +=5; };
181 while(<>)
182 {
183 next if /^#\s*groups\s*/;
184 print;
185 /^\s*([0-9]*)\s*/;
186 $s+=$1;
187 if (($total_known+$s)*100/$total>=$mark_line) {
188 print "# $mark_line\n";
189 if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; };
190 }
191 }
192 PERL_SCRIPT
193 perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after"
194 rm $PERL_SCRIPT_TEMP_NAME
195 rm $after
196 }
198 two_and_three_words()
199 {
200 if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ]
201 then
202 cat
203 else
204 cat
206 export GROUP_WORDS_BY_THREE
207 export GROUP_WORDS_BY_TWO
208 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX`
209 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
210 #!/usr/bin/perl
211 local $/;
212 $words=<>;
213 $words=~ s@[!?;,:#1-9".]@ @g;
214 $words =~ s@\s+@ @g;
215 @words = split /\s+/, $words;
216 for ($i=0; $i<$#words-3;$i++) {
217 my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]);
218 if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) {
219 print "${a}_${b}_${c}\n";
220 };
221 if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) {
222 print "${a}_${b}\n";
223 };
224 }
225 PERL_SCRIPT
226 perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT"
227 rm $PERL_SCRIPT_TEMP_NAME
228 fi
229 }
231 grep_v_english()
232 {
233 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
234 eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat
235 }
237 grep_v_english_perl()
238 {
239 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
240 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
241 if ($ENV{FILTER_WORDS} eq "NO") {
242 while(<>) { print; }
243 exit(0);
244 }
245 $voc_files=$ENV{VOC_FILES};
246 $voc_files=~s@^ @@;
247 for $voc_file (split /\s+/,$voc_files) {
248 if (open(VOC, $voc_file)) {
249 while (<VOC>){
250 chomp;
251 #s/'//g;
252 $voc{$_}="1";
253 }
254 }
255 }
256 while(<>) {
257 chomp;
258 if (not defined($voc{$_})) { print "$_\n"; }
259 }
260 PERL_SCRIPT
261 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
262 export VOCABULARY VOC_FILES
263 VOC_FILES=$VOCABULARY
264 for i in $TAG_NAME
265 do
266 VOC_FILES="${VOC_FILES} `tag_file_name $i`"
267 done
268 perl $PERL_SCRIPT_TEMP_NAME
269 rm $PERL_SCRIPT_TEMP_NAME
270 }
272 group_words()
273 {
274 #if [ "$LANGUAGE" != "en" ]
275 #then
276 # cat
277 # return
278 #fi
279 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX`
280 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
281 #!/usr/bin/perl
283 use Encode;
284 use utf8;
285 use Lingua::Stem::Snowball qw(stem);
287 eval {
288 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
289 require String::Similarity;
290 String::Similarity->import();
291 };
292 unless($@)
293 {
294 our $HAVE_String_Similarity=1;
295 }
298 sub load_notes_dict()
299 {
300 my %dict;
301 if (open(NOTES, $ENV{NOTES_FILE})) {
302 while(<NOTES>) {
303 $_ = decode( "utf8", $_);
304 chomp;
305 s/^\s+//;
306 my ($a,$b)=split /\s+/,$_,2;
307 $dict{$a}=$b;
308 }
309 }
310 return %dict;
311 }
313 sub similar($$){
314 my $a=shift;
315 my $b=shift;
316 if ($HAVE_String_Similarity) {
317 return $Similarity{"$a $b"};
318 }
319 else {
320 return 0;
321 }
322 }
325 sub normalize_without_linked($)
326 {
327 if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
328 elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
329 elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); }
330 elsif ( $ENV{LANGUAGE} eq "io" ) { return normalize_esperanto(shift); }
331 else { return shift ; }
332 }
334 sub normalize_with_linked($)
335 {
336 my $word = normalize_without_linked(shift);
337 #return $word;
338 if ($linked_words{$word}) {
339 return $linked_words{$word};
340 }
341 else {
342 return $word;
343 }
344 }
346 sub normalize($)
347 {
348 return normalize_with_linked(shift);
349 }
351 sub normalize_ukrainian($)
352 {
353 $_=lc(shift);
354 s/[юіоеуаи]$//g;
355 return $_;
356 }
358 sub normalize_esperanto($)
359 {
360 $_=lc(shift);
361 # verbs
362 s/i$//; s/is$//; s/os$//; s/as$//; s/us$//;
364 # nouns
365 s/j?n?$//;
367 return $_;
368 }
370 sub normalize_german($)
371 {
372 @stems = stem('de', \@_);
373 return $stems[0];
374 }
376 sub normalize_german_($)
377 {
378 $_=lc(shift);
380 s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//;
381 s/en$//; s/er$//;
383 s/lich$//; s/ig$//;
384 s/al$//; s/isch$//;
385 s/ell$//; s/haft$//;
387 s/bar$//; s/sam$//; s/lich$//;
389 @prefixes=qw(
390 ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu
391 be emp ent er ge miss ver zer durch über um unter wieder);
392 @prefixes=();
393 for $pref (@prefixes) {
394 s/^$pref//;
395 }
398 return $_;
399 }
401 sub normalize_english($)
402 {
403 $_=lc(shift);
405 s/s$//;
407 s/ation$//; s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//;
409 s/ed$//;
410 s/en$//;
411 s/er$//;
412 s/est$//;
413 s/ing$//;
415 s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//;
416 s/ify$//; s/fy$//; s/ly$//;
417 s/ise$//; s/ize$//;
419 s/e$//;
420 return $_;
421 }
424 sub compare($$)
425 {
426 my $a=shift;
427 my $b=shift;
428 $a =~ s/^\s*//;
429 $b =~ s/^\s*//;
430 my ($a1, $a2)= split /\s+/,$a,2;
431 my ($b1, $b2)= split /\s+/,$b,2;
433 my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)};
435 if ($cmp) {
436 return $cmp;
437 }
438 else {
439 if (normalize($a2) ne normalize($b2)) {
440 return normalize($a2) cmp normalize($b2);
441 }
442 else {
443 return $a1 <=> $b1;
444 }
445 }
446 }
448 sub log_($)
449 {
450 return;
451 open(LOG, ">>", "/tmp/log1");
452 print LOG $_[0];
453 close(LOG);
454 }
456 sub find_linked_words($)
457 {
458 my %linked_words;
459 my $dict = shift;
460 log_("1");
461 log_(join(" ", keys(%$dict)));
463 for $key (keys(%$dict)) {
464 $val = $dict->{$key};
465 log_($key."\n");
466 if ($val =~ /\@([a-z]*)/) {
467 $linked_words{normalize($key)} = normalize($1);
468 log_(normalize($key)." = ".normalize($1)."\n");
469 }
470 }
471 return %linked_words;
472 }
474 sub lc_length($)
475 {
476 my $a= shift;
477 $a =~ s/[a-z]//g;
478 return length($a);
479 }
481 our %dict = load_notes_dict();
482 our %linked_words = find_linked_words(\%dict);
484 our %Vocabulary;
485 open(VOC, $ENV{VOCABULARY})
486 or die "Can't open VOCABULARY";
487 while (<VOC>){
488 chomp;
489 #s/'//g;
490 $Vocabulary{normalize($_)}="1";
491 }
492 close(VOC);
494 binmode STDIN,":utf8";
495 @lines=<STDIN>;
496 for $L (@lines) {
497 chomp($L);
498 #$L = decode( "utf8", $L);
499 $l=$L;
500 $l =~ s/^\s*//;
501 my ($a, $b)=split(/\s+/,$l,2);
502 $group_weight{normalize($b)}+=$a;
503 }
504 if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") {
505 for $k (keys %group_weight) {
506 if (defined($Vocabulary{$k})) {
507 $group_weight{$k} *= 2;
508 }
509 }
510 }
511 @lines2 = sort { compare($b,$a) } @lines;
512 binmode STDOUT, ":utf8";
513 print "# groups ".scalar(keys(%group_weight))."\n";
514 if ($ENV{COMPRESSED_WORDLIST} eq "YES") {
515 my $sum = 0;
516 my $min = 9999;
517 for $L (@lines2) {
518 chomp($L); $l=$L; $l =~ s/^(\s*)//; my $spaces = $1; my ($a, $b)=split(/\s+/,$l,2);
519 $group_name = normalize($b);
520 if ($group_name ne $prev_group_name and $prev_group_name ne '' ) {
521 #print (" "x(7-length($sum))),"$sum $main_word\n";
522 print +(" "x(7-length($sum))),"$sum $main_word\n";
523 $sum = $a;
524 $min = length($b) + 2*lc_length($b);
525 $main_word = $b;
526 }
527 else {
528 $sum += $a;
529 if ($min > length($b) + 2*lc_length($b)) {
530 $min = length($b) + 2*lc_length($b);
531 $main_word = $b;
532 }
533 }
534 $prev_group_name = $group_name;
535 }
536 }
537 else {
538 for $l (@lines2) {
539 print "$l\n";
540 }
541 }
542 PERL_SCRIPT
543 export VOCABULARY
544 export NEED_TO_USE_VOCABULARY_WHEN_SORT
545 export LANGUAGE
546 export COMPRESSED_WORDLIST
547 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
548 export NOTES_FILE
549 perl $PERL_SCRIPT_TEMP_NAME
550 rm $PERL_SCRIPT_TEMP_NAME
551 }
553 text_from_url()
554 {
555 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
556 }
558 add_marks()
559 {
560 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
561 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
562 use Encode;
564 sub load_notes_dict()
565 {
566 my %dict;
567 if (open(NOTES, $ENV{NOTES_FILE})) {
568 while(<NOTES>) {
569 $_ = decode( "utf8", $_);
570 chomp;
571 s/^\s+//;
572 my ($a,$b)=split /\s+/,$_,2;
573 $dict{$a}=$b;
574 }
575 }
576 return %dict;
577 }
579 %dict = load_notes_dict();
581 $file = $ARGV[0];
582 if (open(F, $file)) {
583 @lines=<F>;
584 close(F);
585 for (@lines) {$_ = decode( "utf8", $_);};
587 if (open(F, ">$file")) {
588 binmode F, ":utf8";
589 for (@lines) {
590 m/\s+\S+\s+(\S+)/;
591 $name=$1;
592 if (not /^#/ and defined($dict{$name})) {
593 chomp;
594 $mark=$dict{$name};
595 $space=" "x(30-length($_));
596 print F "$_$space$mark\n";
597 }
598 else {
599 print F "$_";
600 }
601 }
602 close(F);
603 }
604 }
605 PERL_SCRIPT
606 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
607 export NOTES_FILE
608 perl $PERL_SCRIPT_TEMP_NAME "$1"
609 rm $PERL_SCRIPT_TEMP_NAME
610 }
612 remove_marks()
613 {
614 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
615 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
616 $file = $ARGV[0];
617 our %dict;
618 if (open(F, $file)) {
619 @lines=<F>;
620 close(F);
622 if (open(F, ">$file")) {
623 for (@lines) {
624 chomp;
625 if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
626 my $name=$4;
627 my $comment=$6;
628 $dict{$name}=$comment;
629 print F "$1$2$3$4\n";
630 }
631 else {
632 print F "$_\n";
633 }
634 }
635 }
636 }
637 if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) {
638 @lines=<NOTES>;
639 close(NOTES);
641 if (open(NOTES, ">".$ENV{NOTES_FILE})) {
642 for (@lines) {
643 chomp;
644 s/^\s+//;
645 my ($a,$b)=split /\s+/,$_,2;
646 if (not defined($dict{$a}) || ($dict{$a} eq $b)) {
647 print NOTES "$_\n";
648 if (defined($dict{$a})) { unset($dict{$a}); }
649 }
650 }
651 for (keys %dict) {
652 $mark=$dict{$_};
653 $space=" "x(30-length($_));
654 print NOTES "$_$space$mark\n";
655 }
656 }
657 }
658 PERL_SCRIPT
659 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
660 export NOTES_FILE
661 export DONT_ADD_MARKS
662 perl $PERL_SCRIPT_TEMP_NAME "$1"
663 rm $PERL_SCRIPT_TEMP_NAME
664 }
666 part()
667 {
668 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
669 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
670 #!/usr/bin/perl
672 my @lines=<STDIN>;
673 my $lines=$#lines;
674 my $interval=$ARGV[0];
675 if (not $interval) {
676 print @lines;
677 }
678 else {
679 my ($start,$stop,$total);
680 if ($interval =~ m@(.*)/(.*)@) {
681 $start = $1;
682 $total = $2;
683 }
684 else {
685 $start=$interval;
686 $total=0;
687 }
688 if ($start =~ m@(.*)-(.*)@) {
689 $start = $1;
690 $stop = $2;
691 }
692 if ($start =~ m@(.*)\+(.*)@) {
693 $start = $1;
694 $stop = $start+$2;
695 }
697 $start=int($lines/$total*$start);
698 $stop=int($lines/$total*$stop);
700 for($i=$start;$i<$stop;$i++){
701 print $lines[$i];
702 }
703 }
704 PERL_SCRIPT
705 perl $PERL_SCRIPT_TEMP_NAME "$1"
706 rm $PERL_SCRIPT_TEMP_NAME
707 }
709 if [ "$TAGS_LIST_ONLY" = "YES" ]
710 then
711 cd "${WORK_DIR}"
712 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'
713 exit 0
714 fi
716 tag_file_name()
717 {
718 echo "${LANGUAGE}_${1}.txt"
719 }
721 if [ "$REMOVE_TAG" = "YES" ]
722 then
723 cd "${WORK_DIR}"
724 for i in $TAG_NAME
725 do
726 echo "$TAGNAME" | grep -q '[/*?]' && continue
727 f="`tag_file_name $i`"
728 if [ -e "$f" ]
729 then
730 rm -f "$f" && echo Tag "'$i'" removed
731 else
732 echo Unknown tag "'$i'"
733 fi
734 done
735 exit 0
736 fi
738 mkdir -p $WORK_DIR
739 oldpwd="$PWD"
740 cd $WORK_DIR
741 if [ "$MERGE_TAGGED_WORDS" = "YES" ]
742 then
743 VOC_FILES=''
744 for i in $MERGE_THIS_TAGS
745 do
746 f=`tag_file_name $i`
747 [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
748 done
749 if [ -z "$VOC_FILES" ]
750 then
751 echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
752 else
753 cat $VOC_FILES
754 fi
755 elif [ "$MERGE_ALL_TAGGED" = "YES" ]
756 then
757 cat ${LANGUAGE}_*.txt
758 elif echo "$1" | grep -q http:
759 then
760 text_from_url "$1"
761 elif [ "$#" != 0 ]
762 then
763 if echo $1 | grep -q ^/
764 then
765 cat "$1"
766 else
767 cat "$oldpwd/$1"
768 fi
769 else
770 cat
771 fi \
772 | part $PART_TO_PROCESS \
773 | tee $ORIGINAL_TEXT \
774 | two_and_three_words \
775 | get_words ${TEMP1}-full \
776 | group_words \
777 | add_stat ${TEMP1}-full \
778 | tee "$TEMP1" > "$TEMP2"
780 if [ "$STAT_ONLY" = "YES" ]
781 then
782 cat "$TEMP1"
783 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
784 then
785 cat "$TEMP1"
786 else
787 if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ]
788 then
789 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
790 if [ "$editor" = vim ]
791 then
792 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
793 else
794 $editor "$TEMP2"
795 fi
796 remove_marks "$TEMP2"
798 vocabulary="$VOCABULARY"
799 [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`"
800 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary"
801 fi
802 fi
804 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"