new-words
view new-words.sh @ 37:be6336e98b3c
new-words.py started
author | igor@chub.in |
---|---|
date | Fri Jan 21 15:59:45 2011 +0200 (2011-01-21) |
parents | 3827cce83602 |
children | 4e931db74618 |
line source
1 #!/bin/bash
3 show_usage()
4 {
5 cat <<HELP > /dev/stderr
7 USAGE:
9 new-words [ -l lang ] [ -s ] [ ARG ]
11 SWITCHES:
13 -h print this screen
14 -c show compressed wordlist: one word per group
15 -k put higher words that are similar to the known words (only for English)
16 -l lang override language settings
17 -n non-interactive mode (don't run vi)
18 -N turn off known words filtering
19 -a don't add marks (and don't save marks added by user)
20 -p pages work with specified pages only (pages = start-stop/total )
21 -s show the text statistics (percentage of known words and so on) and exit
22 -S show your vocabulary statistics (number of words and word groups)
23 -t tag tag known words with tag
24 -T show list of active tags
25 -m tag merge the words tagged with "tag" into the main vocabulary
26 -M merge the words tagged with any tag into the main vocabulary
27 -r tag remove subvocabulary for the "tag"
28 -2 -3 find 2 and 3 words' sequences
30 The language of the text can be specified also
31 by name of the program new-words (correspondent link must be created before).
32 For example, these calls are equivalent:
34 de-words URL
35 new-words -l de URL
37 HELP
38 }
40 if [ "$1" = "-h" ]
41 then
42 show_usage
43 exit 0
44 fi
46 WORK_DIR=~/.new-words/
47 TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX`
48 TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX`
49 export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX`
50 editor=${EDITOR:-vim}
52 # language detection
54 LANGUAGE=en
55 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
56 for arg
57 do
58 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
59 then
60 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
61 fi
62 done
63 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
65 #----------------------------------------------------
66 # command line options processing
68 STAT_ONLY=NO
69 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
70 DONT_ADD_MARKS=NO
71 NON_INTERACTIVE_MODE=NO
72 PART_TO_PROCESS=''
73 GROUP_WORDS_BY_THREE=NO
74 GROUP_WORDS_BY_TWO=NO
75 TAG_NAME=''
76 MERGE_THIS_TAGS=''
77 TAGS_LIST_ONLY=NO
78 MERGE_TAGGED_WORDS=NO
79 MERGE_ALL_TAGGED=NO
80 DONT_ADD_MARKLINES=NO
81 FILTER_WORDS=YES
82 SHOW_VOC_STAT=NO
83 COMPRESSED_WORDLIST=NO
84 while getopts cl:sSkanNp:t:Tm:Mr:23 opt
85 do
86 case "$opt" in
87 c) COMPRESSED_WORDLIST=YES;;
88 s) STAT_ONLY=YES;;
89 S) SHOW_VOC_STAT=YES;;
90 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
91 l) LANGUAGE="$OPTARG";;
92 a) DONT_ADD_MARKS=YES;;
93 n) NON_INTERACTIVE_MODE=YES;;
94 N) FILTER_WORDS=NO;;
95 p) PART_TO_PROCESS="$OPTARG";;
96 t) TAG_NAME="$OPTARG";;
97 T) TAGS_LIST_ONLY="YES";;
98 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
99 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
100 r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
101 2) GROUP_WORDS_BY_TWO=YES;;
102 3) GROUP_WORDS_BY_THREE=YES;;
103 \?) # unknown flag
104 show_usage
105 exit 1;;
106 esac
107 done
108 shift `expr $OPTIND - 1`
110 if [ "$1" = "-l" ]
111 then
112 LANGUAGE="$2"
113 shift 2
114 fi
116 VOCABULARY=${LANGUAGE}.txt
117 NOTES_FILE=notes-${LANGUAGE}.txt
119 if [ "${SHOW_VOC_STAT}" = "YES" ]
120 then
121 $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>"
122 exit 0
123 fi
125 #----------------------------------------------------
127 get_words()
128 {
129 export FILTER_WORDS
130 tr ' ' '\n' | sed 's/--/ /g' \
131 | sed "s/'/__APOSTROPHE__/g" \
132 | perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\
133 | sed "s/__APOSTROPHE__/'/g" \
134 | tr ' ' '\n' \
135 | tee "$1" \
136 | grep_v_english_perl \
137 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
138 }
140 add_stat()
141 {
142 if [ "$DONT_ADD_MARKLINES" = "YES" ]
143 then
144 cat
145 return
146 fi
147 before="$1"
148 after=${before}2
149 cat > "$after"
150 total="`wc -w $1 | awk '{print $1}'`"
151 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
152 total_known="`echo $total-$total_unknown|bc`"
153 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
154 #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`"
155 sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`"
158 if [ "$STAT_ONLY" = "YES" ]
159 then
160 echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10"
161 echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` "
162 rm $after
163 return 0
164 else
165 groups="`echo $(grep '# groups' $after | awk '{print $3}')`"
166 words="`echo $(grep -v '^#' $after | wc -l)`"
167 echo "# $LANGUAGE, $percentage, <$total_known/$total>, <$groups/$words>"
168 fi
170 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
171 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
172 my $total=shift(@ARGV);
173 my $total_known=shift(@ARGV);
174 my $s=0;
175 my $mark_line=int($total_known*100/$total/5)*5;
176 if ($mark_line>=90) {
177 $mark_line=int($total_known*100/$total)+1;
178 } else { $mark_line +=5; };
179 while(<>)
180 {
181 next if /^#\s*groups\s*/;
182 print;
183 /^\s*([0-9]*)\s*/;
184 $s+=$1;
185 if (($total_known+$s)*100/$total>=$mark_line) {
186 print "# $mark_line\n";
187 if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; };
188 }
189 }
190 PERL_SCRIPT
191 perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after"
192 rm $PERL_SCRIPT_TEMP_NAME
193 rm $after
194 }
196 two_and_three_words()
197 {
198 if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ]
199 then
200 cat
201 else
202 cat
204 export GROUP_WORDS_BY_THREE
205 export GROUP_WORDS_BY_TWO
206 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX`
207 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
208 #!/usr/bin/perl
209 local $/;
210 $words=<>;
211 $words=~ s@[!?;,:#1-9".]@ @g;
212 $words =~ s@\s+@ @g;
213 @words = split /\s+/, $words;
214 for ($i=0; $i<$#words-3;$i++) {
215 my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]);
216 if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) {
217 print "${a}_${b}_${c}\n";
218 };
219 if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) {
220 print "${a}_${b}\n";
221 };
222 }
223 PERL_SCRIPT
224 perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT"
225 rm $PERL_SCRIPT_TEMP_NAME
226 fi
227 }
229 grep_v_english()
230 {
231 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
232 eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat
233 }
235 grep_v_english_perl()
236 {
237 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
238 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
239 if ($ENV{FILTER_WORDS} eq "NO") {
240 while(<>) { print; }
241 exit(0);
242 }
243 $voc_files=$ENV{VOC_FILES};
244 $voc_files=~s@^ @@;
245 for $voc_file (split /\s+/,$voc_files) {
246 if (open(VOC, $voc_file)) {
247 while (<VOC>){
248 chomp;
249 #s/'//g;
250 $voc{$_}="1";
251 }
252 }
253 }
254 while(<>) {
255 chomp;
256 if (not defined($voc{$_})) { print "$_\n"; }
257 }
258 PERL_SCRIPT
259 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
260 export VOCABULARY VOC_FILES
261 VOC_FILES=$VOCABULARY
262 for i in $TAG_NAME
263 do
264 VOC_FILES="${VOC_FILES} `tag_file_name $i`"
265 done
266 perl $PERL_SCRIPT_TEMP_NAME
267 rm $PERL_SCRIPT_TEMP_NAME
268 }
270 group_words()
271 {
272 #if [ "$LANGUAGE" != "en" ]
273 #then
274 # cat
275 # return
276 #fi
277 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX`
278 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
279 #!/usr/bin/perl
281 use Encode;
282 use utf8;
283 use Lingua::Stem::Snowball qw(stem);
285 eval {
286 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
287 require String::Similarity;
288 String::Similarity->import();
289 };
290 unless($@)
291 {
292 our $HAVE_String_Similarity=1;
293 }
296 sub load_notes_dict()
297 {
298 my %dict;
299 if (open(NOTES, $ENV{NOTES_FILE})) {
300 while(<NOTES>) {
301 $_ = decode( "utf8", $_);
302 chomp;
303 s/^\s+//;
304 my ($a,$b)=split /\s+/,$_,2;
305 $dict{$a}=$b;
306 }
307 }
308 return %dict;
309 }
311 sub similar($$){
312 my $a=shift;
313 my $b=shift;
314 if ($HAVE_String_Similarity) {
315 return $Similarity{"$a $b"};
316 }
317 else {
318 return 0;
319 }
320 }
323 sub normalize_without_linked($)
324 {
325 if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
326 elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
327 elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); }
328 elsif ( $ENV{LANGUAGE} eq "io" ) { return normalize_esperanto(shift); }
329 else { return shift ; }
330 }
332 sub normalize_with_linked($)
333 {
334 my $word = normalize_without_linked(shift);
335 #return $word;
336 if ($linked_words{$word}) {
337 return $linked_words{$word};
338 }
339 else {
340 return $word;
341 }
342 }
344 sub normalize($)
345 {
346 return normalize_with_linked(shift);
347 }
349 sub normalize_ukrainian($)
350 {
351 $_=lc(shift);
352 s/[юіоеуаи]$//g;
353 return $_;
354 }
356 sub normalize_esperanto($)
357 {
358 $_=lc(shift);
359 # verbs
360 s/i$//; s/is$//; s/os$//; s/as$//; s/us$//;
362 # nouns
363 s/j?n?$//;
365 return $_;
366 }
368 sub normalize_german($)
369 {
370 @stems = stem('de', \@_);
371 return $stems[0];
372 }
374 sub normalize_german_($)
375 {
376 $_=lc(shift);
378 s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//;
379 s/en$//; s/er$//;
381 s/lich$//; s/ig$//;
382 s/al$//; s/isch$//;
383 s/ell$//; s/haft$//;
385 s/bar$//; s/sam$//; s/lich$//;
387 @prefixes=qw(
388 ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu
389 be emp ent er ge miss ver zer durch über um unter wieder);
390 @prefixes=();
391 for $pref (@prefixes) {
392 s/^$pref//;
393 }
396 return $_;
397 }
399 sub normalize_english($)
400 {
401 $_=lc(shift);
403 s/s$//;
405 s/ation$//; s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//;
407 s/ed$//;
408 s/en$//;
409 s/er$//;
410 s/est$//;
411 s/ing$//;
413 s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//;
414 s/ify$//; s/fy$//; s/ly$//;
415 s/ise$//; s/ize$//;
417 s/e$//;
418 return $_;
419 }
422 sub compare($$)
423 {
424 my $a=shift;
425 my $b=shift;
426 $a =~ s/^\s*//;
427 $b =~ s/^\s*//;
428 my ($a1, $a2)= split /\s+/,$a,2;
429 my ($b1, $b2)= split /\s+/,$b,2;
431 my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)};
433 if ($cmp) {
434 return $cmp;
435 }
436 else {
437 if (normalize($a2) ne normalize($b2)) {
438 return normalize($a2) cmp normalize($b2);
439 }
440 else {
441 return $a1 <=> $b1;
442 }
443 }
444 }
446 sub log_($)
447 {
448 return;
449 open(LOG, ">>", "/tmp/log1");
450 print LOG $_[0];
451 close(LOG);
452 }
454 sub find_linked_words($)
455 {
456 my %linked_words;
457 my $dict = shift;
458 log_("1");
459 log_(join(" ", keys(%$dict)));
461 for $key (keys(%$dict)) {
462 $val = $dict->{$key};
463 log_($key."\n");
464 if ($val =~ /\@([a-z]*)/) {
465 $linked_words{normalize($key)} = normalize($1);
466 log_(normalize($key)." = ".normalize($1)."\n");
467 }
468 }
469 return %linked_words;
470 }
472 sub lc_length($)
473 {
474 my $a= shift;
475 $a =~ s/[a-z]//g;
476 return length($a);
477 }
479 our %dict = load_notes_dict();
480 our %linked_words = find_linked_words(\%dict);
482 our %Vocabulary;
483 open(VOC, $ENV{VOCABULARY})
484 or die "Can't open VOCABULARY";
485 while (<VOC>){
486 chomp;
487 #s/'//g;
488 $Vocabulary{normalize($_)}="1";
489 }
490 close(VOC);
492 binmode STDIN,":utf8";
493 @lines=<STDIN>;
494 for $L (@lines) {
495 chomp($L);
496 #$L = decode( "utf8", $L);
497 $l=$L;
498 $l =~ s/^\s*//;
499 my ($a, $b)=split(/\s+/,$l,2);
500 $group_weight{normalize($b)}+=$a;
501 }
502 if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") {
503 for $k (keys %group_weight) {
504 if (defined($Vocabulary{$k})) {
505 $group_weight{$k} *= 2;
506 }
507 }
508 }
509 @lines2 = sort { compare($b,$a) } @lines;
510 binmode STDOUT, ":utf8";
511 print "# groups ".scalar(keys(%group_weight))."\n";
512 if ($ENV{COMPRESSED_WORDLIST} eq "YES") {
513 my $sum = 0;
514 my $min = 9999;
515 for $L (@lines2) {
516 chomp($L); $l=$L; $l =~ s/^(\s*)//; my $spaces = $1; my ($a, $b)=split(/\s+/,$l,2);
517 $group_name = normalize($b);
518 if ($group_name ne $prev_group_name and $prev_group_name ne '' ) {
519 #print (" "x(7-length($sum))),"$sum $main_word\n";
520 print +(" "x(7-length($sum))),"$sum $main_word\n";
521 $sum = $a;
522 $min = length($b) + 2*lc_length($b);
523 $main_word = $b;
524 }
525 else {
526 $sum += $a;
527 if ($min > length($b) + 2*lc_length($b)) {
528 $min = length($b) + 2*lc_length($b);
529 $main_word = $b;
530 }
531 }
532 $prev_group_name = $group_name;
533 }
534 }
535 else {
536 for $l (@lines2) {
537 print "$l\n";
538 }
539 }
540 PERL_SCRIPT
541 export VOCABULARY
542 export NEED_TO_USE_VOCABULARY_WHEN_SORT
543 export LANGUAGE
544 export COMPRESSED_WORDLIST
545 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
546 export NOTES_FILE
547 perl $PERL_SCRIPT_TEMP_NAME
548 rm $PERL_SCRIPT_TEMP_NAME
549 }
551 text_from_url()
552 {
553 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
554 }
556 add_marks()
557 {
558 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
559 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
560 use Encode;
562 sub load_notes_dict()
563 {
564 my %dict;
565 if (open(NOTES, $ENV{NOTES_FILE})) {
566 while(<NOTES>) {
567 $_ = decode( "utf8", $_);
568 chomp;
569 s/^\s+//;
570 my ($a,$b)=split /\s+/,$_,2;
571 $dict{$a}=$b;
572 }
573 }
574 return %dict;
575 }
577 %dict = load_notes_dict();
579 $file = $ARGV[0];
580 if (open(F, $file)) {
581 @lines=<F>;
582 close(F);
583 for (@lines) {$_ = decode( "utf8", $_);};
585 if (open(F, ">$file")) {
586 binmode F, ":utf8";
587 for (@lines) {
588 m/\s+\S+\s+(\S+)/;
589 $name=$1;
590 if (not /^#/ and defined($dict{$name})) {
591 chomp;
592 $mark=$dict{$name};
593 $space=" "x(30-length($_));
594 print F "$_$space$mark\n";
595 }
596 else {
597 print F "$_";
598 }
599 }
600 close(F);
601 }
602 }
603 PERL_SCRIPT
604 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
605 export NOTES_FILE
606 perl $PERL_SCRIPT_TEMP_NAME "$1"
607 rm $PERL_SCRIPT_TEMP_NAME
608 }
610 remove_marks()
611 {
612 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
613 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
614 $file = $ARGV[0];
615 our %dict;
616 if (open(F, $file)) {
617 @lines=<F>;
618 close(F);
620 if (open(F, ">$file")) {
621 for (@lines) {
622 chomp;
623 if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
624 my $name=$4;
625 my $comment=$6;
626 $dict{$name}=$comment;
627 print F "$1$2$3$4\n";
628 }
629 else {
630 print F "$_\n";
631 }
632 }
633 }
634 }
635 if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) {
636 @lines=<NOTES>;
637 close(NOTES);
639 if (open(NOTES, ">".$ENV{NOTES_FILE})) {
640 for (@lines) {
641 chomp;
642 s/^\s+//;
643 my ($a,$b)=split /\s+/,$_,2;
644 if (not defined($dict{$a}) || ($dict{$a} eq $b)) {
645 print NOTES "$_\n";
646 if (defined($dict{$a})) { unset($dict{$a}); }
647 }
648 }
649 for (keys %dict) {
650 $mark=$dict{$_};
651 $space=" "x(30-length($_));
652 print NOTES "$_$space$mark\n";
653 }
654 }
655 }
656 PERL_SCRIPT
657 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
658 export NOTES_FILE
659 export DONT_ADD_MARKS
660 perl $PERL_SCRIPT_TEMP_NAME "$1"
661 rm $PERL_SCRIPT_TEMP_NAME
662 }
664 part()
665 {
666 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
667 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
668 #!/usr/bin/perl
670 my @lines=<STDIN>;
671 my $lines=$#lines;
672 my $interval=$ARGV[0];
673 if (not $interval) {
674 print @lines;
675 }
676 else {
677 my ($start,$stop,$total);
678 if ($interval =~ m@(.*)/(.*)@) {
679 $start = $1;
680 $total = $2;
681 }
682 else {
683 $start=$interval;
684 $total=0;
685 }
686 if ($start =~ m@(.*)-(.*)@) {
687 $start = $1;
688 $stop = $2;
689 }
690 if ($start =~ m@(.*)\+(.*)@) {
691 $start = $1;
692 $stop = $start+$2;
693 }
695 $start=int($lines/$total*$start);
696 $stop=int($lines/$total*$stop);
698 for($i=$start;$i<$stop;$i++){
699 print $lines[$i];
700 }
701 }
702 PERL_SCRIPT
703 perl $PERL_SCRIPT_TEMP_NAME "$1"
704 rm $PERL_SCRIPT_TEMP_NAME
705 }
707 if [ "$TAGS_LIST_ONLY" = "YES" ]
708 then
709 cd "${WORK_DIR}"
710 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'
711 exit 0
712 fi
714 tag_file_name()
715 {
716 echo "${LANGUAGE}_${1}.txt"
717 }
719 if [ "$REMOVE_TAG" = "YES" ]
720 then
721 cd "${WORK_DIR}"
722 for i in $TAG_NAME
723 do
724 echo "$TAGNAME" | grep -q '[/*?]' && continue
725 f="`tag_file_name $i`"
726 if [ -e "$f" ]
727 then
728 rm -f "$f" && echo Tag "'$i'" removed
729 else
730 echo Unknown tag "'$i'"
731 fi
732 done
733 exit 0
734 fi
736 mkdir -p $WORK_DIR
737 oldpwd="$PWD"
738 cd $WORK_DIR
739 if [ "$MERGE_TAGGED_WORDS" = "YES" ]
740 then
741 VOC_FILES=''
742 for i in $MERGE_THIS_TAGS
743 do
744 f=`tag_file_name $i`
745 [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
746 done
747 if [ -z "$VOC_FILES" ]
748 then
749 echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
750 else
751 cat $VOC_FILES
752 fi
753 elif [ "$MERGE_ALL_TAGGED" = "YES" ]
754 then
755 cat ${LANGUAGE}_*.txt
756 elif echo "$1" | grep -q http:
757 then
758 text_from_url "$1"
759 elif [ "$#" != 0 ]
760 then
761 if echo $1 | grep -q ^/
762 then
763 cat "$1"
764 else
765 cat "$oldpwd/$1"
766 fi
767 else
768 cat
769 fi \
770 | part $PART_TO_PROCESS \
771 | tee $ORIGINAL_TEXT \
772 | two_and_three_words \
773 | get_words ${TEMP1}-full \
774 | group_words \
775 | add_stat ${TEMP1}-full \
776 | tee "$TEMP1" > "$TEMP2"
778 if [ "$STAT_ONLY" = "YES" ]
779 then
780 cat "$TEMP1"
781 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
782 then
783 cat "$TEMP1"
784 else
785 if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ]
786 then
787 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
788 if [ "$editor" = vim ]
789 then
790 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
791 else
792 $editor "$TEMP2"
793 fi
794 remove_marks "$TEMP2"
796 vocabulary="$VOCABULARY"
797 [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`"
798 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary"
799 fi
800 fi
802 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"