new-words

view new-words-py.sh @ 39:a598e0d25784

add_notes (add_marks) + remove_notes (remove_marks) implemented in python
author Igor Chubin <igor@chub.in>
date Sun Jan 23 14:25:52 2011 +0100 (2011-01-23)
parents adbc809d3924
children c3a50c0d2400
line source
1 #!/bin/bash
3 show_usage()
4 {
5 cat <<HELP > /dev/stderr
7 USAGE:
9 new-words [ -l lang ] [ -s ] [ ARG ]
11 SWITCHES:
13 -h print this screen
14 -c show compressed wordlist: one word per group
15 -k put higher words that are similar to the known words (only for English)
16 -l lang override language settings
17 -n non-interactive mode (don't run vi)
18 -N turn off known words filtering
19 -a don't add marks (and don't save marks added by user)
20 -p pages work with specified pages only (pages = start-stop/total )
21 -s show the text statistics (percentage of known words and so on) and exit
22 -S show your vocabulary statistics (number of words and word groups)
23 -t tag tag known words with tag
24 -T show list of active tags
25 -m tag merge the words tagged with "tag" into the main vocabulary
26 -M merge the words tagged with any tag into the main vocabulary
27 -r tag remove subvocabulary for the "tag"
28 -2 -3 find 2 and 3 words' sequences
30 The language of the text can be specified also
31 by name of the program new-words (correspondent link must be created before).
32 For example, these calls are equivalent:
34 de-words URL
35 new-words -l de URL
37 HELP
38 }
40 if [ "$1" = "-h" ]
41 then
42 show_usage
43 exit 0
44 fi
46 NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py
47 WORK_DIR=~/.new-words/
48 TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX`
49 TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX`
50 export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX`
51 editor=${EDITOR:-vim}
53 # language detection
55 LANGUAGE=en
56 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
57 for arg
58 do
59 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
60 then
61 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
62 fi
63 done
64 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
66 #----------------------------------------------------
67 # command line options processing
69 STAT_ONLY=NO
70 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
71 DONT_ADD_MARKS=NO
72 NON_INTERACTIVE_MODE=NO
73 PART_TO_PROCESS=''
74 GROUP_WORDS_BY_THREE=NO
75 GROUP_WORDS_BY_TWO=NO
76 TAG_NAME=''
77 MERGE_THIS_TAGS=''
78 TAGS_LIST_ONLY=NO
79 MERGE_TAGGED_WORDS=NO
80 MERGE_ALL_TAGGED=NO
81 DONT_ADD_MARKLINES=NO
82 FILTER_WORDS=YES
83 SHOW_VOC_STAT=NO
84 COMPRESSED_WORDLIST=NO
85 OLD_STYLE="NO"
86 while getopts Ocl:sSkanNp:t:Tm:Mr:23 opt
87 do
88 case "$opt" in
89 O) OLD_STYLE=YES;;
90 c) COMPRESSED_WORDLIST=YES;;
91 s) STAT_ONLY=YES;;
92 S) SHOW_VOC_STAT=YES;;
93 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
94 l) LANGUAGE="$OPTARG";;
95 a) DONT_ADD_MARKS=YES;;
96 n) NON_INTERACTIVE_MODE=YES;;
97 N) FILTER_WORDS=NO;;
98 p) PART_TO_PROCESS="$OPTARG";;
99 t) TAG_NAME="$OPTARG";;
100 T) TAGS_LIST_ONLY="YES";;
101 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
102 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
103 r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
104 2) GROUP_WORDS_BY_TWO=YES;;
105 3) GROUP_WORDS_BY_THREE=YES;;
106 \?) # unknown flag
107 show_usage
108 exit 1;;
109 esac
110 done
111 shift `expr $OPTIND - 1`
113 if [ "$1" = "-l" ]
114 then
115 LANGUAGE="$2"
116 shift 2
117 fi
119 VOCABULARY=${LANGUAGE}.txt
120 NOTES_FILE=notes-${LANGUAGE}.txt
122 if [ "${SHOW_VOC_STAT}" = "YES" ]
123 then
124 $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>"
125 exit 0
126 fi
128 #----------------------------------------------------
130 get_words()
131 {
132 if [ "$OLD_STYLE" = NO ]
133 then
134 $NEW_WORDS_PY -l "$LANGUAGE" -f get_words "$1"
135 else
136 get_words_OLD "$@"
137 fi
138 }
140 get_words_OLD()
141 {
142 export FILTER_WORDS
143 tr ' ' '\n' | sed 's/--/ /g' \
144 | sed "s/'/__APOSTROPHE__/g" \
145 | perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\
146 | sed "s/__APOSTROPHE__/'/g" \
147 | tr ' ' '\n' \
148 | tee "$1" \
149 | grep_v_english_perl \
150 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
151 }
153 add_stat()
154 {
155 if [ "$DONT_ADD_MARKLINES" = "YES" ]
156 then
157 cat
158 return
159 fi
160 before="$1"
161 after=${before}2
162 cat > "$after"
163 total="`wc -w $1 | awk '{print $1}'`"
164 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
165 total_known="`echo $total-$total_unknown|bc`"
166 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
167 #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`"
168 sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`"
171 if [ "$STAT_ONLY" = "YES" ]
172 then
173 echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10"
174 echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` "
175 rm $after
176 return 0
177 else
178 groups="`echo $(grep '# groups' $after | awk '{print $3}')`"
179 words="`echo $(grep -v '^#' $after | wc -l)`"
180 echo "# $LANGUAGE, $percentage, <$total_known/$total>, <$groups/$words>"
181 fi
183 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
184 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
185 my $total=shift(@ARGV);
186 my $total_known=shift(@ARGV);
187 my $s=0;
188 my $mark_line=int($total_known*100/$total/5)*5;
189 if ($mark_line>=90) {
190 $mark_line=int($total_known*100/$total)+1;
191 } else { $mark_line +=5; };
192 while(<>)
193 {
194 next if /^#\s*groups\s*/;
195 print;
196 /^\s*([0-9]*)\s*/;
197 $s+=$1;
198 if (($total_known+$s)*100/$total>=$mark_line) {
199 print "# $mark_line\n";
200 if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; };
201 }
202 }
203 PERL_SCRIPT
204 perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after"
205 rm $PERL_SCRIPT_TEMP_NAME
206 rm $after
207 }
209 two_and_three_words()
210 {
211 if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ]
212 then
213 cat
214 else
215 cat
217 export GROUP_WORDS_BY_THREE
218 export GROUP_WORDS_BY_TWO
219 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX`
220 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
221 #!/usr/bin/perl
222 local $/;
223 $words=<>;
224 $words=~ s@[!?;,:#1-9".]@ @g;
225 $words =~ s@\s+@ @g;
226 @words = split /\s+/, $words;
227 for ($i=0; $i<$#words-3;$i++) {
228 my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]);
229 if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) {
230 print "${a}_${b}_${c}\n";
231 };
232 if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) {
233 print "${a}_${b}\n";
234 };
235 }
236 PERL_SCRIPT
237 perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT"
238 rm $PERL_SCRIPT_TEMP_NAME
239 fi
240 }
242 grep_v_english()
243 {
244 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
245 eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat
246 }
248 grep_v_english_perl()
249 {
250 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
251 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
252 if ($ENV{FILTER_WORDS} eq "NO") {
253 while(<>) { print; }
254 exit(0);
255 }
256 $voc_files=$ENV{VOC_FILES};
257 $voc_files=~s@^ @@;
258 for $voc_file (split /\s+/,$voc_files) {
259 if (open(VOC, $voc_file)) {
260 while (<VOC>){
261 chomp;
262 #s/'//g;
263 $voc{$_}="1";
264 }
265 }
266 }
267 while(<>) {
268 chomp;
269 if (not defined($voc{$_})) { print "$_\n"; }
270 }
271 PERL_SCRIPT
272 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
273 export VOCABULARY VOC_FILES
274 VOC_FILES=$VOCABULARY
275 for i in $TAG_NAME
276 do
277 VOC_FILES="${VOC_FILES} `tag_file_name $i`"
278 done
279 perl $PERL_SCRIPT_TEMP_NAME
280 rm $PERL_SCRIPT_TEMP_NAME
281 }
283 group_words()
284 {
285 if [ "$OLD_STYLE" = NO ]
286 then
287 $NEW_WORDS_PY -l "$LANGUAGE" -f group_words "$1"
288 else
289 group_words_OLD "$@"
290 fi
291 }
293 group_words_OLD()
294 {
295 #if [ "$LANGUAGE" != "en" ]
296 #then
297 # cat
298 # return
299 #fi
300 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX`
301 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
302 #!/usr/bin/perl
304 use Encode;
305 use utf8;
306 use Lingua::Stem::Snowball qw(stem);
308 eval {
309 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
310 require String::Similarity;
311 String::Similarity->import();
312 };
313 unless($@)
314 {
315 our $HAVE_String_Similarity=1;
316 }
319 sub load_notes_dict()
320 {
321 my %dict;
322 if (open(NOTES, $ENV{NOTES_FILE})) {
323 while(<NOTES>) {
324 $_ = decode( "utf8", $_);
325 chomp;
326 s/^\s+//;
327 my ($a,$b)=split /\s+/,$_,2;
328 $dict{$a}=$b;
329 }
330 }
331 return %dict;
332 }
334 sub similar($$){
335 my $a=shift;
336 my $b=shift;
337 if ($HAVE_String_Similarity) {
338 return $Similarity{"$a $b"};
339 }
340 else {
341 return 0;
342 }
343 }
346 sub normalize_without_linked($)
347 {
348 if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
349 elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
350 elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); }
351 elsif ( $ENV{LANGUAGE} eq "io" ) { return normalize_esperanto(shift); }
352 else { return shift ; }
353 }
355 sub normalize_with_linked($)
356 {
357 my $word = normalize_without_linked(shift);
358 #return $word;
359 if ($linked_words{$word}) {
360 return $linked_words{$word};
361 }
362 else {
363 return $word;
364 }
365 }
367 sub normalize($)
368 {
369 return normalize_with_linked(shift);
370 }
372 sub normalize_ukrainian($)
373 {
374 $_=lc(shift);
375 s/[юіоеуаи]$//g;
376 return $_;
377 }
379 sub normalize_esperanto($)
380 {
381 $_=lc(shift);
382 # verbs
383 s/i$//; s/is$//; s/os$//; s/as$//; s/us$//;
385 # nouns
386 s/j?n?$//;
388 return $_;
389 }
391 sub normalize_german($)
392 {
393 @stems = stem('de', \@_);
394 return $stems[0];
395 }
397 sub normalize_german_($)
398 {
399 $_=lc(shift);
401 s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//;
402 s/en$//; s/er$//;
404 s/lich$//; s/ig$//;
405 s/al$//; s/isch$//;
406 s/ell$//; s/haft$//;
408 s/bar$//; s/sam$//; s/lich$//;
410 @prefixes=qw(
411 ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu
412 be emp ent er ge miss ver zer durch über um unter wieder);
413 @prefixes=();
414 for $pref (@prefixes) {
415 s/^$pref//;
416 }
419 return $_;
420 }
422 sub normalize_english($)
423 {
424 $_=lc(shift);
426 s/s$//;
428 s/ation$//; s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//;
430 s/ed$//;
431 s/en$//;
432 s/er$//;
433 s/est$//;
434 s/ing$//;
436 s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//;
437 s/ify$//; s/fy$//; s/ly$//;
438 s/ise$//; s/ize$//;
440 s/e$//;
441 return $_;
442 }
445 sub compare($$)
446 {
447 my $a=shift;
448 my $b=shift;
449 $a =~ s/^\s*//;
450 $b =~ s/^\s*//;
451 my ($a1, $a2)= split /\s+/,$a,2;
452 my ($b1, $b2)= split /\s+/,$b,2;
454 my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)};
456 if ($cmp) {
457 return $cmp;
458 }
459 else {
460 if (normalize($a2) ne normalize($b2)) {
461 return normalize($a2) cmp normalize($b2);
462 }
463 else {
464 return $a1 <=> $b1;
465 }
466 }
467 }
469 sub log_($)
470 {
471 return;
472 open(LOG, ">>", "/tmp/log1");
473 print LOG $_[0];
474 close(LOG);
475 }
477 sub find_linked_words($)
478 {
479 my %linked_words;
480 my $dict = shift;
481 log_("1");
482 log_(join(" ", keys(%$dict)));
484 for $key (keys(%$dict)) {
485 $val = $dict->{$key};
486 log_($key."\n");
487 if ($val =~ /\@([a-z]*)/) {
488 $linked_words{normalize($key)} = normalize($1);
489 log_(normalize($key)." = ".normalize($1)."\n");
490 }
491 }
492 return %linked_words;
493 }
495 sub lc_length($)
496 {
497 my $a= shift;
498 $a =~ s/[a-z]//g;
499 return length($a);
500 }
502 our %dict = load_notes_dict();
503 our %linked_words = find_linked_words(\%dict);
505 our %Vocabulary;
506 open(VOC, $ENV{VOCABULARY})
507 or die "Can't open VOCABULARY";
508 while (<VOC>){
509 chomp;
510 #s/'//g;
511 $Vocabulary{normalize($_)}="1";
512 }
513 close(VOC);
515 binmode STDIN,":utf8";
516 @lines=<STDIN>;
517 for $L (@lines) {
518 chomp($L);
519 #$L = decode( "utf8", $L);
520 $l=$L;
521 $l =~ s/^\s*//;
522 my ($a, $b)=split(/\s+/,$l,2);
523 $group_weight{normalize($b)}+=$a;
524 }
525 if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") {
526 for $k (keys %group_weight) {
527 if (defined($Vocabulary{$k})) {
528 $group_weight{$k} *= 2;
529 }
530 }
531 }
532 @lines2 = sort { compare($b,$a) } @lines;
533 binmode STDOUT, ":utf8";
534 print "# groups ".scalar(keys(%group_weight))."\n";
535 if ($ENV{COMPRESSED_WORDLIST} eq "YES") {
536 my $sum = 0;
537 my $min = 9999;
538 for $L (@lines2) {
539 chomp($L); $l=$L; $l =~ s/^(\s*)//; my $spaces = $1; my ($a, $b)=split(/\s+/,$l,2);
540 $group_name = normalize($b);
541 if ($group_name ne $prev_group_name and $prev_group_name ne '' ) {
542 #print (" "x(7-length($sum))),"$sum $main_word\n";
543 print +(" "x(7-length($sum))),"$sum $main_word\n";
544 $sum = $a;
545 $min = length($b) + 2*lc_length($b);
546 $main_word = $b;
547 }
548 else {
549 $sum += $a;
550 if ($min > length($b) + 2*lc_length($b)) {
551 $min = length($b) + 2*lc_length($b);
552 $main_word = $b;
553 }
554 }
555 $prev_group_name = $group_name;
556 }
557 }
558 else {
559 for $l (@lines2) {
560 print "$l\n";
561 }
562 }
563 PERL_SCRIPT
564 export VOCABULARY
565 export NEED_TO_USE_VOCABULARY_WHEN_SORT
566 export LANGUAGE
567 export COMPRESSED_WORDLIST
568 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
569 export NOTES_FILE
570 perl $PERL_SCRIPT_TEMP_NAME
571 rm $PERL_SCRIPT_TEMP_NAME
572 }
574 text_from_url()
575 {
576 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
577 }
579 add_marks()
580 {
581 if [ "$OLD_STYLE" = NO ]
582 then
583 $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1"
584 else
585 group_words_OLD "$@"
586 fi
587 }
589 add_marks_OLD()
590 {
591 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
592 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
593 use Encode;
595 sub load_notes_dict()
596 {
597 my %dict;
598 if (open(NOTES, $ENV{NOTES_FILE})) {
599 while(<NOTES>) {
600 $_ = decode( "utf8", $_);
601 chomp;
602 s/^\s+//;
603 my ($a,$b)=split /\s+/,$_,2;
604 $dict{$a}=$b;
605 }
606 }
607 return %dict;
608 }
610 %dict = load_notes_dict();
612 $file = $ARGV[0];
613 if (open(F, $file)) {
614 @lines=<F>;
615 close(F);
616 for (@lines) {$_ = decode( "utf8", $_);};
618 if (open(F, ">$file")) {
619 binmode F, ":utf8";
620 for (@lines) {
621 m/\s+\S+\s+(\S+)/;
622 $name=$1;
623 if (not /^#/ and defined($dict{$name})) {
624 chomp;
625 $mark=$dict{$name};
626 $space=" "x(30-length($_));
627 print F "$_$space$mark\n";
628 }
629 else {
630 print F "$_";
631 }
632 }
633 close(F);
634 }
635 }
636 PERL_SCRIPT
637 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
638 export NOTES_FILE
639 perl $PERL_SCRIPT_TEMP_NAME "$1"
640 rm $PERL_SCRIPT_TEMP_NAME
641 }
643 remove_marks()
644 {
645 if [ "$OLD_STYLE" = NO ]
646 then
647 $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1"
648 else
649 group_words_OLD "$@"
650 fi
651 }
653 remove_marks_OLD()
654 {
655 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
656 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
657 $file = $ARGV[0];
658 our %dict;
659 if (open(F, $file)) {
660 @lines=<F>;
661 close(F);
663 if (open(F, ">$file")) {
664 for (@lines) {
665 chomp;
666 if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
667 my $name=$4;
668 my $comment=$6;
669 $dict{$name}=$comment;
670 print F "$1$2$3$4\n";
671 }
672 else {
673 print F "$_\n";
674 }
675 }
676 }
677 }
678 if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) {
679 @lines=<NOTES>;
680 close(NOTES);
682 if (open(NOTES, ">".$ENV{NOTES_FILE})) {
683 for (@lines) {
684 chomp;
685 s/^\s+//;
686 my ($a,$b)=split /\s+/,$_,2;
687 if (not defined($dict{$a}) || ($dict{$a} eq $b)) {
688 print NOTES "$_\n";
689 if (defined($dict{$a})) { unset($dict{$a}); }
690 }
691 }
692 for (keys %dict) {
693 $mark=$dict{$_};
694 $space=" "x(30-length($_));
695 print NOTES "$_$space$mark\n";
696 }
697 }
698 }
699 PERL_SCRIPT
700 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
701 export NOTES_FILE
702 export DONT_ADD_MARKS
703 perl $PERL_SCRIPT_TEMP_NAME "$1"
704 rm $PERL_SCRIPT_TEMP_NAME
705 }
707 part()
708 {
709 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
710 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
711 #!/usr/bin/perl
713 my @lines=<STDIN>;
714 my $lines=$#lines;
715 my $interval=$ARGV[0];
716 if (not $interval) {
717 print @lines;
718 }
719 else {
720 my ($start,$stop,$total);
721 if ($interval =~ m@(.*)/(.*)@) {
722 $start = $1;
723 $total = $2;
724 }
725 else {
726 $start=$interval;
727 $total=0;
728 }
729 if ($start =~ m@(.*)-(.*)@) {
730 $start = $1;
731 $stop = $2;
732 }
733 if ($start =~ m@(.*)\+(.*)@) {
734 $start = $1;
735 $stop = $start+$2;
736 }
738 $start=int($lines/$total*$start);
739 $stop=int($lines/$total*$stop);
741 for($i=$start;$i<$stop;$i++){
742 print $lines[$i];
743 }
744 }
745 PERL_SCRIPT
746 perl $PERL_SCRIPT_TEMP_NAME "$1"
747 rm $PERL_SCRIPT_TEMP_NAME
748 }
750 if [ "$TAGS_LIST_ONLY" = "YES" ]
751 then
752 cd "${WORK_DIR}"
753 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'
754 exit 0
755 fi
757 tag_file_name()
758 {
759 echo "${LANGUAGE}_${1}.txt"
760 }
762 if [ "$REMOVE_TAG" = "YES" ]
763 then
764 cd "${WORK_DIR}"
765 for i in $TAG_NAME
766 do
767 echo "$TAGNAME" | grep -q '[/*?]' && continue
768 f="`tag_file_name $i`"
769 if [ -e "$f" ]
770 then
771 rm -f "$f" && echo Tag "'$i'" removed
772 else
773 echo Unknown tag "'$i'"
774 fi
775 done
776 exit 0
777 fi
779 mkdir -p $WORK_DIR
780 oldpwd="$PWD"
781 cd $WORK_DIR
782 if [ "$MERGE_TAGGED_WORDS" = "YES" ]
783 then
784 VOC_FILES=''
785 for i in $MERGE_THIS_TAGS
786 do
787 f=`tag_file_name $i`
788 [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
789 done
790 if [ -z "$VOC_FILES" ]
791 then
792 echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
793 else
794 cat $VOC_FILES
795 fi
796 elif [ "$MERGE_ALL_TAGGED" = "YES" ]
797 then
798 cat ${LANGUAGE}_*.txt
799 elif echo "$1" | grep -q http:
800 then
801 text_from_url "$1"
802 elif [ "$#" != 0 ]
803 then
804 if echo $1 | grep -q ^/
805 then
806 cat "$1"
807 else
808 cat "$oldpwd/$1"
809 fi
810 else
811 cat
812 fi \
813 | part $PART_TO_PROCESS \
814 | tee $ORIGINAL_TEXT \
815 | two_and_three_words \
816 | get_words ${TEMP1}-full \
817 | group_words \
818 | add_stat ${TEMP1}-full \
819 | tee "$TEMP1" > "$TEMP2"
821 if [ "$STAT_ONLY" = "YES" ]
822 then
823 cat "$TEMP1"
824 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
825 then
826 cat "$TEMP1"
827 else
828 if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ]
829 then
830 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
831 if [ "$editor" = vim ]
832 then
833 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
834 else
835 $editor "$TEMP2"
836 fi
837 remove_marks "$TEMP2"
839 vocabulary="$VOCABULARY"
840 [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`"
841 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary"
842 fi
843 fi
845 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"