new-words

view new-words-py.sh @ 38:adbc809d3924

Transition to Python started

new-words-py.sh is a wrapper around
new-words.py version which is not finished yet.
author Igor Chubin <igor@chub.in>
date Sat Jan 22 23:42:31 2011 +0100 (2011-01-22)
parents
children a598e0d25784
line source
1 #!/bin/bash
3 show_usage()
4 {
5 cat <<HELP > /dev/stderr
7 USAGE:
9 new-words [ -l lang ] [ -s ] [ ARG ]
11 SWITCHES:
13 -h print this screen
14 -c show compressed wordlist: one word per group
15 -k put higher words that are similar to the known words (only for English)
16 -l lang override language settings
17 -n non-interactive mode (don't run vi)
18 -N turn off known words filtering
19 -a don't add marks (and don't save marks added by user)
20 -p pages work with specified pages only (pages = start-stop/total )
21 -s show the text statistics (percentage of known words and so on) and exit
22 -S show your vocabulary statistics (number of words and word groups)
23 -t tag tag known words with tag
24 -T show list of active tags
25 -m tag merge the words tagged with "tag" into the main vocabulary
26 -M merge the words tagged with any tag into the main vocabulary
27 -r tag remove subvocabulary for the "tag"
28 -2 -3 find 2 and 3 words' sequences
30 The language of the text can be specified also
31 by name of the program new-words (correspondent link must be created before).
32 For example, these calls are equivalent:
34 de-words URL
35 new-words -l de URL
37 HELP
38 }
40 if [ "$1" = "-h" ]
41 then
42 show_usage
43 exit 0
44 fi
46 NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py
47 WORK_DIR=~/.new-words/
48 TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX`
49 TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX`
50 export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX`
51 editor=${EDITOR:-vim}
53 # language detection
55 LANGUAGE=en
56 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
57 for arg
58 do
59 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
60 then
61 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
62 fi
63 done
64 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
66 #----------------------------------------------------
67 # command line options processing
69 STAT_ONLY=NO
70 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
71 DONT_ADD_MARKS=NO
72 NON_INTERACTIVE_MODE=NO
73 PART_TO_PROCESS=''
74 GROUP_WORDS_BY_THREE=NO
75 GROUP_WORDS_BY_TWO=NO
76 TAG_NAME=''
77 MERGE_THIS_TAGS=''
78 TAGS_LIST_ONLY=NO
79 MERGE_TAGGED_WORDS=NO
80 MERGE_ALL_TAGGED=NO
81 DONT_ADD_MARKLINES=NO
82 FILTER_WORDS=YES
83 SHOW_VOC_STAT=NO
84 COMPRESSED_WORDLIST=NO
85 OLD_STYLE="NO"
86 while getopts Ocl:sSkanNp:t:Tm:Mr:23 opt
87 do
88 case "$opt" in
89 O) OLD_STYLE=YES;;
90 c) COMPRESSED_WORDLIST=YES;;
91 s) STAT_ONLY=YES;;
92 S) SHOW_VOC_STAT=YES;;
93 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
94 l) LANGUAGE="$OPTARG";;
95 a) DONT_ADD_MARKS=YES;;
96 n) NON_INTERACTIVE_MODE=YES;;
97 N) FILTER_WORDS=NO;;
98 p) PART_TO_PROCESS="$OPTARG";;
99 t) TAG_NAME="$OPTARG";;
100 T) TAGS_LIST_ONLY="YES";;
101 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
102 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
103 r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
104 2) GROUP_WORDS_BY_TWO=YES;;
105 3) GROUP_WORDS_BY_THREE=YES;;
106 \?) # unknown flag
107 show_usage
108 exit 1;;
109 esac
110 done
111 shift `expr $OPTIND - 1`
113 if [ "$1" = "-l" ]
114 then
115 LANGUAGE="$2"
116 shift 2
117 fi
119 VOCABULARY=${LANGUAGE}.txt
120 NOTES_FILE=notes-${LANGUAGE}.txt
122 if [ "${SHOW_VOC_STAT}" = "YES" ]
123 then
124 $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>"
125 exit 0
126 fi
128 #----------------------------------------------------
130 get_words()
131 {
132 if [ "$OLD_STYLE" = NO ]
133 then
134 $NEW_WORDS_PY -l "$LANGUAGE" -f get_words "$1"
135 else
136 get_words_OLD "$@"
137 fi
138 }
140 get_words_OLD()
141 {
142 export FILTER_WORDS
143 tr ' ' '\n' | sed 's/--/ /g' \
144 | sed "s/'/__APOSTROPHE__/g" \
145 | perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\
146 | sed "s/__APOSTROPHE__/'/g" \
147 | tr ' ' '\n' \
148 | tee "$1" \
149 | grep_v_english_perl \
150 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
151 }
153 add_stat()
154 {
155 if [ "$DONT_ADD_MARKLINES" = "YES" ]
156 then
157 cat
158 return
159 fi
160 before="$1"
161 after=${before}2
162 cat > "$after"
163 total="`wc -w $1 | awk '{print $1}'`"
164 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
165 total_known="`echo $total-$total_unknown|bc`"
166 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
167 #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`"
168 sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`"
171 if [ "$STAT_ONLY" = "YES" ]
172 then
173 echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10"
174 echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` "
175 rm $after
176 return 0
177 else
178 groups="`echo $(grep '# groups' $after | awk '{print $3}')`"
179 words="`echo $(grep -v '^#' $after | wc -l)`"
180 echo "# $LANGUAGE, $percentage, <$total_known/$total>, <$groups/$words>"
181 fi
183 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
184 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
185 my $total=shift(@ARGV);
186 my $total_known=shift(@ARGV);
187 my $s=0;
188 my $mark_line=int($total_known*100/$total/5)*5;
189 if ($mark_line>=90) {
190 $mark_line=int($total_known*100/$total)+1;
191 } else { $mark_line +=5; };
192 while(<>)
193 {
194 next if /^#\s*groups\s*/;
195 print;
196 /^\s*([0-9]*)\s*/;
197 $s+=$1;
198 if (($total_known+$s)*100/$total>=$mark_line) {
199 print "# $mark_line\n";
200 if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; };
201 }
202 }
203 PERL_SCRIPT
204 perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after"
205 rm $PERL_SCRIPT_TEMP_NAME
206 rm $after
207 }
209 two_and_three_words()
210 {
211 if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ]
212 then
213 cat
214 else
215 cat
217 export GROUP_WORDS_BY_THREE
218 export GROUP_WORDS_BY_TWO
219 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX`
220 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
221 #!/usr/bin/perl
222 local $/;
223 $words=<>;
224 $words=~ s@[!?;,:#1-9".]@ @g;
225 $words =~ s@\s+@ @g;
226 @words = split /\s+/, $words;
227 for ($i=0; $i<$#words-3;$i++) {
228 my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]);
229 if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) {
230 print "${a}_${b}_${c}\n";
231 };
232 if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) {
233 print "${a}_${b}\n";
234 };
235 }
236 PERL_SCRIPT
237 perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT"
238 rm $PERL_SCRIPT_TEMP_NAME
239 fi
240 }
242 grep_v_english()
243 {
244 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
245 eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat
246 }
248 grep_v_english_perl()
249 {
250 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
251 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
252 if ($ENV{FILTER_WORDS} eq "NO") {
253 while(<>) { print; }
254 exit(0);
255 }
256 $voc_files=$ENV{VOC_FILES};
257 $voc_files=~s@^ @@;
258 for $voc_file (split /\s+/,$voc_files) {
259 if (open(VOC, $voc_file)) {
260 while (<VOC>){
261 chomp;
262 #s/'//g;
263 $voc{$_}="1";
264 }
265 }
266 }
267 while(<>) {
268 chomp;
269 if (not defined($voc{$_})) { print "$_\n"; }
270 }
271 PERL_SCRIPT
272 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
273 export VOCABULARY VOC_FILES
274 VOC_FILES=$VOCABULARY
275 for i in $TAG_NAME
276 do
277 VOC_FILES="${VOC_FILES} `tag_file_name $i`"
278 done
279 perl $PERL_SCRIPT_TEMP_NAME
280 rm $PERL_SCRIPT_TEMP_NAME
281 }
283 group_words()
284 {
285 if [ "$OLD_STYLE" = NO ]
286 then
287 $NEW_WORDS_PY -l "$LANGUAGE" -f group_words "$1"
288 else
289 group_words_OLD "$@"
290 fi
291 }
293 group_words_OLD()
294 {
295 #if [ "$LANGUAGE" != "en" ]
296 #then
297 # cat
298 # return
299 #fi
300 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX`
301 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
302 #!/usr/bin/perl
304 use Encode;
305 use utf8;
306 use Lingua::Stem::Snowball qw(stem);
308 eval {
309 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
310 require String::Similarity;
311 String::Similarity->import();
312 };
313 unless($@)
314 {
315 our $HAVE_String_Similarity=1;
316 }
319 sub load_notes_dict()
320 {
321 my %dict;
322 if (open(NOTES, $ENV{NOTES_FILE})) {
323 while(<NOTES>) {
324 $_ = decode( "utf8", $_);
325 chomp;
326 s/^\s+//;
327 my ($a,$b)=split /\s+/,$_,2;
328 $dict{$a}=$b;
329 }
330 }
331 return %dict;
332 }
334 sub similar($$){
335 my $a=shift;
336 my $b=shift;
337 if ($HAVE_String_Similarity) {
338 return $Similarity{"$a $b"};
339 }
340 else {
341 return 0;
342 }
343 }
346 sub normalize_without_linked($)
347 {
348 if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
349 elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
350 elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); }
351 elsif ( $ENV{LANGUAGE} eq "io" ) { return normalize_esperanto(shift); }
352 else { return shift ; }
353 }
355 sub normalize_with_linked($)
356 {
357 my $word = normalize_without_linked(shift);
358 #return $word;
359 if ($linked_words{$word}) {
360 return $linked_words{$word};
361 }
362 else {
363 return $word;
364 }
365 }
367 sub normalize($)
368 {
369 return normalize_with_linked(shift);
370 }
372 sub normalize_ukrainian($)
373 {
374 $_=lc(shift);
375 s/[юіоеуаи]$//g;
376 return $_;
377 }
379 sub normalize_esperanto($)
380 {
381 $_=lc(shift);
382 # verbs
383 s/i$//; s/is$//; s/os$//; s/as$//; s/us$//;
385 # nouns
386 s/j?n?$//;
388 return $_;
389 }
391 sub normalize_german($)
392 {
393 @stems = stem('de', \@_);
394 return $stems[0];
395 }
397 sub normalize_german_($)
398 {
399 $_=lc(shift);
401 s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//;
402 s/en$//; s/er$//;
404 s/lich$//; s/ig$//;
405 s/al$//; s/isch$//;
406 s/ell$//; s/haft$//;
408 s/bar$//; s/sam$//; s/lich$//;
410 @prefixes=qw(
411 ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu
412 be emp ent er ge miss ver zer durch über um unter wieder);
413 @prefixes=();
414 for $pref (@prefixes) {
415 s/^$pref//;
416 }
419 return $_;
420 }
422 sub normalize_english($)
423 {
424 $_=lc(shift);
426 s/s$//;
428 s/ation$//; s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//;
430 s/ed$//;
431 s/en$//;
432 s/er$//;
433 s/est$//;
434 s/ing$//;
436 s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//;
437 s/ify$//; s/fy$//; s/ly$//;
438 s/ise$//; s/ize$//;
440 s/e$//;
441 return $_;
442 }
445 sub compare($$)
446 {
447 my $a=shift;
448 my $b=shift;
449 $a =~ s/^\s*//;
450 $b =~ s/^\s*//;
451 my ($a1, $a2)= split /\s+/,$a,2;
452 my ($b1, $b2)= split /\s+/,$b,2;
454 my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)};
456 if ($cmp) {
457 return $cmp;
458 }
459 else {
460 if (normalize($a2) ne normalize($b2)) {
461 return normalize($a2) cmp normalize($b2);
462 }
463 else {
464 return $a1 <=> $b1;
465 }
466 }
467 }
469 sub log_($)
470 {
471 return;
472 open(LOG, ">>", "/tmp/log1");
473 print LOG $_[0];
474 close(LOG);
475 }
477 sub find_linked_words($)
478 {
479 my %linked_words;
480 my $dict = shift;
481 log_("1");
482 log_(join(" ", keys(%$dict)));
484 for $key (keys(%$dict)) {
485 $val = $dict->{$key};
486 log_($key."\n");
487 if ($val =~ /\@([a-z]*)/) {
488 $linked_words{normalize($key)} = normalize($1);
489 log_(normalize($key)." = ".normalize($1)."\n");
490 }
491 }
492 return %linked_words;
493 }
495 sub lc_length($)
496 {
497 my $a= shift;
498 $a =~ s/[a-z]//g;
499 return length($a);
500 }
502 our %dict = load_notes_dict();
503 our %linked_words = find_linked_words(\%dict);
505 our %Vocabulary;
506 open(VOC, $ENV{VOCABULARY})
507 or die "Can't open VOCABULARY";
508 while (<VOC>){
509 chomp;
510 #s/'//g;
511 $Vocabulary{normalize($_)}="1";
512 }
513 close(VOC);
515 binmode STDIN,":utf8";
516 @lines=<STDIN>;
517 for $L (@lines) {
518 chomp($L);
519 #$L = decode( "utf8", $L);
520 $l=$L;
521 $l =~ s/^\s*//;
522 my ($a, $b)=split(/\s+/,$l,2);
523 $group_weight{normalize($b)}+=$a;
524 }
525 if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") {
526 for $k (keys %group_weight) {
527 if (defined($Vocabulary{$k})) {
528 $group_weight{$k} *= 2;
529 }
530 }
531 }
532 @lines2 = sort { compare($b,$a) } @lines;
533 binmode STDOUT, ":utf8";
534 print "# groups ".scalar(keys(%group_weight))."\n";
535 if ($ENV{COMPRESSED_WORDLIST} eq "YES") {
536 my $sum = 0;
537 my $min = 9999;
538 for $L (@lines2) {
539 chomp($L); $l=$L; $l =~ s/^(\s*)//; my $spaces = $1; my ($a, $b)=split(/\s+/,$l,2);
540 $group_name = normalize($b);
541 if ($group_name ne $prev_group_name and $prev_group_name ne '' ) {
542 #print (" "x(7-length($sum))),"$sum $main_word\n";
543 print +(" "x(7-length($sum))),"$sum $main_word\n";
544 $sum = $a;
545 $min = length($b) + 2*lc_length($b);
546 $main_word = $b;
547 }
548 else {
549 $sum += $a;
550 if ($min > length($b) + 2*lc_length($b)) {
551 $min = length($b) + 2*lc_length($b);
552 $main_word = $b;
553 }
554 }
555 $prev_group_name = $group_name;
556 }
557 }
558 else {
559 for $l (@lines2) {
560 print "$l\n";
561 }
562 }
563 PERL_SCRIPT
564 export VOCABULARY
565 export NEED_TO_USE_VOCABULARY_WHEN_SORT
566 export LANGUAGE
567 export COMPRESSED_WORDLIST
568 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
569 export NOTES_FILE
570 perl $PERL_SCRIPT_TEMP_NAME
571 rm $PERL_SCRIPT_TEMP_NAME
572 }
574 text_from_url()
575 {
576 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
577 }
579 add_marks()
580 {
581 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
582 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
583 use Encode;
585 sub load_notes_dict()
586 {
587 my %dict;
588 if (open(NOTES, $ENV{NOTES_FILE})) {
589 while(<NOTES>) {
590 $_ = decode( "utf8", $_);
591 chomp;
592 s/^\s+//;
593 my ($a,$b)=split /\s+/,$_,2;
594 $dict{$a}=$b;
595 }
596 }
597 return %dict;
598 }
600 %dict = load_notes_dict();
602 $file = $ARGV[0];
603 if (open(F, $file)) {
604 @lines=<F>;
605 close(F);
606 for (@lines) {$_ = decode( "utf8", $_);};
608 if (open(F, ">$file")) {
609 binmode F, ":utf8";
610 for (@lines) {
611 m/\s+\S+\s+(\S+)/;
612 $name=$1;
613 if (not /^#/ and defined($dict{$name})) {
614 chomp;
615 $mark=$dict{$name};
616 $space=" "x(30-length($_));
617 print F "$_$space$mark\n";
618 }
619 else {
620 print F "$_";
621 }
622 }
623 close(F);
624 }
625 }
626 PERL_SCRIPT
627 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
628 export NOTES_FILE
629 perl $PERL_SCRIPT_TEMP_NAME "$1"
630 rm $PERL_SCRIPT_TEMP_NAME
631 }
633 remove_marks()
634 {
635 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
636 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
637 $file = $ARGV[0];
638 our %dict;
639 if (open(F, $file)) {
640 @lines=<F>;
641 close(F);
643 if (open(F, ">$file")) {
644 for (@lines) {
645 chomp;
646 if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
647 my $name=$4;
648 my $comment=$6;
649 $dict{$name}=$comment;
650 print F "$1$2$3$4\n";
651 }
652 else {
653 print F "$_\n";
654 }
655 }
656 }
657 }
658 if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) {
659 @lines=<NOTES>;
660 close(NOTES);
662 if (open(NOTES, ">".$ENV{NOTES_FILE})) {
663 for (@lines) {
664 chomp;
665 s/^\s+//;
666 my ($a,$b)=split /\s+/,$_,2;
667 if (not defined($dict{$a}) || ($dict{$a} eq $b)) {
668 print NOTES "$_\n";
669 if (defined($dict{$a})) { unset($dict{$a}); }
670 }
671 }
672 for (keys %dict) {
673 $mark=$dict{$_};
674 $space=" "x(30-length($_));
675 print NOTES "$_$space$mark\n";
676 }
677 }
678 }
679 PERL_SCRIPT
680 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
681 export NOTES_FILE
682 export DONT_ADD_MARKS
683 perl $PERL_SCRIPT_TEMP_NAME "$1"
684 rm $PERL_SCRIPT_TEMP_NAME
685 }
687 part()
688 {
689 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
690 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
691 #!/usr/bin/perl
693 my @lines=<STDIN>;
694 my $lines=$#lines;
695 my $interval=$ARGV[0];
696 if (not $interval) {
697 print @lines;
698 }
699 else {
700 my ($start,$stop,$total);
701 if ($interval =~ m@(.*)/(.*)@) {
702 $start = $1;
703 $total = $2;
704 }
705 else {
706 $start=$interval;
707 $total=0;
708 }
709 if ($start =~ m@(.*)-(.*)@) {
710 $start = $1;
711 $stop = $2;
712 }
713 if ($start =~ m@(.*)\+(.*)@) {
714 $start = $1;
715 $stop = $start+$2;
716 }
718 $start=int($lines/$total*$start);
719 $stop=int($lines/$total*$stop);
721 for($i=$start;$i<$stop;$i++){
722 print $lines[$i];
723 }
724 }
725 PERL_SCRIPT
726 perl $PERL_SCRIPT_TEMP_NAME "$1"
727 rm $PERL_SCRIPT_TEMP_NAME
728 }
730 if [ "$TAGS_LIST_ONLY" = "YES" ]
731 then
732 cd "${WORK_DIR}"
733 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'
734 exit 0
735 fi
737 tag_file_name()
738 {
739 echo "${LANGUAGE}_${1}.txt"
740 }
742 if [ "$REMOVE_TAG" = "YES" ]
743 then
744 cd "${WORK_DIR}"
745 for i in $TAG_NAME
746 do
747 echo "$TAGNAME" | grep -q '[/*?]' && continue
748 f="`tag_file_name $i`"
749 if [ -e "$f" ]
750 then
751 rm -f "$f" && echo Tag "'$i'" removed
752 else
753 echo Unknown tag "'$i'"
754 fi
755 done
756 exit 0
757 fi
759 mkdir -p $WORK_DIR
760 oldpwd="$PWD"
761 cd $WORK_DIR
762 if [ "$MERGE_TAGGED_WORDS" = "YES" ]
763 then
764 VOC_FILES=''
765 for i in $MERGE_THIS_TAGS
766 do
767 f=`tag_file_name $i`
768 [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
769 done
770 if [ -z "$VOC_FILES" ]
771 then
772 echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
773 else
774 cat $VOC_FILES
775 fi
776 elif [ "$MERGE_ALL_TAGGED" = "YES" ]
777 then
778 cat ${LANGUAGE}_*.txt
779 elif echo "$1" | grep -q http:
780 then
781 text_from_url "$1"
782 elif [ "$#" != 0 ]
783 then
784 if echo $1 | grep -q ^/
785 then
786 cat "$1"
787 else
788 cat "$oldpwd/$1"
789 fi
790 else
791 cat
792 fi \
793 | part $PART_TO_PROCESS \
794 | tee $ORIGINAL_TEXT \
795 | two_and_three_words \
796 | get_words ${TEMP1}-full \
797 | group_words \
798 | add_stat ${TEMP1}-full \
799 | tee "$TEMP1" > "$TEMP2"
801 if [ "$STAT_ONLY" = "YES" ]
802 then
803 cat "$TEMP1"
804 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
805 then
806 cat "$TEMP1"
807 else
808 if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ]
809 then
810 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
811 if [ "$editor" = vim ]
812 then
813 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
814 else
815 $editor "$TEMP2"
816 fi
817 remove_marks "$TEMP2"
819 vocabulary="$VOCABULARY"
820 [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`"
821 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary"
822 fi
823 fi
825 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"