new-words
view new-words.sh @ 36:f95804355b0f
compressed wordlist support
| author | Igor Chubin <igor@chub.in> | 
|---|---|
| date | Sat Jan 01 19:47:39 2011 +0100 (2011-01-01) | 
| parents | 3827cce83602 | 
| children | 4e931db74618 | 
 line source
     1 #!/bin/bash
     3 show_usage()
     4 {
     5 cat <<HELP > /dev/stderr
     7 USAGE: 
     9     new-words [ -l lang ] [ -s ] [ ARG ] 
    11 SWITCHES: 
    13     -h          print this screen
    14     -c          show compressed wordlist: one word per group
    15     -k          put higher words that are similar to the known words (only for English)
    16     -l lang     override language settings
    17     -n          non-interactive mode (don't run vi)
    18     -N          turn off known words filtering
    19     -a          don't add marks (and don't save marks added by user)
    20     -p pages    work with specified pages only (pages = start-stop/total )
    21     -s          show the text statistics (percentage of known words and so on) and exit
    22     -S          show your vocabulary statistics (number of words and word groups)
    23     -t tag      tag known words with tag
    24     -T          show list of active tags
    25     -m tag      merge the words tagged with "tag" into the main vocabulary
    26     -M          merge the words tagged with any tag into the main vocabulary 
    27     -r tag      remove subvocabulary for the "tag"
    28     -2 -3       find 2 and 3 words' sequences
    30 The language of the text can be specified also
    31 by name of the program new-words (correspondent link must be created before).
    32 For example, these calls are equivalent:
    34     de-words URL
    35     new-words -l de URL
    37 HELP
    38 }
    40 if [ "$1" = "-h" ]
    41 then
    42     show_usage
    43     exit 0
    44 fi
    46 WORK_DIR=~/.new-words/
    47 TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX`
    48 TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX`
    49 export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX`
    50 editor=${EDITOR:-vim}
    52 # language detection
    54 LANGUAGE=en
    55 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
    56 for arg
    57 do
    58     if echo "$arg" | grep -q http://...wikipedia.org/wiki/
    59     then
    60     LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
    61     fi
    62 done
    63 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
    65 #----------------------------------------------------
    66 # command line options processing
    68 STAT_ONLY=NO
    69 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
    70 DONT_ADD_MARKS=NO
    71 NON_INTERACTIVE_MODE=NO
    72 PART_TO_PROCESS=''
    73 GROUP_WORDS_BY_THREE=NO
    74 GROUP_WORDS_BY_TWO=NO
    75 TAG_NAME=''
    76 MERGE_THIS_TAGS=''
    77 TAGS_LIST_ONLY=NO
    78 MERGE_TAGGED_WORDS=NO
    79 MERGE_ALL_TAGGED=NO
    80 DONT_ADD_MARKLINES=NO
    81 FILTER_WORDS=YES
    82 SHOW_VOC_STAT=NO
    83 COMPRESSED_WORDLIST=NO
    84 while getopts cl:sSkanNp:t:Tm:Mr:23 opt
    85 do
    86     case "$opt" in
    87       c)  COMPRESSED_WORDLIST=YES;;
    88       s)  STAT_ONLY=YES;;
    89       S)  SHOW_VOC_STAT=YES;;
    90       k)  NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
    91       l)  LANGUAGE="$OPTARG";;
    92       a)  DONT_ADD_MARKS=YES;;
    93       n)  NON_INTERACTIVE_MODE=YES;;
    94       N)  FILTER_WORDS=NO;;
    95       p)  PART_TO_PROCESS="$OPTARG";;
    96       t)  TAG_NAME="$OPTARG";;
    97       T)  TAGS_LIST_ONLY="YES";;
    98       m)  DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
    99       M)  DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
   100       r)  REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
   101       2)  GROUP_WORDS_BY_TWO=YES;;
   102       3)  GROUP_WORDS_BY_THREE=YES;;
   103       \?)       # unknown flag
   104           show_usage
   105           exit 1;;
   106     esac
   107 done
   108 shift `expr $OPTIND - 1`
   110 if [ "$1" = "-l" ]
   111 then
   112     LANGUAGE="$2"
   113     shift 2
   114 fi
   116 VOCABULARY=${LANGUAGE}.txt
   117 NOTES_FILE=notes-${LANGUAGE}.txt
   119 if [ "${SHOW_VOC_STAT}" = "YES" ]
   120 then
   121   $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>"
   122   exit 0
   123 fi
   125 #----------------------------------------------------
   127 get_words()
   128 {
   129     export FILTER_WORDS
   130 tr ' ' '\n' | sed 's/--/ /g' \
   131 | sed "s/'/__APOSTROPHE__/g" \
   132 | perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\
   133 | sed "s/__APOSTROPHE__/'/g" \
   134 | tr ' ' '\n' \
   135 | tee "$1" \
   136 | grep_v_english_perl \
   137 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
   138 }
   140 add_stat()
   141 {
   142     if [ "$DONT_ADD_MARKLINES" = "YES" ]
   143     then
   144         cat
   145         return
   146     fi
   147     before="$1"
   148     after=${before}2
   149     cat > "$after"
   150     total="`wc -w $1 | awk '{print $1}'`"
   151     total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
   152     total_known="`echo $total-$total_unknown|bc`"
   153     percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
   154     #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`"
   155     sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`"
   158     if [ "$STAT_ONLY" = "YES" ]
   159     then
   160         echo "LANG  KNOWN%  UNKNOWN%  KNOWN     TOTAL     WPS  UWPS*10"
   161         echo "$LANGUAGE    $percentage    `echo \(100-$percentage\) | bc -l`      $total_known    $total    `echo $total/$sentences|bc`   `echo 10*$total_unknown/$sentences|bc` "
   162         rm $after
   163         return 0
   164     else 
   165         groups="`echo $(grep '# groups' $after | awk '{print $3}')`"
   166         words="`echo $(grep -v '^#' $after | wc -l)`"
   167         echo "# $LANGUAGE, $percentage, <$total_known/$total>, <$groups/$words>"
   168     fi
   170     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
   171     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
   172 my $total=shift(@ARGV);
   173 my $total_known=shift(@ARGV);
   174 my $s=0;
   175 my $mark_line=int($total_known*100/$total/5)*5;
   176 if ($mark_line>=90) { 
   177     $mark_line=int($total_known*100/$total)+1;
   178 } else { $mark_line +=5; };
   179 while(<>)
   180 {
   181     next if /^#\s*groups\s*/;
   182     print;
   183     /^\s*([0-9]*)\s*/;
   184     $s+=$1;
   185     if (($total_known+$s)*100/$total>=$mark_line) {
   186         print "# $mark_line\n";
   187         if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; };
   188     }
   189 }
   190 PERL_SCRIPT
   191     perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after"
   192     rm $PERL_SCRIPT_TEMP_NAME
   193     rm $after
   194 }
   196 two_and_three_words()
   197 {
   198     if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ]
   199     then 
   200         cat
   201     else
   202         cat 
   204     export GROUP_WORDS_BY_THREE
   205     export GROUP_WORDS_BY_TWO
   206     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX`
   207     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
   208 #!/usr/bin/perl
   209 local $/;
   210 $words=<>;
   211 $words=~ s@[!?;,:#1-9".]@ @g;
   212 $words =~ s@\s+@ @g;
   213 @words = split /\s+/, $words;
   214 for ($i=0; $i<$#words-3;$i++) {
   215     my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]);
   216     if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) {
   217         print "${a}_${b}_${c}\n";
   218     };  
   219     if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) {
   220         print "${a}_${b}\n";
   221     };
   222 }
   223 PERL_SCRIPT
   224     perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT"
   225     rm $PERL_SCRIPT_TEMP_NAME
   226     fi
   227 }
   229 grep_v_english()
   230 {
   231 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
   232 eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat
   233 }
   235 grep_v_english_perl()
   236 {
   237     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
   238     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
   239     if ($ENV{FILTER_WORDS} eq "NO") {
   240         while(<>) { print; }
   241         exit(0);
   242     }
   243 $voc_files=$ENV{VOC_FILES};
   244 $voc_files=~s@^ @@;
   245 for $voc_file (split /\s+/,$voc_files) {
   246     if (open(VOC, $voc_file)) {
   247         while (<VOC>){
   248             chomp;
   249             #s/'//g;
   250             $voc{$_}="1";
   251         }
   252     }
   253 }
   254 while(<>) {
   255     chomp;
   256     if (not defined($voc{$_})) { print "$_\n"; }
   257 }
   258 PERL_SCRIPT
   259     [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
   260     export VOCABULARY VOC_FILES
   261     VOC_FILES=$VOCABULARY
   262     for i in $TAG_NAME
   263     do
   264         VOC_FILES="${VOC_FILES} `tag_file_name $i`"
   265     done
   266     perl $PERL_SCRIPT_TEMP_NAME
   267     rm $PERL_SCRIPT_TEMP_NAME
   268 }
   270 group_words()
   271 {
   272     #if [ "$LANGUAGE" != "en" ]
   273     #then
   274     #    cat 
   275     #    return
   276     #fi
   277     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX`
   278     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
   279 #!/usr/bin/perl
   281 use Encode;
   282 use utf8;
   283 use Lingua::Stem::Snowball qw(stem);
   285 eval {
   286 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
   287     require String::Similarity;
   288     String::Similarity->import();
   289 };
   290 unless($@)
   291 {
   292     our $HAVE_String_Similarity=1;
   293 }
   296 sub load_notes_dict()
   297 {
   298     my %dict;
   299     if (open(NOTES, $ENV{NOTES_FILE})) {
   300         while(<NOTES>) {
   301             $_ = decode( "utf8", $_);
   302             chomp;
   303             s/^\s+//;
   304             my ($a,$b)=split /\s+/,$_,2;
   305             $dict{$a}=$b;
   306         }
   307     }
   308     return %dict;
   309 }
   311 sub similar($$){
   312     my $a=shift;
   313     my $b=shift;
   314     if ($HAVE_String_Similarity) {
   315         return $Similarity{"$a $b"};
   316     } 
   317     else {
   318         return 0;
   319     }
   320 }
   323 sub normalize_without_linked($)
   324 {
   325     if   ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
   326     elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
   327     elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); }
   328     elsif ( $ENV{LANGUAGE} eq "io" ) { return normalize_esperanto(shift); }
   329     else { return shift ; }
   330 }
   332 sub normalize_with_linked($)
   333 {
   334     my $word = normalize_without_linked(shift);
   335         #return $word;
   336     if ($linked_words{$word}) {
   337         return $linked_words{$word};
   338     }
   339     else {
   340         return $word;
   341     }
   342 }
   344 sub normalize($)
   345 {
   346     return normalize_with_linked(shift);
   347 }
   349 sub normalize_ukrainian($)
   350 {
   351     $_=lc(shift);
   352     s/[юіоеуаи]$//g;
   353     return $_;
   354 }
   356 sub normalize_esperanto($)
   357 {
   358     $_=lc(shift);
   359 # verbs
   360     s/i$//; s/is$//; s/os$//; s/as$//; s/us$//;
   362 # nouns
   363     s/j?n?$//;
   365     return $_;
   366 }
   368 sub normalize_german($)
   369 {
   370     @stems = stem('de', \@_);
   371     return $stems[0];
   372 }
   374 sub normalize_german_($)
   375 {
   376     $_=lc(shift);
   378     s/heit$//;  s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//; 
   379     s/en$//; s/er$//;
   381     s/lich$//; s/ig$//;
   382     s/al$//; s/isch$//;
   383     s/ell$//; s/haft$//;
   385     s/bar$//; s/sam$//; s/lich$//;
   387     @prefixes=qw(
   388         ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu
   389         be emp ent er ge miss ver zer durch über um unter wieder);
   390     @prefixes=();
   391     for $pref (@prefixes) {
   392         s/^$pref//;
   393     }
   396     return $_;
   397 }
   399 sub normalize_english($)
   400 {
   401     $_=lc(shift);
   403     s/s$//;
   405     s/ation$//;  s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//; 
   407     s/ed$//;
   408     s/en$//;
   409     s/er$//;
   410     s/est$//;
   411     s/ing$//;
   413     s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//;
   414     s/ify$//; s/fy$//; s/ly$//;
   415     s/ise$//; s/ize$//;
   417     s/e$//;
   418     return $_;
   419 }
   422 sub compare($$)
   423 {
   424     my $a=shift;
   425     my $b=shift;
   426     $a =~ s/^\s*//;
   427     $b =~ s/^\s*//;
   428     my ($a1, $a2)= split /\s+/,$a,2;
   429     my ($b1, $b2)= split /\s+/,$b,2;
   431     my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)};
   433     if ($cmp) {
   434         return $cmp;
   435     }
   436     else {
   437         if (normalize($a2) ne normalize($b2)) {
   438             return normalize($a2) cmp normalize($b2);
   439         }
   440         else {
   441             return $a1 <=> $b1;
   442         }
   443     }
   444 }
   446 sub log_($)
   447 {
   448     return;
   449     open(LOG, ">>", "/tmp/log1");
   450     print LOG $_[0];
   451     close(LOG);
   452 }
   454 sub find_linked_words($)
   455 {
   456     my %linked_words;
   457     my $dict = shift;
   458     log_("1");
   459     log_(join(" ", keys(%$dict)));
   461     for $key (keys(%$dict)) {
   462         $val = $dict->{$key};
   463         log_($key."\n");
   464         if ($val =~ /\@([a-z]*)/) {
   465             $linked_words{normalize($key)} = normalize($1);
   466             log_(normalize($key)." = ".normalize($1)."\n");
   467         }
   468     }
   469     return %linked_words;
   470 }
   472 sub lc_length($)
   473 {
   474     my $a= shift;
   475     $a =~ s/[a-z]//g;
   476     return length($a);
   477 }
   479 our %dict = load_notes_dict();
   480 our %linked_words = find_linked_words(\%dict);
   482 our %Vocabulary;
   483 open(VOC, $ENV{VOCABULARY})
   484  or die "Can't open VOCABULARY";
   485 while (<VOC>){
   486     chomp;
   487     #s/'//g;
   488     $Vocabulary{normalize($_)}="1";
   489 }
   490 close(VOC);
   492 binmode STDIN,":utf8";
   493 @lines=<STDIN>;
   494 for $L (@lines) {
   495     chomp($L);
   496     #$L = decode( "utf8", $L);
   497     $l=$L;
   498     $l =~ s/^\s*//;
   499     my ($a, $b)=split(/\s+/,$l,2);
   500     $group_weight{normalize($b)}+=$a;
   501 }
   502 if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") {
   503     for $k (keys %group_weight) {
   504         if (defined($Vocabulary{$k})) {
   505             $group_weight{$k} *= 2;
   506         }
   507     }
   508 }
   509 @lines2 = sort { compare($b,$a) } @lines;
   510 binmode STDOUT, ":utf8";
   511 print "# groups ".scalar(keys(%group_weight))."\n";
   512 if ($ENV{COMPRESSED_WORDLIST} eq "YES") {
   513     my $sum = 0;
   514     my $min = 9999;
   515     for $L (@lines2) {
   516         chomp($L); $l=$L; $l =~ s/^(\s*)//; my $spaces = $1; my ($a, $b)=split(/\s+/,$l,2);
   517         $group_name = normalize($b);
   518         if ($group_name ne $prev_group_name and $prev_group_name ne '' ) {
   519             #print (" "x(7-length($sum))),"$sum $main_word\n";
   520             print +(" "x(7-length($sum))),"$sum $main_word\n";
   521             $sum = $a;
   522             $min = length($b) + 2*lc_length($b);
   523             $main_word = $b;
   524         }
   525         else {
   526             $sum += $a;
   527             if ($min > length($b) + 2*lc_length($b)) {
   528                 $min = length($b) + 2*lc_length($b);
   529                 $main_word = $b;
   530             }
   531         }
   532         $prev_group_name = $group_name;
   533     }
   534 }
   535 else {
   536     for $l (@lines2) {
   537         print "$l\n";
   538     }
   539 }
   540 PERL_SCRIPT
   541     export VOCABULARY
   542     export NEED_TO_USE_VOCABULARY_WHEN_SORT
   543     export LANGUAGE
   544     export COMPRESSED_WORDLIST
   545     [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
   546     export NOTES_FILE
   547     perl $PERL_SCRIPT_TEMP_NAME
   548     rm $PERL_SCRIPT_TEMP_NAME
   549 }
   551 text_from_url()
   552 {
   553 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
   554 }
   556 add_marks()
   557 {
   558     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
   559     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
   560 use Encode;
   562 sub load_notes_dict()
   563 {
   564     my %dict;
   565     if (open(NOTES, $ENV{NOTES_FILE})) {
   566         while(<NOTES>) {
   567             $_ = decode( "utf8", $_);
   568             chomp;
   569             s/^\s+//;
   570             my ($a,$b)=split /\s+/,$_,2;
   571             $dict{$a}=$b;
   572         }
   573     }
   574     return %dict;
   575 }
   577 %dict = load_notes_dict();
   579 $file = $ARGV[0];
   580 if (open(F, $file)) {
   581     @lines=<F>;
   582     close(F);
   583     for (@lines) {$_ = decode( "utf8", $_);};
   585     if (open(F, ">$file")) {
   586         binmode F, ":utf8";
   587         for (@lines) {
   588             m/\s+\S+\s+(\S+)/;
   589             $name=$1;
   590             if (not /^#/ and defined($dict{$name})) {
   591                 chomp;
   592                 $mark=$dict{$name};
   593                 $space=" "x(30-length($_));
   594                 print F "$_$space$mark\n";
   595             }
   596             else {
   597                 print F "$_";
   598             }
   599         }
   600         close(F);
   601     }
   602 }
   603 PERL_SCRIPT
   604     [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
   605     export NOTES_FILE
   606     perl $PERL_SCRIPT_TEMP_NAME "$1"
   607     rm $PERL_SCRIPT_TEMP_NAME
   608 }
   610 remove_marks()
   611 {
   612     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
   613     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
   614 $file = $ARGV[0];
   615 our %dict;
   616 if (open(F, $file)) {
   617     @lines=<F>;
   618     close(F);
   620     if (open(F, ">$file")) {
   621         for (@lines) {
   622             chomp;
   623             if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
   624                 my $name=$4;
   625                 my $comment=$6;
   626                 $dict{$name}=$comment;
   627                 print F "$1$2$3$4\n";
   628             }
   629             else {
   630                 print F "$_\n";
   631             }
   632         }
   633     }
   634 }
   635 if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) {
   636     @lines=<NOTES>;
   637     close(NOTES);
   639     if (open(NOTES, ">".$ENV{NOTES_FILE})) {
   640         for (@lines) {
   641             chomp;
   642             s/^\s+//;
   643             my ($a,$b)=split /\s+/,$_,2;
   644             if (not defined($dict{$a}) || ($dict{$a} eq $b)) {
   645                 print NOTES "$_\n";
   646                 if (defined($dict{$a})) { unset($dict{$a}); }
   647             }
   648         }
   649         for (keys %dict) {
   650             $mark=$dict{$_};
   651             $space=" "x(30-length($_));
   652             print NOTES "$_$space$mark\n";
   653         }
   654     }
   655 }
   656 PERL_SCRIPT
   657     [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
   658     export NOTES_FILE
   659     export DONT_ADD_MARKS
   660     perl $PERL_SCRIPT_TEMP_NAME "$1"
   661     rm $PERL_SCRIPT_TEMP_NAME
   662 }
   664 part()
   665 {
   666     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
   667     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
   668 #!/usr/bin/perl
   670 my @lines=<STDIN>;
   671 my $lines=$#lines;
   672 my $interval=$ARGV[0];
   673 if (not $interval) {
   674     print @lines;
   675 }
   676 else {
   677     my ($start,$stop,$total);
   678     if ($interval =~ m@(.*)/(.*)@) {
   679         $start = $1;
   680         $total = $2;
   681     }
   682     else {
   683         $start=$interval;
   684         $total=0;
   685     }
   686     if ($start =~ m@(.*)-(.*)@) {
   687         $start = $1;
   688         $stop = $2;
   689     }
   690     if ($start =~ m@(.*)\+(.*)@) {
   691         $start = $1;
   692         $stop = $start+$2;
   693     }
   695     $start=int($lines/$total*$start);
   696     $stop=int($lines/$total*$stop);
   698     for($i=$start;$i<$stop;$i++){
   699         print $lines[$i];
   700     }
   701 }
   702 PERL_SCRIPT
   703     perl $PERL_SCRIPT_TEMP_NAME "$1"
   704     rm $PERL_SCRIPT_TEMP_NAME
   705 }
   707 if [ "$TAGS_LIST_ONLY" = "YES" ] 
   708 then
   709     cd "${WORK_DIR}"
   710     echo ${LANGUAGE}_*.txt | tr ' ' '\n'  | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'
   711     exit 0
   712 fi
   714 tag_file_name()
   715 {
   716     echo "${LANGUAGE}_${1}.txt"
   717 }
   719 if [ "$REMOVE_TAG" = "YES" ]
   720 then
   721     cd "${WORK_DIR}"
   722     for i in $TAG_NAME 
   723     do
   724         echo "$TAGNAME" | grep -q '[/*?]' && continue
   725         f="`tag_file_name $i`"
   726         if [ -e "$f" ] 
   727         then
   728             rm -f "$f" && echo Tag "'$i'" removed
   729         else
   730             echo Unknown tag "'$i'"
   731         fi
   732     done
   733     exit 0
   734 fi
   736 mkdir -p $WORK_DIR
   737 oldpwd="$PWD"
   738 cd $WORK_DIR
   739 if [ "$MERGE_TAGGED_WORDS" = "YES" ]
   740 then
   741     VOC_FILES=''
   742     for i in $MERGE_THIS_TAGS
   743     do
   744         f=`tag_file_name $i`
   745         [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
   746     done
   747     if [ -z "$VOC_FILES" ]
   748     then 
   749         echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
   750     else
   751         cat $VOC_FILES
   752     fi
   753 elif [ "$MERGE_ALL_TAGGED" = "YES" ]
   754 then
   755     cat ${LANGUAGE}_*.txt
   756 elif echo "$1" | grep -q http: 
   757 then 
   758     text_from_url "$1"
   759 elif [ "$#" != 0 ]
   760 then
   761     if echo $1 | grep -q ^/
   762     then
   763         cat "$1"
   764     else
   765         cat "$oldpwd/$1"
   766     fi
   767 else 
   768     cat
   769 fi \
   770    | part $PART_TO_PROCESS \
   771    | tee $ORIGINAL_TEXT \
   772    | two_and_three_words \
   773    | get_words ${TEMP1}-full \
   774    | group_words \
   775    | add_stat ${TEMP1}-full \
   776    | tee "$TEMP1" > "$TEMP2"
   778 if [ "$STAT_ONLY" = "YES" ]
   779 then
   780     cat "$TEMP1"
   781 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
   782 then
   783     cat "$TEMP1"
   784 else
   785     if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ] 
   786     then
   787         [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
   788         if [ "$editor" = vim ]
   789         then
   790             vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
   791         else
   792             $editor "$TEMP2"
   793         fi
   794         remove_marks "$TEMP2"
   796         vocabulary="$VOCABULARY"
   797         [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`"
   798         diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary"
   799     fi
   800 fi
   802 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"
