| rev | line source | 
| igor@26 | 1 #!/bin/bash | 
| igor@20 | 2 | 
| igor@20 | 3 show_usage() | 
| igor@20 | 4 { | 
| igor@20 | 5 cat <<HELP > /dev/stderr | 
| igor@0 | 6 | 
| igor@16 | 7 USAGE: | 
| igor@16 | 8 | 
| igor@16 | 9     new-words [ -l lang ] [ -s ] [ ARG ] | 
| igor@16 | 10 | 
| igor@16 | 11 SWITCHES: | 
| igor@16 | 12 | 
| igor@20 | 13     -h          print this screen | 
| igor@20 | 14     -k          put higher words that are similar to the known words (only for English) | 
| igor@16 | 15     -l lang     override language settings | 
| igor@21 | 16     -n          non-interactive mode (don't run vi) | 
| igor@21 | 17     -m          don't add marks (and don't save marks added by user) | 
| igor@25 | 18     -p pages    work with specified pages only (pages = start-stop/total ) | 
| igor@20 | 19     -s          show the text statistics (percentage of known words and so on) and exit | 
| igor@26 | 20     -2 -3       find 2 and 3 words' sequences | 
| igor@16 | 21 | 
| igor@20 | 22 The language of the text can be specified also | 
| igor@20 | 23 by name of the program new-words (correspondent link must be created before). | 
| igor@20 | 24 For example, these calls are equivalent: | 
| igor@0 | 25 | 
| igor@20 | 26     de-words URL | 
| igor@0 | 27     new-words -l de URL | 
| igor@0 | 28 | 
| igor@20 | 29 HELP | 
| igor@20 | 30 } | 
| igor@0 | 31 | 
| igor@20 | 32 if [ "$1" = "-h" ] | 
| igor@20 | 33 then | 
| igor@20 | 34     show_usage | 
| igor@20 | 35     exit 0 | 
| igor@20 | 36 fi | 
| igor@0 | 37 | 
| igor@0 | 38 WORK_DIR=~/.new-words/ | 
| igor@17 | 39 TEMP1=`mktemp /tmp/new-words-XXXXXXXXXX-temp1` | 
| igor@17 | 40 TEMP2=`mktemp /tmp/new-words-XXXXXXXXXX-temp2` | 
| igor@17 | 41 export ORIGINAL_TEXT=`mktemp /tmp/new-words-XXXXXXXXXX-orig` | 
| igor@0 | 42 editor=${EDITOR:-vim} | 
| igor@0 | 43 | 
| igor@24 | 44 # language detection | 
| igor@21 | 45 | 
| igor@2 | 46 LANGUAGE=en | 
| igor@2 | 47 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `" | 
| igor@22 | 48 for arg | 
| igor@22 | 49 do | 
| igor@22 | 50     if echo "$arg" | grep -q http://...wikipedia.org/wiki/ | 
| igor@22 | 51     then | 
| igor@22 | 52     LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`" | 
| igor@22 | 53     fi | 
| igor@22 | 54 done | 
| igor@2 | 55 [ "${my_name}" = "new" ] || LANGUAGE="$my_name" | 
| igor@24 | 56 | 
| igor@24 | 57 #---------------------------------------------------- | 
| igor@24 | 58 # command line options processing | 
| igor@24 | 59 | 
| igor@24 | 60 STAT_ONLY=NO | 
| igor@24 | 61 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO | 
| igor@24 | 62 DONT_ADD_MARKS=NO | 
| igor@24 | 63 NON_INTERACTIVE_MODE=NO | 
| igor@25 | 64 PART_TO_PROCESS='' | 
| igor@26 | 65 GROUP_WORDS_BY_THREE=NO | 
| igor@26 | 66 GROUP_WORDS_BY_TWO=NO | 
| igor@26 | 67 while getopts l:skmnp:23 opt | 
| igor@24 | 68 do | 
| igor@24 | 69     case "$opt" in | 
| igor@24 | 70       s)  STAT_ONLY=YES;; | 
| igor@24 | 71       k)  NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;; | 
| igor@24 | 72       l)  LANGUAGE="$OPTARG";; | 
| igor@24 | 73       m)  DONT_ADD_MARKS=YES;; | 
| igor@24 | 74       n)  NON_INTERACTIVE_MODE=YES;; | 
| igor@25 | 75       p)  PART_TO_PROCESS="$OPTARG";; | 
| igor@26 | 76       2)  GROUP_WORDS_BY_TWO=YES;; | 
| igor@26 | 77       3)  GROUP_WORDS_BY_THREE=YES;; | 
| igor@24 | 78       \?)       # unknown flag | 
| igor@24 | 79           show_usage | 
| igor@24 | 80           exit 1;; | 
| igor@24 | 81     esac | 
| igor@24 | 82 done | 
| igor@24 | 83 shift `expr $OPTIND - 1` | 
| igor@24 | 84 | 
| igor@0 | 85 if [ "$1" = "-l" ] | 
| igor@0 | 86 then | 
| igor@0 | 87     LANGUAGE="$2" | 
| igor@0 | 88     shift 2 | 
| igor@0 | 89 fi | 
| igor@24 | 90 | 
| igor@2 | 91 VOCABULARY=${LANGUAGE}.txt | 
| igor@2 | 92 NOTES_FILE=notes-${LANGUAGE}.txt | 
| igor@0 | 93 | 
| igor@21 | 94 #---------------------------------------------------- | 
| igor@21 | 95 | 
| igor@0 | 96 get_words() | 
| igor@0 | 97 { | 
| igor@1 | 98 tr ' ' '\n' | sed 's/--/ /g' \ | 
| igor@3 | 99 | sed "s/'/__APOSTROPHE__/g" \ | 
| igor@26 | 100 | perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/                        /; binmode STDOUT, ":utf8"; print if /^[[:alpha:]'"'"'_-]*$/'\ | 
| igor@26 | 101 | sed "s/__APOSTROPHE__/'/g" \ | 
| igor@3 | 102 | tr ' ' '\n' \ | 
| igor@3 | 103 | tee "$1" \ | 
| igor@3 | 104 | grep_v_english_perl \ | 
| igor@0 | 105 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn | 
| igor@0 | 106 } | 
| igor@0 | 107 | 
| igor@3 | 108 add_stat() | 
| igor@3 | 109 { | 
| igor@3 | 110     before="$1" | 
| igor@3 | 111     after=${before}2 | 
| igor@3 | 112     cat > "$after" | 
| igor@3 | 113     total="`wc -w $1 | awk '{print $1}'`" | 
| igor@3 | 114     total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`" | 
| igor@3 | 115     total_known="`echo $total-$total_unknown|bc`" | 
| igor@3 | 116     percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`" | 
| igor@18 | 117     #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`" | 
| igor@18 | 118     sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`" | 
| igor@18 | 119 | 
| igor@18 | 120 | 
| igor@16 | 121     if [ "$STAT_ONLY" = "YES" ] | 
| igor@16 | 122     then | 
| igor@18 | 123         echo "LANG  KNOWN%  UNKNOWN%  KNOWN     TOTAL     WPS  UWPS*10" | 
| igor@18 | 124         echo "$LANGUAGE    $percentage    `echo \(100-$percentage\) | bc -l`      $total_known    $total    `echo $total/$sentences|bc`   `echo 10*$total_unknown/$sentences|bc` " | 
| igor@17 | 125         rm $after | 
| igor@16 | 126         return 0 | 
| igor@16 | 127     else | 
| igor@16 | 128         echo "# $LANGUAGE, $percentage, <$total_known/$total>" | 
| igor@16 | 129     fi | 
| igor@16 | 130 | 
| igor@14 | 131     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` | 
| igor@14 | 132     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME | 
| igor@14 | 133 my $total=shift(@ARGV); | 
| igor@14 | 134 my $total_known=shift(@ARGV); | 
| igor@14 | 135 my $s=0; | 
| igor@16 | 136 my $mark_line=int($total_known*100/$total/5)*5; | 
| igor@19 | 137 if ($mark_line>=90) { | 
| igor@19 | 138     $mark_line=int($total_known*100/$total)+1; | 
| igor@19 | 139 } else { $mark_line +=5; }; | 
| igor@14 | 140 while(<>) | 
| igor@14 | 141 { | 
| igor@14 | 142     print; | 
| igor@14 | 143     /^\s*([0-9]*)\s*/; | 
| igor@14 | 144     $s+=$1; | 
| igor@19 | 145     if (($total_known+$s)*100/$total>=$mark_line) { | 
| igor@14 | 146         print "# $mark_line\n"; | 
| igor@14 | 147         if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; }; | 
| igor@14 | 148     } | 
| igor@14 | 149 } | 
| igor@14 | 150 PERL_SCRIPT | 
| igor@14 | 151     perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after" | 
| igor@14 | 152     rm $PERL_SCRIPT_TEMP_NAME | 
| igor@3 | 153     rm $after | 
| igor@3 | 154 } | 
| igor@3 | 155 | 
| igor@26 | 156 two_and_three_words() | 
| igor@26 | 157 { | 
| igor@26 | 158     if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ] | 
| igor@26 | 159     then | 
| igor@26 | 160         cat | 
| igor@26 | 161     else | 
| igor@26 | 162         cat | 
| igor@26 | 163 | 
| igor@26 | 164     export GROUP_WORDS_BY_THREE | 
| igor@26 | 165     export GROUP_WORDS_BY_TWO | 
| igor@26 | 166     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX` | 
| igor@26 | 167     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME | 
| igor@26 | 168 #!/usr/bin/perl | 
| igor@26 | 169 local $/; | 
| igor@26 | 170 $words=<>; | 
| igor@26 | 171 $words=~ s@[!?;,:#1-9".]@ @g; | 
| igor@26 | 172 $words =~ s@\s+@ @g; | 
| igor@26 | 173 @words = split /\s+/, $words; | 
| igor@26 | 174 for ($i=0; $i<$#words-3;$i++) { | 
| igor@26 | 175     my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]); | 
| igor@26 | 176     if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) { | 
| igor@26 | 177         print "${a}_${b}_${c}\n"; | 
| igor@26 | 178     }; | 
| igor@26 | 179     if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) { | 
| igor@26 | 180         print "${a}_${b}\n"; | 
| igor@26 | 181     }; | 
| igor@26 | 182 } | 
| igor@26 | 183 PERL_SCRIPT | 
| igor@26 | 184     perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT" | 
| igor@26 | 185     rm $PERL_SCRIPT_TEMP_NAME | 
| igor@26 | 186     fi | 
| igor@26 | 187 } | 
| igor@26 | 188 | 
| igor@0 | 189 grep_v_english() | 
| igor@0 | 190 { | 
| igor@0 | 191 [ -e "$VOCABULARY" ] || touch "$VOCABULARY" | 
| igor@0 | 192 eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat | 
| igor@0 | 193 } | 
| igor@0 | 194 | 
| igor@0 | 195 grep_v_english_perl() | 
| igor@0 | 196 { | 
| igor@0 | 197     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` | 
| igor@0 | 198     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME | 
| igor@0 | 199 open(VOC, $ENV{VOCABULARY}) | 
| igor@0 | 200  or die "Can't open VOCABULARY"; | 
| igor@0 | 201 while (<VOC>){ | 
| igor@0 | 202     chomp; | 
| igor@3 | 203     #s/'//g; | 
| igor@0 | 204     $voc{$_}="1"; | 
| igor@0 | 205 } | 
| igor@0 | 206 while(<>) { | 
| igor@0 | 207     chomp; | 
| igor@0 | 208     if (not defined($voc{$_})) { print "$_\n"; } | 
| igor@0 | 209 } | 
| igor@0 | 210 PERL_SCRIPT | 
| igor@0 | 211     [ -e "$VOCABULARY" ] || touch "$VOCABULARY" | 
| igor@0 | 212     export VOCABULARY | 
| igor@0 | 213     perl $PERL_SCRIPT_TEMP_NAME | 
| igor@0 | 214     rm $PERL_SCRIPT_TEMP_NAME | 
| igor@0 | 215 } | 
| igor@0 | 216 | 
| igor@11 | 217 group_words() | 
| igor@11 | 218 { | 
| igor@22 | 219     #if [ "$LANGUAGE" != "en" ] | 
| igor@22 | 220     #then | 
| igor@22 | 221     #    cat | 
| igor@22 | 222     #    return | 
| igor@22 | 223     #fi | 
| igor@11 | 224     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX` | 
| igor@11 | 225     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME | 
| igor@11 | 226 #!/usr/bin/perl | 
| igor@11 | 227 | 
| igor@12 | 228 eval { | 
| igor@12 | 229 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it | 
| igor@12 | 230     require String::Similarity; | 
| igor@12 | 231     String::Similarity->import(); | 
| igor@12 | 232 }; | 
| igor@12 | 233 unless($@) | 
| igor@12 | 234 { | 
| igor@12 | 235     our $HAVE_String_Similarity=1; | 
| igor@12 | 236 } | 
| igor@12 | 237 | 
| igor@12 | 238 sub similar($$){ | 
| igor@12 | 239     my $a=shift; | 
| igor@12 | 240     my $b=shift; | 
| igor@12 | 241     if ($HAVE_String_Similarity) { | 
| igor@12 | 242         return $Similarity{"$a $b"}; | 
| igor@12 | 243     } | 
| igor@12 | 244     else { | 
| igor@12 | 245         return 0; | 
| igor@12 | 246     } | 
| igor@12 | 247 } | 
| igor@12 | 248 | 
| igor@11 | 249 sub normalize($) | 
| igor@11 | 250 { | 
| igor@22 | 251     if   ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); } | 
| igor@22 | 252     elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); } | 
| igor@22 | 253     else { return shift ; } | 
| igor@22 | 254 } | 
| igor@22 | 255 | 
| igor@22 | 256 sub normalize_german($) | 
| igor@22 | 257 { | 
| igor@22 | 258     $_=lc(shift); | 
| igor@22 | 259 | 
| igor@22 | 260     s/heit$//;  s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//; | 
| igor@22 | 261     s/en$//; s/er$//; | 
| igor@22 | 262 | 
| igor@22 | 263     s/lich$//; s/ig$//; | 
| igor@22 | 264     s/al$//; s/isch$//; | 
| igor@22 | 265     s/ell$//; s/haft$//; | 
| igor@22 | 266 | 
| igor@22 | 267     s/bar$//; s/sam$//; s/lich$//; | 
| igor@22 | 268 | 
| igor@22 | 269     @prefixes=qw( | 
| igor@22 | 270         ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu | 
| igor@22 | 271         be emp ent er ge miss ver zer durch über um unter wieder); | 
| igor@24 | 272     @prefixes=(); | 
| igor@22 | 273     for $pref (@prefixes) { | 
| igor@22 | 274         s/^$pref//; | 
| igor@22 | 275     } | 
| igor@22 | 276 | 
| igor@22 | 277 | 
| igor@22 | 278     return $_; | 
| igor@22 | 279 } | 
| igor@22 | 280 | 
| igor@22 | 281 sub normalize_english($) | 
| igor@22 | 282 { | 
| igor@11 | 283     $_=lc(shift); | 
| igor@12 | 284 | 
| igor@11 | 285     s/s$//; | 
| igor@22 | 286 | 
| igor@22 | 287     s/ation$//;  s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//; | 
| igor@22 | 288 | 
| igor@11 | 289     s/ed$//; | 
| igor@12 | 290     s/en$//; | 
| igor@12 | 291     s/er$//; | 
| igor@12 | 292     s/est$//; | 
| igor@11 | 293     s/ing$//; | 
| igor@12 | 294 | 
| igor@12 | 295     s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//; | 
| igor@22 | 296     s/ify$//; s/fy$//; s/ly$//; | 
| igor@12 | 297     s/ise$//; s/ize$//; | 
| igor@12 | 298 | 
| igor@12 | 299     s/e$//; | 
| igor@11 | 300     return $_; | 
| igor@11 | 301 } | 
| igor@11 | 302 | 
| igor@12 | 303 | 
| igor@11 | 304 sub compare($$) | 
| igor@11 | 305 { | 
| igor@11 | 306     my $a=shift; | 
| igor@11 | 307     my $b=shift; | 
| igor@11 | 308     $a =~ s/^\s*//; | 
| igor@11 | 309     $b =~ s/^\s*//; | 
| igor@11 | 310     my ($a1, $a2)= split /\s+/,$a,2; | 
| igor@11 | 311     my ($b1, $b2)= split /\s+/,$b,2; | 
| igor@11 | 312 | 
| igor@11 | 313     my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)}; | 
| igor@12 | 314 | 
| igor@11 | 315     if ($cmp) { | 
| igor@11 | 316         return $cmp; | 
| igor@11 | 317     } | 
| igor@11 | 318     else { | 
| igor@11 | 319         if (normalize($a2) ne normalize($b2)) { | 
| igor@11 | 320             return normalize($a2) cmp normalize($b2); | 
| igor@11 | 321         } | 
| igor@11 | 322         else { | 
| igor@11 | 323             return $a1 <=> $b1; | 
| igor@11 | 324         } | 
| igor@11 | 325     } | 
| igor@11 | 326 } | 
| igor@11 | 327 | 
| igor@20 | 328 our %Vocabulary; | 
| igor@20 | 329 open(VOC, $ENV{VOCABULARY}) | 
| igor@20 | 330  or die "Can't open VOCABULARY"; | 
| igor@20 | 331 while (<VOC>){ | 
| igor@20 | 332     chomp; | 
| igor@20 | 333     #s/'//g; | 
| igor@20 | 334     $Vocabulary{normalize($_)}="1"; | 
| igor@20 | 335 } | 
| igor@20 | 336 close(VOC); | 
| igor@20 | 337 | 
| igor@20 | 338 @lines=<STDIN>; | 
| igor@11 | 339 for $L (@lines) { | 
| igor@11 | 340     chomp($L); | 
| igor@11 | 341     $l=$L; | 
| igor@11 | 342     $l =~ s/^\s*//; | 
| igor@11 | 343     my ($a, $b)=split(/\s+/,$l,2); | 
| igor@11 | 344     $group_weight{normalize($b)}+=$a; | 
| igor@11 | 345 } | 
| igor@20 | 346 if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") { | 
| igor@20 | 347     for $k (keys %group_weight) { | 
| igor@20 | 348         if (defined($Vocabulary{$k})) { | 
| igor@20 | 349             $group_weight{$k} *= 2; | 
| igor@20 | 350         } | 
| igor@20 | 351     } | 
| igor@20 | 352 } | 
| igor@11 | 353 @lines2 = sort { compare($b,$a) } @lines; | 
| igor@11 | 354 for $l (@lines2) { | 
| igor@11 | 355     print "$l\n"; | 
| igor@11 | 356 } | 
| igor@11 | 357 PERL_SCRIPT | 
| igor@20 | 358     export VOCABULARY | 
| igor@20 | 359     export NEED_TO_USE_VOCABULARY_WHEN_SORT | 
| igor@22 | 360     export LANGUAGE | 
| igor@11 | 361     perl $PERL_SCRIPT_TEMP_NAME | 
| igor@11 | 362     rm $PERL_SCRIPT_TEMP_NAME | 
| igor@11 | 363 } | 
| igor@11 | 364 | 
| igor@0 | 365 text_from_url() | 
| igor@0 | 366 { | 
| igor@0 | 367 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@' | 
| igor@0 | 368 } | 
| igor@0 | 369 | 
| igor@2 | 370 add_marks() | 
| igor@2 | 371 { | 
| igor@2 | 372     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` | 
| igor@2 | 373     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME | 
| igor@2 | 374 $file = $ARGV[0]; | 
| igor@2 | 375 our $dict; | 
| igor@2 | 376 if (open(NOTES, $ENV{NOTES_FILE})) { | 
| igor@2 | 377     while(<NOTES>) { | 
| igor@2 | 378         chomp; | 
| igor@2 | 379         s/^\s+//; | 
| igor@2 | 380         my ($a,$b)=split /\s+/,$_,2; | 
| igor@2 | 381         $dict{$a}=$b; | 
| igor@2 | 382     } | 
| igor@2 | 383 } | 
| igor@2 | 384 if (open(F, $file)) { | 
| igor@2 | 385     @lines=<F>; | 
| igor@2 | 386     close(F); | 
| igor@2 | 387 | 
| igor@2 | 388     if (open(F, ">$file")) { | 
| igor@2 | 389         for (@lines) { | 
| igor@2 | 390             m/\s+\S+\s+(\S+)/; | 
| igor@2 | 391             $name=$1; | 
| igor@3 | 392             if (not /^#/ and defined($dict{$name})) { | 
| igor@2 | 393                 chomp; | 
| igor@2 | 394                 $mark=$dict{$name}; | 
| igor@2 | 395                 $space=" "x(30-length($_)); | 
| igor@2 | 396                 print F "$_$space$mark\n"; | 
| igor@2 | 397             } | 
| igor@2 | 398             else { | 
| igor@2 | 399                 print F "$_"; | 
| igor@2 | 400             } | 
| igor@2 | 401         } | 
| igor@2 | 402         close(F); | 
| igor@2 | 403     } | 
| igor@2 | 404 } | 
| igor@2 | 405 PERL_SCRIPT | 
| igor@2 | 406     [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" | 
| igor@2 | 407     export NOTES_FILE | 
| igor@2 | 408     perl $PERL_SCRIPT_TEMP_NAME "$1" | 
| igor@2 | 409     rm $PERL_SCRIPT_TEMP_NAME | 
| igor@2 | 410 } | 
| igor@2 | 411 | 
| igor@2 | 412 remove_marks() | 
| igor@2 | 413 { | 
| igor@2 | 414     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` | 
| igor@2 | 415     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME | 
| igor@2 | 416 $file = $ARGV[0]; | 
| igor@2 | 417 our %dict; | 
| igor@2 | 418 if (open(F, $file)) { | 
| igor@2 | 419     @lines=<F>; | 
| igor@2 | 420     close(F); | 
| igor@2 | 421 | 
| igor@2 | 422     if (open(F, ">$file")) { | 
| igor@2 | 423         for (@lines) { | 
| igor@2 | 424             chomp; | 
| igor@3 | 425             if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) { | 
| igor@2 | 426                 my $name=$4; | 
| igor@2 | 427                 my $comment=$6; | 
| igor@2 | 428                 $dict{$name}=$comment; | 
| igor@2 | 429                 print F "$1$2$3$4\n"; | 
| igor@2 | 430             } | 
| igor@2 | 431             else { | 
| igor@2 | 432                 print F "$_\n"; | 
| igor@2 | 433             } | 
| igor@2 | 434         } | 
| igor@2 | 435     } | 
| igor@2 | 436 } | 
| igor@21 | 437 if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) { | 
| igor@2 | 438     @lines=<NOTES>; | 
| igor@2 | 439     close(NOTES); | 
| igor@2 | 440 | 
| igor@2 | 441     if (open(NOTES, ">".$ENV{NOTES_FILE})) { | 
| igor@2 | 442         for (@lines) { | 
| igor@2 | 443             chomp; | 
| igor@2 | 444             s/^\s+//; | 
| igor@2 | 445             my ($a,$b)=split /\s+/,$_,2; | 
| igor@2 | 446             if (not defined($dict{$a}) || ($dict{$a} eq $b)) { | 
| igor@2 | 447                 print NOTES "$_\n"; | 
| igor@2 | 448                 if (defined($dict{$a})) { unset($dict{$a}); } | 
| igor@2 | 449             } | 
| igor@2 | 450         } | 
| igor@2 | 451         for (keys %dict) { | 
| igor@2 | 452             $mark=$dict{$_}; | 
| igor@2 | 453             $space=" "x(30-length($_)); | 
| igor@2 | 454             print NOTES "$_$space$mark\n"; | 
| igor@2 | 455         } | 
| igor@2 | 456     } | 
| igor@2 | 457 } | 
| igor@2 | 458 PERL_SCRIPT | 
| igor@2 | 459     [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" | 
| igor@2 | 460     export NOTES_FILE | 
| igor@21 | 461     export DONT_ADD_MARKS | 
| igor@2 | 462     perl $PERL_SCRIPT_TEMP_NAME "$1" | 
| igor@2 | 463     rm $PERL_SCRIPT_TEMP_NAME | 
| igor@2 | 464 } | 
| igor@2 | 465 | 
| igor@25 | 466 part() | 
| igor@25 | 467 { | 
| igor@25 | 468     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX` | 
| igor@25 | 469     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME | 
| igor@25 | 470 #!/usr/bin/perl | 
| igor@25 | 471 | 
| igor@25 | 472 my @lines=<STDIN>; | 
| igor@25 | 473 my $lines=$#lines; | 
| igor@25 | 474 my $interval=$ARGV[0]; | 
| igor@25 | 475 if (not $interval) { | 
| igor@25 | 476     print @lines; | 
| igor@25 | 477 } | 
| igor@25 | 478 else { | 
| igor@25 | 479     my ($start,$stop,$total); | 
| igor@25 | 480     if ($interval =~ m@(.*)/(.*)@) { | 
| igor@25 | 481         $start = $1; | 
| igor@25 | 482         $total = $2; | 
| igor@25 | 483     } | 
| igor@25 | 484     else { | 
| igor@25 | 485         $start=$interval; | 
| igor@25 | 486         $total=0; | 
| igor@25 | 487     } | 
| igor@25 | 488     if ($start =~ m@(.*)-(.*)@) { | 
| igor@25 | 489         $start = $1; | 
| igor@25 | 490         $stop = $2; | 
| igor@25 | 491     } | 
| igor@25 | 492     if ($start =~ m@(.*)\+(.*)@) { | 
| igor@25 | 493         $start = $1; | 
| igor@25 | 494         $stop = $start+$2; | 
| igor@25 | 495     } | 
| igor@25 | 496 | 
| igor@25 | 497     $start=int($lines/$total*$start); | 
| igor@25 | 498     $stop=int($lines/$total*$stop); | 
| igor@25 | 499 | 
| igor@25 | 500     for($i=$start;$i<$stop;$i++){ | 
| igor@25 | 501         print $lines[$i]; | 
| igor@25 | 502     } | 
| igor@25 | 503 } | 
| igor@25 | 504 PERL_SCRIPT | 
| igor@25 | 505     perl $PERL_SCRIPT_TEMP_NAME "$1" | 
| igor@25 | 506     rm $PERL_SCRIPT_TEMP_NAME | 
| igor@25 | 507 } | 
| igor@25 | 508 | 
| igor@0 | 509 mkdir -p $WORK_DIR | 
| igor@8 | 510 oldpwd="$PWD" | 
| igor@0 | 511 cd $WORK_DIR | 
| igor@0 | 512 if echo "$1" | grep -q http: | 
| igor@0 | 513 then | 
| igor@11 | 514     text_from_url "$1" | 
| igor@0 | 515 elif [ "$#" != 0 ] | 
| igor@0 | 516 then | 
| igor@13 | 517     if echo $1 | grep -q ^/ | 
| igor@13 | 518     then | 
| igor@13 | 519         cat "$1" | 
| igor@13 | 520     else | 
| igor@13 | 521         cat "$oldpwd/$1" | 
| igor@13 | 522     fi | 
| igor@0 | 523 else | 
| igor@11 | 524     cat | 
| igor@11 | 525 fi \ | 
| igor@25 | 526    | part $PART_TO_PROCESS \ | 
| igor@11 | 527    | tee $ORIGINAL_TEXT \ | 
| igor@26 | 528    | two_and_three_words \ | 
| igor@11 | 529    | get_words ${TEMP1}-full \ | 
| igor@11 | 530    | group_words \ | 
| igor@11 | 531    | add_stat ${TEMP1}-full \ | 
| igor@11 | 532    | tee "$TEMP1" > "$TEMP2" | 
| igor@0 | 533 | 
| igor@16 | 534 if [ "$STAT_ONLY" = "YES" ] | 
| igor@0 | 535 then | 
| igor@16 | 536     cat "$TEMP1" | 
| igor@21 | 537 elif [ "$NON_INTERACTIVE_MODE" = "YES" ] | 
| igor@21 | 538 then | 
| igor@21 | 539     cat "$TEMP1" | 
| igor@0 | 540 else | 
| igor@21 | 541     [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2" | 
| igor@16 | 542     if [ "$editor" = vim ] | 
| igor@16 | 543     then | 
| igor@23 | 544         vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty | 
| igor@16 | 545     else | 
| igor@16 | 546         echo 2 | 
| igor@16 | 547         $editor "$TEMP2" | 
| igor@16 | 548     fi | 
| igor@16 | 549     remove_marks "$TEMP2" | 
| igor@0 | 550 fi | 
| igor@2 | 551 | 
| igor@0 | 552 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$VOCABULARY" | 
| igor@5 | 553 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT" |