new-words

view new-words.sh @ 25:d1eb7dc37feb

minifix: -p pages
author Igor Chubin <igor@chub.in>
date Mon May 17 14:48:34 2010 +0300 (2010-05-17)
parents 1318aa5898ee
children 4a10c0f4510c
line source
1 #!/bin/sh
3 show_usage()
4 {
5 cat <<HELP > /dev/stderr
7 USAGE:
9 new-words [ -l lang ] [ -s ] [ ARG ]
11 SWITCHES:
13 -h print this screen
14 -k put higher words that are similar to the known words (only for English)
15 -l lang override language settings
16 -n non-interactive mode (don't run vi)
17 -m don't add marks (and don't save marks added by user)
18 -p pages work with specified pages only (pages = start-stop/total )
19 -s show the text statistics (percentage of known words and so on) and exit
21 The language of the text can be specified also
22 by name of the program new-words (correspondent link must be created before).
23 For example, these calls are equivalent:
25 de-words URL
26 new-words -l de URL
28 HELP
29 }
31 if [ "$1" = "-h" ]
32 then
33 show_usage
34 exit 0
35 fi
37 WORK_DIR=~/.new-words/
38 TEMP1=`mktemp /tmp/new-words-XXXXXXXXXX-temp1`
39 TEMP2=`mktemp /tmp/new-words-XXXXXXXXXX-temp2`
40 export ORIGINAL_TEXT=`mktemp /tmp/new-words-XXXXXXXXXX-orig`
41 editor=${EDITOR:-vim}
43 # language detection
45 LANGUAGE=en
46 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
47 for arg
48 do
49 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
50 then
51 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
52 fi
53 done
54 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
56 #----------------------------------------------------
57 # command line options processing
59 STAT_ONLY=NO
60 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
61 DONT_ADD_MARKS=NO
62 NON_INTERACTIVE_MODE=NO
63 PART_TO_PROCESS=''
64 while getopts l:skmnp: opt
65 do
66 case "$opt" in
67 s) STAT_ONLY=YES;;
68 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
69 l) LANGUAGE="$OPTARG";;
70 m) DONT_ADD_MARKS=YES;;
71 n) NON_INTERACTIVE_MODE=YES;;
72 p) PART_TO_PROCESS="$OPTARG";;
73 \?) # unknown flag
74 show_usage
75 exit 1;;
76 esac
77 done
78 shift `expr $OPTIND - 1`
80 if [ "$1" = "-l" ]
81 then
82 LANGUAGE="$2"
83 shift 2
84 fi
86 VOCABULARY=${LANGUAGE}.txt
87 NOTES_FILE=notes-${LANGUAGE}.txt
89 #----------------------------------------------------
91 get_words()
92 {
93 tr ' ' '\n' | sed 's/--/ /g' \
94 | sed "s/'/__APOSTROPHE__/g" \
95 | tr '—·-' '-----' \
96 | tr '*\r,.:#@()+=—<>$;"?!|·[]^%&' ' ' \
97 | tr ' ' '\n' \
98 | grep -x '[[:alpha:]'"'"'-]*' \
99 | tee "$1" \
100 | grep_v_english_perl \
101 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
102 }
104 add_stat()
105 {
106 before="$1"
107 after=${before}2
108 cat > "$after"
109 total="`wc -w $1 | awk '{print $1}'`"
110 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
111 total_known="`echo $total-$total_unknown|bc`"
112 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
113 #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`"
114 sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`"
117 if [ "$STAT_ONLY" = "YES" ]
118 then
119 echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10"
120 echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` "
121 rm $after
122 return 0
123 else
124 echo "# $LANGUAGE, $percentage, <$total_known/$total>"
125 fi
127 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
128 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
129 my $total=shift(@ARGV);
130 my $total_known=shift(@ARGV);
131 my $s=0;
132 my $mark_line=int($total_known*100/$total/5)*5;
133 if ($mark_line>=90) {
134 $mark_line=int($total_known*100/$total)+1;
135 } else { $mark_line +=5; };
136 while(<>)
137 {
138 print;
139 /^\s*([0-9]*)\s*/;
140 $s+=$1;
141 if (($total_known+$s)*100/$total>=$mark_line) {
142 print "# $mark_line\n";
143 if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; };
144 }
145 }
146 PERL_SCRIPT
147 perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after"
148 rm $PERL_SCRIPT_TEMP_NAME
149 rm $after
150 }
152 grep_v_english()
153 {
154 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
155 eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat
156 }
158 grep_v_english_perl()
159 {
160 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
161 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
162 open(VOC, $ENV{VOCABULARY})
163 or die "Can't open VOCABULARY";
164 while (<VOC>){
165 chomp;
166 #s/'//g;
167 $voc{$_}="1";
168 }
169 while(<>) {
170 chomp;
171 if (not defined($voc{$_})) { print "$_\n"; }
172 }
173 PERL_SCRIPT
174 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
175 export VOCABULARY
176 perl $PERL_SCRIPT_TEMP_NAME
177 rm $PERL_SCRIPT_TEMP_NAME
178 }
180 group_words()
181 {
182 #if [ "$LANGUAGE" != "en" ]
183 #then
184 # cat
185 # return
186 #fi
187 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX`
188 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
189 #!/usr/bin/perl
191 eval {
192 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
193 require String::Similarity;
194 String::Similarity->import();
195 };
196 unless($@)
197 {
198 our $HAVE_String_Similarity=1;
199 }
201 sub similar($$){
202 my $a=shift;
203 my $b=shift;
204 if ($HAVE_String_Similarity) {
205 return $Similarity{"$a $b"};
206 }
207 else {
208 return 0;
209 }
210 }
212 sub normalize($)
213 {
214 if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
215 elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
216 else { return shift ; }
217 }
219 sub normalize_german($)
220 {
221 $_=lc(shift);
223 s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//;
224 s/en$//; s/er$//;
226 s/lich$//; s/ig$//;
227 s/al$//; s/isch$//;
228 s/ell$//; s/haft$//;
230 s/bar$//; s/sam$//; s/lich$//;
232 @prefixes=qw(
233 ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu
234 be emp ent er ge miss ver zer durch über um unter wieder);
235 @prefixes=();
236 for $pref (@prefixes) {
237 s/^$pref//;
238 }
241 return $_;
242 }
244 sub normalize_english($)
245 {
246 $_=lc(shift);
248 s/s$//;
250 s/ation$//; s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//;
252 s/ed$//;
253 s/en$//;
254 s/er$//;
255 s/est$//;
256 s/ing$//;
258 s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//;
259 s/ify$//; s/fy$//; s/ly$//;
260 s/ise$//; s/ize$//;
262 s/e$//;
263 return $_;
264 }
267 sub compare($$)
268 {
269 my $a=shift;
270 my $b=shift;
271 $a =~ s/^\s*//;
272 $b =~ s/^\s*//;
273 my ($a1, $a2)= split /\s+/,$a,2;
274 my ($b1, $b2)= split /\s+/,$b,2;
276 my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)};
278 if ($cmp) {
279 return $cmp;
280 }
281 else {
282 if (normalize($a2) ne normalize($b2)) {
283 return normalize($a2) cmp normalize($b2);
284 }
285 else {
286 return $a1 <=> $b1;
287 }
288 }
289 }
291 our %Vocabulary;
292 open(VOC, $ENV{VOCABULARY})
293 or die "Can't open VOCABULARY";
294 while (<VOC>){
295 chomp;
296 #s/'//g;
297 $Vocabulary{normalize($_)}="1";
298 }
299 close(VOC);
301 @lines=<STDIN>;
302 for $L (@lines) {
303 chomp($L);
304 $l=$L;
305 $l =~ s/^\s*//;
306 my ($a, $b)=split(/\s+/,$l,2);
307 $group_weight{normalize($b)}+=$a;
308 }
309 if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") {
310 for $k (keys %group_weight) {
311 if (defined($Vocabulary{$k})) {
312 $group_weight{$k} *= 2;
313 }
314 }
315 }
316 @lines2 = sort { compare($b,$a) } @lines;
317 for $l (@lines2) {
318 print "$l\n";
319 }
320 PERL_SCRIPT
321 export VOCABULARY
322 export NEED_TO_USE_VOCABULARY_WHEN_SORT
323 export LANGUAGE
324 perl $PERL_SCRIPT_TEMP_NAME
325 rm $PERL_SCRIPT_TEMP_NAME
326 }
328 text_from_url()
329 {
330 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
331 }
333 add_marks()
334 {
335 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
336 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
337 $file = $ARGV[0];
338 our $dict;
339 if (open(NOTES, $ENV{NOTES_FILE})) {
340 while(<NOTES>) {
341 chomp;
342 s/^\s+//;
343 my ($a,$b)=split /\s+/,$_,2;
344 $dict{$a}=$b;
345 }
346 }
347 if (open(F, $file)) {
348 @lines=<F>;
349 close(F);
351 if (open(F, ">$file")) {
352 for (@lines) {
353 m/\s+\S+\s+(\S+)/;
354 $name=$1;
355 if (not /^#/ and defined($dict{$name})) {
356 chomp;
357 $mark=$dict{$name};
358 $space=" "x(30-length($_));
359 print F "$_$space$mark\n";
360 }
361 else {
362 print F "$_";
363 }
364 }
365 close(F);
366 }
367 }
368 PERL_SCRIPT
369 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
370 export NOTES_FILE
371 perl $PERL_SCRIPT_TEMP_NAME "$1"
372 rm $PERL_SCRIPT_TEMP_NAME
373 }
375 remove_marks()
376 {
377 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
378 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
379 $file = $ARGV[0];
380 our %dict;
381 if (open(F, $file)) {
382 @lines=<F>;
383 close(F);
385 if (open(F, ">$file")) {
386 for (@lines) {
387 chomp;
388 if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
389 my $name=$4;
390 my $comment=$6;
391 $dict{$name}=$comment;
392 print F "$1$2$3$4\n";
393 }
394 else {
395 print F "$_\n";
396 }
397 }
398 }
399 }
400 if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) {
401 @lines=<NOTES>;
402 close(NOTES);
404 if (open(NOTES, ">".$ENV{NOTES_FILE})) {
405 for (@lines) {
406 chomp;
407 s/^\s+//;
408 my ($a,$b)=split /\s+/,$_,2;
409 if (not defined($dict{$a}) || ($dict{$a} eq $b)) {
410 print NOTES "$_\n";
411 if (defined($dict{$a})) { unset($dict{$a}); }
412 }
413 }
414 for (keys %dict) {
415 $mark=$dict{$_};
416 $space=" "x(30-length($_));
417 print NOTES "$_$space$mark\n";
418 }
419 }
420 }
421 PERL_SCRIPT
422 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
423 export NOTES_FILE
424 export DONT_ADD_MARKS
425 perl $PERL_SCRIPT_TEMP_NAME "$1"
426 rm $PERL_SCRIPT_TEMP_NAME
427 }
429 part()
430 {
431 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
432 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
433 #!/usr/bin/perl
435 my @lines=<STDIN>;
436 my $lines=$#lines;
437 my $interval=$ARGV[0];
438 if (not $interval) {
439 print @lines;
440 }
441 else {
442 my ($start,$stop,$total);
443 if ($interval =~ m@(.*)/(.*)@) {
444 $start = $1;
445 $total = $2;
446 }
447 else {
448 $start=$interval;
449 $total=0;
450 }
451 if ($start =~ m@(.*)-(.*)@) {
452 $start = $1;
453 $stop = $2;
454 }
455 if ($start =~ m@(.*)\+(.*)@) {
456 $start = $1;
457 $stop = $start+$2;
458 }
460 $start=int($lines/$total*$start);
461 $stop=int($lines/$total*$stop);
463 for($i=$start;$i<$stop;$i++){
464 print $lines[$i];
465 }
466 }
467 PERL_SCRIPT
468 perl $PERL_SCRIPT_TEMP_NAME "$1"
469 rm $PERL_SCRIPT_TEMP_NAME
470 }
472 mkdir -p $WORK_DIR
473 oldpwd="$PWD"
474 cd $WORK_DIR
475 if echo "$1" | grep -q http:
476 then
477 text_from_url "$1"
478 elif [ "$#" != 0 ]
479 then
480 if echo $1 | grep -q ^/
481 then
482 cat "$1"
483 else
484 cat "$oldpwd/$1"
485 fi
486 else
487 cat
488 fi \
489 | part $PART_TO_PROCESS \
490 | tee $ORIGINAL_TEXT \
491 | get_words ${TEMP1}-full \
492 | group_words \
493 | add_stat ${TEMP1}-full \
494 | tee "$TEMP1" > "$TEMP2"
496 if [ "$STAT_ONLY" = "YES" ]
497 then
498 cat "$TEMP1"
499 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
500 then
501 cat "$TEMP1"
502 else
503 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
504 if [ "$editor" = vim ]
505 then
506 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
507 else
508 echo 2
509 $editor "$TEMP2"
510 fi
511 remove_marks "$TEMP2"
512 fi
514 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$VOCABULARY"
515 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"