new-words

view new-words.sh @ 24:1318aa5898ee

minifix:getopts
author Igor Chubin <igor@chub.in>
date Mon May 17 12:14:44 2010 +0300 (2010-05-17)
parents 4b9d13c78de2
children d1eb7dc37feb
line source
1 #!/bin/sh
3 show_usage()
4 {
5 cat <<HELP > /dev/stderr
7 USAGE:
9 new-words [ -l lang ] [ -s ] [ ARG ]
11 SWITCHES:
13 -h print this screen
14 -k put higher words that are similar to the known words (only for English)
15 -l lang override language settings
16 -n non-interactive mode (don't run vi)
17 -m don't add marks (and don't save marks added by user)
18 -s show the text statistics (percentage of known words and so on) and exit
20 The language of the text can be specified also
21 by name of the program new-words (correspondent link must be created before).
22 For example, these calls are equivalent:
24 de-words URL
25 new-words -l de URL
27 HELP
28 }
30 if [ "$1" = "-h" ]
31 then
32 show_usage
33 exit 0
34 fi
36 WORK_DIR=~/.new-words/
37 TEMP1=`mktemp /tmp/new-words-XXXXXXXXXX-temp1`
38 TEMP2=`mktemp /tmp/new-words-XXXXXXXXXX-temp2`
39 export ORIGINAL_TEXT=`mktemp /tmp/new-words-XXXXXXXXXX-orig`
40 editor=${EDITOR:-vim}
42 # language detection
44 LANGUAGE=en
45 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
46 for arg
47 do
48 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
49 then
50 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
51 fi
52 done
53 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
55 #----------------------------------------------------
56 # command line options processing
58 STAT_ONLY=NO
59 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
60 DONT_ADD_MARKS=NO
61 NON_INTERACTIVE_MODE=NO
62 while getopts l:skmnp opt
63 do
64 case "$opt" in
65 s) STAT_ONLY=YES;;
66 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
67 l) LANGUAGE="$OPTARG";;
68 m) DONT_ADD_MARKS=YES;;
69 n) NON_INTERACTIVE_MODE=YES;;
70 p) PAGES="$OPTARG";;
71 \?) # unknown flag
72 show_usage
73 exit 1;;
74 esac
75 done
76 shift `expr $OPTIND - 1`
78 if [ "$1" = "-l" ]
79 then
80 LANGUAGE="$2"
81 shift 2
82 fi
84 VOCABULARY=${LANGUAGE}.txt
85 NOTES_FILE=notes-${LANGUAGE}.txt
87 #----------------------------------------------------
89 get_words()
90 {
91 tr ' ' '\n' | sed 's/--/ /g' \
92 | sed "s/'/__APOSTROPHE__/g" \
93 | tr '—·-' '-----' \
94 | tr '*\r,.:#@()+=—<>$;"?!|·[]^%&' ' ' \
95 | tr ' ' '\n' \
96 | grep -x '[[:alpha:]'"'"'-]*' \
97 | tee "$1" \
98 | grep_v_english_perl \
99 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
100 }
102 add_stat()
103 {
104 before="$1"
105 after=${before}2
106 cat > "$after"
107 total="`wc -w $1 | awk '{print $1}'`"
108 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
109 total_known="`echo $total-$total_unknown|bc`"
110 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
111 #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`"
112 sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`"
115 if [ "$STAT_ONLY" = "YES" ]
116 then
117 echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10"
118 echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` "
119 rm $after
120 return 0
121 else
122 echo "# $LANGUAGE, $percentage, <$total_known/$total>"
123 fi
125 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
126 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
127 my $total=shift(@ARGV);
128 my $total_known=shift(@ARGV);
129 my $s=0;
130 my $mark_line=int($total_known*100/$total/5)*5;
131 if ($mark_line>=90) {
132 $mark_line=int($total_known*100/$total)+1;
133 } else { $mark_line +=5; };
134 while(<>)
135 {
136 print;
137 /^\s*([0-9]*)\s*/;
138 $s+=$1;
139 if (($total_known+$s)*100/$total>=$mark_line) {
140 print "# $mark_line\n";
141 if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; };
142 }
143 }
144 PERL_SCRIPT
145 perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after"
146 rm $PERL_SCRIPT_TEMP_NAME
147 rm $after
148 }
150 grep_v_english()
151 {
152 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
153 eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat
154 }
156 grep_v_english_perl()
157 {
158 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
159 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
160 open(VOC, $ENV{VOCABULARY})
161 or die "Can't open VOCABULARY";
162 while (<VOC>){
163 chomp;
164 #s/'//g;
165 $voc{$_}="1";
166 }
167 while(<>) {
168 chomp;
169 if (not defined($voc{$_})) { print "$_\n"; }
170 }
171 PERL_SCRIPT
172 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
173 export VOCABULARY
174 perl $PERL_SCRIPT_TEMP_NAME
175 rm $PERL_SCRIPT_TEMP_NAME
176 }
178 group_words()
179 {
180 #if [ "$LANGUAGE" != "en" ]
181 #then
182 # cat
183 # return
184 #fi
185 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX`
186 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
187 #!/usr/bin/perl
189 eval {
190 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
191 require String::Similarity;
192 String::Similarity->import();
193 };
194 unless($@)
195 {
196 our $HAVE_String_Similarity=1;
197 }
199 sub similar($$){
200 my $a=shift;
201 my $b=shift;
202 if ($HAVE_String_Similarity) {
203 return $Similarity{"$a $b"};
204 }
205 else {
206 return 0;
207 }
208 }
210 sub normalize($)
211 {
212 if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
213 elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
214 else { return shift ; }
215 }
217 sub normalize_german($)
218 {
219 $_=lc(shift);
221 s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//;
222 s/en$//; s/er$//;
224 s/lich$//; s/ig$//;
225 s/al$//; s/isch$//;
226 s/ell$//; s/haft$//;
228 s/bar$//; s/sam$//; s/lich$//;
230 @prefixes=qw(
231 ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu
232 be emp ent er ge miss ver zer durch über um unter wieder);
233 @prefixes=();
234 for $pref (@prefixes) {
235 s/^$pref//;
236 }
239 return $_;
240 }
242 sub normalize_english($)
243 {
244 $_=lc(shift);
246 s/s$//;
248 s/ation$//; s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//;
250 s/ed$//;
251 s/en$//;
252 s/er$//;
253 s/est$//;
254 s/ing$//;
256 s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//;
257 s/ify$//; s/fy$//; s/ly$//;
258 s/ise$//; s/ize$//;
260 s/e$//;
261 return $_;
262 }
265 sub compare($$)
266 {
267 my $a=shift;
268 my $b=shift;
269 $a =~ s/^\s*//;
270 $b =~ s/^\s*//;
271 my ($a1, $a2)= split /\s+/,$a,2;
272 my ($b1, $b2)= split /\s+/,$b,2;
274 my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)};
276 if ($cmp) {
277 return $cmp;
278 }
279 else {
280 if (normalize($a2) ne normalize($b2)) {
281 return normalize($a2) cmp normalize($b2);
282 }
283 else {
284 return $a1 <=> $b1;
285 }
286 }
287 }
289 our %Vocabulary;
290 open(VOC, $ENV{VOCABULARY})
291 or die "Can't open VOCABULARY";
292 while (<VOC>){
293 chomp;
294 #s/'//g;
295 $Vocabulary{normalize($_)}="1";
296 }
297 close(VOC);
299 @lines=<STDIN>;
300 for $L (@lines) {
301 chomp($L);
302 $l=$L;
303 $l =~ s/^\s*//;
304 my ($a, $b)=split(/\s+/,$l,2);
305 $group_weight{normalize($b)}+=$a;
306 }
307 if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") {
308 for $k (keys %group_weight) {
309 if (defined($Vocabulary{$k})) {
310 $group_weight{$k} *= 2;
311 }
312 }
313 }
314 @lines2 = sort { compare($b,$a) } @lines;
315 for $l (@lines2) {
316 print "$l\n";
317 }
318 PERL_SCRIPT
319 export VOCABULARY
320 export NEED_TO_USE_VOCABULARY_WHEN_SORT
321 export LANGUAGE
322 perl $PERL_SCRIPT_TEMP_NAME
323 rm $PERL_SCRIPT_TEMP_NAME
324 }
326 text_from_url()
327 {
328 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
329 }
331 add_marks()
332 {
333 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
334 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
335 $file = $ARGV[0];
336 our $dict;
337 if (open(NOTES, $ENV{NOTES_FILE})) {
338 while(<NOTES>) {
339 chomp;
340 s/^\s+//;
341 my ($a,$b)=split /\s+/,$_,2;
342 $dict{$a}=$b;
343 }
344 }
345 if (open(F, $file)) {
346 @lines=<F>;
347 close(F);
349 if (open(F, ">$file")) {
350 for (@lines) {
351 m/\s+\S+\s+(\S+)/;
352 $name=$1;
353 if (not /^#/ and defined($dict{$name})) {
354 chomp;
355 $mark=$dict{$name};
356 $space=" "x(30-length($_));
357 print F "$_$space$mark\n";
358 }
359 else {
360 print F "$_";
361 }
362 }
363 close(F);
364 }
365 }
366 PERL_SCRIPT
367 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
368 export NOTES_FILE
369 perl $PERL_SCRIPT_TEMP_NAME "$1"
370 rm $PERL_SCRIPT_TEMP_NAME
371 }
373 remove_marks()
374 {
375 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
376 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
377 $file = $ARGV[0];
378 our %dict;
379 if (open(F, $file)) {
380 @lines=<F>;
381 close(F);
383 if (open(F, ">$file")) {
384 for (@lines) {
385 chomp;
386 if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
387 my $name=$4;
388 my $comment=$6;
389 $dict{$name}=$comment;
390 print F "$1$2$3$4\n";
391 }
392 else {
393 print F "$_\n";
394 }
395 }
396 }
397 }
398 if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) {
399 @lines=<NOTES>;
400 close(NOTES);
402 if (open(NOTES, ">".$ENV{NOTES_FILE})) {
403 for (@lines) {
404 chomp;
405 s/^\s+//;
406 my ($a,$b)=split /\s+/,$_,2;
407 if (not defined($dict{$a}) || ($dict{$a} eq $b)) {
408 print NOTES "$_\n";
409 if (defined($dict{$a})) { unset($dict{$a}); }
410 }
411 }
412 for (keys %dict) {
413 $mark=$dict{$_};
414 $space=" "x(30-length($_));
415 print NOTES "$_$space$mark\n";
416 }
417 }
418 }
419 PERL_SCRIPT
420 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
421 export NOTES_FILE
422 export DONT_ADD_MARKS
423 perl $PERL_SCRIPT_TEMP_NAME "$1"
424 rm $PERL_SCRIPT_TEMP_NAME
425 }
427 mkdir -p $WORK_DIR
428 oldpwd="$PWD"
429 cd $WORK_DIR
430 if echo "$1" | grep -q http:
431 then
432 text_from_url "$1"
433 elif [ "$#" != 0 ]
434 then
435 if echo $1 | grep -q ^/
436 then
437 cat "$1"
438 else
439 cat "$oldpwd/$1"
440 fi
441 else
442 cat
443 fi \
444 | tee $ORIGINAL_TEXT \
445 | get_words ${TEMP1}-full \
446 | group_words \
447 | add_stat ${TEMP1}-full \
448 | tee "$TEMP1" > "$TEMP2"
450 if [ "$STAT_ONLY" = "YES" ]
451 then
452 cat "$TEMP1"
453 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
454 then
455 cat "$TEMP1"
456 else
457 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
458 if [ "$editor" = vim ]
459 then
460 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
461 else
462 echo 2
463 $editor "$TEMP2"
464 fi
465 remove_marks "$TEMP2"
466 fi
468 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$VOCABULARY"
469 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"