new-words

view new-words.sh @ 22:46e987f4636d

part.pl script + german normalization support
author Igor Chubin <igor@chub.in>
date Sun May 16 18:20:18 2010 +0300 (2010-05-16)
parents 190d4ac6b07c
children 4b9d13c78de2
line source
1 #!/bin/sh
3 show_usage()
4 {
5 cat <<HELP > /dev/stderr
7 USAGE:
9 new-words [ -l lang ] [ -s ] [ ARG ]
11 SWITCHES:
13 -h print this screen
14 -k put higher words that are similar to the known words (only for English)
15 -l lang override language settings
16 -n non-interactive mode (don't run vi)
17 -m don't add marks (and don't save marks added by user)
18 -s show the text statistics (percentage of known words and so on) and exit
20 The language of the text can be specified also
21 by name of the program new-words (correspondent link must be created before).
22 For example, these calls are equivalent:
24 de-words URL
25 new-words -l de URL
27 HELP
28 }
30 if [ "$1" = "-h" ]
31 then
32 show_usage
33 exit 0
34 fi
36 WORK_DIR=~/.new-words/
37 TEMP1=`mktemp /tmp/new-words-XXXXXXXXXX-temp1`
38 TEMP2=`mktemp /tmp/new-words-XXXXXXXXXX-temp2`
39 export ORIGINAL_TEXT=`mktemp /tmp/new-words-XXXXXXXXXX-orig`
40 editor=${EDITOR:-vim}
42 #----------------------------------------------------
43 # command line options processing
45 LANGUAGE=en
46 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
47 for arg
48 do
49 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
50 then
51 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
52 fi
53 done
54 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
55 if [ "$1" = "-l" ]
56 then
57 LANGUAGE="$2"
58 VOCABULARY="$LANGUAGE".txt
59 shift 2
60 fi
61 VOCABULARY=${LANGUAGE}.txt
62 NOTES_FILE=notes-${LANGUAGE}.txt
64 STAT_ONLY=NO
65 if [ "$1" = "-s" ]
66 then
67 STAT_ONLY=YES
68 shift
69 fi
71 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
72 if [ "$1" = "-k" ]
73 then
74 NEED_TO_USE_VOCABULARY_WHEN_SORT=YES
75 shift
76 fi
78 DONT_ADD_MARKS=NO
79 if [ "$1" = "-m" ]
80 then
81 DONT_ADD_MARKS=YES
82 shift
83 fi
85 NON_INTERACTIVE_MODE=NO
86 if [ "$1" = "-n" ]
87 then
88 NON_INTERACTIVE_MODE=YES
89 shift
90 fi
92 #----------------------------------------------------
94 get_words()
95 {
96 tr ' ' '\n' | sed 's/--/ /g' \
97 | sed "s/'/__APOSTROPHE__/g" \
98 | tr '—·-' '-----' \
99 | tr '*\r,.:#@()+=—<>$;"?!|·[]^%&' ' ' \
100 | tr ' ' '\n' \
101 | grep -x '[[:alpha:]'"'"'-]*' \
102 | tee "$1" \
103 | grep_v_english_perl \
104 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
105 }
107 add_stat()
108 {
109 before="$1"
110 after=${before}2
111 cat > "$after"
112 total="`wc -w $1 | awk '{print $1}'`"
113 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
114 total_known="`echo $total-$total_unknown|bc`"
115 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
116 #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`"
117 sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`"
120 if [ "$STAT_ONLY" = "YES" ]
121 then
122 echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10"
123 echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` "
124 rm $after
125 return 0
126 else
127 echo "# $LANGUAGE, $percentage, <$total_known/$total>"
128 fi
130 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
131 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
132 my $total=shift(@ARGV);
133 my $total_known=shift(@ARGV);
134 my $s=0;
135 my $mark_line=int($total_known*100/$total/5)*5;
136 if ($mark_line>=90) {
137 $mark_line=int($total_known*100/$total)+1;
138 } else { $mark_line +=5; };
139 while(<>)
140 {
141 print;
142 /^\s*([0-9]*)\s*/;
143 $s+=$1;
144 if (($total_known+$s)*100/$total>=$mark_line) {
145 print "# $mark_line\n";
146 if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; };
147 }
148 }
149 PERL_SCRIPT
150 perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after"
151 rm $PERL_SCRIPT_TEMP_NAME
152 rm $after
153 }
155 grep_v_english()
156 {
157 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
158 eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat
159 }
161 grep_v_english_perl()
162 {
163 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
164 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
165 open(VOC, $ENV{VOCABULARY})
166 or die "Can't open VOCABULARY";
167 while (<VOC>){
168 chomp;
169 #s/'//g;
170 $voc{$_}="1";
171 }
172 while(<>) {
173 chomp;
174 if (not defined($voc{$_})) { print "$_\n"; }
175 }
176 PERL_SCRIPT
177 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
178 export VOCABULARY
179 perl $PERL_SCRIPT_TEMP_NAME
180 rm $PERL_SCRIPT_TEMP_NAME
181 }
183 group_words()
184 {
185 #if [ "$LANGUAGE" != "en" ]
186 #then
187 # cat
188 # return
189 #fi
190 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX`
191 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
192 #!/usr/bin/perl
194 eval {
195 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
196 require String::Similarity;
197 String::Similarity->import();
198 };
199 unless($@)
200 {
201 our $HAVE_String_Similarity=1;
202 }
204 sub similar($$){
205 my $a=shift;
206 my $b=shift;
207 if ($HAVE_String_Similarity) {
208 return $Similarity{"$a $b"};
209 }
210 else {
211 return 0;
212 }
213 }
215 sub normalize($)
216 {
217 if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
218 elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
219 else { return shift ; }
220 }
222 sub normalize_german($)
223 {
224 $_=lc(shift);
226 s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//;
227 s/en$//; s/er$//;
229 s/lich$//; s/ig$//;
230 s/al$//; s/isch$//;
231 s/ell$//; s/haft$//;
233 s/bar$//; s/sam$//; s/lich$//;
235 @prefixes=qw(
236 ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu
237 be emp ent er ge miss ver zer durch über um unter wieder);
238 for $pref (@prefixes) {
239 s/^$pref//;
240 }
243 return $_;
244 }
246 sub normalize_english($)
247 {
248 $_=lc(shift);
250 s/s$//;
252 s/ation$//; s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//;
254 s/ed$//;
255 s/en$//;
256 s/er$//;
257 s/est$//;
258 s/ing$//;
260 s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//;
261 s/ify$//; s/fy$//; s/ly$//;
262 s/ise$//; s/ize$//;
264 s/e$//;
265 return $_;
266 }
269 sub compare($$)
270 {
271 my $a=shift;
272 my $b=shift;
273 $a =~ s/^\s*//;
274 $b =~ s/^\s*//;
275 my ($a1, $a2)= split /\s+/,$a,2;
276 my ($b1, $b2)= split /\s+/,$b,2;
278 my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)};
280 if ($cmp) {
281 return $cmp;
282 }
283 else {
284 if (normalize($a2) ne normalize($b2)) {
285 return normalize($a2) cmp normalize($b2);
286 }
287 else {
288 return $a1 <=> $b1;
289 }
290 }
291 }
293 our %Vocabulary;
294 open(VOC, $ENV{VOCABULARY})
295 or die "Can't open VOCABULARY";
296 while (<VOC>){
297 chomp;
298 #s/'//g;
299 $Vocabulary{normalize($_)}="1";
300 }
301 close(VOC);
303 @lines=<STDIN>;
304 for $L (@lines) {
305 chomp($L);
306 $l=$L;
307 $l =~ s/^\s*//;
308 my ($a, $b)=split(/\s+/,$l,2);
309 $group_weight{normalize($b)}+=$a;
310 }
311 if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") {
312 for $k (keys %group_weight) {
313 if (defined($Vocabulary{$k})) {
314 $group_weight{$k} *= 2;
315 }
316 }
317 }
318 @lines2 = sort { compare($b,$a) } @lines;
319 for $l (@lines2) {
320 print "$l\n";
321 }
322 PERL_SCRIPT
323 export VOCABULARY
324 export NEED_TO_USE_VOCABULARY_WHEN_SORT
325 export LANGUAGE
326 perl $PERL_SCRIPT_TEMP_NAME
327 rm $PERL_SCRIPT_TEMP_NAME
328 }
330 text_from_url()
331 {
332 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
333 }
335 add_marks()
336 {
337 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
338 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
339 $file = $ARGV[0];
340 our $dict;
341 if (open(NOTES, $ENV{NOTES_FILE})) {
342 while(<NOTES>) {
343 chomp;
344 s/^\s+//;
345 my ($a,$b)=split /\s+/,$_,2;
346 $dict{$a}=$b;
347 }
348 }
349 if (open(F, $file)) {
350 @lines=<F>;
351 close(F);
353 if (open(F, ">$file")) {
354 for (@lines) {
355 m/\s+\S+\s+(\S+)/;
356 $name=$1;
357 if (not /^#/ and defined($dict{$name})) {
358 chomp;
359 $mark=$dict{$name};
360 $space=" "x(30-length($_));
361 print F "$_$space$mark\n";
362 }
363 else {
364 print F "$_";
365 }
366 }
367 close(F);
368 }
369 }
370 PERL_SCRIPT
371 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
372 export NOTES_FILE
373 perl $PERL_SCRIPT_TEMP_NAME "$1"
374 rm $PERL_SCRIPT_TEMP_NAME
375 }
377 remove_marks()
378 {
379 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
380 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
381 $file = $ARGV[0];
382 our %dict;
383 if (open(F, $file)) {
384 @lines=<F>;
385 close(F);
387 if (open(F, ">$file")) {
388 for (@lines) {
389 chomp;
390 if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
391 my $name=$4;
392 my $comment=$6;
393 $dict{$name}=$comment;
394 print F "$1$2$3$4\n";
395 }
396 else {
397 print F "$_\n";
398 }
399 }
400 }
401 }
402 if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) {
403 @lines=<NOTES>;
404 close(NOTES);
406 if (open(NOTES, ">".$ENV{NOTES_FILE})) {
407 for (@lines) {
408 chomp;
409 s/^\s+//;
410 my ($a,$b)=split /\s+/,$_,2;
411 if (not defined($dict{$a}) || ($dict{$a} eq $b)) {
412 print NOTES "$_\n";
413 if (defined($dict{$a})) { unset($dict{$a}); }
414 }
415 }
416 for (keys %dict) {
417 $mark=$dict{$_};
418 $space=" "x(30-length($_));
419 print NOTES "$_$space$mark\n";
420 }
421 }
422 }
423 PERL_SCRIPT
424 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
425 export NOTES_FILE
426 export DONT_ADD_MARKS
427 perl $PERL_SCRIPT_TEMP_NAME "$1"
428 rm $PERL_SCRIPT_TEMP_NAME
429 }
431 mkdir -p $WORK_DIR
432 oldpwd="$PWD"
433 cd $WORK_DIR
434 if echo "$1" | grep -q http:
435 then
436 text_from_url "$1"
437 elif [ "$#" != 0 ]
438 then
439 if echo $1 | grep -q ^/
440 then
441 cat "$1"
442 else
443 cat "$oldpwd/$1"
444 fi
445 else
446 cat
447 fi \
448 | tee $ORIGINAL_TEXT \
449 | get_words ${TEMP1}-full \
450 | group_words \
451 | add_stat ${TEMP1}-full \
452 | tee "$TEMP1" > "$TEMP2"
454 if [ "$STAT_ONLY" = "YES" ]
455 then
456 cat "$TEMP1"
457 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
458 then
459 cat "$TEMP1"
460 else
461 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
462 if [ "$editor" = vim ]
463 then
464 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=' "$TEMP2" < /dev/tty > /dev/tty
465 else
466 echo 2
467 $editor "$TEMP2"
468 fi
469 remove_marks "$TEMP2"
470 fi
472 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$VOCABULARY"
473 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"