new-words

view new-words.sh @ 16:c65ffd60cc18

ключ -s: Информация об известных словах в тексте

Может использоваться в пакетном режиме,
например, для того чтобы из множества файлов
выбрать тот, в котором процент известных слов наибольший
author Igor Chubin <igor@chub.in>
date Sun Apr 04 19:03:30 2010 +0300 (2010-04-04)
parents 9b18c7efe31c
children 35eeaf2620ce
line source
1 #!/bin/sh
2 cat <<HELP > /dev/null
4 USAGE:
6 new-words [ -l lang ] [ -s ] [ ARG ]
8 SWITCHES:
10 -s show text statistics and exit
11 -l lang override language settings
13 Поддержка нескольких языков:
15 new-words -l lang URL
17 Например, для немецких текстов:
19 new-words -l de URL
21 Или, предварительно создав соответствующую ссылку:
23 de-words URL
25 HELP
27 WORK_DIR=~/.new-words/
28 TEMP1=`mktemp /tmp/news-words-XXXXXXXXXX`
29 TEMP2=`mktemp /tmp/news-words-XXXXXXXXXX`
30 export ORIGINAL_TEXT=`mktemp /tmp/news-words-XXXXXXXXXX-orig`
31 editor=${EDITOR:-vim}
33 LANGUAGE=en
34 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
35 if echo "$1" | grep -q http://...wikipedia.org/wiki/
36 then
37 LANGUAGE="`echo $1 | sed s@http://@@ | sed s@.wikipedia.*@@`"
38 fi
39 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
40 if [ "$1" = "-l" ]
41 then
42 LANGUAGE="$2"
43 VOCABULARY="$LANGUAGE".txt
44 shift 2
45 fi
46 VOCABULARY=${LANGUAGE}.txt
47 NOTES_FILE=notes-${LANGUAGE}.txt
49 STAT_ONLY=NO
50 if [ "$1" = "-s" ]
51 then
52 STAT_ONLY=YES
53 shift
54 fi
56 get_words()
57 {
58 tr ' ' '\n' | sed 's/--/ /g' \
59 | sed "s/'/__APOSTROPHE__/g" \
60 | tr '—·-' '-----' \
61 | tr '*\r,.:#@()+=—<>$;"?!|·[]^%&' ' ' \
62 | tr ' ' '\n' \
63 | grep -x '[[:alpha:]'"'"'-]*' \
64 | tee "$1" \
65 | grep_v_english_perl \
66 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
67 }
69 add_stat()
70 {
71 before="$1"
72 after=${before}2
73 cat > "$after"
74 total="`wc -w $1 | awk '{print $1}'`"
75 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
76 total_known="`echo $total-$total_unknown|bc`"
77 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
78 if [ "$STAT_ONLY" = "YES" ]
79 then
80 echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL"
81 echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total"
82 return 0
83 else
84 echo "# $LANGUAGE, $percentage, <$total_known/$total>"
85 fi
87 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
88 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
89 my $total=shift(@ARGV);
90 my $total_known=shift(@ARGV);
91 my $s=0;
92 my $mark_line=int($total_known*100/$total/5)*5;
93 if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; };
94 while(<>)
95 {
96 print;
97 /^\s*([0-9]*)\s*/;
98 $s+=$1;
99 if (int(($total_known+$s)*100/$total)>=$mark_line) {
100 print "# $mark_line\n";
101 if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; };
102 }
103 }
104 PERL_SCRIPT
105 perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after"
106 rm $PERL_SCRIPT_TEMP_NAME
107 rm $after
108 }
110 grep_v_english()
111 {
112 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
113 eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat
114 }
116 grep_v_english_perl()
117 {
118 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
119 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
120 open(VOC, $ENV{VOCABULARY})
121 or die "Can't open VOCABULARY";
122 while (<VOC>){
123 chomp;
124 #s/'//g;
125 $voc{$_}="1";
126 }
127 while(<>) {
128 chomp;
129 if (not defined($voc{$_})) { print "$_\n"; }
130 }
131 PERL_SCRIPT
132 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
133 export VOCABULARY
134 perl $PERL_SCRIPT_TEMP_NAME
135 rm $PERL_SCRIPT_TEMP_NAME
136 }
138 group_words()
139 {
140 if [ "$LANGUAGE" != "en" ]
141 then
142 cat
143 return
144 fi
145 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX`
146 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
147 #!/usr/bin/perl
149 eval {
150 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
151 require String::Similarity;
152 String::Similarity->import();
153 };
154 unless($@)
155 {
156 our $HAVE_String_Similarity=1;
157 }
159 sub similar($$){
160 my $a=shift;
161 my $b=shift;
162 if ($HAVE_String_Similarity) {
163 return $Similarity{"$a $b"};
164 }
165 else {
166 return 0;
167 }
168 }
170 sub normalize($)
171 {
172 $_=lc(shift);
174 s/s$//;
175 s/ed$//;
176 s/en$//;
177 s/er$//;
178 s/est$//;
179 s/ing$//;
181 s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//;
182 s/ation$//; s/ness$//; s/ship$//; s/ally$//;
183 s/ment$//; s/ify$//; s/ity$//; s/fy$//; s/ly$//;
184 s/ise$//; s/ize$//;
186 s/e$//;
187 return $_;
188 }
191 sub compare($$)
192 {
193 my $a=shift;
194 my $b=shift;
195 $a =~ s/^\s*//;
196 $b =~ s/^\s*//;
197 my ($a1, $a2)= split /\s+/,$a,2;
198 my ($b1, $b2)= split /\s+/,$b,2;
200 my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)};
202 if ($cmp) {
203 return $cmp;
204 }
205 else {
206 if (normalize($a2) ne normalize($b2)) {
207 return normalize($a2) cmp normalize($b2);
208 }
209 else {
210 return $a1 <=> $b1;
211 }
212 }
213 }
215 @lines=<>;
216 for $L (@lines) {
217 chomp($L);
218 $l=$L;
219 $l =~ s/^\s*//;
220 my ($a, $b)=split(/\s+/,$l,2);
221 $group_weight{normalize($b)}+=$a;
222 }
223 @lines2 = sort { compare($b,$a) } @lines;
224 for $l (@lines2) {
225 print "$l\n";
226 }
227 PERL_SCRIPT
228 perl $PERL_SCRIPT_TEMP_NAME
229 rm $PERL_SCRIPT_TEMP_NAME
230 }
232 text_from_url()
233 {
234 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
235 }
237 add_marks()
238 {
239 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
240 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
241 $file = $ARGV[0];
242 our $dict;
243 if (open(NOTES, $ENV{NOTES_FILE})) {
244 while(<NOTES>) {
245 chomp;
246 s/^\s+//;
247 my ($a,$b)=split /\s+/,$_,2;
248 $dict{$a}=$b;
249 }
250 }
251 if (open(F, $file)) {
252 @lines=<F>;
253 close(F);
255 if (open(F, ">$file")) {
256 for (@lines) {
257 m/\s+\S+\s+(\S+)/;
258 $name=$1;
259 if (not /^#/ and defined($dict{$name})) {
260 chomp;
261 $mark=$dict{$name};
262 $space=" "x(30-length($_));
263 print F "$_$space$mark\n";
264 }
265 else {
266 print F "$_";
267 }
268 }
269 close(F);
270 }
271 }
272 PERL_SCRIPT
273 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
274 export NOTES_FILE
275 perl $PERL_SCRIPT_TEMP_NAME "$1"
276 rm $PERL_SCRIPT_TEMP_NAME
277 }
279 remove_marks()
280 {
281 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
282 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
283 $file = $ARGV[0];
284 our %dict;
285 if (open(F, $file)) {
286 @lines=<F>;
287 close(F);
289 if (open(F, ">$file")) {
290 for (@lines) {
291 chomp;
292 if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
293 my $name=$4;
294 my $comment=$6;
295 $dict{$name}=$comment;
296 print F "$1$2$3$4\n";
297 }
298 else {
299 print F "$_\n";
300 }
301 }
302 }
303 }
304 if (open(NOTES, $ENV{NOTES_FILE})) {
305 @lines=<NOTES>;
306 close(NOTES);
308 if (open(NOTES, ">".$ENV{NOTES_FILE})) {
309 for (@lines) {
310 chomp;
311 s/^\s+//;
312 my ($a,$b)=split /\s+/,$_,2;
313 if (not defined($dict{$a}) || ($dict{$a} eq $b)) {
314 print NOTES "$_\n";
315 if (defined($dict{$a})) { unset($dict{$a}); }
316 }
317 }
318 for (keys %dict) {
319 $mark=$dict{$_};
320 $space=" "x(30-length($_));
321 print NOTES "$_$space$mark\n";
322 }
323 }
324 }
325 PERL_SCRIPT
326 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
327 export NOTES_FILE
328 perl $PERL_SCRIPT_TEMP_NAME "$1"
329 rm $PERL_SCRIPT_TEMP_NAME
330 }
332 mkdir -p $WORK_DIR
333 oldpwd="$PWD"
334 cd $WORK_DIR
335 if echo "$1" | grep -q http:
336 then
337 text_from_url "$1"
338 elif [ "$#" != 0 ]
339 then
340 if echo $1 | grep -q ^/
341 then
342 cat "$1"
343 else
344 cat "$oldpwd/$1"
345 fi
346 else
347 cat
348 fi \
349 | tee $ORIGINAL_TEXT \
350 | get_words ${TEMP1}-full \
351 | group_words \
352 | add_stat ${TEMP1}-full \
353 | tee "$TEMP1" > "$TEMP2"
355 if [ "$STAT_ONLY" = "YES" ]
356 then
357 cat "$TEMP1"
358 else
359 add_marks "$TEMP2"
360 if [ "$editor" = vim ]
361 then
362 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=' "$TEMP2" < /dev/tty > /dev/tty
363 else
364 echo 2
365 $editor "$TEMP2"
366 fi
367 remove_marks "$TEMP2"
368 fi
370 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$VOCABULARY"
371 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"