new-words

view new-words.sh @ 11:34d0332f238c

Группировка похожих слов (пока что только для английского языка).

Выполняется группировка похожих слов,
и слова сортируются не отдельно, а группами,
исходя из количества слов в группе.
В пределах группы сортировка выполняется по количеству вхождений
для каждого слова.
author Igor Chubin <igor@chub.in>
date Fri Apr 02 19:46:44 2010 +0300 (2010-04-02)
parents 0ff259b8b6a3
children 4bbe553c1ee2
line source
1 #!/bin/sh
2 cat <<HELP > /dev/null
4 Поддержка нескольких языков:
6 new-words -l lang URL
8 Например, для немецких текстов:
10 new-words -l de URL
12 Или, предварительно создав соответствующую ссылку:
14 de-words URL
16 HELP
18 WORK_DIR=~/.new-words/
19 TEMP1=`mktemp /tmp/news-words-XXXXXXXXXX`
20 TEMP2=`mktemp /tmp/news-words-XXXXXXXXXX`
21 export ORIGINAL_TEXT=`mktemp /tmp/news-words-XXXXXXXXXX-orig`
22 editor=${EDITOR:-vim}
24 LANGUAGE=en
25 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
26 if echo "$1" | grep -q http://...wikipedia.org/wiki/
27 then
28 LANGUAGE="`echo $1 | sed s@http://@@ | sed s@.wikipedia.*@@`"
29 fi
30 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
31 if [ "$1" = "-l" ]
32 then
33 LANGUAGE="$2"
34 VOCABULARY="$LANGUAGE".txt
35 shift 2
36 fi
37 VOCABULARY=${LANGUAGE}.txt
38 NOTES_FILE=notes-${LANGUAGE}.txt
40 get_words()
41 {
42 tr ' ' '\n' | sed 's/--/ /g' \
43 | sed "s/'/__APOSTROPHE__/g" \
44 | tr '—·-' '-----' \
45 | tr '*\r,.:#@()+=—<>$;"?!|·[]^%&' ' ' \
46 | tr ' ' '\n' \
47 | grep -x '[[:alpha:]'"'"'-]*' \
48 | tee "$1" \
49 | grep_v_english_perl \
50 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
51 }
53 add_stat()
54 {
55 before="$1"
56 after=${before}2
57 cat > "$after"
58 total="`wc -w $1 | awk '{print $1}'`"
59 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
60 total_known="`echo $total-$total_unknown|bc`"
61 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
62 echo "# $LANGUAGE, $percentage, <$total_known/$total>"
63 cat "$after"
64 rm $after
65 }
67 grep_v_english()
68 {
69 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
70 eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat
71 }
73 grep_v_english_perl()
74 {
75 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
76 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
77 open(VOC, $ENV{VOCABULARY})
78 or die "Can't open VOCABULARY";
79 while (<VOC>){
80 chomp;
81 #s/'//g;
82 $voc{$_}="1";
83 }
84 while(<>) {
85 chomp;
86 if (not defined($voc{$_})) { print "$_\n"; }
87 }
88 PERL_SCRIPT
89 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
90 export VOCABULARY
91 perl $PERL_SCRIPT_TEMP_NAME
92 rm $PERL_SCRIPT_TEMP_NAME
93 }
95 group_words()
96 {
97 if [ "$LANGUAGE" != "en" ]
98 then
99 cat
100 return
101 fi
102 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX`
103 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
104 #!/usr/bin/perl
106 sub normalize($)
107 {
108 $_=lc(shift);
109 s///;
110 s/s$//;
111 s/ed$//;
112 s/ing$//;
113 return $_;
114 }
116 sub compare($$)
117 {
118 my $a=shift;
119 my $b=shift;
120 $a =~ s/^\s*//;
121 $b =~ s/^\s*//;
122 my ($a1, $a2)= split /\s+/,$a,2;
123 my ($b1, $b2)= split /\s+/,$b,2;
125 my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)};
126 if ($cmp) {
127 return $cmp;
128 }
129 else {
130 if (normalize($a2) ne normalize($b2)) {
131 return normalize($a2) cmp normalize($b2);
132 }
133 else {
134 return $a1 <=> $b1;
135 }
136 }
137 }
139 @lines=<>;
140 for $L (@lines) {
141 chomp($L);
142 $l=$L;
143 $l =~ s/^\s*//;
144 my ($a, $b)=split(/\s+/,$l,2);
145 $group_weight{normalize($b)}+=$a;
146 }
147 @lines2 = sort { compare($b,$a) } @lines;
148 for $l (@lines2) {
149 print "$l\n";
150 }
151 PERL_SCRIPT
152 perl $PERL_SCRIPT_TEMP_NAME
153 rm $PERL_SCRIPT_TEMP_NAME
154 }
156 text_from_url()
157 {
158 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
159 }
161 add_marks()
162 {
163 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
164 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
165 $file = $ARGV[0];
166 our $dict;
167 if (open(NOTES, $ENV{NOTES_FILE})) {
168 while(<NOTES>) {
169 chomp;
170 s/^\s+//;
171 my ($a,$b)=split /\s+/,$_,2;
172 $dict{$a}=$b;
173 }
174 }
175 if (open(F, $file)) {
176 @lines=<F>;
177 close(F);
179 if (open(F, ">$file")) {
180 for (@lines) {
181 m/\s+\S+\s+(\S+)/;
182 $name=$1;
183 if (not /^#/ and defined($dict{$name})) {
184 chomp;
185 $mark=$dict{$name};
186 $space=" "x(30-length($_));
187 print F "$_$space$mark\n";
188 }
189 else {
190 print F "$_";
191 }
192 }
193 close(F);
194 }
195 }
196 PERL_SCRIPT
197 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
198 export NOTES_FILE
199 perl $PERL_SCRIPT_TEMP_NAME "$1"
200 rm $PERL_SCRIPT_TEMP_NAME
201 }
203 remove_marks()
204 {
205 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
206 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
207 $file = $ARGV[0];
208 our %dict;
209 if (open(F, $file)) {
210 @lines=<F>;
211 close(F);
213 if (open(F, ">$file")) {
214 for (@lines) {
215 chomp;
216 if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
217 my $name=$4;
218 my $comment=$6;
219 $dict{$name}=$comment;
220 print F "$1$2$3$4\n";
221 }
222 else {
223 print F "$_\n";
224 }
225 }
226 }
227 }
228 if (open(NOTES, $ENV{NOTES_FILE})) {
229 @lines=<NOTES>;
230 close(NOTES);
232 if (open(NOTES, ">".$ENV{NOTES_FILE})) {
233 for (@lines) {
234 chomp;
235 s/^\s+//;
236 my ($a,$b)=split /\s+/,$_,2;
237 if (not defined($dict{$a}) || ($dict{$a} eq $b)) {
238 print NOTES "$_\n";
239 if (defined($dict{$a})) { unset($dict{$a}); }
240 }
241 }
242 for (keys %dict) {
243 $mark=$dict{$_};
244 $space=" "x(30-length($_));
245 print NOTES "$_$space$mark\n";
246 }
247 }
248 }
249 PERL_SCRIPT
250 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
251 export NOTES_FILE
252 perl $PERL_SCRIPT_TEMP_NAME "$1"
253 rm $PERL_SCRIPT_TEMP_NAME
254 }
256 mkdir -p $WORK_DIR
257 oldpwd="$PWD"
258 cd $WORK_DIR
259 if echo "$1" | grep -q http:
260 then
261 text_from_url "$1"
262 elif [ "$#" != 0 ]
263 then
264 cat "$oldpwd/$1"
265 else
266 cat
267 fi \
268 | tee $ORIGINAL_TEXT \
269 | get_words ${TEMP1}-full \
270 | group_words \
271 | add_stat ${TEMP1}-full \
272 | tee "$TEMP1" > "$TEMP2"
274 add_marks "$TEMP2"
275 if [ "$editor" = vim ]
276 then
277 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=' "$TEMP2" < /dev/tty > /dev/tty
278 else
279 echo 2
280 $editor "$TEMP2"
281 fi
282 remove_marks "$TEMP2"
284 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$VOCABULARY"
285 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"