new-words

view new-words.sh @ 3:c703b8898696

Комментарий-министатистика, дефис в словах, автоматический выбор языка для википедии.

* Комментарий с процентом известных слов,
* Автоматическое определение языка по названию страницы в википедии (только двухбуквенный),
* Слова с дефисом.
author igor@book.xt.vpn
date Tue Mar 02 22:28:27 2010 +0200 (2010-03-02)
parents 68722cd6faff
children 9345cc05fdd1
line source
1 #!/bin/sh
2 cat <<HELP > /dev/null
4 Поддержка нескольких языков:
6 new-words -l lang URL
8 Например, для немецких текстов:
10 new-words -l de URL
12 Или, предварительно создав соответствующую ссылку:
14 de-words URL
16 HELP
18 WORK_DIR=~/.new-words/
19 TEMP1=`mktemp /tmp/news-words-XXXXXXXXXX`
20 TEMP2=`mktemp /tmp/news-words-XXXXXXXXXX`
21 editor=${EDITOR:-vim}
23 LANGUAGE=en
24 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
25 if echo "$1" | grep -q http://...wikipedia.org/wiki/
26 then
27 LANGUAGE="`echo $1 | sed s@http://@@ | sed s@.wikipedia.*@@`"
28 fi
29 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
30 if [ "$1" = "-l" ]
31 then
32 LANGUAGE="$2"
33 VOCABULARY="$LANGUAGE".txt
34 shift 2
35 fi
36 VOCABULARY=${LANGUAGE}.txt
37 NOTES_FILE=notes-${LANGUAGE}.txt
39 get_words()
40 {
41 tr ' ' '\n' | sed 's/--/ /g' \
42 | sed "s/'/__APOSTROPHE__/g" \
43 | tr '—·-' '-----' \
44 | tr '*\r,.:#@()+=—<>$;"?!|·[]^%&' ' ' \
45 | tr ' ' '\n' \
46 | grep -x '[[:alpha:]'"'"'-]*' \
47 | tee "$1" \
48 | grep_v_english_perl \
49 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
50 }
52 add_stat()
53 {
54 before="$1"
55 after=${before}2
56 cat > "$after"
57 total="`wc -w $1 | awk '{print $1}'`"
58 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
59 total_known="`echo $total-$total_unknown|bc`"
60 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
61 echo "# $LANGUAGE, $percentage, <$total_known/$total>"
62 cat "$after"
63 rm $after
64 }
66 grep_v_english()
67 {
68 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
69 eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat
70 }
72 grep_v_english_perl()
73 {
74 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
75 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
76 open(VOC, $ENV{VOCABULARY})
77 or die "Can't open VOCABULARY";
78 while (<VOC>){
79 chomp;
80 #s/'//g;
81 $voc{$_}="1";
82 }
83 while(<>) {
84 chomp;
85 if (not defined($voc{$_})) { print "$_\n"; }
86 }
87 PERL_SCRIPT
88 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
89 export VOCABULARY
90 perl $PERL_SCRIPT_TEMP_NAME
91 rm $PERL_SCRIPT_TEMP_NAME
92 }
94 text_from_url()
95 {
96 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
97 }
99 add_marks()
100 {
101 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
102 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
103 $file = $ARGV[0];
104 our $dict;
105 if (open(NOTES, $ENV{NOTES_FILE})) {
106 while(<NOTES>) {
107 chomp;
108 s/^\s+//;
109 my ($a,$b)=split /\s+/,$_,2;
110 $dict{$a}=$b;
111 }
112 }
113 if (open(F, $file)) {
114 @lines=<F>;
115 close(F);
117 if (open(F, ">$file")) {
118 for (@lines) {
119 m/\s+\S+\s+(\S+)/;
120 $name=$1;
121 if (not /^#/ and defined($dict{$name})) {
122 chomp;
123 $mark=$dict{$name};
124 $space=" "x(30-length($_));
125 print F "$_$space$mark\n";
126 }
127 else {
128 print F "$_";
129 }
130 }
131 close(F);
132 }
133 }
134 PERL_SCRIPT
135 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
136 export NOTES_FILE
137 perl $PERL_SCRIPT_TEMP_NAME "$1"
138 rm $PERL_SCRIPT_TEMP_NAME
139 }
141 remove_marks()
142 {
143 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
144 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
145 $file = $ARGV[0];
146 our %dict;
147 if (open(F, $file)) {
148 @lines=<F>;
149 close(F);
151 if (open(F, ">$file")) {
152 for (@lines) {
153 chomp;
154 if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
155 my $name=$4;
156 my $comment=$6;
157 $dict{$name}=$comment;
158 print F "$1$2$3$4\n";
159 }
160 else {
161 print F "$_\n";
162 }
163 }
164 }
165 }
166 if (open(NOTES, $ENV{NOTES_FILE})) {
167 @lines=<NOTES>;
168 close(NOTES);
170 if (open(NOTES, ">".$ENV{NOTES_FILE})) {
171 for (@lines) {
172 chomp;
173 s/^\s+//;
174 my ($a,$b)=split /\s+/,$_,2;
175 if (not defined($dict{$a}) || ($dict{$a} eq $b)) {
176 print NOTES "$_\n";
177 if (defined($dict{$a})) { unset($dict{$a}); }
178 }
179 }
180 for (keys %dict) {
181 $mark=$dict{$_};
182 $space=" "x(30-length($_));
183 print NOTES "$_$space$mark\n";
184 }
185 }
186 }
187 PERL_SCRIPT
188 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
189 export NOTES_FILE
190 perl $PERL_SCRIPT_TEMP_NAME "$1"
191 rm $PERL_SCRIPT_TEMP_NAME
192 }
194 mkdir -p $WORK_DIR
195 cd $WORK_DIR
196 if echo "$1" | grep -q http:
197 then
198 text_from_url "$1" | get_words ${TEMP1}-full | add_stat ${TEMP1}-full| tee "$TEMP1" > "$TEMP2"
199 elif [ "$#" != 0 ]
200 then
201 cat "$1" | get_words ${TEMP1}-full | add_stat ${TEMP1}-full | tee "$TEMP1" > "$TEMP2"
202 else
203 get_words ${TEMP1}-full| add_stat ${TEMP1}-full | tee "$TEMP1" > "$TEMP2"
204 fi
206 add_marks "$TEMP2"
207 if [ "$editor" = vim ]
208 then
209 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=' "$TEMP2" < /dev/tty > /dev/tty
210 else
211 echo 2
212 $editor "$TEMP2"
213 fi
214 remove_marks "$TEMP2"
216 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$VOCABULARY"
217 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full"