new-words

view new-words-py.sh @ 42:3ec83a7cc544

minifix: psyco activated if installed
author Igor Chubin <igor@chub.in>
date Mon Jan 24 06:31:42 2011 +0100 (2011-01-24)
parents a598e0d25784
children d532e7b52ab2
line source
1 #!/bin/bash
3 show_usage()
4 {
5 cat <<HELP > /dev/stderr
7 USAGE:
9 new-words [ -l lang ] [ -s ] [ ARG ]
11 SWITCHES:
13 -h print this screen
14 -c show compressed wordlist: one word per group
15 -k put higher words that are similar to the known words (only for English)
16 -l lang override language settings
17 -n non-interactive mode (don't run vi)
18 -N turn off known words filtering
19 -a don't add marks (and don't save marks added by user)
20 -p pages work with specified pages only (pages = start-stop/total )
21 -s show the text statistics (percentage of known words and so on) and exit
22 -S show your vocabulary statistics (number of words and word groups)
23 -t tag tag known words with tag
24 -T show list of active tags
25 -m tag merge the words tagged with "tag" into the main vocabulary
26 -M merge the words tagged with any tag into the main vocabulary
27 -r tag remove subvocabulary for the "tag"
28 -2 -3 find 2 and 3 words' sequences
30 The language of the text can be specified also
31 by name of the program new-words (correspondent link must be created before).
32 For example, these calls are equivalent:
34 de-words URL
35 new-words -l de URL
37 HELP
38 }
40 if [ "$1" = "-h" ]
41 then
42 show_usage
43 exit 0
44 fi
46 NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py
47 WORK_DIR=~/.new-words/
48 TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX`
49 TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX`
50 export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX`
51 editor=${EDITOR:-vim}
53 # language detection
55 LANGUAGE=en
56 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
57 for arg
58 do
59 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
60 then
61 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
62 fi
63 done
64 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
66 #----------------------------------------------------
67 # command line options processing
69 STAT_ONLY=NO
70 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
71 DONT_ADD_MARKS=NO
72 NON_INTERACTIVE_MODE=NO
73 PART_TO_PROCESS=''
74 GROUP_WORDS_BY_THREE=NO
75 GROUP_WORDS_BY_TWO=NO
76 TAG_NAME=''
77 MERGE_THIS_TAGS=''
78 TAGS_LIST_ONLY=NO
79 MERGE_TAGGED_WORDS=NO
80 MERGE_ALL_TAGGED=NO
81 DONT_ADD_MARKLINES=NO
82 FILTER_WORDS=YES
83 SHOW_VOC_STAT=NO
84 COMPRESSED_WORDLIST=NO
85 while getopts cl:sSkanNp:t:Tm:Mr:23 opt
86 do
87 case "$opt" in
88 c) COMPRESSED_WORDLIST=YES;;
89 s) STAT_ONLY=YES;;
90 S) SHOW_VOC_STAT=YES;;
91 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
92 l) LANGUAGE="$OPTARG";;
93 a) DONT_ADD_MARKS=YES;;
94 n) NON_INTERACTIVE_MODE=YES;;
95 N) FILTER_WORDS=NO;;
96 p) PART_TO_PROCESS="$OPTARG";;
97 t) TAG_NAME="$OPTARG";;
98 T) TAGS_LIST_ONLY="YES";;
99 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
100 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
101 r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
102 2) GROUP_WORDS_BY_TWO=YES;;
103 3) GROUP_WORDS_BY_THREE=YES;;
104 \?) # unknown flag
105 show_usage
106 exit 1;;
107 esac
108 done
109 shift `expr $OPTIND - 1`
111 if [ "$1" = "-l" ]
112 then
113 LANGUAGE="$2"
114 shift 2
115 fi
117 VOCABULARY=${LANGUAGE}.txt
118 NOTES_FILE=notes-${LANGUAGE}.txt
120 if [ "${SHOW_VOC_STAT}" = "YES" ]
121 then
122 $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>"
123 exit 0
124 fi
126 two_and_three_words()
127 {
128 if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ]
129 then
130 cat
131 else
132 cat
134 export GROUP_WORDS_BY_THREE
135 export GROUP_WORDS_BY_TWO
136 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX`
137 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
138 #!/usr/bin/perl
139 local $/;
140 $words=<>;
141 $words=~ s@[!?;,:#1-9".]@ @g;
142 $words =~ s@\s+@ @g;
143 @words = split /\s+/, $words;
144 for ($i=0; $i<$#words-3;$i++) {
145 my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]);
146 if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) {
147 print "${a}_${b}_${c}\n";
148 };
149 if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) {
150 print "${a}_${b}\n";
151 };
152 }
153 PERL_SCRIPT
154 perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT"
155 rm $PERL_SCRIPT_TEMP_NAME
156 fi
157 }
159 text_from_url()
160 {
161 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
162 }
164 add_marks()
165 {
166 $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1"
167 }
168 remove_marks()
169 {
170 $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1"
171 }
172 get_words_group_words_add_stat()
173 {
174 $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1"
175 }
177 part()
178 {
179 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
180 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
181 #!/usr/bin/perl
183 my @lines=<STDIN>;
184 my $lines=$#lines;
185 my $interval=$ARGV[0];
186 if (not $interval) {
187 print @lines;
188 }
189 else {
190 my ($start,$stop,$total);
191 if ($interval =~ m@(.*)/(.*)@) {
192 $start = $1;
193 $total = $2;
194 }
195 else {
196 $start=$interval;
197 $total=0;
198 }
199 if ($start =~ m@(.*)-(.*)@) {
200 $start = $1;
201 $stop = $2;
202 }
203 if ($start =~ m@(.*)\+(.*)@) {
204 $start = $1;
205 $stop = $start+$2;
206 }
208 $start=int($lines/$total*$start);
209 $stop=int($lines/$total*$stop);
211 for($i=$start;$i<$stop;$i++){
212 print $lines[$i];
213 }
214 }
215 PERL_SCRIPT
216 perl $PERL_SCRIPT_TEMP_NAME "$1"
217 rm $PERL_SCRIPT_TEMP_NAME
218 }
220 if [ "$TAGS_LIST_ONLY" = "YES" ]
221 then
222 cd "${WORK_DIR}"
223 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'
224 exit 0
225 fi
227 tag_file_name()
228 {
229 echo "${LANGUAGE}_${1}.txt"
230 }
232 if [ "$REMOVE_TAG" = "YES" ]
233 then
234 cd "${WORK_DIR}"
235 for i in $TAG_NAME
236 do
237 echo "$TAGNAME" | grep -q '[/*?]' && continue
238 f="`tag_file_name $i`"
239 if [ -e "$f" ]
240 then
241 rm -f "$f" && echo Tag "'$i'" removed
242 else
243 echo Unknown tag "'$i'"
244 fi
245 done
246 exit 0
247 fi
249 mkdir -p $WORK_DIR
250 oldpwd="$PWD"
251 cd $WORK_DIR
252 if [ "$MERGE_TAGGED_WORDS" = "YES" ]
253 then
254 VOC_FILES=''
255 for i in $MERGE_THIS_TAGS
256 do
257 f=`tag_file_name $i`
258 [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
259 done
260 if [ -z "$VOC_FILES" ]
261 then
262 echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
263 else
264 cat $VOC_FILES
265 fi
266 elif [ "$MERGE_ALL_TAGGED" = "YES" ]
267 then
268 cat ${LANGUAGE}_*.txt
269 elif echo "$1" | grep -q http:
270 then
271 text_from_url "$1"
272 elif [ "$#" != 0 ]
273 then
274 if echo $1 | grep -q ^/
275 then
276 cat "$1"
277 else
278 cat "$oldpwd/$1"
279 fi
280 else
281 cat
282 fi \
283 | part $PART_TO_PROCESS \
284 | tee $ORIGINAL_TEXT \
285 | two_and_three_words \
286 | get_words_group_words_add_stat \
287 | tee "$TEMP1" > "$TEMP2"
289 if [ "$STAT_ONLY" = "YES" ]
290 then
291 cat "$TEMP1"
292 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
293 then
294 cat "$TEMP1"
295 else
296 if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ]
297 then
298 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
299 if [ "$editor" = vim ]
300 then
301 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
302 else
303 $editor "$TEMP2"
304 fi
305 remove_marks "$TEMP2"
307 vocabulary="$VOCABULARY"
308 [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`"
309 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary"
310 fi
311 fi
313 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"