new-words

view new-words-py.sh @ 49:00286f6bfa85

experimental: when -c specified, use dictionary for compression
author Igor Chubin <igor@chub.in>
date Wed Feb 09 21:08:23 2011 +0200 (2011-02-09)
parents d708e2c1bad8
children 4e931db74618
line source
1 #!/bin/bash
3 show_usage()
4 {
5 cat <<HELP > /dev/stderr
7 USAGE:
9 new-words [ -l lang ] [ -s ] [ ARG ]
11 SWITCHES:
13 -h print this screen
14 -c show compressed wordlist: one word per group
15 -G turn off word grouping
16 -k put higher words that are similar to the known words (only for English)
17 -l lang override language settings
18 -n non-interactive mode (don't run vi)
19 -N turn off known words filtering
20 -a don't add marks (and don't save marks added by user)
21 -p pages work with specified pages only (pages = start-stop/total )
22 -s show the text statistics (percentage of known words and so on) and exit
23 -S show your vocabulary statistics (number of words and word groups)
24 -t tag tag known words with tag
25 -T show list of active tags
26 -m tag merge the words tagged with "tag" into the main vocabulary
27 -M merge the words tagged with any tag into the main vocabulary
28 -d tag delete subvocabulary for the "tag"
29 -r RANGE show only first RANGE words
30 -R RANGE show only words lower than RANGE percent
31 -2 -3 find 2 and 3 words' sequences
33 The language of the text can be specified also
34 by name of the program new-words (correspondent link must be created before).
35 For example, these calls are equivalent:
37 de-words URL
38 new-words -l de URL
40 HELP
41 }
43 if [ "$1" = "-h" ]
44 then
45 show_usage
46 exit 0
47 fi
49 NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py
50 WORK_DIR=~/.new-words/
51 TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX`
52 TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX`
53 export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX`
54 editor=${EDITOR:-vim}
56 # language detection
58 LANGUAGE=en
59 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
60 for arg
61 do
62 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
63 then
64 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
65 fi
66 done
67 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
69 #----------------------------------------------------
70 # command line options processing
72 STAT_ONLY=NO
73 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
74 DONT_ADD_MARKS=NO
75 NON_INTERACTIVE_MODE=NO
76 PART_TO_PROCESS=''
77 GROUP_WORDS_BY_THREE=NO
78 GROUP_WORDS_BY_TWO=NO
79 TAG_NAME=''
80 MERGE_THIS_TAGS=''
81 TAGS_LIST_ONLY=NO
82 MERGE_TAGGED_WORDS=NO
83 MERGE_ALL_TAGGED=NO
84 DONT_ADD_MARKLINES=NO
85 FILTER_WORDS=YES
86 SHOW_VOC_STAT=NO
87 COMPRESSED_WORDLIST=NO
88 WORDS_GROUPING=YES
89 while getopts Gcl:sSkanNp:t:Tm:Md:r:R:23 opt
90 do
91 case "$opt" in
92 c) COMPRESSED_WORDLIST=YES;;
93 G) WORDS_GROUPING=NO;;
94 s) STAT_ONLY=YES;;
95 S) SHOW_VOC_STAT=YES;;
96 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
97 l) LANGUAGE="$OPTARG";;
98 a) DONT_ADD_MARKS=YES;;
99 n) NON_INTERACTIVE_MODE=YES;;
100 N) FILTER_WORDS=NO;;
101 p) PART_TO_PROCESS="$OPTARG";;
102 t) TAG_NAME="$OPTARG";;
103 T) TAGS_LIST_ONLY="YES";;
104 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
105 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
106 d) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
107 r) SHOW_RANGE="$OPTARG";;
108 R) SHOW_RANGE_PERCENTAGE="$OPTARG";;
109 2) GROUP_WORDS_BY_TWO=YES;;
110 3) GROUP_WORDS_BY_THREE=YES;;
111 \?) # unknown flag
112 show_usage
113 exit 1;;
114 esac
115 done
116 shift `expr $OPTIND - 1`
118 if [ "$1" = "-l" ]
119 then
120 LANGUAGE="$2"
121 shift 2
122 fi
124 VOCABULARY=${LANGUAGE}.txt
125 NOTES_FILE=notes-${LANGUAGE}.txt
127 if [ "${SHOW_VOC_STAT}" = "YES" ]
128 then
129 $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>"
130 exit 0
131 fi
133 text_from_url()
134 {
135 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
136 }
138 add_marks()
139 {
140 $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1"
141 }
142 remove_marks()
143 {
144 $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1"
145 }
146 get_words_group_words_add_stat()
147 {
148 SHOW_RANGE="$SHOW_RANGE" \
149 SHOW_RANGE_PERCENTAGE="$SHOW_RANGE_PERCENTAGE" \
150 COMPRESSED_WORDLIST="$COMPRESSED_WORDLIST" \
151 GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \
152 GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \
153 STAT_ONLY="$STAT_ONLY" \
154 WORDS_GROUPING="$WORDS_GROUPING" \
155 FILTER_WORDS="$FILTER_WORDS" \
156 $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1"
157 }
159 part()
160 {
161 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
162 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
163 #!/usr/bin/perl
165 my @lines=<STDIN>;
166 my $lines=$#lines;
167 my $interval=$ARGV[0];
168 if (not $interval) {
169 print @lines;
170 }
171 else {
172 my ($start,$stop,$total);
173 if ($interval =~ m@(.*)/(.*)@) {
174 $start = $1;
175 $total = $2;
176 }
177 else {
178 $start=$interval;
179 $total=0;
180 }
181 if ($start =~ m@(.*)-(.*)@) {
182 $start = $1;
183 $stop = $2;
184 }
185 if ($start =~ m@(.*)\+(.*)@) {
186 $start = $1;
187 $stop = $start+$2;
188 }
190 $start=int($lines/$total*$start);
191 $stop=int($lines/$total*$stop);
193 for($i=$start;$i<$stop;$i++){
194 print $lines[$i];
195 }
196 }
197 PERL_SCRIPT
198 perl $PERL_SCRIPT_TEMP_NAME "$1"
199 rm $PERL_SCRIPT_TEMP_NAME
200 }
202 if [ "$TAGS_LIST_ONLY" = "YES" ]
203 then
204 cd "${WORK_DIR}"
205 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'
206 exit 0
207 fi
209 tag_file_name()
210 {
211 echo "${LANGUAGE}_${1}.txt"
212 }
214 if [ "$REMOVE_TAG" = "YES" ]
215 then
216 cd "${WORK_DIR}"
217 for i in $TAG_NAME
218 do
219 echo "$TAGNAME" | grep -q '[/*?]' && continue
220 f="`tag_file_name $i`"
221 if [ -e "$f" ]
222 then
223 rm -f "$f" && echo Tag "'$i'" removed
224 else
225 echo Unknown tag "'$i'"
226 fi
227 done
228 exit 0
229 fi
231 mkdir -p $WORK_DIR
232 oldpwd="$PWD"
233 cd $WORK_DIR
234 if [ "$MERGE_TAGGED_WORDS" = "YES" ]
235 then
236 VOC_FILES=''
237 for i in $MERGE_THIS_TAGS
238 do
239 f=`tag_file_name $i`
240 [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
241 done
242 if [ -z "$VOC_FILES" ]
243 then
244 echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
245 else
246 cat $VOC_FILES
247 fi
248 elif [ "$MERGE_ALL_TAGGED" = "YES" ]
249 then
250 cat ${LANGUAGE}_*.txt
251 elif echo "$1" | grep -q http:
252 then
253 text_from_url "$1"
254 elif [ "$#" != 0 ]
255 then
256 if echo $1 | grep -q ^/
257 then
258 cat "$1"
259 else
260 cat "$oldpwd/$1"
261 fi
262 else
263 cat
264 fi \
265 | part $PART_TO_PROCESS \
266 | tee $ORIGINAL_TEXT \
267 | \
268 get_words_group_words_add_stat \
269 | tee "$TEMP1" > "$TEMP2"
271 if [ "$STAT_ONLY" = "YES" ]
272 then
273 cat "$TEMP1"
274 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
275 then
276 cat "$TEMP1"
277 else
278 if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ]
279 then
280 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
281 if [ "$editor" = vim ]
282 then
283 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
284 else
285 $editor "$TEMP2"
286 fi
287 remove_marks "$TEMP2"
289 vocabulary="$VOCABULARY"
290 [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`"
291 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary"
292 fi
293 fi
295 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"