new-words
view new-words-py.sh @ 51:74e05d4436ee
minifix: spanish + italian
author | Igor Chubin <igor@chub.in> |
---|---|
date | Sun May 01 20:25:55 2011 +0200 (2011-05-01) |
parents | 7194bdb56475 |
children | e25de9ea9184 |
line source
1 #!/bin/bash
3 show_usage()
4 {
5 cat <<HELP > /dev/stderr
7 USAGE:
9 new-words [ -l lang ] [ -s ] [ ARG ]
11 SWITCHES:
13 -h print this screen
14 -c show compressed wordlist: one word per group
15 -f file show only words related to the words in the file
16 -G turn off word grouping
17 -k put higher words that are similar to the known words (only for English)
18 -l lang override language settings
19 -n non-interactive mode (don't run vi)
20 -N turn off known words filtering
21 -a don't add marks (and don't save marks added by user)
22 -p pages work with specified pages only (pages = start-stop/total )
23 -s show the text statistics (percentage of known words and so on) and exit
24 -S show your vocabulary statistics (number of words and word groups)
25 -t tag tag known words with tag
26 -T show list of active tags
27 -m tag merge the words tagged with "tag" into the main vocabulary
28 -M merge the words tagged with any tag into the main vocabulary
29 -d tag delete subvocabulary for the "tag"
30 -r RANGE show only first RANGE words
31 -R RANGE show only words lower than RANGE percent
32 -2 -3 find 2 and 3 words' sequences
34 The language of the text can be specified also
35 by name of the program new-words (correspondent link must be created before).
36 For example, these calls are equivalent:
38 de-words URL
39 new-words -l de URL
41 HELP
42 }
44 if [ "$1" = "-h" ]
45 then
46 show_usage
47 exit 0
48 fi
50 NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py
51 WORK_DIR=~/.new-words/
52 TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX`
53 TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX`
54 export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX`
55 editor=${EDITOR:-vim}
57 # language detection
59 LANGUAGE=en
60 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
61 for arg
62 do
63 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
64 then
65 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
66 fi
67 done
68 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
70 #----------------------------------------------------
71 # command line options processing
73 STAT_ONLY=NO
74 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
75 DONT_ADD_MARKS=NO
76 NON_INTERACTIVE_MODE=NO
77 PART_TO_PROCESS=''
78 GROUP_WORDS_BY_THREE=NO
79 GROUP_WORDS_BY_TWO=NO
80 TAG_NAME=''
81 MERGE_THIS_TAGS=''
82 TAGS_LIST_ONLY=NO
83 MERGE_TAGGED_WORDS=NO
84 MERGE_ALL_TAGGED=NO
85 DONT_ADD_MARKLINES=NO
86 FILTER_WORDS=YES
87 SHOW_VOC_STAT=NO
88 COMPRESSED_WORDLIST=NO
89 WORDS_GROUPING=YES
90 ALLOWED_WORDS_FILENAME=''
91 while getopts Gcf:l:sSkanNp:t:Tm:Md:r:R:23 opt
92 do
93 case "$opt" in
94 c) COMPRESSED_WORDLIST=YES;;
95 f) ALLOWED_WORDS_FILENAME="$OPTARG";;
96 G) WORDS_GROUPING=NO;;
97 s) STAT_ONLY=YES;;
98 S) SHOW_VOC_STAT=YES;;
99 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
100 l) LANGUAGE="$OPTARG";;
101 a) DONT_ADD_MARKS=YES;;
102 n) NON_INTERACTIVE_MODE=YES;;
103 N) FILTER_WORDS=NO;;
104 p) PART_TO_PROCESS="$OPTARG";;
105 t) TAG_NAME="$OPTARG";;
106 T) TAGS_LIST_ONLY="YES";;
107 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
108 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
109 d) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
110 r) SHOW_RANGE="$OPTARG";;
111 R) SHOW_RANGE_PERCENTAGE="$OPTARG";;
112 2) GROUP_WORDS_BY_TWO=YES;;
113 3) GROUP_WORDS_BY_THREE=YES;;
114 \?) # unknown flag
115 show_usage
116 exit 1;;
117 esac
118 done
119 shift `expr $OPTIND - 1`
121 if [ "$1" = "-l" ]
122 then
123 LANGUAGE="$2"
124 shift 2
125 fi
127 VOCABULARY=${LANGUAGE}.txt
128 NOTES_FILE=notes-${LANGUAGE}.txt
130 if [ "${SHOW_VOC_STAT}" = "YES" ]
131 then
132 $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>"
133 exit 0
134 fi
136 text_from_url()
137 {
138 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
139 }
141 add_marks()
142 {
143 $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1"
144 }
145 remove_marks()
146 {
147 $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1"
148 }
149 get_words_group_words_add_stat()
150 {
151 SHOW_RANGE="$SHOW_RANGE" \
152 SHOW_RANGE_PERCENTAGE="$SHOW_RANGE_PERCENTAGE" \
153 COMPRESSED_WORDLIST="$COMPRESSED_WORDLIST" \
154 GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \
155 GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \
156 STAT_ONLY="$STAT_ONLY" \
157 WORDS_GROUPING="$WORDS_GROUPING" \
158 FILTER_WORDS="$FILTER_WORDS" \
159 ALLOWED_WORDS_FILENAME="$ALLOWED_WORDS_FILENAME" \
160 $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1"
161 }
163 part()
164 {
165 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
166 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
167 #!/usr/bin/perl
169 my @lines=<STDIN>;
170 my $lines=$#lines;
171 my $interval=$ARGV[0];
172 if (not $interval) {
173 print @lines;
174 }
175 else {
176 my ($start,$stop,$total);
177 if ($interval =~ m@(.*)/(.*)@) {
178 $start = $1;
179 $total = $2;
180 }
181 else {
182 $start=$interval;
183 $total=0;
184 }
185 if ($start =~ m@(.*)-(.*)@) {
186 $start = $1;
187 $stop = $2;
188 }
189 if ($start =~ m@(.*)\+(.*)@) {
190 $start = $1;
191 $stop = $start+$2;
192 }
194 $start=int($lines/$total*$start);
195 $stop=int($lines/$total*$stop);
197 for($i=$start;$i<$stop;$i++){
198 print $lines[$i];
199 }
200 }
201 PERL_SCRIPT
202 perl $PERL_SCRIPT_TEMP_NAME "$1"
203 rm $PERL_SCRIPT_TEMP_NAME
204 }
206 if [ "$TAGS_LIST_ONLY" = "YES" ]
207 then
208 cd "${WORK_DIR}"
209 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'
210 exit 0
211 fi
213 tag_file_name()
214 {
215 echo "${LANGUAGE}_${1}.txt"
216 }
218 if [ "$REMOVE_TAG" = "YES" ]
219 then
220 cd "${WORK_DIR}"
221 for i in $TAG_NAME
222 do
223 echo "$TAGNAME" | grep -q '[/*?]' && continue
224 f="`tag_file_name $i`"
225 if [ -e "$f" ]
226 then
227 rm -f "$f" && echo Tag "'$i'" removed
228 else
229 echo Unknown tag "'$i'"
230 fi
231 done
232 exit 0
233 fi
235 mkdir -p $WORK_DIR
236 oldpwd="$PWD"
237 cd $WORK_DIR
238 if [ "$MERGE_TAGGED_WORDS" = "YES" ]
239 then
240 VOC_FILES=''
241 for i in $MERGE_THIS_TAGS
242 do
243 f=`tag_file_name $i`
244 [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
245 done
246 if [ -z "$VOC_FILES" ]
247 then
248 echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
249 else
250 cat $VOC_FILES
251 fi
252 elif [ "$MERGE_ALL_TAGGED" = "YES" ]
253 then
254 cat ${LANGUAGE}_*.txt
255 elif echo "$1" | grep -q http:
256 then
257 text_from_url "$1"
258 elif [ "$#" != 0 ]
259 then
260 if echo $1 | grep -q ^/
261 then
262 cat "$1"
263 else
264 cat "$oldpwd/$1"
265 fi
266 else
267 cat
268 fi \
269 | part $PART_TO_PROCESS \
270 | tee $ORIGINAL_TEXT \
271 | \
272 get_words_group_words_add_stat \
273 | tee "$TEMP1" > "$TEMP2"
275 if [ "$STAT_ONLY" = "YES" ]
276 then
277 cat "$TEMP1"
278 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
279 then
280 cat "$TEMP1"
281 else
282 if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ]
283 then
284 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
285 if [ "$editor" = vim ]
286 then
287 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
288 else
289 $editor "$TEMP2"
290 fi
291 remove_marks "$TEMP2"
293 vocabulary="$VOCABULARY"
294 [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`"
295 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary"
296 fi
297 fi
299 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"