new-words
view new-words-py.sh @ 45:5f90e44eecfc
new-words.py: turn words filtering and grouping on and off
author | Igor Chubin <igor@chub.in> |
---|---|
date | Fri Feb 04 06:18:50 2011 +0100 (2011-02-04) |
parents | 7eb1a8c3eade |
children | d708e2c1bad8 |
line source
1 #!/bin/bash
3 show_usage()
4 {
5 cat <<HELP > /dev/stderr
7 USAGE:
9 new-words [ -l lang ] [ -s ] [ ARG ]
11 SWITCHES:
13 -h print this screen
14 -c show compressed wordlist: one word per group
15 -G turn off word grouping
16 -k put higher words that are similar to the known words (only for English)
17 -l lang override language settings
18 -n non-interactive mode (don't run vi)
19 -N turn off known words filtering
20 -a don't add marks (and don't save marks added by user)
21 -p pages work with specified pages only (pages = start-stop/total )
22 -s show the text statistics (percentage of known words and so on) and exit
23 -S show your vocabulary statistics (number of words and word groups)
24 -t tag tag known words with tag
25 -T show list of active tags
26 -m tag merge the words tagged with "tag" into the main vocabulary
27 -M merge the words tagged with any tag into the main vocabulary
28 -r tag remove subvocabulary for the "tag"
29 -2 -3 find 2 and 3 words' sequences
31 The language of the text can be specified also
32 by name of the program new-words (correspondent link must be created before).
33 For example, these calls are equivalent:
35 de-words URL
36 new-words -l de URL
38 HELP
39 }
41 if [ "$1" = "-h" ]
42 then
43 show_usage
44 exit 0
45 fi
47 NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py
48 WORK_DIR=~/.new-words/
49 TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX`
50 TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX`
51 export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX`
52 editor=${EDITOR:-vim}
54 # language detection
56 LANGUAGE=en
57 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
58 for arg
59 do
60 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
61 then
62 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
63 fi
64 done
65 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
67 #----------------------------------------------------
68 # command line options processing
70 STAT_ONLY=NO
71 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
72 DONT_ADD_MARKS=NO
73 NON_INTERACTIVE_MODE=NO
74 PART_TO_PROCESS=''
75 GROUP_WORDS_BY_THREE=NO
76 GROUP_WORDS_BY_TWO=NO
77 TAG_NAME=''
78 MERGE_THIS_TAGS=''
79 TAGS_LIST_ONLY=NO
80 MERGE_TAGGED_WORDS=NO
81 MERGE_ALL_TAGGED=NO
82 DONT_ADD_MARKLINES=NO
83 FILTER_WORDS=YES
84 SHOW_VOC_STAT=NO
85 COMPRESSED_WORDLIST=NO
86 WORDS_GROUPING=YES
87 while getopts Gcl:sSkanNp:t:Tm:Mr:23 opt
88 do
89 case "$opt" in
90 c) COMPRESSED_WORDLIST=YES;;
91 G) WORDS_GROUPING=NO;;
92 s) STAT_ONLY=YES;;
93 S) SHOW_VOC_STAT=YES;;
94 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
95 l) LANGUAGE="$OPTARG";;
96 a) DONT_ADD_MARKS=YES;;
97 n) NON_INTERACTIVE_MODE=YES;;
98 N) FILTER_WORDS=NO;;
99 p) PART_TO_PROCESS="$OPTARG";;
100 t) TAG_NAME="$OPTARG";;
101 T) TAGS_LIST_ONLY="YES";;
102 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
103 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
104 r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
105 2) GROUP_WORDS_BY_TWO=YES;;
106 3) GROUP_WORDS_BY_THREE=YES;;
107 \?) # unknown flag
108 show_usage
109 exit 1;;
110 esac
111 done
112 shift `expr $OPTIND - 1`
114 if [ "$1" = "-l" ]
115 then
116 LANGUAGE="$2"
117 shift 2
118 fi
120 VOCABULARY=${LANGUAGE}.txt
121 NOTES_FILE=notes-${LANGUAGE}.txt
123 if [ "${SHOW_VOC_STAT}" = "YES" ]
124 then
125 $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>"
126 exit 0
127 fi
129 text_from_url()
130 {
131 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
132 }
134 add_marks()
135 {
136 $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1"
137 }
138 remove_marks()
139 {
140 $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1"
141 }
142 get_words_group_words_add_stat()
143 {
144 STAT_ONLY="$STAT_ONLY" \
145 GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \
146 GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \
147 WORDS_GROUPING="$WORDS_GROUPING" \
148 FILTER_WORDS="$FILTER_WORDS" \
149 $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1"
150 }
152 part()
153 {
154 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
155 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
156 #!/usr/bin/perl
158 my @lines=<STDIN>;
159 my $lines=$#lines;
160 my $interval=$ARGV[0];
161 if (not $interval) {
162 print @lines;
163 }
164 else {
165 my ($start,$stop,$total);
166 if ($interval =~ m@(.*)/(.*)@) {
167 $start = $1;
168 $total = $2;
169 }
170 else {
171 $start=$interval;
172 $total=0;
173 }
174 if ($start =~ m@(.*)-(.*)@) {
175 $start = $1;
176 $stop = $2;
177 }
178 if ($start =~ m@(.*)\+(.*)@) {
179 $start = $1;
180 $stop = $start+$2;
181 }
183 $start=int($lines/$total*$start);
184 $stop=int($lines/$total*$stop);
186 for($i=$start;$i<$stop;$i++){
187 print $lines[$i];
188 }
189 }
190 PERL_SCRIPT
191 perl $PERL_SCRIPT_TEMP_NAME "$1"
192 rm $PERL_SCRIPT_TEMP_NAME
193 }
195 if [ "$TAGS_LIST_ONLY" = "YES" ]
196 then
197 cd "${WORK_DIR}"
198 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'
199 exit 0
200 fi
202 tag_file_name()
203 {
204 echo "${LANGUAGE}_${1}.txt"
205 }
207 if [ "$REMOVE_TAG" = "YES" ]
208 then
209 cd "${WORK_DIR}"
210 for i in $TAG_NAME
211 do
212 echo "$TAGNAME" | grep -q '[/*?]' && continue
213 f="`tag_file_name $i`"
214 if [ -e "$f" ]
215 then
216 rm -f "$f" && echo Tag "'$i'" removed
217 else
218 echo Unknown tag "'$i'"
219 fi
220 done
221 exit 0
222 fi
224 mkdir -p $WORK_DIR
225 oldpwd="$PWD"
226 cd $WORK_DIR
227 if [ "$MERGE_TAGGED_WORDS" = "YES" ]
228 then
229 VOC_FILES=''
230 for i in $MERGE_THIS_TAGS
231 do
232 f=`tag_file_name $i`
233 [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
234 done
235 if [ -z "$VOC_FILES" ]
236 then
237 echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
238 else
239 cat $VOC_FILES
240 fi
241 elif [ "$MERGE_ALL_TAGGED" = "YES" ]
242 then
243 cat ${LANGUAGE}_*.txt
244 elif echo "$1" | grep -q http:
245 then
246 text_from_url "$1"
247 elif [ "$#" != 0 ]
248 then
249 if echo $1 | grep -q ^/
250 then
251 cat "$1"
252 else
253 cat "$oldpwd/$1"
254 fi
255 else
256 cat
257 fi \
258 | part $PART_TO_PROCESS \
259 | tee $ORIGINAL_TEXT \
260 | \
261 get_words_group_words_add_stat \
262 | tee "$TEMP1" > "$TEMP2"
264 if [ "$STAT_ONLY" = "YES" ]
265 then
266 cat "$TEMP1"
267 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
268 then
269 cat "$TEMP1"
270 else
271 if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ]
272 then
273 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
274 if [ "$editor" = vim ]
275 then
276 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
277 else
278 $editor "$TEMP2"
279 fi
280 remove_marks "$TEMP2"
282 vocabulary="$VOCABULARY"
283 [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`"
284 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary"
285 fi
286 fi
288 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"