rev |
line source |
igor@38
|
1 #!/bin/bash
|
igor@38
|
2
|
igor@38
|
3 show_usage()
|
igor@38
|
4 {
|
igor@38
|
5 cat <<HELP > /dev/stderr
|
igor@38
|
6
|
igor@38
|
7 USAGE:
|
igor@38
|
8
|
igor@38
|
9 new-words [ -l lang ] [ -s ] [ ARG ]
|
igor@38
|
10
|
igor@38
|
11 SWITCHES:
|
igor@38
|
12
|
igor@38
|
13 -h print this screen
|
igor@38
|
14 -c show compressed wordlist: one word per group
|
igor@45
|
15 -G turn off word grouping
|
igor@38
|
16 -k put higher words that are similar to the known words (only for English)
|
igor@38
|
17 -l lang override language settings
|
igor@38
|
18 -n non-interactive mode (don't run vi)
|
igor@38
|
19 -N turn off known words filtering
|
igor@38
|
20 -a don't add marks (and don't save marks added by user)
|
igor@38
|
21 -p pages work with specified pages only (pages = start-stop/total )
|
igor@38
|
22 -s show the text statistics (percentage of known words and so on) and exit
|
igor@38
|
23 -S show your vocabulary statistics (number of words and word groups)
|
igor@38
|
24 -t tag tag known words with tag
|
igor@38
|
25 -T show list of active tags
|
igor@38
|
26 -m tag merge the words tagged with "tag" into the main vocabulary
|
igor@38
|
27 -M merge the words tagged with any tag into the main vocabulary
|
igor@38
|
28 -r tag remove subvocabulary for the "tag"
|
igor@38
|
29 -2 -3 find 2 and 3 words' sequences
|
igor@38
|
30
|
igor@38
|
31 The language of the text can be specified also
|
igor@38
|
32 by name of the program new-words (correspondent link must be created before).
|
igor@38
|
33 For example, these calls are equivalent:
|
igor@38
|
34
|
igor@38
|
35 de-words URL
|
igor@38
|
36 new-words -l de URL
|
igor@38
|
37
|
igor@38
|
38 HELP
|
igor@38
|
39 }
|
igor@38
|
40
|
igor@38
|
41 if [ "$1" = "-h" ]
|
igor@38
|
42 then
|
igor@38
|
43 show_usage
|
igor@38
|
44 exit 0
|
igor@38
|
45 fi
|
igor@38
|
46
|
igor@38
|
47 NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py
|
igor@38
|
48 WORK_DIR=~/.new-words/
|
igor@38
|
49 TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX`
|
igor@38
|
50 TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX`
|
igor@38
|
51 export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX`
|
igor@38
|
52 editor=${EDITOR:-vim}
|
igor@38
|
53
|
igor@38
|
54 # language detection
|
igor@38
|
55
|
igor@38
|
56 LANGUAGE=en
|
igor@38
|
57 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
|
igor@38
|
58 for arg
|
igor@38
|
59 do
|
igor@38
|
60 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
|
igor@38
|
61 then
|
igor@38
|
62 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
|
igor@38
|
63 fi
|
igor@38
|
64 done
|
igor@38
|
65 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
|
igor@38
|
66
|
igor@38
|
67 #----------------------------------------------------
|
igor@38
|
68 # command line options processing
|
igor@38
|
69
|
igor@38
|
70 STAT_ONLY=NO
|
igor@38
|
71 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
|
igor@38
|
72 DONT_ADD_MARKS=NO
|
igor@38
|
73 NON_INTERACTIVE_MODE=NO
|
igor@38
|
74 PART_TO_PROCESS=''
|
igor@38
|
75 GROUP_WORDS_BY_THREE=NO
|
igor@38
|
76 GROUP_WORDS_BY_TWO=NO
|
igor@38
|
77 TAG_NAME=''
|
igor@38
|
78 MERGE_THIS_TAGS=''
|
igor@38
|
79 TAGS_LIST_ONLY=NO
|
igor@38
|
80 MERGE_TAGGED_WORDS=NO
|
igor@38
|
81 MERGE_ALL_TAGGED=NO
|
igor@38
|
82 DONT_ADD_MARKLINES=NO
|
igor@38
|
83 FILTER_WORDS=YES
|
igor@38
|
84 SHOW_VOC_STAT=NO
|
igor@38
|
85 COMPRESSED_WORDLIST=NO
|
igor@45
|
86 WORDS_GROUPING=YES
|
igor@45
|
87 while getopts Gcl:sSkanNp:t:Tm:Mr:23 opt
|
igor@38
|
88 do
|
igor@38
|
89 case "$opt" in
|
igor@38
|
90 c) COMPRESSED_WORDLIST=YES;;
|
igor@45
|
91 G) WORDS_GROUPING=NO;;
|
igor@38
|
92 s) STAT_ONLY=YES;;
|
igor@38
|
93 S) SHOW_VOC_STAT=YES;;
|
igor@38
|
94 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
|
igor@38
|
95 l) LANGUAGE="$OPTARG";;
|
igor@38
|
96 a) DONT_ADD_MARKS=YES;;
|
igor@38
|
97 n) NON_INTERACTIVE_MODE=YES;;
|
igor@38
|
98 N) FILTER_WORDS=NO;;
|
igor@38
|
99 p) PART_TO_PROCESS="$OPTARG";;
|
igor@38
|
100 t) TAG_NAME="$OPTARG";;
|
igor@38
|
101 T) TAGS_LIST_ONLY="YES";;
|
igor@38
|
102 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
|
igor@38
|
103 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
|
igor@38
|
104 r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
|
igor@38
|
105 2) GROUP_WORDS_BY_TWO=YES;;
|
igor@38
|
106 3) GROUP_WORDS_BY_THREE=YES;;
|
igor@38
|
107 \?) # unknown flag
|
igor@38
|
108 show_usage
|
igor@38
|
109 exit 1;;
|
igor@38
|
110 esac
|
igor@38
|
111 done
|
igor@38
|
112 shift `expr $OPTIND - 1`
|
igor@38
|
113
|
igor@38
|
114 if [ "$1" = "-l" ]
|
igor@38
|
115 then
|
igor@38
|
116 LANGUAGE="$2"
|
igor@38
|
117 shift 2
|
igor@38
|
118 fi
|
igor@38
|
119
|
igor@38
|
120 VOCABULARY=${LANGUAGE}.txt
|
igor@38
|
121 NOTES_FILE=notes-${LANGUAGE}.txt
|
igor@38
|
122
|
igor@38
|
123 if [ "${SHOW_VOC_STAT}" = "YES" ]
|
igor@38
|
124 then
|
igor@38
|
125 $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>"
|
igor@38
|
126 exit 0
|
igor@38
|
127 fi
|
igor@38
|
128
|
igor@38
|
129 text_from_url()
|
igor@38
|
130 {
|
igor@38
|
131 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
|
igor@38
|
132 }
|
igor@38
|
133
|
igor@38
|
134 add_marks()
|
igor@38
|
135 {
|
igor@40
|
136 $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1"
|
igor@39
|
137 }
|
igor@40
|
138 remove_marks()
|
igor@39
|
139 {
|
igor@40
|
140 $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1"
|
igor@40
|
141 }
|
igor@40
|
142 get_words_group_words_add_stat()
|
igor@38
|
143 {
|
igor@47
|
144 COMPRESSED_WORDLIST="$COMPRESSED_WORDLIST" \
|
igor@44
|
145 GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \
|
igor@44
|
146 GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \
|
igor@47
|
147 STAT_ONLY="$STAT_ONLY" \
|
igor@45
|
148 WORDS_GROUPING="$WORDS_GROUPING" \
|
igor@45
|
149 FILTER_WORDS="$FILTER_WORDS" \
|
igor@40
|
150 $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1"
|
igor@38
|
151 }
|
igor@38
|
152
|
igor@38
|
153 part()
|
igor@38
|
154 {
|
igor@38
|
155 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
|
igor@38
|
156 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
|
igor@38
|
157 #!/usr/bin/perl
|
igor@38
|
158
|
igor@38
|
159 my @lines=<STDIN>;
|
igor@38
|
160 my $lines=$#lines;
|
igor@38
|
161 my $interval=$ARGV[0];
|
igor@38
|
162 if (not $interval) {
|
igor@38
|
163 print @lines;
|
igor@38
|
164 }
|
igor@38
|
165 else {
|
igor@38
|
166 my ($start,$stop,$total);
|
igor@38
|
167 if ($interval =~ m@(.*)/(.*)@) {
|
igor@38
|
168 $start = $1;
|
igor@38
|
169 $total = $2;
|
igor@38
|
170 }
|
igor@38
|
171 else {
|
igor@38
|
172 $start=$interval;
|
igor@38
|
173 $total=0;
|
igor@38
|
174 }
|
igor@38
|
175 if ($start =~ m@(.*)-(.*)@) {
|
igor@38
|
176 $start = $1;
|
igor@38
|
177 $stop = $2;
|
igor@38
|
178 }
|
igor@38
|
179 if ($start =~ m@(.*)\+(.*)@) {
|
igor@38
|
180 $start = $1;
|
igor@38
|
181 $stop = $start+$2;
|
igor@38
|
182 }
|
igor@38
|
183
|
igor@38
|
184 $start=int($lines/$total*$start);
|
igor@38
|
185 $stop=int($lines/$total*$stop);
|
igor@38
|
186
|
igor@38
|
187 for($i=$start;$i<$stop;$i++){
|
igor@38
|
188 print $lines[$i];
|
igor@38
|
189 }
|
igor@38
|
190 }
|
igor@38
|
191 PERL_SCRIPT
|
igor@38
|
192 perl $PERL_SCRIPT_TEMP_NAME "$1"
|
igor@38
|
193 rm $PERL_SCRIPT_TEMP_NAME
|
igor@38
|
194 }
|
igor@38
|
195
|
igor@38
|
196 if [ "$TAGS_LIST_ONLY" = "YES" ]
|
igor@38
|
197 then
|
igor@38
|
198 cd "${WORK_DIR}"
|
igor@38
|
199 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'
|
igor@38
|
200 exit 0
|
igor@38
|
201 fi
|
igor@38
|
202
|
igor@38
|
203 tag_file_name()
|
igor@38
|
204 {
|
igor@38
|
205 echo "${LANGUAGE}_${1}.txt"
|
igor@38
|
206 }
|
igor@38
|
207
|
igor@38
|
208 if [ "$REMOVE_TAG" = "YES" ]
|
igor@38
|
209 then
|
igor@38
|
210 cd "${WORK_DIR}"
|
igor@38
|
211 for i in $TAG_NAME
|
igor@38
|
212 do
|
igor@38
|
213 echo "$TAGNAME" | grep -q '[/*?]' && continue
|
igor@38
|
214 f="`tag_file_name $i`"
|
igor@38
|
215 if [ -e "$f" ]
|
igor@38
|
216 then
|
igor@38
|
217 rm -f "$f" && echo Tag "'$i'" removed
|
igor@38
|
218 else
|
igor@38
|
219 echo Unknown tag "'$i'"
|
igor@38
|
220 fi
|
igor@38
|
221 done
|
igor@38
|
222 exit 0
|
igor@38
|
223 fi
|
igor@38
|
224
|
igor@38
|
225 mkdir -p $WORK_DIR
|
igor@38
|
226 oldpwd="$PWD"
|
igor@38
|
227 cd $WORK_DIR
|
igor@38
|
228 if [ "$MERGE_TAGGED_WORDS" = "YES" ]
|
igor@38
|
229 then
|
igor@38
|
230 VOC_FILES=''
|
igor@38
|
231 for i in $MERGE_THIS_TAGS
|
igor@38
|
232 do
|
igor@38
|
233 f=`tag_file_name $i`
|
igor@38
|
234 [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
|
igor@38
|
235 done
|
igor@38
|
236 if [ -z "$VOC_FILES" ]
|
igor@38
|
237 then
|
igor@38
|
238 echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
|
igor@38
|
239 else
|
igor@38
|
240 cat $VOC_FILES
|
igor@38
|
241 fi
|
igor@38
|
242 elif [ "$MERGE_ALL_TAGGED" = "YES" ]
|
igor@38
|
243 then
|
igor@38
|
244 cat ${LANGUAGE}_*.txt
|
igor@38
|
245 elif echo "$1" | grep -q http:
|
igor@38
|
246 then
|
igor@38
|
247 text_from_url "$1"
|
igor@38
|
248 elif [ "$#" != 0 ]
|
igor@38
|
249 then
|
igor@38
|
250 if echo $1 | grep -q ^/
|
igor@38
|
251 then
|
igor@38
|
252 cat "$1"
|
igor@38
|
253 else
|
igor@38
|
254 cat "$oldpwd/$1"
|
igor@38
|
255 fi
|
igor@38
|
256 else
|
igor@38
|
257 cat
|
igor@38
|
258 fi \
|
igor@38
|
259 | part $PART_TO_PROCESS \
|
igor@38
|
260 | tee $ORIGINAL_TEXT \
|
igor@44
|
261 | \
|
igor@44
|
262 get_words_group_words_add_stat \
|
igor@38
|
263 | tee "$TEMP1" > "$TEMP2"
|
igor@38
|
264
|
igor@38
|
265 if [ "$STAT_ONLY" = "YES" ]
|
igor@38
|
266 then
|
igor@38
|
267 cat "$TEMP1"
|
igor@38
|
268 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
|
igor@38
|
269 then
|
igor@38
|
270 cat "$TEMP1"
|
igor@38
|
271 else
|
igor@38
|
272 if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ]
|
igor@38
|
273 then
|
igor@38
|
274 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
|
igor@38
|
275 if [ "$editor" = vim ]
|
igor@38
|
276 then
|
igor@38
|
277 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
|
igor@38
|
278 else
|
igor@38
|
279 $editor "$TEMP2"
|
igor@38
|
280 fi
|
igor@38
|
281 remove_marks "$TEMP2"
|
igor@38
|
282
|
igor@38
|
283 vocabulary="$VOCABULARY"
|
igor@38
|
284 [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`"
|
igor@38
|
285 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary"
|
igor@38
|
286 fi
|
igor@38
|
287 fi
|
igor@38
|
288
|
igor@38
|
289 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"
|