new-words

view new-words-py.sh @ 54:e25de9ea9184

new-words.py is almost ready
author Igor Chubin <igor@chub.in>
date Tue Nov 01 20:19:18 2011 +0100 (2011-11-01)
parents 4e931db74618
children 2a1a25e61872
line source
1 #!/bin/bash
3 show_usage()
4 {
5 cat <<HELP > /dev/stderr
7 USAGE:
9 new-words [ -l lang ] [ -s ] [ ARG ]
11 SWITCHES:
13 -h print this screen
14 -c show compressed wordlist: one word per group
15 -f file show only words related to the words in the file
16 -G turn off word grouping
17 -k put higher words that are similar to the known words (only for English)
18 -l lang override language settings
19 -n non-interactive mode (don't run vi)
20 -N turn off known words filtering
21 -a don't add marks (and don't save marks added by user)
22 -p pages work with specified pages only (pages = start-stop/total )
23 -s show the text statistics (percentage of known words and so on) and exit
24 -S show your vocabulary statistics (number of words and word groups)
25 -t tag tag known words with tag
26 -T show list of active tags
27 -m tag merge the words tagged with "tag" into the main vocabulary
28 -M merge the words tagged with any tag into the main vocabulary
29 -d tag delete subvocabulary for the "tag"
30 -r RANGE show only first RANGE words
31 -R RANGE show only words lower than RANGE percent
32 -2 -3 find 2 and 3 words' sequences
34 The language of the text can be specified also
35 by name of the program new-words (correspondent link must be created before).
36 For example, these calls are equivalent:
38 de-words URL
39 new-words -l de URL
41 HELP
42 }
44 if [ "$1" = "-h" ]
45 then
46 show_usage
47 exit 0
48 fi
50 NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py
51 WORK_DIR=~/.new-words/
52 editor=${EDITOR:-vim}
54 # language detection
56 LANGUAGE=en
57 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
58 for arg
59 do
60 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
61 then
62 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
63 fi
64 done
65 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
67 #----------------------------------------------------
68 # command line options processing
70 STAT_ONLY=NO
71 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
72 DONT_ADD_MARKS=NO
73 NON_INTERACTIVE_MODE=NO
74 PART_TO_PROCESS=''
75 GROUP_WORDS_BY_THREE=NO
76 GROUP_WORDS_BY_TWO=NO
77 TAG_NAME=''
78 MERGE_THIS_TAGS=''
79 TAGS_LIST_ONLY=NO
80 MERGE_TAGGED_WORDS=NO
81 MERGE_ALL_TAGGED=NO
82 DONT_ADD_MARKLINES=NO
83 FILTER_WORDS=YES
84 SHOW_VOC_STAT=NO
85 COMPRESSED_WORDLIST=NO
86 WORDS_GROUPING=YES
87 ALLOWED_WORDS_FILENAME=''
88 while getopts Gcf:l:sSkanNp:t:Tm:Md:r:R:23 opt
89 do
90 case "$opt" in
91 c) COMPRESSED_WORDLIST=YES;;
92 f) ALLOWED_WORDS_FILENAME="$OPTARG";;
93 G) WORDS_GROUPING=NO;;
94 s) STAT_ONLY=YES;;
95 S) SHOW_VOC_STAT=YES;;
96 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
97 l) LANGUAGE="$OPTARG";;
98 a) DONT_ADD_MARKS=YES;;
99 n) NON_INTERACTIVE_MODE=YES;;
100 N) FILTER_WORDS=NO;;
101 p) PART_TO_PROCESS="$OPTARG";;
102 t) TAG_NAME="$OPTARG";;
103 T) TAGS_LIST_ONLY="YES";;
104 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
105 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
106 d) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
107 r) SHOW_RANGE="$OPTARG";;
108 R) SHOW_RANGE_PERCENTAGE="$OPTARG";;
109 2) GROUP_WORDS_BY_TWO=YES;;
110 3) GROUP_WORDS_BY_THREE=YES;;
111 \?) # unknown flag
112 show_usage
113 exit 1;;
114 esac
115 done
116 shift `expr $OPTIND - 1`
118 if [ "$1" = "-l" ]
119 then
120 LANGUAGE="$2"
121 shift 2
122 fi
124 VOCABULARY=${LANGUAGE}.txt
125 NOTES_FILE=notes-${LANGUAGE}.txt
127 if [ "${SHOW_VOC_STAT}" = "YES" ]
128 then
129 $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>"
130 exit 0
131 fi
133 get_words_group_words_add_stat()
134 {
135 [ "$PART_TO_PROCESS" == "" ] || PART_TO_PROCESS="-p $PART_TO_PROCESS"
136 [ "$ALLOWED_WORDS_FILENAME" = "" ] || ALLOWED_WORDS_FILENAME="-f $ALLOWED_WORDS_FILENAME"
137 [ "$SHOW_RANGE_PERCENTAGE" = "" ] || SHOW_RANGE_PERCENTAGE="-R $SHOW_RANGE_PERCENTAGE"
138 [ "$NON_INTERACTIVE_MODE" = YES ] && non_interactive="-n"
139 [ "$STAT_ONLY" = YES ] && stat_only="-s"
140 [ "$COMPRESSED_WORDLIST" = YES ] && compressed_wordlist="-c"
141 [ "$FILTER_WORDS" = NO ] && filter_words="-N"
142 [ "$GROUP_WORDS_BY_TWO" = YES ] && group_words_by_two="-2"
143 [ "$GROUP_WORDS_BY_THREE" = YES ] && group_words_by_three="-3"
145 SHOW_RANGE="$SHOW_RANGE" \
146 WORDS_GROUPING="$WORDS_GROUPING" \
147 $NEW_WORDS_PY -l "$LANGUAGE" \
148 $SHOW_RANGE_PERCENTAGE \
149 $PART_TO_PROCESS \
150 $ALLOWED_WORDS_FILENAME \
151 $non_interactive \
152 $stat_only \
153 $compressed_wordlist \
154 $filter_words \
155 $group_words_by_two \
156 $group_words_by_three \
157 -X get_words_group_words_add_stat "$1"
158 }
160 if [ "$TAGS_LIST_ONLY" = "YES" ]
161 then
162 cd "${WORK_DIR}"
163 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'
164 exit 0
165 fi
167 tag_file_name()
168 {
169 echo "${LANGUAGE}_${1}.txt"
170 }
172 if [ "$REMOVE_TAG" = "YES" ]
173 then
174 cd "${WORK_DIR}"
175 for i in $TAG_NAME
176 do
177 echo "$TAGNAME" | grep -q '[/*?]' && continue
178 f="`tag_file_name $i`"
179 if [ -e "$f" ]
180 then
181 rm -f "$f" && echo Tag "'$i'" removed
182 else
183 echo Unknown tag "'$i'"
184 fi
185 done
186 exit 0
187 fi
189 get_words_group_words_add_stat "$1"
191 #mkdir -p $WORK_DIR
192 #oldpwd="$PWD"
193 #cd $WORK_DIR
194 #if [ "$MERGE_TAGGED_WORDS" = "YES" ]
195 #then
196 # VOC_FILES=''
197 # for i in $MERGE_THIS_TAGS
198 # do
199 # f=`tag_file_name $i`
200 # [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
201 # done
202 # if [ -z "$VOC_FILES" ]
203 # then
204 # echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
205 # else
206 # cat $VOC_FILES
207 # fi
208 #elif [ "$MERGE_ALL_TAGGED" = "YES" ]
209 #then
210 # cat ${LANGUAGE}_*.txt
211 #else
212 # cat
213 #fi