rev |
line source |
igor@37
|
1 #!/usr/bin/env python
|
igor@38
|
2 # -*- coding: utf-8 -*-
|
igor@37
|
3
|
igor@40
|
4 from __future__ import with_statement
|
igor@38
|
5 import codecs
|
igor@38
|
6 import logging
|
igor@38
|
7 import os
|
igor@37
|
8 import optparse
|
igor@38
|
9 import re
|
igor@38
|
10 import subprocess
|
igor@38
|
11 import sys
|
igor@38
|
12 import Stemmer
|
igor@42
|
13 try:
|
igor@42
|
14 import psyco
|
igor@42
|
15 psyco.full()
|
igor@42
|
16 except:
|
igor@42
|
17 pass
|
igor@38
|
18
|
igor@38
|
19 config = {
|
igor@38
|
20 'config_directory': os.environ['HOME'] + '/.new-words',
|
igor@38
|
21 'language': 'en',
|
igor@38
|
22 }
|
igor@38
|
23
|
igor@38
|
24 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
|
igor@38
|
25
|
igor@38
|
26 class Normalizator:
|
igor@38
|
27 def __init__(self, language, linked_words={}):
|
igor@38
|
28 stemmer_algorithm = {
|
igor@38
|
29 'de' : 'german',
|
igor@38
|
30 'en' : 'english',
|
igor@38
|
31 'ru' : 'russian',
|
igor@38
|
32 'uk' : 'ukrainian',
|
igor@38
|
33 }
|
igor@38
|
34 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
|
igor@38
|
35 self.linked_words = linked_words
|
igor@38
|
36
|
igor@38
|
37 def normalize(self, word):
|
igor@38
|
38 word_chain = []
|
igor@38
|
39 while word in self.linked_words and not word in word_chain:
|
igor@38
|
40 word_chain.append(word)
|
igor@38
|
41 word = self.linked_words[word]
|
igor@38
|
42 return self.stemmer.stemWord(word.lower())
|
igor@37
|
43
|
igor@47
|
44 def best_word_from_group(self, wordpairs_group):
|
igor@47
|
45 """Returns the word that is the most relevant to the wordpairs_group.
|
igor@47
|
46
|
igor@47
|
47 At the moment: returns the word with minimal length"""
|
igor@47
|
48
|
igor@47
|
49 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
|
igor@47
|
50 return list(x[1] for x in sorted(
|
igor@47
|
51 (x for x in wordpairs_group if len(x[1]) == minimal_length),
|
igor@47
|
52 key=lambda x:x[0],
|
igor@47
|
53 reverse=True))[0]
|
igor@47
|
54
|
igor@37
|
55 parser = optparse.OptionParser()
|
igor@37
|
56
|
igor@37
|
57 parser.add_option(
|
igor@37
|
58 "-a", "--no-marks",
|
igor@37
|
59 help="don't add marks (and don't save marks added by user)",
|
igor@37
|
60 action="store_true",
|
igor@37
|
61 dest="no_marks")
|
igor@37
|
62
|
igor@37
|
63 parser.add_option(
|
igor@37
|
64 "-c", "--compressed",
|
igor@37
|
65 help="show compressed wordlist: one word per group",
|
igor@37
|
66 action="store_true",
|
igor@37
|
67 dest="compressed")
|
igor@37
|
68
|
igor@37
|
69 parser.add_option(
|
igor@37
|
70 "-k", "--known-words",
|
igor@37
|
71 help="put higher words that are similar to the known words (only for English)",
|
igor@37
|
72 action="store_true",
|
igor@37
|
73 dest="compressed")
|
igor@37
|
74
|
igor@37
|
75 parser.add_option(
|
igor@37
|
76 "-l", "--language",
|
igor@37
|
77 help="specify language of text",
|
igor@37
|
78 action="store",
|
igor@37
|
79 dest="language")
|
igor@37
|
80
|
igor@37
|
81 parser.add_option(
|
igor@38
|
82 "-f", "--function",
|
igor@38
|
83 help="filter through subsystem [INTERNAL]",
|
igor@38
|
84 action="store",
|
igor@38
|
85 dest="function")
|
igor@38
|
86
|
igor@38
|
87 parser.add_option(
|
igor@37
|
88 "-m", "--merge-tag",
|
igor@37
|
89 help="merge words tagged with specified tag into the main vocabulary",
|
igor@37
|
90 action="store",
|
igor@37
|
91 dest="merge_tag")
|
igor@37
|
92
|
igor@37
|
93 parser.add_option(
|
igor@37
|
94 "-M", "--merge-tagged",
|
igor@37
|
95 help="merge words tagged with ANY tag into the main vocabulary",
|
igor@37
|
96 action="store_true",
|
igor@37
|
97 dest="merge_tagged")
|
igor@37
|
98
|
igor@37
|
99 parser.add_option(
|
igor@37
|
100 "-n", "--non-interactive",
|
igor@37
|
101 help="non-interactive mode (don't run vi)",
|
igor@37
|
102 action="store_true",
|
igor@37
|
103 dest="non_interactive")
|
igor@37
|
104
|
igor@37
|
105 parser.add_option(
|
igor@37
|
106 "-N", "--no-filter",
|
igor@37
|
107 help="switch off known words filtering",
|
igor@37
|
108 action="store_true",
|
igor@37
|
109 dest="no_filter")
|
igor@37
|
110
|
igor@37
|
111 parser.add_option(
|
igor@37
|
112 "-p", "--pages",
|
igor@37
|
113 help="work with specified pages only (pages = start-stop/total )",
|
igor@37
|
114 action="store",
|
igor@37
|
115 dest="pages")
|
igor@37
|
116
|
igor@37
|
117 parser.add_option(
|
igor@48
|
118 "-d", "--delete-tag",
|
igor@48
|
119 help="delete subvocabulary of specified tag",
|
igor@37
|
120 action="store",
|
igor@48
|
121 dest="delete_tag")
|
igor@37
|
122
|
igor@37
|
123 parser.add_option(
|
igor@37
|
124 "-s", "--text-stats",
|
igor@37
|
125 help="show the text statistics (percentage of known words and so on) and exit",
|
igor@37
|
126 action="store_true",
|
igor@37
|
127 dest="text_stats")
|
igor@37
|
128
|
igor@37
|
129 parser.add_option(
|
igor@37
|
130 "-S", "--voc-stats",
|
igor@37
|
131 help="show your vocabulary statistics (number of words and word groups)",
|
igor@37
|
132 action="store_true",
|
igor@37
|
133 dest="voc_stats")
|
igor@37
|
134
|
igor@37
|
135 parser.add_option(
|
igor@37
|
136 "-t", "--tag",
|
igor@37
|
137 help="tag known words with tag",
|
igor@37
|
138 action="store",
|
igor@37
|
139 dest="tag")
|
igor@37
|
140
|
igor@37
|
141 parser.add_option(
|
igor@37
|
142 "-T", "--show-tags",
|
igor@37
|
143 help="tag known words with tag",
|
igor@37
|
144 action="store_true",
|
igor@37
|
145 dest="show_tags")
|
igor@37
|
146
|
igor@37
|
147 parser.add_option(
|
igor@37
|
148 "-2", "--two-words",
|
igor@37
|
149 help="find 2 words' sequences",
|
igor@37
|
150 action="store_true",
|
igor@37
|
151 dest="two_words")
|
igor@37
|
152
|
igor@37
|
153 parser.add_option(
|
igor@37
|
154 "-3", "--three-words",
|
igor@37
|
155 help="find 3 words' sequences",
|
igor@37
|
156 action="store_true",
|
igor@37
|
157 dest="three_words")
|
igor@37
|
158
|
igor@38
|
159 def readlines_from_file(filename):
|
igor@38
|
160 res = []
|
igor@38
|
161 with codecs.open(filename, "r", "utf-8") as f:
|
igor@38
|
162 for line in f.readlines():
|
igor@38
|
163 res += [line]
|
igor@38
|
164 return res
|
igor@38
|
165
|
igor@38
|
166 def readlines_from_stdin():
|
igor@38
|
167 return codecs.getreader("utf-8")(sys.stdin).readlines()
|
igor@38
|
168
|
igor@38
|
169 def words_from_line(line):
|
igor@38
|
170 line = line.rstrip('\n')
|
igor@38
|
171 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
|
igor@38
|
172 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
|
igor@44
|
173 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
|
igor@38
|
174
|
igor@44
|
175 def get_words(lines, group_by=[1]):
|
igor@38
|
176 """
|
igor@38
|
177 Returns hash of words in a file
|
igor@38
|
178 word => number
|
igor@38
|
179 """
|
igor@38
|
180 result = {}
|
igor@44
|
181 (a, b, c) = ("", "", "")
|
igor@38
|
182 for line in lines:
|
igor@38
|
183 words = words_from_line(line)
|
igor@38
|
184 for word in words:
|
igor@41
|
185 if re.match('[0-9]*$', word):
|
igor@41
|
186 continue
|
igor@38
|
187 result.setdefault(word, 0)
|
igor@38
|
188 result[word] += 1
|
igor@44
|
189 if 2 in group_by and a != "" and b != "":
|
igor@44
|
190 w = "%s_%s" % (a,b)
|
igor@44
|
191 result.setdefault(w, 0)
|
igor@44
|
192 result[w] += 1
|
igor@44
|
193 if 3 in group_by and not "" in [a,b,c]:
|
igor@44
|
194 w = "%s_%s_%s" % (a,b,c)
|
igor@44
|
195 result.setdefault(w, 0)
|
igor@44
|
196 result[w] += 1
|
igor@44
|
197 (a,b,c) = (b, c, word)
|
igor@44
|
198
|
igor@44
|
199 logging.debug(result)
|
igor@38
|
200 return result
|
igor@38
|
201
|
igor@38
|
202 def load_vocabulary():
|
igor@38
|
203 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
|
igor@38
|
204
|
igor@38
|
205 def notes_filenames():
|
igor@38
|
206 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
|
igor@38
|
207
|
igor@38
|
208 def load_notes(files):
|
igor@38
|
209 notes = {}
|
igor@38
|
210 for filename in files:
|
igor@39
|
211 with codecs.open(filename, "r", "utf-8") as f:
|
igor@38
|
212 for line in f.readlines():
|
igor@38
|
213 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
|
igor@38
|
214 notes.setdefault(word, {})
|
igor@38
|
215 notes[word][filename] = note
|
igor@38
|
216 return notes
|
igor@38
|
217
|
igor@39
|
218 def add_notes(lines, notes):
|
igor@39
|
219 notes_filename = notes_filenames()[0]
|
igor@39
|
220 result = []
|
igor@39
|
221 for line in lines:
|
igor@39
|
222 if line.startswith('#'):
|
igor@39
|
223 result += [line]
|
igor@39
|
224 else:
|
igor@39
|
225 match_object = re.search('^\s*\S+\s*(\S+)', line)
|
igor@39
|
226 if match_object:
|
igor@39
|
227 word = match_object.group(1)
|
igor@39
|
228 if word in notes:
|
igor@39
|
229 if notes_filename in notes[word]:
|
igor@39
|
230 line = line.rstrip('\n')
|
igor@39
|
231 line = "%-30s %s\n" % (line, notes[word][notes_filename])
|
igor@39
|
232 result += [line]
|
igor@39
|
233 else:
|
igor@39
|
234 result += [line]
|
igor@39
|
235 else:
|
igor@39
|
236 result += [line]
|
igor@39
|
237 return result
|
igor@39
|
238
|
igor@39
|
239 def remove_notes(lines, notes_group):
|
igor@39
|
240 notes_filename = notes_filenames()[0]
|
igor@39
|
241 notes = {}
|
igor@39
|
242 for k in notes_group.keys():
|
igor@39
|
243 if notes_filename in notes_group[k]:
|
igor@39
|
244 notes[k] = notes_group[k][notes_filename]
|
igor@39
|
245
|
igor@39
|
246 result = []
|
igor@39
|
247 for line in lines:
|
igor@39
|
248 line = line.rstrip('\n')
|
igor@39
|
249 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
|
igor@39
|
250 if match_object:
|
igor@39
|
251 result.append("".join([
|
igor@39
|
252 match_object.group(1),
|
igor@39
|
253 match_object.group(2),
|
igor@39
|
254 match_object.group(3),
|
igor@39
|
255 match_object.group(4),
|
igor@39
|
256 "\n"
|
igor@39
|
257 ]))
|
igor@39
|
258 notes[match_object.group(4)] = match_object.group(6)
|
igor@39
|
259 else:
|
igor@39
|
260 result.append(line+"\n")
|
igor@39
|
261
|
igor@39
|
262 save_notes(notes_filename, notes)
|
igor@39
|
263 return result
|
igor@39
|
264
|
igor@39
|
265 def save_notes(filename, notes):
|
igor@39
|
266 lines = []
|
igor@39
|
267 saved_words = []
|
igor@39
|
268 with codecs.open(filename, "r", "utf-8") as f:
|
igor@39
|
269 for line in f.readlines():
|
igor@39
|
270 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
|
igor@39
|
271 if word in notes:
|
igor@39
|
272 line = "%-29s %s\n" % (word, notes[word])
|
igor@39
|
273 saved_words.append(word)
|
igor@39
|
274 lines.append(line)
|
igor@39
|
275 for word in [x for x in notes.keys() if not x in saved_words]:
|
igor@39
|
276 line = "%-29s %s\n" % (word, notes[word])
|
igor@39
|
277 lines.append(line)
|
igor@39
|
278
|
igor@39
|
279 with codecs.open(filename, "w", "utf-8") as f:
|
igor@39
|
280 for line in lines:
|
igor@39
|
281 f.write(line)
|
igor@39
|
282
|
igor@39
|
283
|
igor@38
|
284 def substract_dictionary(dict1, dict2):
|
igor@38
|
285 """
|
igor@38
|
286 returns dict1 - dict2
|
igor@38
|
287 """
|
igor@38
|
288 result = {}
|
igor@38
|
289 for (k,v) in dict1.items():
|
igor@38
|
290 if not k in dict2:
|
igor@38
|
291 result[k] = v
|
igor@38
|
292 return result
|
igor@38
|
293
|
igor@38
|
294 def dump_words(words, filename):
|
igor@38
|
295 with codecs.open(filename, "w+", "utf-8") as f:
|
igor@38
|
296 for word in words.keys():
|
igor@38
|
297 f.write(("%s\n"%word)*words[word])
|
igor@38
|
298
|
igor@38
|
299 def error_message(text):
|
igor@38
|
300 print text
|
igor@38
|
301
|
igor@40
|
302 def find_wordgroups_weights(word_pairs, normalizator):
|
igor@38
|
303 weight = {}
|
igor@40
|
304 for (num, word) in word_pairs:
|
igor@38
|
305 normalized = normalizator.normalize(word)
|
igor@38
|
306 weight.setdefault(normalized, 0)
|
igor@40
|
307 weight[normalized] += num
|
igor@38
|
308 return weight
|
igor@38
|
309
|
igor@38
|
310 def find_linked_words(notes):
|
igor@38
|
311 linked_words = {}
|
igor@38
|
312 for word in notes.keys():
|
igor@38
|
313 for note in notes[word].values():
|
igor@38
|
314 if "@" in note:
|
igor@38
|
315 result = re.search(r'\@(\S*)', note)
|
igor@38
|
316 if result:
|
igor@38
|
317 main_word = result.group(1)
|
igor@38
|
318 if main_word:
|
igor@38
|
319 linked_words[word] = main_word
|
igor@38
|
320 return linked_words
|
igor@38
|
321
|
igor@40
|
322 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
|
igor@40
|
323 (num1, word1) = pair1
|
igor@40
|
324 (num2, word2) = pair2
|
igor@38
|
325
|
igor@38
|
326 normalized_word1 = normalizator.normalize(word1)
|
igor@38
|
327 normalized_word2 = normalizator.normalize(word2)
|
igor@38
|
328
|
igor@38
|
329 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
|
igor@38
|
330 if cmp_res != 0:
|
igor@38
|
331 return cmp_res
|
igor@38
|
332 else:
|
igor@38
|
333 cmp_res = cmp(normalized_word1, normalized_word2)
|
igor@38
|
334 if cmp_res != 0:
|
igor@38
|
335 return cmp_res
|
igor@38
|
336 else:
|
igor@38
|
337 return cmp(int(num1), int(num2))
|
igor@38
|
338
|
igor@47
|
339
|
igor@48
|
340 def print_words_sorted(
|
igor@48
|
341 word_pairs,
|
igor@48
|
342 stats,
|
igor@48
|
343 normalizator,
|
igor@48
|
344 print_stats=True,
|
igor@48
|
345 stats_only=False,
|
igor@48
|
346 compressed_wordlist=False,
|
igor@48
|
347 show_range=0,
|
igor@48
|
348 show_range_percentage=0,
|
igor@48
|
349 ):
|
igor@40
|
350 if stats_only:
|
igor@43
|
351 codecs.getwriter("utf-8")(sys.stdout).write(
|
igor@43
|
352 " ".join([
|
igor@43
|
353 "%-10s" % x for x in [
|
igor@43
|
354 "LANG",
|
igor@43
|
355 "KNOWN%",
|
igor@43
|
356 "UNKNOWN%",
|
igor@43
|
357 "KNOWN",
|
igor@43
|
358 "TOTAL",
|
igor@43
|
359 "WPS",
|
igor@43
|
360 "UWPS*10"
|
igor@43
|
361 ]]) + "\n")
|
igor@43
|
362 codecs.getwriter("utf-8")(sys.stdout).write(
|
igor@43
|
363 " ".join([
|
igor@43
|
364 "%(language)-10s",
|
igor@43
|
365 "%(percentage)-10.2f",
|
igor@43
|
366 "%(percentage_unknown)-10.2f",
|
igor@43
|
367 "%(total_known)-11d"
|
igor@43
|
368 "%(total)-11d"
|
igor@43
|
369 "%(wps)-11d"
|
igor@43
|
370 "%(uwps)-11d"
|
igor@43
|
371 ]) % stats + "\n")
|
igor@40
|
372 return
|
igor@38
|
373
|
igor@40
|
374 if print_stats:
|
igor@40
|
375 codecs.getwriter("utf-8")(sys.stdout).write(
|
igor@43
|
376 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
|
igor@38
|
377
|
igor@40
|
378 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
|
igor@40
|
379 known = int(stats['total_known'])
|
igor@40
|
380 total = int(stats['total'])
|
igor@40
|
381 current_level = 0
|
igor@47
|
382 old_normalized_word = None
|
igor@47
|
383 words_of_this_group = []
|
igor@48
|
384 printed_words = 0
|
igor@40
|
385 for word_pair in word_pairs:
|
igor@47
|
386
|
igor@47
|
387 normalized_word = normalizator.normalize(word_pair[1])
|
igor@47
|
388 if old_normalized_word and old_normalized_word != normalized_word:
|
igor@47
|
389 #codecs.getwriter("utf-8")(sys.stdout).write(
|
igor@47
|
390 # "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
|
igor@47
|
391 compressed_word_pair = (
|
igor@47
|
392 sum(x[0] for x in words_of_this_group),
|
igor@47
|
393 normalizator.best_word_from_group(words_of_this_group)
|
igor@47
|
394 )
|
igor@47
|
395 if compressed_wordlist:
|
igor@47
|
396 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
|
igor@48
|
397 printed_words += 1
|
igor@47
|
398 words_of_this_group = []
|
igor@47
|
399
|
igor@47
|
400 old_normalized_word = normalized_word
|
igor@47
|
401 words_of_this_group.append(word_pair)
|
igor@47
|
402
|
igor@47
|
403 if not compressed_wordlist:
|
igor@47
|
404 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
|
igor@48
|
405 printed_words += 1
|
igor@47
|
406
|
igor@47
|
407
|
igor@40
|
408 known += word_pair[0]
|
igor@40
|
409 if 100.0*known/total >= level_lines[0]:
|
igor@40
|
410 current_level = level_lines[0]
|
igor@40
|
411 while 100.0*known/total > level_lines[0]:
|
igor@40
|
412 current_level = level_lines[0]
|
igor@40
|
413 level_lines = level_lines[1:]
|
igor@40
|
414 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
|
igor@38
|
415
|
igor@48
|
416 if show_range >0 and printed_words >= show_range:
|
igor@48
|
417 break
|
igor@48
|
418 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
|
igor@48
|
419 break
|
igor@48
|
420
|
igor@39
|
421 def filter_add_notes(args):
|
igor@39
|
422 lines = readlines_from_file(args[0])
|
igor@39
|
423 notes = load_notes(notes_filenames())
|
igor@39
|
424 lines = add_notes(lines, notes)
|
igor@39
|
425 with codecs.open(args[0], "w", "utf-8") as f:
|
igor@39
|
426 for line in lines:
|
igor@39
|
427 f.write(line)
|
igor@39
|
428
|
igor@39
|
429 def filter_remove_notes(args):
|
igor@39
|
430 lines = readlines_from_file(args[0])
|
igor@39
|
431 notes = load_notes(notes_filenames())
|
igor@39
|
432 lines = remove_notes(lines, notes)
|
igor@39
|
433 with codecs.open(args[0], "w", "utf-8") as f:
|
igor@39
|
434 for line in lines:
|
igor@39
|
435 f.write(line)
|
igor@39
|
436
|
igor@40
|
437 def filter_get_words_group_words_add_stat(args):
|
igor@40
|
438 vocabulary = load_vocabulary()
|
igor@40
|
439 notes = load_notes(notes_filenames())
|
igor@40
|
440 lines = readlines_from_stdin()
|
igor@44
|
441 group_by = [1]
|
igor@48
|
442
|
igor@44
|
443 if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
|
igor@44
|
444 group_by.append(2)
|
igor@44
|
445 if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
|
igor@44
|
446 group_by.append(3)
|
igor@44
|
447 words = get_words(lines, group_by)
|
igor@43
|
448 stats_only = False
|
igor@43
|
449 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
|
igor@43
|
450 stats_only = True
|
igor@40
|
451
|
igor@47
|
452 compressed_wordlist = False
|
igor@47
|
453 if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
|
igor@47
|
454 compressed_wordlist = True
|
igor@47
|
455
|
igor@48
|
456 show_range = os.environ.get('SHOW_RANGE', '')
|
igor@48
|
457 if show_range != '':
|
igor@48
|
458 show_range = int(show_range)
|
igor@48
|
459 else:
|
igor@48
|
460 show_range = 0
|
igor@48
|
461 show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '')
|
igor@48
|
462 if show_range_percentage != '':
|
igor@48
|
463 show_range_percentage = int(show_range_percentage)
|
igor@48
|
464 else:
|
igor@48
|
465 show_range_percentage = 0
|
igor@48
|
466
|
igor@44
|
467
|
igor@40
|
468 stats = {}
|
igor@40
|
469 stats['total'] = sum(words[x] for x in words.keys())
|
igor@45
|
470 if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':
|
igor@45
|
471 words = substract_dictionary(words, vocabulary)
|
igor@40
|
472
|
igor@40
|
473 stats['total_unknown'] = sum(words[x] for x in words.keys())
|
igor@40
|
474 stats['total_known'] = stats['total'] - stats['total_unknown']
|
igor@43
|
475 stats['percentage'] = 100.0*stats['total_known']/stats['total']
|
igor@43
|
476 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
|
igor@40
|
477 stats['groups'] = 0
|
igor@40
|
478 stats['words'] = len(words)
|
igor@43
|
479 stats['sentences'] = 0 #FIXME
|
igor@43
|
480 stats['wps'] = 0 #FIXME
|
igor@43
|
481 stats['uwps'] = 0 #FIXME
|
igor@40
|
482 stats['language'] = config['language']
|
igor@40
|
483
|
igor@40
|
484 linked_words = find_linked_words(notes)
|
igor@40
|
485 normalizator = Normalizator(config['language'], linked_words)
|
igor@40
|
486
|
igor@44
|
487 words_with_freq = []
|
igor@40
|
488 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
|
igor@44
|
489 words_with_freq.append((words[k], k))
|
igor@40
|
490
|
igor@44
|
491 wgw = find_wordgroups_weights(words_with_freq, normalizator)
|
igor@45
|
492 if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':
|
igor@45
|
493 words_with_freq = sorted(
|
igor@44
|
494 words_with_freq,
|
igor@40
|
495 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
|
igor@40
|
496 reverse=True)
|
igor@40
|
497
|
igor@47
|
498 print_words_sorted(
|
igor@47
|
499 words_with_freq,
|
igor@47
|
500 stats,
|
igor@47
|
501 normalizator,
|
igor@47
|
502 stats_only=stats_only,
|
igor@48
|
503 compressed_wordlist=compressed_wordlist,
|
igor@48
|
504 show_range=show_range,
|
igor@48
|
505 show_range_percentage=show_range_percentage,
|
igor@47
|
506 )
|
igor@40
|
507
|
igor@37
|
508 (options, args) = parser.parse_args()
|
igor@38
|
509 if options.language:
|
igor@38
|
510 config['language'] = options.language
|
igor@37
|
511
|
igor@38
|
512 if options.function:
|
igor@38
|
513 function_names = {
|
igor@39
|
514 'add_notes' : filter_add_notes,
|
igor@39
|
515 'remove_notes': filter_remove_notes,
|
igor@40
|
516 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
|
igor@38
|
517 }
|
igor@38
|
518 if options.function in function_names:
|
igor@38
|
519 function_names[options.function](args)
|
igor@38
|
520 else:
|
igor@38
|
521 error_message("Unkown function %s.\nAvailable functions:\n%s" % (
|
igor@38
|
522 options.function, "".join([" "+x for x in sorted(function_names.keys())])))
|
igor@38
|
523 sys.exit(1)
|
igor@37
|
524
|
igor@37
|
525
|
igor@37
|
526
|
igor@37
|
527
|
igor@38
|
528 #os.system("vim")
|
igor@37
|
529
|