rev |
line source |
igor@37
|
1 #!/usr/bin/env python
|
igor@38
|
2 # -*- coding: utf-8 -*-
|
igor@37
|
3
|
igor@40
|
4 from __future__ import with_statement
|
igor@38
|
5 import codecs
|
igor@49
|
6 import difflib
|
igor@38
|
7 import logging
|
igor@38
|
8 import os
|
igor@37
|
9 import optparse
|
igor@38
|
10 import re
|
igor@38
|
11 import subprocess
|
igor@38
|
12 import sys
|
igor@38
|
13 import Stemmer
|
igor@42
|
14 try:
|
igor@42
|
15 import psyco
|
igor@42
|
16 psyco.full()
|
igor@42
|
17 except:
|
igor@42
|
18 pass
|
igor@38
|
19
|
igor@38
|
20 config = {
|
igor@38
|
21 'config_directory': os.environ['HOME'] + '/.new-words',
|
igor@38
|
22 'language': 'en',
|
igor@38
|
23 }
|
igor@38
|
24
|
igor@38
|
25 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
|
igor@38
|
26
|
igor@38
|
27 class Normalizator:
|
igor@38
|
28 def __init__(self, language, linked_words={}):
|
igor@38
|
29 stemmer_algorithm = {
|
igor@38
|
30 'de' : 'german',
|
igor@38
|
31 'en' : 'english',
|
igor@51
|
32 'es' : 'spanish',
|
igor@38
|
33 'ru' : 'russian',
|
igor@51
|
34 'it' : 'italian',
|
igor@38
|
35 'uk' : 'ukrainian',
|
igor@38
|
36 }
|
igor@38
|
37 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
|
igor@38
|
38 self.linked_words = linked_words
|
igor@38
|
39
|
igor@38
|
40 def normalize(self, word):
|
igor@38
|
41 word_chain = []
|
igor@38
|
42 while word in self.linked_words and not word in word_chain:
|
igor@38
|
43 word_chain.append(word)
|
igor@38
|
44 word = self.linked_words[word]
|
igor@38
|
45 return self.stemmer.stemWord(word.lower())
|
igor@37
|
46
|
igor@47
|
47 def best_word_from_group(self, wordpairs_group):
|
igor@47
|
48 """Returns the word that is the most relevant to the wordpairs_group.
|
igor@47
|
49
|
igor@47
|
50 At the moment: returns the word with minimal length"""
|
igor@49
|
51
|
igor@49
|
52 def f(x, y):
|
igor@49
|
53 return difflib.SequenceMatcher(
|
igor@49
|
54 None,
|
igor@49
|
55 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),
|
igor@49
|
56 x.lower(),
|
igor@49
|
57 y.lower()).ratio()
|
igor@47
|
58
|
igor@47
|
59 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
|
igor@49
|
60 best_match = list(x[1] for x in sorted(
|
igor@47
|
61 (x for x in wordpairs_group if len(x[1]) == minimal_length),
|
igor@47
|
62 key=lambda x:x[0],
|
igor@47
|
63 reverse=True))[0]
|
igor@47
|
64
|
igor@51
|
65 return best_match
|
igor@51
|
66
|
igor@49
|
67 suggestions = self.dictionary_suggestions(best_match)
|
igor@49
|
68 if len(suggestions) == 1:
|
igor@49
|
69 return best_match
|
igor@49
|
70
|
igor@49
|
71 verb = False
|
igor@49
|
72 corrected_best_match = best_match
|
igor@49
|
73 if best_match[-2:] == 'et':
|
igor@49
|
74 word = best_match[:-1]+"n"
|
igor@49
|
75 sugg = self.dictionary_suggestions(word)
|
igor@49
|
76 if len(sugg) == 1:
|
igor@49
|
77 return word
|
igor@49
|
78 suggestions += sugg
|
igor@49
|
79 corrected_best_match = word
|
igor@49
|
80 corrected_best_match = best_match[:-2]
|
igor@49
|
81 verb = True
|
igor@49
|
82
|
igor@49
|
83 if best_match[-1] == 't':
|
igor@49
|
84 word = best_match[:-1]+"en"
|
igor@49
|
85 sugg = self.dictionary_suggestions(word)
|
igor@49
|
86 if len(sugg) == 1:
|
igor@49
|
87 return word
|
igor@49
|
88 suggestions += sugg
|
igor@49
|
89 corrected_best_match = best_match[:-1]
|
igor@49
|
90 verb = True
|
igor@49
|
91
|
igor@49
|
92 if corrected_best_match[0].lower() == corrected_best_match[0]:
|
igor@49
|
93 suggestions = [ x for x in suggestions
|
igor@49
|
94 if x[0].lower() == x[0] ]
|
igor@49
|
95
|
igor@49
|
96 if suggestions == []:
|
igor@49
|
97 return best_match+"_"
|
igor@49
|
98 return best_match+" "+(" ".join(
|
igor@49
|
99 sorted(
|
igor@49
|
100 suggestions,
|
igor@49
|
101 key = lambda x: f(x, corrected_best_match),
|
igor@49
|
102 reverse = True
|
igor@49
|
103 )
|
igor@49
|
104 )
|
igor@49
|
105 )
|
igor@49
|
106
|
igor@49
|
107 def dictionary_suggestions(self, word):
|
igor@49
|
108 return [
|
igor@49
|
109 x.decode('utf-8').rstrip('\n')
|
igor@49
|
110 for x
|
igor@49
|
111 in subprocess.Popen(
|
igor@49
|
112 ["de-variants", word],
|
igor@49
|
113 stdout=subprocess.PIPE
|
igor@49
|
114 ).stdout.readlines() ]
|
igor@49
|
115
|
igor@49
|
116
|
igor@37
|
117 parser = optparse.OptionParser()
|
igor@37
|
118
|
igor@37
|
119 parser.add_option(
|
igor@37
|
120 "-a", "--no-marks",
|
igor@37
|
121 help="don't add marks (and don't save marks added by user)",
|
igor@37
|
122 action="store_true",
|
igor@37
|
123 dest="no_marks")
|
igor@37
|
124
|
igor@37
|
125 parser.add_option(
|
igor@37
|
126 "-c", "--compressed",
|
igor@37
|
127 help="show compressed wordlist: one word per group",
|
igor@37
|
128 action="store_true",
|
igor@37
|
129 dest="compressed")
|
igor@37
|
130
|
igor@37
|
131 parser.add_option(
|
igor@37
|
132 "-k", "--known-words",
|
igor@37
|
133 help="put higher words that are similar to the known words (only for English)",
|
igor@37
|
134 action="store_true",
|
igor@37
|
135 dest="compressed")
|
igor@37
|
136
|
igor@37
|
137 parser.add_option(
|
igor@37
|
138 "-l", "--language",
|
igor@37
|
139 help="specify language of text",
|
igor@37
|
140 action="store",
|
igor@37
|
141 dest="language")
|
igor@37
|
142
|
igor@37
|
143 parser.add_option(
|
igor@38
|
144 "-f", "--function",
|
igor@38
|
145 help="filter through subsystem [INTERNAL]",
|
igor@38
|
146 action="store",
|
igor@38
|
147 dest="function")
|
igor@38
|
148
|
igor@38
|
149 parser.add_option(
|
igor@37
|
150 "-m", "--merge-tag",
|
igor@37
|
151 help="merge words tagged with specified tag into the main vocabulary",
|
igor@37
|
152 action="store",
|
igor@37
|
153 dest="merge_tag")
|
igor@37
|
154
|
igor@37
|
155 parser.add_option(
|
igor@37
|
156 "-M", "--merge-tagged",
|
igor@37
|
157 help="merge words tagged with ANY tag into the main vocabulary",
|
igor@37
|
158 action="store_true",
|
igor@37
|
159 dest="merge_tagged")
|
igor@37
|
160
|
igor@37
|
161 parser.add_option(
|
igor@37
|
162 "-n", "--non-interactive",
|
igor@37
|
163 help="non-interactive mode (don't run vi)",
|
igor@37
|
164 action="store_true",
|
igor@37
|
165 dest="non_interactive")
|
igor@37
|
166
|
igor@37
|
167 parser.add_option(
|
igor@37
|
168 "-N", "--no-filter",
|
igor@37
|
169 help="switch off known words filtering",
|
igor@37
|
170 action="store_true",
|
igor@37
|
171 dest="no_filter")
|
igor@37
|
172
|
igor@37
|
173 parser.add_option(
|
igor@37
|
174 "-p", "--pages",
|
igor@37
|
175 help="work with specified pages only (pages = start-stop/total )",
|
igor@37
|
176 action="store",
|
igor@37
|
177 dest="pages")
|
igor@37
|
178
|
igor@37
|
179 parser.add_option(
|
igor@48
|
180 "-d", "--delete-tag",
|
igor@48
|
181 help="delete subvocabulary of specified tag",
|
igor@37
|
182 action="store",
|
igor@48
|
183 dest="delete_tag")
|
igor@37
|
184
|
igor@37
|
185 parser.add_option(
|
igor@37
|
186 "-s", "--text-stats",
|
igor@37
|
187 help="show the text statistics (percentage of known words and so on) and exit",
|
igor@37
|
188 action="store_true",
|
igor@37
|
189 dest="text_stats")
|
igor@37
|
190
|
igor@37
|
191 parser.add_option(
|
igor@37
|
192 "-S", "--voc-stats",
|
igor@37
|
193 help="show your vocabulary statistics (number of words and word groups)",
|
igor@37
|
194 action="store_true",
|
igor@37
|
195 dest="voc_stats")
|
igor@37
|
196
|
igor@37
|
197 parser.add_option(
|
igor@37
|
198 "-t", "--tag",
|
igor@37
|
199 help="tag known words with tag",
|
igor@37
|
200 action="store",
|
igor@37
|
201 dest="tag")
|
igor@37
|
202
|
igor@37
|
203 parser.add_option(
|
igor@37
|
204 "-T", "--show-tags",
|
igor@37
|
205 help="tag known words with tag",
|
igor@37
|
206 action="store_true",
|
igor@37
|
207 dest="show_tags")
|
igor@37
|
208
|
igor@37
|
209 parser.add_option(
|
igor@37
|
210 "-2", "--two-words",
|
igor@37
|
211 help="find 2 words' sequences",
|
igor@37
|
212 action="store_true",
|
igor@37
|
213 dest="two_words")
|
igor@37
|
214
|
igor@37
|
215 parser.add_option(
|
igor@37
|
216 "-3", "--three-words",
|
igor@37
|
217 help="find 3 words' sequences",
|
igor@37
|
218 action="store_true",
|
igor@37
|
219 dest="three_words")
|
igor@37
|
220
|
igor@38
|
221 def readlines_from_file(filename):
|
igor@38
|
222 res = []
|
igor@38
|
223 with codecs.open(filename, "r", "utf-8") as f:
|
igor@38
|
224 for line in f.readlines():
|
igor@38
|
225 res += [line]
|
igor@38
|
226 return res
|
igor@38
|
227
|
igor@38
|
228 def readlines_from_stdin():
|
igor@38
|
229 return codecs.getreader("utf-8")(sys.stdin).readlines()
|
igor@38
|
230
|
igor@38
|
231 def words_from_line(line):
|
igor@38
|
232 line = line.rstrip('\n')
|
igor@38
|
233 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
|
igor@38
|
234 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
|
igor@44
|
235 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
|
igor@38
|
236
|
igor@44
|
237 def get_words(lines, group_by=[1]):
|
igor@38
|
238 """
|
igor@38
|
239 Returns hash of words in a file
|
igor@38
|
240 word => number
|
igor@38
|
241 """
|
igor@38
|
242 result = {}
|
igor@44
|
243 (a, b, c) = ("", "", "")
|
igor@38
|
244 for line in lines:
|
igor@38
|
245 words = words_from_line(line)
|
igor@38
|
246 for word in words:
|
igor@41
|
247 if re.match('[0-9]*$', word):
|
igor@41
|
248 continue
|
igor@38
|
249 result.setdefault(word, 0)
|
igor@38
|
250 result[word] += 1
|
igor@44
|
251 if 2 in group_by and a != "" and b != "":
|
igor@44
|
252 w = "%s_%s" % (a,b)
|
igor@44
|
253 result.setdefault(w, 0)
|
igor@44
|
254 result[w] += 1
|
igor@44
|
255 if 3 in group_by and not "" in [a,b,c]:
|
igor@44
|
256 w = "%s_%s_%s" % (a,b,c)
|
igor@44
|
257 result.setdefault(w, 0)
|
igor@44
|
258 result[w] += 1
|
igor@44
|
259 (a,b,c) = (b, c, word)
|
igor@44
|
260
|
igor@44
|
261 logging.debug(result)
|
igor@38
|
262 return result
|
igor@38
|
263
|
igor@38
|
264 def load_vocabulary():
|
igor@38
|
265 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
|
igor@38
|
266
|
igor@38
|
267 def notes_filenames():
|
igor@38
|
268 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
|
igor@38
|
269
|
igor@38
|
270 def load_notes(files):
|
igor@38
|
271 notes = {}
|
igor@38
|
272 for filename in files:
|
igor@39
|
273 with codecs.open(filename, "r", "utf-8") as f:
|
igor@38
|
274 for line in f.readlines():
|
igor@38
|
275 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
|
igor@38
|
276 notes.setdefault(word, {})
|
igor@38
|
277 notes[word][filename] = note
|
igor@38
|
278 return notes
|
igor@38
|
279
|
igor@39
|
280 def add_notes(lines, notes):
|
igor@39
|
281 notes_filename = notes_filenames()[0]
|
igor@39
|
282 result = []
|
igor@39
|
283 for line in lines:
|
igor@39
|
284 if line.startswith('#'):
|
igor@39
|
285 result += [line]
|
igor@39
|
286 else:
|
igor@39
|
287 match_object = re.search('^\s*\S+\s*(\S+)', line)
|
igor@39
|
288 if match_object:
|
igor@39
|
289 word = match_object.group(1)
|
igor@39
|
290 if word in notes:
|
igor@39
|
291 if notes_filename in notes[word]:
|
igor@39
|
292 line = line.rstrip('\n')
|
igor@39
|
293 line = "%-30s %s\n" % (line, notes[word][notes_filename])
|
igor@39
|
294 result += [line]
|
igor@39
|
295 else:
|
igor@39
|
296 result += [line]
|
igor@39
|
297 else:
|
igor@39
|
298 result += [line]
|
igor@39
|
299 return result
|
igor@39
|
300
|
igor@39
|
301 def remove_notes(lines, notes_group):
|
igor@39
|
302 notes_filename = notes_filenames()[0]
|
igor@39
|
303 notes = {}
|
igor@39
|
304 for k in notes_group.keys():
|
igor@39
|
305 if notes_filename in notes_group[k]:
|
igor@39
|
306 notes[k] = notes_group[k][notes_filename]
|
igor@39
|
307
|
igor@39
|
308 result = []
|
igor@39
|
309 for line in lines:
|
igor@39
|
310 line = line.rstrip('\n')
|
igor@39
|
311 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
|
igor@39
|
312 if match_object:
|
igor@39
|
313 result.append("".join([
|
igor@39
|
314 match_object.group(1),
|
igor@39
|
315 match_object.group(2),
|
igor@39
|
316 match_object.group(3),
|
igor@39
|
317 match_object.group(4),
|
igor@39
|
318 "\n"
|
igor@39
|
319 ]))
|
igor@39
|
320 notes[match_object.group(4)] = match_object.group(6)
|
igor@39
|
321 else:
|
igor@39
|
322 result.append(line+"\n")
|
igor@39
|
323
|
igor@39
|
324 save_notes(notes_filename, notes)
|
igor@39
|
325 return result
|
igor@39
|
326
|
igor@39
|
327 def save_notes(filename, notes):
|
igor@39
|
328 lines = []
|
igor@39
|
329 saved_words = []
|
igor@39
|
330 with codecs.open(filename, "r", "utf-8") as f:
|
igor@39
|
331 for line in f.readlines():
|
igor@39
|
332 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
|
igor@39
|
333 if word in notes:
|
igor@39
|
334 line = "%-29s %s\n" % (word, notes[word])
|
igor@39
|
335 saved_words.append(word)
|
igor@39
|
336 lines.append(line)
|
igor@39
|
337 for word in [x for x in notes.keys() if not x in saved_words]:
|
igor@39
|
338 line = "%-29s %s\n" % (word, notes[word])
|
igor@39
|
339 lines.append(line)
|
igor@39
|
340
|
igor@39
|
341 with codecs.open(filename, "w", "utf-8") as f:
|
igor@39
|
342 for line in lines:
|
igor@39
|
343 f.write(line)
|
igor@39
|
344
|
igor@39
|
345
|
igor@38
|
346 def substract_dictionary(dict1, dict2):
|
igor@38
|
347 """
|
igor@38
|
348 returns dict1 - dict2
|
igor@38
|
349 """
|
igor@38
|
350 result = {}
|
igor@38
|
351 for (k,v) in dict1.items():
|
igor@38
|
352 if not k in dict2:
|
igor@38
|
353 result[k] = v
|
igor@38
|
354 return result
|
igor@38
|
355
|
igor@38
|
356 def dump_words(words, filename):
|
igor@38
|
357 with codecs.open(filename, "w+", "utf-8") as f:
|
igor@38
|
358 for word in words.keys():
|
igor@38
|
359 f.write(("%s\n"%word)*words[word])
|
igor@38
|
360
|
igor@38
|
361 def error_message(text):
|
igor@38
|
362 print text
|
igor@38
|
363
|
igor@40
|
364 def find_wordgroups_weights(word_pairs, normalizator):
|
igor@38
|
365 weight = {}
|
igor@40
|
366 for (num, word) in word_pairs:
|
igor@38
|
367 normalized = normalizator.normalize(word)
|
igor@38
|
368 weight.setdefault(normalized, 0)
|
igor@40
|
369 weight[normalized] += num
|
igor@38
|
370 return weight
|
igor@38
|
371
|
igor@38
|
372 def find_linked_words(notes):
|
igor@38
|
373 linked_words = {}
|
igor@38
|
374 for word in notes.keys():
|
igor@38
|
375 for note in notes[word].values():
|
igor@38
|
376 if "@" in note:
|
igor@38
|
377 result = re.search(r'\@(\S*)', note)
|
igor@38
|
378 if result:
|
igor@38
|
379 main_word = result.group(1)
|
igor@38
|
380 if main_word:
|
igor@38
|
381 linked_words[word] = main_word
|
igor@38
|
382 return linked_words
|
igor@38
|
383
|
igor@40
|
384 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
|
igor@40
|
385 (num1, word1) = pair1
|
igor@40
|
386 (num2, word2) = pair2
|
igor@38
|
387
|
igor@38
|
388 normalized_word1 = normalizator.normalize(word1)
|
igor@38
|
389 normalized_word2 = normalizator.normalize(word2)
|
igor@38
|
390
|
igor@38
|
391 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
|
igor@38
|
392 if cmp_res != 0:
|
igor@38
|
393 return cmp_res
|
igor@38
|
394 else:
|
igor@38
|
395 cmp_res = cmp(normalized_word1, normalized_word2)
|
igor@38
|
396 if cmp_res != 0:
|
igor@38
|
397 return cmp_res
|
igor@38
|
398 else:
|
igor@38
|
399 return cmp(int(num1), int(num2))
|
igor@38
|
400
|
igor@47
|
401
|
igor@48
|
402 def print_words_sorted(
|
igor@48
|
403 word_pairs,
|
igor@48
|
404 stats,
|
igor@48
|
405 normalizator,
|
igor@48
|
406 print_stats=True,
|
igor@48
|
407 stats_only=False,
|
igor@48
|
408 compressed_wordlist=False,
|
igor@48
|
409 show_range=0,
|
igor@48
|
410 show_range_percentage=0,
|
igor@48
|
411 ):
|
igor@40
|
412 if stats_only:
|
igor@43
|
413 codecs.getwriter("utf-8")(sys.stdout).write(
|
igor@43
|
414 " ".join([
|
igor@43
|
415 "%-10s" % x for x in [
|
igor@43
|
416 "LANG",
|
igor@43
|
417 "KNOWN%",
|
igor@43
|
418 "UNKNOWN%",
|
igor@43
|
419 "KNOWN",
|
igor@43
|
420 "TOTAL",
|
igor@43
|
421 "WPS",
|
igor@43
|
422 "UWPS*10"
|
igor@43
|
423 ]]) + "\n")
|
igor@43
|
424 codecs.getwriter("utf-8")(sys.stdout).write(
|
igor@43
|
425 " ".join([
|
igor@43
|
426 "%(language)-10s",
|
igor@43
|
427 "%(percentage)-10.2f",
|
igor@43
|
428 "%(percentage_unknown)-10.2f",
|
igor@43
|
429 "%(total_known)-11d"
|
igor@43
|
430 "%(total)-11d"
|
igor@43
|
431 "%(wps)-11d"
|
igor@43
|
432 "%(uwps)-11d"
|
igor@43
|
433 ]) % stats + "\n")
|
igor@40
|
434 return
|
igor@38
|
435
|
igor@40
|
436 if print_stats:
|
igor@40
|
437 codecs.getwriter("utf-8")(sys.stdout).write(
|
igor@43
|
438 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
|
igor@38
|
439
|
igor@40
|
440 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
|
igor@40
|
441 known = int(stats['total_known'])
|
igor@40
|
442 total = int(stats['total'])
|
igor@40
|
443 current_level = 0
|
igor@47
|
444 old_normalized_word = None
|
igor@47
|
445 words_of_this_group = []
|
igor@48
|
446 printed_words = 0
|
igor@40
|
447 for word_pair in word_pairs:
|
igor@47
|
448
|
igor@47
|
449 normalized_word = normalizator.normalize(word_pair[1])
|
igor@47
|
450 if old_normalized_word and old_normalized_word != normalized_word:
|
igor@47
|
451 #codecs.getwriter("utf-8")(sys.stdout).write(
|
igor@47
|
452 # "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
|
igor@47
|
453 if compressed_wordlist:
|
igor@49
|
454 compressed_word_pair = (
|
igor@49
|
455 sum(x[0] for x in words_of_this_group),
|
igor@49
|
456 normalizator.best_word_from_group(words_of_this_group)
|
igor@49
|
457 )
|
igor@47
|
458 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
|
igor@48
|
459 printed_words += 1
|
igor@47
|
460 words_of_this_group = []
|
igor@47
|
461
|
igor@47
|
462 old_normalized_word = normalized_word
|
igor@47
|
463 words_of_this_group.append(word_pair)
|
igor@47
|
464
|
igor@47
|
465 if not compressed_wordlist:
|
igor@47
|
466 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
|
igor@48
|
467 printed_words += 1
|
igor@47
|
468
|
igor@47
|
469
|
igor@40
|
470 known += word_pair[0]
|
igor@40
|
471 if 100.0*known/total >= level_lines[0]:
|
igor@40
|
472 current_level = level_lines[0]
|
igor@40
|
473 while 100.0*known/total > level_lines[0]:
|
igor@40
|
474 current_level = level_lines[0]
|
igor@40
|
475 level_lines = level_lines[1:]
|
igor@40
|
476 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
|
igor@38
|
477
|
igor@48
|
478 if show_range >0 and printed_words >= show_range:
|
igor@48
|
479 break
|
igor@48
|
480 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
|
igor@48
|
481 break
|
igor@48
|
482
|
igor@39
|
483 def filter_add_notes(args):
|
igor@39
|
484 lines = readlines_from_file(args[0])
|
igor@39
|
485 notes = load_notes(notes_filenames())
|
igor@39
|
486 lines = add_notes(lines, notes)
|
igor@39
|
487 with codecs.open(args[0], "w", "utf-8") as f:
|
igor@39
|
488 for line in lines:
|
igor@39
|
489 f.write(line)
|
igor@39
|
490
|
igor@39
|
491 def filter_remove_notes(args):
|
igor@39
|
492 lines = readlines_from_file(args[0])
|
igor@39
|
493 notes = load_notes(notes_filenames())
|
igor@39
|
494 lines = remove_notes(lines, notes)
|
igor@39
|
495 with codecs.open(args[0], "w", "utf-8") as f:
|
igor@39
|
496 for line in lines:
|
igor@39
|
497 f.write(line)
|
igor@39
|
498
|
igor@40
|
499 def filter_get_words_group_words_add_stat(args):
|
igor@40
|
500 vocabulary = load_vocabulary()
|
igor@40
|
501 notes = load_notes(notes_filenames())
|
igor@40
|
502 lines = readlines_from_stdin()
|
igor@44
|
503 group_by = [1]
|
igor@48
|
504
|
igor@44
|
505 if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
|
igor@44
|
506 group_by.append(2)
|
igor@44
|
507 if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
|
igor@44
|
508 group_by.append(3)
|
igor@44
|
509 words = get_words(lines, group_by)
|
igor@43
|
510 stats_only = False
|
igor@43
|
511 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
|
igor@43
|
512 stats_only = True
|
igor@40
|
513
|
igor@47
|
514 compressed_wordlist = False
|
igor@47
|
515 if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
|
igor@47
|
516 compressed_wordlist = True
|
igor@47
|
517
|
igor@48
|
518 show_range = os.environ.get('SHOW_RANGE', '')
|
igor@48
|
519 if show_range != '':
|
igor@48
|
520 show_range = int(show_range)
|
igor@48
|
521 else:
|
igor@48
|
522 show_range = 0
|
igor@48
|
523 show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '')
|
igor@48
|
524 if show_range_percentage != '':
|
igor@48
|
525 show_range_percentage = int(show_range_percentage)
|
igor@48
|
526 else:
|
igor@48
|
527 show_range_percentage = 0
|
igor@48
|
528
|
igor@44
|
529
|
igor@40
|
530 stats = {}
|
igor@40
|
531 stats['total'] = sum(words[x] for x in words.keys())
|
igor@45
|
532 if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':
|
igor@45
|
533 words = substract_dictionary(words, vocabulary)
|
igor@40
|
534
|
igor@40
|
535 stats['total_unknown'] = sum(words[x] for x in words.keys())
|
igor@40
|
536 stats['total_known'] = stats['total'] - stats['total_unknown']
|
igor@43
|
537 stats['percentage'] = 100.0*stats['total_known']/stats['total']
|
igor@43
|
538 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
|
igor@40
|
539 stats['groups'] = 0
|
igor@40
|
540 stats['words'] = len(words)
|
igor@43
|
541 stats['sentences'] = 0 #FIXME
|
igor@43
|
542 stats['wps'] = 0 #FIXME
|
igor@43
|
543 stats['uwps'] = 0 #FIXME
|
igor@40
|
544 stats['language'] = config['language']
|
igor@40
|
545
|
igor@40
|
546 linked_words = find_linked_words(notes)
|
igor@40
|
547 normalizator = Normalizator(config['language'], linked_words)
|
igor@40
|
548
|
igor@50
|
549 # filter words by allowed_words_filter
|
igor@50
|
550 if os.environ.get('ALLOWED_WORDS_FILENAME', ''):
|
igor@50
|
551 allowed_words_filename = os.environ.get('ALLOWED_WORDS_FILENAME', '')
|
igor@50
|
552 normalized_allowed_words = [
|
igor@50
|
553 normalizator.normalize(w.rstrip('\n'))
|
igor@50
|
554 for w in readlines_from_file(allowed_words_filename)
|
igor@50
|
555 ]
|
igor@50
|
556
|
igor@50
|
557 result = {}
|
igor@50
|
558 for w, wn in words.iteritems():
|
igor@50
|
559 if normalizator.normalize(w) in normalized_allowed_words:
|
igor@50
|
560 result[w] = wn
|
igor@50
|
561 words = result
|
igor@50
|
562
|
igor@44
|
563 words_with_freq = []
|
igor@40
|
564 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
|
igor@44
|
565 words_with_freq.append((words[k], k))
|
igor@40
|
566
|
igor@44
|
567 wgw = find_wordgroups_weights(words_with_freq, normalizator)
|
igor@45
|
568 if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':
|
igor@45
|
569 words_with_freq = sorted(
|
igor@44
|
570 words_with_freq,
|
igor@40
|
571 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
|
igor@40
|
572 reverse=True)
|
igor@40
|
573
|
igor@47
|
574 print_words_sorted(
|
igor@47
|
575 words_with_freq,
|
igor@47
|
576 stats,
|
igor@47
|
577 normalizator,
|
igor@47
|
578 stats_only=stats_only,
|
igor@48
|
579 compressed_wordlist=compressed_wordlist,
|
igor@48
|
580 show_range=show_range,
|
igor@48
|
581 show_range_percentage=show_range_percentage,
|
igor@47
|
582 )
|
igor@40
|
583
|
igor@37
|
584 (options, args) = parser.parse_args()
|
igor@38
|
585 if options.language:
|
igor@38
|
586 config['language'] = options.language
|
igor@37
|
587
|
igor@38
|
588 if options.function:
|
igor@38
|
589 function_names = {
|
igor@39
|
590 'add_notes' : filter_add_notes,
|
igor@39
|
591 'remove_notes': filter_remove_notes,
|
igor@40
|
592 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
|
igor@38
|
593 }
|
igor@38
|
594 if options.function in function_names:
|
igor@38
|
595 function_names[options.function](args)
|
igor@38
|
596 else:
|
igor@38
|
597 error_message("Unkown function %s.\nAvailable functions:\n%s" % (
|
igor@38
|
598 options.function, "".join([" "+x for x in sorted(function_names.keys())])))
|
igor@38
|
599 sys.exit(1)
|
igor@37
|
600
|
igor@37
|
601
|
igor@37
|
602
|
igor@37
|
603
|
igor@38
|
604 #os.system("vim")
|
igor@37
|
605
|