rev |
line source |
igor@37
|
1 #!/usr/bin/env python
|
igor@38
|
2 # -*- coding: utf-8 -*-
|
igor@37
|
3
|
igor@40
|
4 from __future__ import with_statement
|
igor@38
|
5 import codecs
|
igor@49
|
6 import difflib
|
igor@38
|
7 import logging
|
igor@38
|
8 import os
|
igor@37
|
9 import optparse
|
igor@38
|
10 import re
|
igor@38
|
11 import subprocess
|
igor@38
|
12 import sys
|
igor@38
|
13 import Stemmer
|
igor@54
|
14 import tempfile
|
igor@42
|
15 try:
|
igor@42
|
16 import psyco
|
igor@42
|
17 psyco.full()
|
igor@42
|
18 except:
|
igor@42
|
19 pass
|
igor@38
|
20
|
igor@38
|
21 config = {
|
igor@38
|
22 'config_directory': os.environ['HOME'] + '/.new-words',
|
igor@38
|
23 'language': 'en',
|
igor@38
|
24 }
|
igor@38
|
25
|
igor@38
|
26 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
|
igor@38
|
27
|
igor@38
|
28 class Normalizator:
|
igor@38
|
29 def __init__(self, language, linked_words={}):
|
igor@38
|
30 stemmer_algorithm = {
|
igor@38
|
31 'de' : 'german',
|
igor@38
|
32 'en' : 'english',
|
igor@51
|
33 'es' : 'spanish',
|
igor@38
|
34 'ru' : 'russian',
|
igor@51
|
35 'it' : 'italian',
|
igor@38
|
36 'uk' : 'ukrainian',
|
igor@38
|
37 }
|
igor@38
|
38 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
|
igor@38
|
39 self.linked_words = linked_words
|
igor@38
|
40
|
igor@38
|
41 def normalize(self, word):
|
igor@38
|
42 word_chain = []
|
igor@38
|
43 while word in self.linked_words and not word in word_chain:
|
igor@38
|
44 word_chain.append(word)
|
igor@38
|
45 word = self.linked_words[word]
|
igor@38
|
46 return self.stemmer.stemWord(word.lower())
|
igor@37
|
47
|
igor@47
|
48 def best_word_from_group(self, wordpairs_group):
|
igor@47
|
49 """Returns the word that is the most relevant to the wordpairs_group.
|
igor@47
|
50
|
igor@47
|
51 At the moment: returns the word with minimal length"""
|
igor@49
|
52
|
igor@49
|
53 def f(x, y):
|
igor@49
|
54 return difflib.SequenceMatcher(
|
igor@49
|
55 None,
|
igor@49
|
56 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),
|
igor@49
|
57 x.lower(),
|
igor@49
|
58 y.lower()).ratio()
|
igor@47
|
59
|
igor@47
|
60 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
|
igor@49
|
61 best_match = list(x[1] for x in sorted(
|
igor@47
|
62 (x for x in wordpairs_group if len(x[1]) == minimal_length),
|
igor@47
|
63 key=lambda x:x[0],
|
igor@47
|
64 reverse=True))[0]
|
igor@47
|
65
|
igor@51
|
66 return best_match
|
igor@51
|
67
|
igor@49
|
68 suggestions = self.dictionary_suggestions(best_match)
|
igor@49
|
69 if len(suggestions) == 1:
|
igor@49
|
70 return best_match
|
igor@49
|
71
|
igor@49
|
72 verb = False
|
igor@49
|
73 corrected_best_match = best_match
|
igor@49
|
74 if best_match[-2:] == 'et':
|
igor@49
|
75 word = best_match[:-1]+"n"
|
igor@49
|
76 sugg = self.dictionary_suggestions(word)
|
igor@49
|
77 if len(sugg) == 1:
|
igor@49
|
78 return word
|
igor@49
|
79 suggestions += sugg
|
igor@49
|
80 corrected_best_match = word
|
igor@49
|
81 corrected_best_match = best_match[:-2]
|
igor@49
|
82 verb = True
|
igor@49
|
83
|
igor@49
|
84 if best_match[-1] == 't':
|
igor@49
|
85 word = best_match[:-1]+"en"
|
igor@49
|
86 sugg = self.dictionary_suggestions(word)
|
igor@49
|
87 if len(sugg) == 1:
|
igor@49
|
88 return word
|
igor@49
|
89 suggestions += sugg
|
igor@49
|
90 corrected_best_match = best_match[:-1]
|
igor@49
|
91 verb = True
|
igor@49
|
92
|
igor@49
|
93 if corrected_best_match[0].lower() == corrected_best_match[0]:
|
igor@49
|
94 suggestions = [ x for x in suggestions
|
igor@49
|
95 if x[0].lower() == x[0] ]
|
igor@49
|
96
|
igor@49
|
97 if suggestions == []:
|
igor@49
|
98 return best_match+"_"
|
igor@49
|
99 return best_match+" "+(" ".join(
|
igor@49
|
100 sorted(
|
igor@49
|
101 suggestions,
|
igor@49
|
102 key = lambda x: f(x, corrected_best_match),
|
igor@49
|
103 reverse = True
|
igor@49
|
104 )
|
igor@49
|
105 )
|
igor@49
|
106 )
|
igor@49
|
107
|
igor@49
|
108 def dictionary_suggestions(self, word):
|
igor@49
|
109 return [
|
igor@49
|
110 x.decode('utf-8').rstrip('\n')
|
igor@49
|
111 for x
|
igor@49
|
112 in subprocess.Popen(
|
igor@49
|
113 ["de-variants", word],
|
igor@49
|
114 stdout=subprocess.PIPE
|
igor@49
|
115 ).stdout.readlines() ]
|
igor@49
|
116
|
igor@49
|
117
|
igor@37
|
118 parser = optparse.OptionParser()
|
igor@37
|
119
|
igor@37
|
120 parser.add_option(
|
igor@37
|
121 "-a", "--no-marks",
|
igor@37
|
122 help="don't add marks (and don't save marks added by user)",
|
igor@37
|
123 action="store_true",
|
igor@37
|
124 dest="no_marks")
|
igor@37
|
125
|
igor@37
|
126 parser.add_option(
|
igor@37
|
127 "-c", "--compressed",
|
igor@37
|
128 help="show compressed wordlist: one word per group",
|
igor@37
|
129 action="store_true",
|
igor@37
|
130 dest="compressed")
|
igor@37
|
131
|
igor@37
|
132 parser.add_option(
|
igor@37
|
133 "-k", "--known-words",
|
igor@37
|
134 help="put higher words that are similar to the known words (only for English)",
|
igor@37
|
135 action="store_true",
|
igor@37
|
136 dest="compressed")
|
igor@37
|
137
|
igor@37
|
138 parser.add_option(
|
igor@37
|
139 "-l", "--language",
|
igor@37
|
140 help="specify language of text",
|
igor@37
|
141 action="store",
|
igor@37
|
142 dest="language")
|
igor@37
|
143
|
igor@37
|
144 parser.add_option(
|
igor@54
|
145 "-f", "--allowed-words",
|
igor@54
|
146 help="file with list of allowed words (words that will be shown in the output)",
|
igor@54
|
147 action="store",
|
igor@54
|
148 dest="allowed_words")
|
igor@54
|
149
|
igor@54
|
150 parser.add_option(
|
igor@54
|
151 "-X", "--function",
|
igor@38
|
152 help="filter through subsystem [INTERNAL]",
|
igor@38
|
153 action="store",
|
igor@38
|
154 dest="function")
|
igor@38
|
155
|
igor@38
|
156 parser.add_option(
|
igor@37
|
157 "-m", "--merge-tag",
|
igor@37
|
158 help="merge words tagged with specified tag into the main vocabulary",
|
igor@37
|
159 action="store",
|
igor@37
|
160 dest="merge_tag")
|
igor@37
|
161
|
igor@37
|
162 parser.add_option(
|
igor@37
|
163 "-M", "--merge-tagged",
|
igor@37
|
164 help="merge words tagged with ANY tag into the main vocabulary",
|
igor@37
|
165 action="store_true",
|
igor@37
|
166 dest="merge_tagged")
|
igor@37
|
167
|
igor@37
|
168 parser.add_option(
|
igor@37
|
169 "-n", "--non-interactive",
|
igor@37
|
170 help="non-interactive mode (don't run vi)",
|
igor@37
|
171 action="store_true",
|
igor@37
|
172 dest="non_interactive")
|
igor@37
|
173
|
igor@37
|
174 parser.add_option(
|
igor@37
|
175 "-N", "--no-filter",
|
igor@37
|
176 help="switch off known words filtering",
|
igor@37
|
177 action="store_true",
|
igor@37
|
178 dest="no_filter")
|
igor@37
|
179
|
igor@37
|
180 parser.add_option(
|
igor@37
|
181 "-p", "--pages",
|
igor@37
|
182 help="work with specified pages only (pages = start-stop/total )",
|
igor@37
|
183 action="store",
|
igor@37
|
184 dest="pages")
|
igor@37
|
185
|
igor@37
|
186 parser.add_option(
|
igor@48
|
187 "-d", "--delete-tag",
|
igor@48
|
188 help="delete subvocabulary of specified tag",
|
igor@37
|
189 action="store",
|
igor@48
|
190 dest="delete_tag")
|
igor@37
|
191
|
igor@37
|
192 parser.add_option(
|
igor@54
|
193 "-R", "--show-range-percentage",
|
igor@54
|
194 help="show only words that cover specified percentage of the text, skip the rest",
|
igor@54
|
195 action="store",
|
igor@54
|
196 dest="show_range_percentage")
|
igor@54
|
197
|
igor@54
|
198 parser.add_option(
|
igor@37
|
199 "-s", "--text-stats",
|
igor@37
|
200 help="show the text statistics (percentage of known words and so on) and exit",
|
igor@37
|
201 action="store_true",
|
igor@37
|
202 dest="text_stats")
|
igor@37
|
203
|
igor@37
|
204 parser.add_option(
|
igor@37
|
205 "-S", "--voc-stats",
|
igor@37
|
206 help="show your vocabulary statistics (number of words and word groups)",
|
igor@37
|
207 action="store_true",
|
igor@37
|
208 dest="voc_stats")
|
igor@37
|
209
|
igor@37
|
210 parser.add_option(
|
igor@37
|
211 "-t", "--tag",
|
igor@37
|
212 help="tag known words with tag",
|
igor@37
|
213 action="store",
|
igor@37
|
214 dest="tag")
|
igor@37
|
215
|
igor@37
|
216 parser.add_option(
|
igor@37
|
217 "-T", "--show-tags",
|
igor@37
|
218 help="tag known words with tag",
|
igor@37
|
219 action="store_true",
|
igor@37
|
220 dest="show_tags")
|
igor@37
|
221
|
igor@37
|
222 parser.add_option(
|
igor@37
|
223 "-2", "--two-words",
|
igor@37
|
224 help="find 2 words' sequences",
|
igor@37
|
225 action="store_true",
|
igor@37
|
226 dest="two_words")
|
igor@37
|
227
|
igor@37
|
228 parser.add_option(
|
igor@37
|
229 "-3", "--three-words",
|
igor@37
|
230 help="find 3 words' sequences",
|
igor@37
|
231 action="store_true",
|
igor@37
|
232 dest="three_words")
|
igor@37
|
233
|
igor@38
|
234 def readlines_from_file(filename):
|
igor@38
|
235 res = []
|
igor@38
|
236 with codecs.open(filename, "r", "utf-8") as f:
|
igor@38
|
237 for line in f.readlines():
|
igor@38
|
238 res += [line]
|
igor@38
|
239 return res
|
igor@38
|
240
|
igor@54
|
241 def readlines_from_url(url):
|
igor@54
|
242 return [x.decode('utf-8') for x in
|
igor@54
|
243 subprocess.Popen(
|
igor@54
|
244 "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url),
|
igor@54
|
245 shell = True,
|
igor@54
|
246 stdout = subprocess.PIPE,
|
igor@54
|
247 stderr = subprocess.STDOUT
|
igor@54
|
248 ).communicate()[0].split('\n')
|
igor@54
|
249 ]
|
igor@54
|
250
|
igor@38
|
251 def readlines_from_stdin():
|
igor@38
|
252 return codecs.getreader("utf-8")(sys.stdin).readlines()
|
igor@38
|
253
|
igor@38
|
254 def words_from_line(line):
|
igor@38
|
255 line = line.rstrip('\n')
|
igor@38
|
256 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
|
igor@38
|
257 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
|
igor@44
|
258 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
|
igor@38
|
259
|
igor@44
|
260 def get_words(lines, group_by=[1]):
|
igor@38
|
261 """
|
igor@38
|
262 Returns hash of words in a file
|
igor@38
|
263 word => number
|
igor@38
|
264 """
|
igor@38
|
265 result = {}
|
igor@44
|
266 (a, b, c) = ("", "", "")
|
igor@38
|
267 for line in lines:
|
igor@38
|
268 words = words_from_line(line)
|
igor@38
|
269 for word in words:
|
igor@41
|
270 if re.match('[0-9]*$', word):
|
igor@41
|
271 continue
|
igor@38
|
272 result.setdefault(word, 0)
|
igor@38
|
273 result[word] += 1
|
igor@44
|
274 if 2 in group_by and a != "" and b != "":
|
igor@44
|
275 w = "%s_%s" % (a,b)
|
igor@44
|
276 result.setdefault(w, 0)
|
igor@44
|
277 result[w] += 1
|
igor@44
|
278 if 3 in group_by and not "" in [a,b,c]:
|
igor@44
|
279 w = "%s_%s_%s" % (a,b,c)
|
igor@44
|
280 result.setdefault(w, 0)
|
igor@44
|
281 result[w] += 1
|
igor@44
|
282 (a,b,c) = (b, c, word)
|
igor@44
|
283
|
igor@44
|
284 logging.debug(result)
|
igor@38
|
285 return result
|
igor@38
|
286
|
igor@54
|
287 def voc_filename():
|
igor@54
|
288 return "%s/%s.txt"%(config['config_directory'], config['language'])
|
igor@54
|
289
|
igor@38
|
290 def load_vocabulary():
|
igor@54
|
291 return get_words(readlines_from_file(voc_filename()))
|
igor@38
|
292
|
igor@38
|
293 def notes_filenames():
|
igor@38
|
294 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
|
igor@38
|
295
|
igor@38
|
296 def load_notes(files):
|
igor@38
|
297 notes = {}
|
igor@38
|
298 for filename in files:
|
igor@39
|
299 with codecs.open(filename, "r", "utf-8") as f:
|
igor@38
|
300 for line in f.readlines():
|
igor@38
|
301 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
|
igor@38
|
302 notes.setdefault(word, {})
|
igor@38
|
303 notes[word][filename] = note
|
igor@38
|
304 return notes
|
igor@38
|
305
|
igor@39
|
306 def add_notes(lines, notes):
|
igor@39
|
307 notes_filename = notes_filenames()[0]
|
igor@39
|
308 result = []
|
igor@39
|
309 for line in lines:
|
igor@39
|
310 if line.startswith('#'):
|
igor@39
|
311 result += [line]
|
igor@39
|
312 else:
|
igor@39
|
313 match_object = re.search('^\s*\S+\s*(\S+)', line)
|
igor@39
|
314 if match_object:
|
igor@39
|
315 word = match_object.group(1)
|
igor@39
|
316 if word in notes:
|
igor@39
|
317 if notes_filename in notes[word]:
|
igor@39
|
318 line = line.rstrip('\n')
|
igor@39
|
319 line = "%-30s %s\n" % (line, notes[word][notes_filename])
|
igor@39
|
320 result += [line]
|
igor@39
|
321 else:
|
igor@39
|
322 result += [line]
|
igor@39
|
323 else:
|
igor@39
|
324 result += [line]
|
igor@39
|
325 return result
|
igor@39
|
326
|
igor@39
|
327 def remove_notes(lines, notes_group):
|
igor@39
|
328 notes_filename = notes_filenames()[0]
|
igor@39
|
329 notes = {}
|
igor@39
|
330 for k in notes_group.keys():
|
igor@39
|
331 if notes_filename in notes_group[k]:
|
igor@39
|
332 notes[k] = notes_group[k][notes_filename]
|
igor@39
|
333
|
igor@39
|
334 result = []
|
igor@39
|
335 for line in lines:
|
igor@39
|
336 line = line.rstrip('\n')
|
igor@39
|
337 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
|
igor@39
|
338 if match_object:
|
igor@39
|
339 result.append("".join([
|
igor@39
|
340 match_object.group(1),
|
igor@39
|
341 match_object.group(2),
|
igor@39
|
342 match_object.group(3),
|
igor@39
|
343 match_object.group(4),
|
igor@39
|
344 "\n"
|
igor@39
|
345 ]))
|
igor@39
|
346 notes[match_object.group(4)] = match_object.group(6)
|
igor@39
|
347 else:
|
igor@39
|
348 result.append(line+"\n")
|
igor@39
|
349
|
igor@39
|
350 save_notes(notes_filename, notes)
|
igor@39
|
351 return result
|
igor@39
|
352
|
igor@39
|
353 def save_notes(filename, notes):
|
igor@39
|
354 lines = []
|
igor@39
|
355 saved_words = []
|
igor@39
|
356 with codecs.open(filename, "r", "utf-8") as f:
|
igor@39
|
357 for line in f.readlines():
|
igor@39
|
358 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
|
igor@39
|
359 if word in notes:
|
igor@39
|
360 line = "%-29s %s\n" % (word, notes[word])
|
igor@39
|
361 saved_words.append(word)
|
igor@39
|
362 lines.append(line)
|
igor@39
|
363 for word in [x for x in notes.keys() if not x in saved_words]:
|
igor@39
|
364 line = "%-29s %s\n" % (word, notes[word])
|
igor@39
|
365 lines.append(line)
|
igor@39
|
366
|
igor@39
|
367 with codecs.open(filename, "w", "utf-8") as f:
|
igor@39
|
368 for line in lines:
|
igor@39
|
369 f.write(line)
|
igor@39
|
370
|
igor@39
|
371
|
igor@38
|
372 def substract_dictionary(dict1, dict2):
|
igor@38
|
373 """
|
igor@38
|
374 returns dict1 - dict2
|
igor@38
|
375 """
|
igor@38
|
376 result = {}
|
igor@38
|
377 for (k,v) in dict1.items():
|
igor@38
|
378 if not k in dict2:
|
igor@38
|
379 result[k] = v
|
igor@38
|
380 return result
|
igor@38
|
381
|
igor@38
|
382 def dump_words(words, filename):
|
igor@38
|
383 with codecs.open(filename, "w+", "utf-8") as f:
|
igor@38
|
384 for word in words.keys():
|
igor@38
|
385 f.write(("%s\n"%word)*words[word])
|
igor@38
|
386
|
igor@38
|
387 def error_message(text):
|
igor@38
|
388 print text
|
igor@38
|
389
|
igor@40
|
390 def find_wordgroups_weights(word_pairs, normalizator):
|
igor@38
|
391 weight = {}
|
igor@40
|
392 for (num, word) in word_pairs:
|
igor@38
|
393 normalized = normalizator.normalize(word)
|
igor@38
|
394 weight.setdefault(normalized, 0)
|
igor@40
|
395 weight[normalized] += num
|
igor@38
|
396 return weight
|
igor@38
|
397
|
igor@38
|
398 def find_linked_words(notes):
|
igor@38
|
399 linked_words = {}
|
igor@38
|
400 for word in notes.keys():
|
igor@38
|
401 for note in notes[word].values():
|
igor@38
|
402 if "@" in note:
|
igor@38
|
403 result = re.search(r'\@(\S*)', note)
|
igor@38
|
404 if result:
|
igor@38
|
405 main_word = result.group(1)
|
igor@38
|
406 if main_word:
|
igor@38
|
407 linked_words[word] = main_word
|
igor@38
|
408 return linked_words
|
igor@38
|
409
|
igor@40
|
410 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
|
igor@40
|
411 (num1, word1) = pair1
|
igor@40
|
412 (num2, word2) = pair2
|
igor@38
|
413
|
igor@38
|
414 normalized_word1 = normalizator.normalize(word1)
|
igor@38
|
415 normalized_word2 = normalizator.normalize(word2)
|
igor@38
|
416
|
igor@38
|
417 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
|
igor@38
|
418 if cmp_res != 0:
|
igor@38
|
419 return cmp_res
|
igor@38
|
420 else:
|
igor@38
|
421 cmp_res = cmp(normalized_word1, normalized_word2)
|
igor@38
|
422 if cmp_res != 0:
|
igor@38
|
423 return cmp_res
|
igor@38
|
424 else:
|
igor@38
|
425 return cmp(int(num1), int(num2))
|
igor@38
|
426
|
igor@47
|
427
|
igor@48
|
428 def print_words_sorted(
|
igor@48
|
429 word_pairs,
|
igor@48
|
430 stats,
|
igor@48
|
431 normalizator,
|
igor@48
|
432 print_stats=True,
|
igor@48
|
433 stats_only=False,
|
igor@48
|
434 compressed_wordlist=False,
|
igor@48
|
435 show_range=0,
|
igor@48
|
436 show_range_percentage=0,
|
igor@48
|
437 ):
|
igor@54
|
438 result = []
|
igor@40
|
439 if stats_only:
|
igor@54
|
440 #codecs.getwriter("utf-8")(sys.stdout).write(
|
igor@54
|
441 result.append(
|
igor@43
|
442 " ".join([
|
igor@43
|
443 "%-10s" % x for x in [
|
igor@43
|
444 "LANG",
|
igor@43
|
445 "KNOWN%",
|
igor@43
|
446 "UNKNOWN%",
|
igor@43
|
447 "KNOWN",
|
igor@43
|
448 "TOTAL",
|
igor@43
|
449 "WPS",
|
igor@43
|
450 "UWPS*10"
|
igor@43
|
451 ]]) + "\n")
|
igor@54
|
452 result.append(
|
igor@43
|
453 " ".join([
|
igor@43
|
454 "%(language)-10s",
|
igor@43
|
455 "%(percentage)-10.2f",
|
igor@43
|
456 "%(percentage_unknown)-10.2f",
|
igor@43
|
457 "%(total_known)-11d"
|
igor@43
|
458 "%(total)-11d"
|
igor@43
|
459 "%(wps)-11d"
|
igor@43
|
460 "%(uwps)-11d"
|
igor@43
|
461 ]) % stats + "\n")
|
igor@54
|
462 return "".join(result)
|
igor@38
|
463
|
igor@40
|
464 if print_stats:
|
igor@54
|
465 result.append(
|
igor@43
|
466 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
|
igor@38
|
467
|
igor@40
|
468 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
|
igor@40
|
469 known = int(stats['total_known'])
|
igor@40
|
470 total = int(stats['total'])
|
igor@40
|
471 current_level = 0
|
igor@47
|
472 old_normalized_word = None
|
igor@47
|
473 words_of_this_group = []
|
igor@48
|
474 printed_words = 0
|
igor@40
|
475 for word_pair in word_pairs:
|
igor@47
|
476
|
igor@47
|
477 normalized_word = normalizator.normalize(word_pair[1])
|
igor@47
|
478 if old_normalized_word and old_normalized_word != normalized_word:
|
igor@47
|
479 if compressed_wordlist:
|
igor@49
|
480 compressed_word_pair = (
|
igor@49
|
481 sum(x[0] for x in words_of_this_group),
|
igor@49
|
482 normalizator.best_word_from_group(words_of_this_group)
|
igor@49
|
483 )
|
igor@54
|
484 result.append("%10s %s\n" % compressed_word_pair)
|
igor@48
|
485 printed_words += 1
|
igor@47
|
486 words_of_this_group = []
|
igor@47
|
487
|
igor@47
|
488 old_normalized_word = normalized_word
|
igor@47
|
489 words_of_this_group.append(word_pair)
|
igor@47
|
490
|
igor@47
|
491 if not compressed_wordlist:
|
igor@54
|
492 result.append("%10s %s\n" % word_pair)
|
igor@48
|
493 printed_words += 1
|
igor@47
|
494
|
igor@47
|
495
|
igor@40
|
496 known += word_pair[0]
|
igor@40
|
497 if 100.0*known/total >= level_lines[0]:
|
igor@40
|
498 current_level = level_lines[0]
|
igor@40
|
499 while 100.0*known/total > level_lines[0]:
|
igor@40
|
500 current_level = level_lines[0]
|
igor@40
|
501 level_lines = level_lines[1:]
|
igor@54
|
502 result.append("# %s\n" % current_level)
|
igor@38
|
503
|
igor@48
|
504 if show_range >0 and printed_words >= show_range:
|
igor@48
|
505 break
|
igor@48
|
506 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
|
igor@48
|
507 break
|
igor@48
|
508
|
igor@54
|
509 return result
|
igor@39
|
510
|
igor@53
|
511 def parse_parts_description(parts_description):
|
igor@53
|
512 """
|
igor@53
|
513 Returns triad (start, stop, step)
|
igor@53
|
514 basing on parts_description string.
|
igor@53
|
515 from-to/step
|
igor@53
|
516 from+delta/step
|
igor@53
|
517 """
|
igor@53
|
518
|
igor@53
|
519 try:
|
igor@53
|
520 (a, step) = parts_description.split("/", 1)
|
igor@53
|
521 step = int(step)
|
igor@53
|
522 start = 0
|
igor@53
|
523 stop = 0
|
igor@53
|
524 if '-' in a:
|
igor@53
|
525 (start, stop) = a.split("-", 1)
|
igor@53
|
526 start = int(start)
|
igor@53
|
527 stop = int(stop)
|
igor@53
|
528 elif '+' in a:
|
igor@53
|
529 (start, stop) = a.split("+", 1)
|
igor@53
|
530 start = int(start)
|
igor@53
|
531 stop = int(stop)
|
igor@53
|
532 else:
|
igor@53
|
533 start = int(a)
|
igor@53
|
534 stop = start + 1
|
igor@53
|
535 return (start, stop, step)
|
igor@53
|
536
|
igor@53
|
537 except:
|
igor@54
|
538 raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % parts_description)
|
igor@53
|
539
|
igor@53
|
540
|
igor@53
|
541 def take_part(lines, part_description = None):
|
igor@53
|
542 if part_description == None:
|
igor@53
|
543 return lines
|
igor@53
|
544 (start, stop, step) = parse_parts_description(part_description)
|
igor@53
|
545 n = len(lines)
|
igor@53
|
546 part_size = (1.0*n) / step
|
igor@53
|
547 result = []
|
igor@53
|
548 for i in range(n):
|
igor@54
|
549 if i >= start * part_size and i <= stop * part_size:
|
igor@54
|
550 result += [lines[i]]
|
igor@53
|
551 return result
|
igor@53
|
552
|
igor@40
|
553 def filter_get_words_group_words_add_stat(args):
|
igor@40
|
554 vocabulary = load_vocabulary()
|
igor@40
|
555 notes = load_notes(notes_filenames())
|
igor@54
|
556
|
igor@54
|
557 if len(args) > 0:
|
igor@54
|
558 if 'http://' in args[0]:
|
igor@54
|
559 input_lines = readlines_from_url(args[0])
|
igor@54
|
560 else:
|
igor@54
|
561 input_lines = readlines_from_file(args[0])
|
igor@54
|
562 else:
|
igor@54
|
563 input_lines = readlines_from_stdin()
|
igor@54
|
564
|
igor@54
|
565 if len(input_lines) == 0:
|
igor@54
|
566 print >> sys.stderr, "Nothing to do, standard input is empty, exiting."
|
igor@54
|
567 sys.exit(1)
|
igor@54
|
568
|
igor@54
|
569 lines = take_part(input_lines, config.get('pages', ''))
|
igor@54
|
570
|
igor@54
|
571 (_, original_text_tempfile) = tempfile.mkstemp(prefix='new-word')
|
igor@54
|
572 with codecs.open(original_text_tempfile, "w", "utf-8") as f:
|
igor@54
|
573 f.write("".join(lines))
|
igor@54
|
574
|
igor@44
|
575 group_by = [1]
|
igor@48
|
576
|
igor@54
|
577 if 'two_words' in config:
|
igor@44
|
578 group_by.append(2)
|
igor@54
|
579 if 'three_words' in config:
|
igor@44
|
580 group_by.append(3)
|
igor@44
|
581 words = get_words(lines, group_by)
|
igor@43
|
582 stats_only = False
|
igor@54
|
583 if 'text_stats' in config:
|
igor@43
|
584 stats_only = True
|
igor@40
|
585
|
igor@47
|
586 compressed_wordlist = False
|
igor@54
|
587 if 'compressed' in config:
|
igor@47
|
588 compressed_wordlist = True
|
igor@47
|
589
|
igor@48
|
590 show_range = os.environ.get('SHOW_RANGE', '')
|
igor@48
|
591 if show_range != '':
|
igor@48
|
592 show_range = int(show_range)
|
igor@48
|
593 else:
|
igor@48
|
594 show_range = 0
|
igor@54
|
595
|
igor@54
|
596 if 'show_range_percentage' in config:
|
igor@54
|
597 show_range_percentage = int(config['show_range_percentage'])
|
igor@48
|
598 else:
|
igor@48
|
599 show_range_percentage = 0
|
igor@48
|
600
|
igor@44
|
601
|
igor@40
|
602 stats = {}
|
igor@40
|
603 stats['total'] = sum(words[x] for x in words.keys())
|
igor@54
|
604 if not 'no_filter' in config:
|
igor@45
|
605 words = substract_dictionary(words, vocabulary)
|
igor@40
|
606
|
igor@40
|
607 stats['total_unknown'] = sum(words[x] for x in words.keys())
|
igor@40
|
608 stats['total_known'] = stats['total'] - stats['total_unknown']
|
igor@43
|
609 stats['percentage'] = 100.0*stats['total_known']/stats['total']
|
igor@43
|
610 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
|
igor@40
|
611 stats['groups'] = 0
|
igor@40
|
612 stats['words'] = len(words)
|
igor@43
|
613 stats['sentences'] = 0 #FIXME
|
igor@43
|
614 stats['wps'] = 0 #FIXME
|
igor@43
|
615 stats['uwps'] = 0 #FIXME
|
igor@40
|
616 stats['language'] = config['language']
|
igor@40
|
617
|
igor@40
|
618 linked_words = find_linked_words(notes)
|
igor@40
|
619 normalizator = Normalizator(config['language'], linked_words)
|
igor@40
|
620
|
igor@50
|
621 # filter words by allowed_words_filter
|
igor@54
|
622 if 'allowed_words' in config:
|
igor@54
|
623 allowed_words_filename = config['allowed_words']
|
igor@50
|
624 normalized_allowed_words = [
|
igor@50
|
625 normalizator.normalize(w.rstrip('\n'))
|
igor@50
|
626 for w in readlines_from_file(allowed_words_filename)
|
igor@50
|
627 ]
|
igor@50
|
628
|
igor@50
|
629 result = {}
|
igor@50
|
630 for w, wn in words.iteritems():
|
igor@50
|
631 if normalizator.normalize(w) in normalized_allowed_words:
|
igor@50
|
632 result[w] = wn
|
igor@50
|
633 words = result
|
igor@50
|
634
|
igor@44
|
635 words_with_freq = []
|
igor@40
|
636 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
|
igor@44
|
637 words_with_freq.append((words[k], k))
|
igor@40
|
638
|
igor@44
|
639 wgw = find_wordgroups_weights(words_with_freq, normalizator)
|
igor@45
|
640 if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':
|
igor@45
|
641 words_with_freq = sorted(
|
igor@44
|
642 words_with_freq,
|
igor@40
|
643 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
|
igor@40
|
644 reverse=True)
|
igor@40
|
645
|
igor@54
|
646 output = print_words_sorted(
|
igor@47
|
647 words_with_freq,
|
igor@47
|
648 stats,
|
igor@47
|
649 normalizator,
|
igor@47
|
650 stats_only=stats_only,
|
igor@48
|
651 compressed_wordlist=compressed_wordlist,
|
igor@48
|
652 show_range=show_range,
|
igor@48
|
653 show_range_percentage=show_range_percentage,
|
igor@47
|
654 )
|
igor@40
|
655
|
igor@54
|
656
|
igor@54
|
657 if ('non_interactive' in config or 'text_stats' in config):
|
igor@54
|
658 codecs.getwriter("utf-8")(sys.stdout).write("".join(output))
|
igor@54
|
659 else:
|
igor@54
|
660 (_, temp1) = tempfile.mkstemp(prefix='new-word')
|
igor@54
|
661 (_, temp2) = tempfile.mkstemp(prefix='new-word')
|
igor@54
|
662
|
igor@54
|
663 with codecs.open(temp1, "w", "utf-8") as f:
|
igor@54
|
664 f.write("".join(output))
|
igor@54
|
665 with codecs.open(temp2, "w", "utf-8") as f:
|
igor@54
|
666 f.write("".join(add_notes(output, notes)))
|
igor@54
|
667
|
igor@54
|
668 os.putenv('ORIGINAL_TEXT', original_text_tempfile)
|
igor@54
|
669 os.system((
|
igor@54
|
670 "vim"
|
igor@54
|
671 " -c 'setlocal spell spelllang={language}'"
|
igor@54
|
672 " -c 'set keywordprg={language}'"
|
igor@54
|
673 " -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255'"
|
igor@54
|
674 " {filename}"
|
igor@54
|
675 " < /dev/tty > /dev/tty"
|
igor@54
|
676 ).format(language=config['language'], filename=temp2))
|
igor@54
|
677
|
igor@54
|
678 lines = remove_notes(readlines_from_file(temp2), notes)
|
igor@54
|
679
|
igor@54
|
680 # compare lines_before and lines_after and return deleted words
|
igor@54
|
681 lines_before = output
|
igor@54
|
682 lines_after = lines
|
igor@54
|
683 deleted_words = []
|
igor@54
|
684
|
igor@54
|
685 for line in lines_before:
|
igor@54
|
686 if line not in lines_after:
|
igor@54
|
687 line = line.strip()
|
igor@54
|
688 if ' ' in line:
|
igor@54
|
689 word = re.split('\s+', line, 1)[1]
|
igor@54
|
690 if ' ' in word:
|
igor@54
|
691 word = re.split('\s+', word, 1)[0]
|
igor@54
|
692 deleted_words.append(word)
|
igor@54
|
693
|
igor@54
|
694 with codecs.open(voc_filename(), "a", "utf-8") as f:
|
igor@54
|
695 f.write("\n".join(deleted_words + ['']))
|
igor@54
|
696
|
igor@54
|
697 os.unlink(temp1)
|
igor@54
|
698 os.unlink(temp2)
|
igor@54
|
699
|
igor@54
|
700 os.unlink(original_text_tempfile)
|
igor@54
|
701
|
igor@37
|
702 (options, args) = parser.parse_args()
|
igor@38
|
703 if options.language:
|
igor@38
|
704 config['language'] = options.language
|
igor@37
|
705
|
igor@54
|
706 if options.pages:
|
igor@54
|
707 config['pages'] = options.pages
|
igor@54
|
708 else:
|
igor@54
|
709 config['pages'] = ""
|
igor@54
|
710
|
igor@54
|
711 if options.allowed_words:
|
igor@54
|
712 config['allowed_words'] = options.allowed_words
|
igor@54
|
713
|
igor@54
|
714 if options.show_range_percentage:
|
igor@54
|
715 config['show_range_percentage'] = options.show_range_percentage
|
igor@54
|
716
|
igor@54
|
717 if options.non_interactive:
|
igor@54
|
718 config['non_interactive'] = True
|
igor@54
|
719
|
igor@54
|
720 if options.text_stats:
|
igor@54
|
721 config['text_stats'] = True
|
igor@54
|
722
|
igor@54
|
723 if options.compressed:
|
igor@54
|
724 config['compressed'] = True
|
igor@54
|
725
|
igor@54
|
726 if options.no_filter:
|
igor@54
|
727 config['no_filter'] = True
|
igor@54
|
728
|
igor@54
|
729 if options.two_words:
|
igor@54
|
730 config['two_words'] = True
|
igor@54
|
731
|
igor@54
|
732 if options.three_words:
|
igor@54
|
733 config['three_words'] = True
|
igor@54
|
734
|
igor@38
|
735 if options.function:
|
igor@38
|
736 function_names = {
|
igor@40
|
737 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
|
igor@38
|
738 }
|
igor@38
|
739 if options.function in function_names:
|
igor@38
|
740 function_names[options.function](args)
|
igor@38
|
741 else:
|
igor@38
|
742 error_message("Unkown function %s.\nAvailable functions:\n%s" % (
|
igor@38
|
743 options.function, "".join([" "+x for x in sorted(function_names.keys())])))
|
igor@38
|
744 sys.exit(1)
|
igor@37
|
745
|
igor@37
|
746
|
igor@37
|
747
|
igor@37
|
748
|
igor@38
|
749 #os.system("vim")
|
igor@37
|
750
|