rev |
line source |
igor@37
|
1 #!/usr/bin/env python
|
igor@38
|
2 # -*- coding: utf-8 -*-
|
igor@37
|
3
|
igor@40
|
4 from __future__ import with_statement
|
igor@38
|
5 import codecs
|
igor@38
|
6 import logging
|
igor@38
|
7 import os
|
igor@37
|
8 import optparse
|
igor@38
|
9 import re
|
igor@38
|
10 import subprocess
|
igor@38
|
11 import sys
|
igor@38
|
12 import Stemmer
|
igor@38
|
13
|
igor@38
|
14 config = {
|
igor@38
|
15 'config_directory': os.environ['HOME'] + '/.new-words',
|
igor@38
|
16 'language': 'en',
|
igor@38
|
17 }
|
igor@38
|
18
|
igor@38
|
19 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
|
igor@38
|
20
|
igor@38
|
21 class Normalizator:
|
igor@38
|
22 def __init__(self, language, linked_words={}):
|
igor@38
|
23 stemmer_algorithm = {
|
igor@38
|
24 'de' : 'german',
|
igor@38
|
25 'en' : 'english',
|
igor@38
|
26 'ru' : 'russian',
|
igor@38
|
27 'uk' : 'ukrainian',
|
igor@38
|
28 }
|
igor@38
|
29 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
|
igor@38
|
30 self.linked_words = linked_words
|
igor@38
|
31
|
igor@38
|
32 def normalize(self, word):
|
igor@38
|
33 word_chain = []
|
igor@38
|
34 while word in self.linked_words and not word in word_chain:
|
igor@38
|
35 word_chain.append(word)
|
igor@38
|
36 word = self.linked_words[word]
|
igor@38
|
37 return self.stemmer.stemWord(word.lower())
|
igor@37
|
38
|
igor@37
|
39 parser = optparse.OptionParser()
|
igor@37
|
40
|
igor@37
|
41 parser.add_option(
|
igor@37
|
42 "-a", "--no-marks",
|
igor@37
|
43 help="don't add marks (and don't save marks added by user)",
|
igor@37
|
44 action="store_true",
|
igor@37
|
45 dest="no_marks")
|
igor@37
|
46
|
igor@37
|
47 parser.add_option(
|
igor@37
|
48 "-c", "--compressed",
|
igor@37
|
49 help="show compressed wordlist: one word per group",
|
igor@37
|
50 action="store_true",
|
igor@37
|
51 dest="compressed")
|
igor@37
|
52
|
igor@37
|
53 parser.add_option(
|
igor@37
|
54 "-k", "--known-words",
|
igor@37
|
55 help="put higher words that are similar to the known words (only for English)",
|
igor@37
|
56 action="store_true",
|
igor@37
|
57 dest="compressed")
|
igor@37
|
58
|
igor@37
|
59 parser.add_option(
|
igor@37
|
60 "-l", "--language",
|
igor@37
|
61 help="specify language of text",
|
igor@37
|
62 action="store",
|
igor@37
|
63 dest="language")
|
igor@37
|
64
|
igor@37
|
65 parser.add_option(
|
igor@38
|
66 "-f", "--function",
|
igor@38
|
67 help="filter through subsystem [INTERNAL]",
|
igor@38
|
68 action="store",
|
igor@38
|
69 dest="function")
|
igor@38
|
70
|
igor@38
|
71 parser.add_option(
|
igor@37
|
72 "-m", "--merge-tag",
|
igor@37
|
73 help="merge words tagged with specified tag into the main vocabulary",
|
igor@37
|
74 action="store",
|
igor@37
|
75 dest="merge_tag")
|
igor@37
|
76
|
igor@37
|
77 parser.add_option(
|
igor@37
|
78 "-M", "--merge-tagged",
|
igor@37
|
79 help="merge words tagged with ANY tag into the main vocabulary",
|
igor@37
|
80 action="store_true",
|
igor@37
|
81 dest="merge_tagged")
|
igor@37
|
82
|
igor@37
|
83 parser.add_option(
|
igor@37
|
84 "-n", "--non-interactive",
|
igor@37
|
85 help="non-interactive mode (don't run vi)",
|
igor@37
|
86 action="store_true",
|
igor@37
|
87 dest="non_interactive")
|
igor@37
|
88
|
igor@37
|
89 parser.add_option(
|
igor@37
|
90 "-N", "--no-filter",
|
igor@37
|
91 help="switch off known words filtering",
|
igor@37
|
92 action="store_true",
|
igor@37
|
93 dest="no_filter")
|
igor@37
|
94
|
igor@37
|
95 parser.add_option(
|
igor@37
|
96 "-p", "--pages",
|
igor@37
|
97 help="work with specified pages only (pages = start-stop/total )",
|
igor@37
|
98 action="store",
|
igor@37
|
99 dest="pages")
|
igor@37
|
100
|
igor@37
|
101 parser.add_option(
|
igor@37
|
102 "-r", "--remove-tag",
|
igor@37
|
103 help="remove subvocabulary of specified tag",
|
igor@37
|
104 action="store",
|
igor@37
|
105 dest="remove_tag")
|
igor@37
|
106
|
igor@37
|
107 parser.add_option(
|
igor@37
|
108 "-s", "--text-stats",
|
igor@37
|
109 help="show the text statistics (percentage of known words and so on) and exit",
|
igor@37
|
110 action="store_true",
|
igor@37
|
111 dest="text_stats")
|
igor@37
|
112
|
igor@37
|
113 parser.add_option(
|
igor@37
|
114 "-S", "--voc-stats",
|
igor@37
|
115 help="show your vocabulary statistics (number of words and word groups)",
|
igor@37
|
116 action="store_true",
|
igor@37
|
117 dest="voc_stats")
|
igor@37
|
118
|
igor@37
|
119 parser.add_option(
|
igor@37
|
120 "-t", "--tag",
|
igor@37
|
121 help="tag known words with tag",
|
igor@37
|
122 action="store",
|
igor@37
|
123 dest="tag")
|
igor@37
|
124
|
igor@37
|
125 parser.add_option(
|
igor@37
|
126 "-T", "--show-tags",
|
igor@37
|
127 help="tag known words with tag",
|
igor@37
|
128 action="store_true",
|
igor@37
|
129 dest="show_tags")
|
igor@37
|
130
|
igor@37
|
131 parser.add_option(
|
igor@37
|
132 "-2", "--two-words",
|
igor@37
|
133 help="find 2 words' sequences",
|
igor@37
|
134 action="store_true",
|
igor@37
|
135 dest="two_words")
|
igor@37
|
136
|
igor@37
|
137 parser.add_option(
|
igor@37
|
138 "-3", "--three-words",
|
igor@37
|
139 help="find 3 words' sequences",
|
igor@37
|
140 action="store_true",
|
igor@37
|
141 dest="three_words")
|
igor@37
|
142
|
igor@38
|
143 def readlines_from_file(filename):
|
igor@38
|
144 res = []
|
igor@38
|
145 with codecs.open(filename, "r", "utf-8") as f:
|
igor@38
|
146 for line in f.readlines():
|
igor@38
|
147 res += [line]
|
igor@38
|
148 return res
|
igor@38
|
149
|
igor@38
|
150 def readlines_from_stdin():
|
igor@38
|
151 return codecs.getreader("utf-8")(sys.stdin).readlines()
|
igor@38
|
152
|
igor@38
|
153 def words_from_line(line):
|
igor@38
|
154 line = line.rstrip('\n')
|
igor@38
|
155 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
|
igor@38
|
156 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
|
igor@38
|
157 return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line)
|
igor@38
|
158
|
igor@38
|
159 def get_words(lines):
|
igor@38
|
160 """
|
igor@38
|
161 Returns hash of words in a file
|
igor@38
|
162 word => number
|
igor@38
|
163 """
|
igor@38
|
164 result = {}
|
igor@38
|
165 for line in lines:
|
igor@38
|
166 words = words_from_line(line)
|
igor@38
|
167 for word in words:
|
igor@41
|
168 if re.match('[0-9]*$', word):
|
igor@41
|
169 continue
|
igor@38
|
170 result.setdefault(word, 0)
|
igor@38
|
171 result[word] += 1
|
igor@38
|
172 return result
|
igor@38
|
173
|
igor@38
|
174 def load_vocabulary():
|
igor@38
|
175 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
|
igor@38
|
176
|
igor@38
|
177 def notes_filenames():
|
igor@38
|
178 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
|
igor@38
|
179
|
igor@38
|
180 def load_notes(files):
|
igor@38
|
181 notes = {}
|
igor@38
|
182 for filename in files:
|
igor@39
|
183 with codecs.open(filename, "r", "utf-8") as f:
|
igor@38
|
184 for line in f.readlines():
|
igor@38
|
185 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
|
igor@38
|
186 notes.setdefault(word, {})
|
igor@38
|
187 notes[word][filename] = note
|
igor@38
|
188 return notes
|
igor@38
|
189
|
igor@39
|
190 def add_notes(lines, notes):
|
igor@39
|
191 notes_filename = notes_filenames()[0]
|
igor@39
|
192 result = []
|
igor@39
|
193 for line in lines:
|
igor@39
|
194 if line.startswith('#'):
|
igor@39
|
195 result += [line]
|
igor@39
|
196 else:
|
igor@39
|
197 match_object = re.search('^\s*\S+\s*(\S+)', line)
|
igor@39
|
198 if match_object:
|
igor@39
|
199 word = match_object.group(1)
|
igor@39
|
200 if word in notes:
|
igor@39
|
201 logging.debug(word)
|
igor@39
|
202 logging.debug(line)
|
igor@39
|
203 if notes_filename in notes[word]:
|
igor@39
|
204 line = line.rstrip('\n')
|
igor@39
|
205 line = "%-30s %s\n" % (line, notes[word][notes_filename])
|
igor@39
|
206 logging.debug(line)
|
igor@39
|
207 result += [line]
|
igor@39
|
208 else:
|
igor@39
|
209 result += [line]
|
igor@39
|
210 else:
|
igor@39
|
211 result += [line]
|
igor@39
|
212 return result
|
igor@39
|
213
|
igor@39
|
214 def remove_notes(lines, notes_group):
|
igor@39
|
215 notes_filename = notes_filenames()[0]
|
igor@39
|
216 notes = {}
|
igor@39
|
217 for k in notes_group.keys():
|
igor@39
|
218 if notes_filename in notes_group[k]:
|
igor@39
|
219 notes[k] = notes_group[k][notes_filename]
|
igor@39
|
220
|
igor@39
|
221 result = []
|
igor@39
|
222 for line in lines:
|
igor@39
|
223 line = line.rstrip('\n')
|
igor@39
|
224 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
|
igor@39
|
225 if match_object:
|
igor@39
|
226 result.append("".join([
|
igor@39
|
227 match_object.group(1),
|
igor@39
|
228 match_object.group(2),
|
igor@39
|
229 match_object.group(3),
|
igor@39
|
230 match_object.group(4),
|
igor@39
|
231 "\n"
|
igor@39
|
232 ]))
|
igor@39
|
233 notes[match_object.group(4)] = match_object.group(6)
|
igor@39
|
234 else:
|
igor@39
|
235 result.append(line+"\n")
|
igor@39
|
236
|
igor@39
|
237 save_notes(notes_filename, notes)
|
igor@39
|
238 return result
|
igor@39
|
239
|
igor@39
|
240 def save_notes(filename, notes):
|
igor@39
|
241 lines = []
|
igor@39
|
242 saved_words = []
|
igor@39
|
243 with codecs.open(filename, "r", "utf-8") as f:
|
igor@39
|
244 for line in f.readlines():
|
igor@39
|
245 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
|
igor@39
|
246 if word in notes:
|
igor@39
|
247 line = "%-29s %s\n" % (word, notes[word])
|
igor@39
|
248 saved_words.append(word)
|
igor@39
|
249 lines.append(line)
|
igor@39
|
250 for word in [x for x in notes.keys() if not x in saved_words]:
|
igor@39
|
251 line = "%-29s %s\n" % (word, notes[word])
|
igor@39
|
252 lines.append(line)
|
igor@39
|
253
|
igor@39
|
254 with codecs.open(filename, "w", "utf-8") as f:
|
igor@39
|
255 for line in lines:
|
igor@39
|
256 f.write(line)
|
igor@39
|
257
|
igor@39
|
258
|
igor@38
|
259 def substract_dictionary(dict1, dict2):
|
igor@38
|
260 """
|
igor@38
|
261 returns dict1 - dict2
|
igor@38
|
262 """
|
igor@38
|
263 result = {}
|
igor@38
|
264 for (k,v) in dict1.items():
|
igor@38
|
265 if not k in dict2:
|
igor@38
|
266 result[k] = v
|
igor@38
|
267 return result
|
igor@38
|
268
|
igor@38
|
269 def dump_words(words, filename):
|
igor@38
|
270 with codecs.open(filename, "w+", "utf-8") as f:
|
igor@38
|
271 for word in words.keys():
|
igor@38
|
272 f.write(("%s\n"%word)*words[word])
|
igor@38
|
273
|
igor@38
|
274 def error_message(text):
|
igor@38
|
275 print text
|
igor@38
|
276
|
igor@40
|
277 def find_wordgroups_weights(word_pairs, normalizator):
|
igor@38
|
278 weight = {}
|
igor@40
|
279 for (num, word) in word_pairs:
|
igor@38
|
280 normalized = normalizator.normalize(word)
|
igor@38
|
281 weight.setdefault(normalized, 0)
|
igor@40
|
282 weight[normalized] += num
|
igor@38
|
283 return weight
|
igor@38
|
284
|
igor@38
|
285 def find_linked_words(notes):
|
igor@38
|
286 linked_words = {}
|
igor@38
|
287 for word in notes.keys():
|
igor@38
|
288 for note in notes[word].values():
|
igor@38
|
289 if "@" in note:
|
igor@38
|
290 result = re.search(r'\@(\S*)', note)
|
igor@38
|
291 if result:
|
igor@38
|
292 main_word = result.group(1)
|
igor@38
|
293 if main_word:
|
igor@38
|
294 linked_words[word] = main_word
|
igor@38
|
295 return linked_words
|
igor@38
|
296
|
igor@40
|
297 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
|
igor@40
|
298 (num1, word1) = pair1
|
igor@40
|
299 (num2, word2) = pair2
|
igor@38
|
300
|
igor@38
|
301 normalized_word1 = normalizator.normalize(word1)
|
igor@38
|
302 normalized_word2 = normalizator.normalize(word2)
|
igor@38
|
303
|
igor@38
|
304 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
|
igor@38
|
305 if cmp_res != 0:
|
igor@38
|
306 return cmp_res
|
igor@38
|
307 else:
|
igor@38
|
308 cmp_res = cmp(normalized_word1, normalized_word2)
|
igor@38
|
309 if cmp_res != 0:
|
igor@38
|
310 return cmp_res
|
igor@38
|
311 else:
|
igor@38
|
312 return cmp(int(num1), int(num2))
|
igor@38
|
313
|
igor@40
|
314 def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False):
|
igor@40
|
315 if stats_only:
|
igor@40
|
316 codecs.getwriter("utf-8")(sys.stdout).write("stat_only")
|
igor@40
|
317 return
|
igor@38
|
318
|
igor@40
|
319 if print_stats:
|
igor@40
|
320 codecs.getwriter("utf-8")(sys.stdout).write(
|
igor@40
|
321 "# %(language)s, %(percentage)s, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
|
igor@38
|
322
|
igor@40
|
323 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
|
igor@40
|
324 known = int(stats['total_known'])
|
igor@40
|
325 total = int(stats['total'])
|
igor@40
|
326 current_level = 0
|
igor@40
|
327 for word_pair in word_pairs:
|
igor@40
|
328 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
|
igor@40
|
329 known += word_pair[0]
|
igor@40
|
330 if 100.0*known/total >= level_lines[0]:
|
igor@40
|
331 current_level = level_lines[0]
|
igor@40
|
332 while 100.0*known/total > level_lines[0]:
|
igor@40
|
333 current_level = level_lines[0]
|
igor@40
|
334 level_lines = level_lines[1:]
|
igor@40
|
335 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
|
igor@38
|
336
|
igor@39
|
337 def filter_add_notes(args):
|
igor@39
|
338 lines = readlines_from_file(args[0])
|
igor@39
|
339 notes = load_notes(notes_filenames())
|
igor@39
|
340 lines = add_notes(lines, notes)
|
igor@39
|
341 with codecs.open(args[0], "w", "utf-8") as f:
|
igor@39
|
342 for line in lines:
|
igor@39
|
343 f.write(line)
|
igor@39
|
344
|
igor@39
|
345 def filter_remove_notes(args):
|
igor@39
|
346 lines = readlines_from_file(args[0])
|
igor@39
|
347 notes = load_notes(notes_filenames())
|
igor@39
|
348 lines = remove_notes(lines, notes)
|
igor@39
|
349 with codecs.open(args[0], "w", "utf-8") as f:
|
igor@39
|
350 for line in lines:
|
igor@39
|
351 f.write(line)
|
igor@39
|
352
|
igor@40
|
353 def filter_get_words_group_words_add_stat(args):
|
igor@40
|
354 vocabulary = load_vocabulary()
|
igor@40
|
355 notes = load_notes(notes_filenames())
|
igor@40
|
356 lines = readlines_from_stdin()
|
igor@40
|
357 words = get_words(lines)
|
igor@40
|
358
|
igor@40
|
359 stats = {}
|
igor@40
|
360 stats['total'] = sum(words[x] for x in words.keys())
|
igor@40
|
361 words = substract_dictionary(words, vocabulary)
|
igor@40
|
362
|
igor@40
|
363 stats['total_unknown'] = sum(words[x] for x in words.keys())
|
igor@40
|
364 stats['total_known'] = stats['total'] - stats['total_unknown']
|
igor@40
|
365 stats['percentage'] = "%7.2f"%(100.0*stats['total_known']/stats['total'])
|
igor@40
|
366 stats['groups'] = 0
|
igor@40
|
367 stats['words'] = len(words)
|
igor@40
|
368 stats['sentences'] = 0 #FIXME
|
igor@40
|
369 stats['language'] = config['language']
|
igor@40
|
370
|
igor@40
|
371 linked_words = find_linked_words(notes)
|
igor@40
|
372 normalizator = Normalizator(config['language'], linked_words)
|
igor@40
|
373
|
igor@40
|
374 word_pairs = []
|
igor@40
|
375 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
|
igor@40
|
376 word_pairs.append((words[k], k))
|
igor@40
|
377
|
igor@40
|
378 wgw = find_wordgroups_weights(word_pairs, normalizator)
|
igor@40
|
379 word_pairs = sorted(
|
igor@40
|
380 word_pairs,
|
igor@40
|
381 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
|
igor@40
|
382 reverse=True)
|
igor@40
|
383
|
igor@40
|
384 print_words_sorted(word_pairs, stats)
|
igor@40
|
385
|
igor@37
|
386 (options, args) = parser.parse_args()
|
igor@38
|
387 if options.language:
|
igor@38
|
388 config['language'] = options.language
|
igor@37
|
389
|
igor@38
|
390 if options.function:
|
igor@38
|
391 function_names = {
|
igor@39
|
392 'add_notes' : filter_add_notes,
|
igor@39
|
393 'remove_notes': filter_remove_notes,
|
igor@40
|
394 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
|
igor@38
|
395 }
|
igor@38
|
396 if options.function in function_names:
|
igor@38
|
397 function_names[options.function](args)
|
igor@38
|
398 else:
|
igor@38
|
399 error_message("Unkown function %s.\nAvailable functions:\n%s" % (
|
igor@38
|
400 options.function, "".join([" "+x for x in sorted(function_names.keys())])))
|
igor@38
|
401 sys.exit(1)
|
igor@37
|
402
|
igor@37
|
403
|
igor@37
|
404
|
igor@37
|
405
|
igor@38
|
406 #os.system("vim")
|
igor@37
|
407
|