new-words

view new-words.py @ 49:00286f6bfa85

experimental: when -c specified, use dictionary for compression
author Igor Chubin <igor@chub.in>
date Wed Feb 09 21:08:23 2011 +0200 (2011-02-09)
parents 7194bdb56475
children 4e931db74618
line source
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 from __future__ import with_statement
5 import codecs
6 import difflib
7 import logging
8 import os
9 import optparse
10 import re
11 import subprocess
12 import sys
13 import Stemmer
14 try:
15 import psyco
16 psyco.full()
17 except:
18 pass
20 config = {
21 'config_directory': os.environ['HOME'] + '/.new-words',
22 'language': 'en',
23 }
25 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
27 class Normalizator:
28 def __init__(self, language, linked_words={}):
29 stemmer_algorithm = {
30 'de' : 'german',
31 'en' : 'english',
32 'ru' : 'russian',
33 'uk' : 'ukrainian',
34 }
35 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
36 self.linked_words = linked_words
38 def normalize(self, word):
39 word_chain = []
40 while word in self.linked_words and not word in word_chain:
41 word_chain.append(word)
42 word = self.linked_words[word]
43 return self.stemmer.stemWord(word.lower())
45 def best_word_from_group(self, wordpairs_group):
46 """Returns the word that is the most relevant to the wordpairs_group.
48 At the moment: returns the word with minimal length"""
50 def f(x, y):
51 return difflib.SequenceMatcher(
52 None,
53 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),
54 x.lower(),
55 y.lower()).ratio()
57 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
58 best_match = list(x[1] for x in sorted(
59 (x for x in wordpairs_group if len(x[1]) == minimal_length),
60 key=lambda x:x[0],
61 reverse=True))[0]
63 suggestions = self.dictionary_suggestions(best_match)
64 if len(suggestions) == 1:
65 return best_match
67 #return best_match
69 verb = False
70 corrected_best_match = best_match
71 if best_match[-2:] == 'et':
72 word = best_match[:-1]+"n"
73 sugg = self.dictionary_suggestions(word)
74 if len(sugg) == 1:
75 return word
76 suggestions += sugg
77 corrected_best_match = word
78 corrected_best_match = best_match[:-2]
79 verb = True
81 if best_match[-1] == 't':
82 word = best_match[:-1]+"en"
83 sugg = self.dictionary_suggestions(word)
84 if len(sugg) == 1:
85 return word
86 suggestions += sugg
87 corrected_best_match = best_match[:-1]
88 verb = True
90 if corrected_best_match[0].lower() == corrected_best_match[0]:
91 suggestions = [ x for x in suggestions
92 if x[0].lower() == x[0] ]
94 if suggestions == []:
95 return best_match+"_"
96 return best_match+" "+(" ".join(
97 sorted(
98 suggestions,
99 key = lambda x: f(x, corrected_best_match),
100 reverse = True
101 )
102 )
103 )
105 def dictionary_suggestions(self, word):
106 return [
107 x.decode('utf-8').rstrip('\n')
108 for x
109 in subprocess.Popen(
110 ["de-variants", word],
111 stdout=subprocess.PIPE
112 ).stdout.readlines() ]
115 parser = optparse.OptionParser()
117 parser.add_option(
118 "-a", "--no-marks",
119 help="don't add marks (and don't save marks added by user)",
120 action="store_true",
121 dest="no_marks")
123 parser.add_option(
124 "-c", "--compressed",
125 help="show compressed wordlist: one word per group",
126 action="store_true",
127 dest="compressed")
129 parser.add_option(
130 "-k", "--known-words",
131 help="put higher words that are similar to the known words (only for English)",
132 action="store_true",
133 dest="compressed")
135 parser.add_option(
136 "-l", "--language",
137 help="specify language of text",
138 action="store",
139 dest="language")
141 parser.add_option(
142 "-f", "--function",
143 help="filter through subsystem [INTERNAL]",
144 action="store",
145 dest="function")
147 parser.add_option(
148 "-m", "--merge-tag",
149 help="merge words tagged with specified tag into the main vocabulary",
150 action="store",
151 dest="merge_tag")
153 parser.add_option(
154 "-M", "--merge-tagged",
155 help="merge words tagged with ANY tag into the main vocabulary",
156 action="store_true",
157 dest="merge_tagged")
159 parser.add_option(
160 "-n", "--non-interactive",
161 help="non-interactive mode (don't run vi)",
162 action="store_true",
163 dest="non_interactive")
165 parser.add_option(
166 "-N", "--no-filter",
167 help="switch off known words filtering",
168 action="store_true",
169 dest="no_filter")
171 parser.add_option(
172 "-p", "--pages",
173 help="work with specified pages only (pages = start-stop/total )",
174 action="store",
175 dest="pages")
177 parser.add_option(
178 "-d", "--delete-tag",
179 help="delete subvocabulary of specified tag",
180 action="store",
181 dest="delete_tag")
183 parser.add_option(
184 "-s", "--text-stats",
185 help="show the text statistics (percentage of known words and so on) and exit",
186 action="store_true",
187 dest="text_stats")
189 parser.add_option(
190 "-S", "--voc-stats",
191 help="show your vocabulary statistics (number of words and word groups)",
192 action="store_true",
193 dest="voc_stats")
195 parser.add_option(
196 "-t", "--tag",
197 help="tag known words with tag",
198 action="store",
199 dest="tag")
201 parser.add_option(
202 "-T", "--show-tags",
203 help="tag known words with tag",
204 action="store_true",
205 dest="show_tags")
207 parser.add_option(
208 "-2", "--two-words",
209 help="find 2 words' sequences",
210 action="store_true",
211 dest="two_words")
213 parser.add_option(
214 "-3", "--three-words",
215 help="find 3 words' sequences",
216 action="store_true",
217 dest="three_words")
219 def readlines_from_file(filename):
220 res = []
221 with codecs.open(filename, "r", "utf-8") as f:
222 for line in f.readlines():
223 res += [line]
224 return res
226 def readlines_from_stdin():
227 return codecs.getreader("utf-8")(sys.stdin).readlines()
229 def words_from_line(line):
230 line = line.rstrip('\n')
231 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
232 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
233 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
235 def get_words(lines, group_by=[1]):
236 """
237 Returns hash of words in a file
238 word => number
239 """
240 result = {}
241 (a, b, c) = ("", "", "")
242 for line in lines:
243 words = words_from_line(line)
244 for word in words:
245 if re.match('[0-9]*$', word):
246 continue
247 result.setdefault(word, 0)
248 result[word] += 1
249 if 2 in group_by and a != "" and b != "":
250 w = "%s_%s" % (a,b)
251 result.setdefault(w, 0)
252 result[w] += 1
253 if 3 in group_by and not "" in [a,b,c]:
254 w = "%s_%s_%s" % (a,b,c)
255 result.setdefault(w, 0)
256 result[w] += 1
257 (a,b,c) = (b, c, word)
259 logging.debug(result)
260 return result
262 def load_vocabulary():
263 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
265 def notes_filenames():
266 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
268 def load_notes(files):
269 notes = {}
270 for filename in files:
271 with codecs.open(filename, "r", "utf-8") as f:
272 for line in f.readlines():
273 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
274 notes.setdefault(word, {})
275 notes[word][filename] = note
276 return notes
278 def add_notes(lines, notes):
279 notes_filename = notes_filenames()[0]
280 result = []
281 for line in lines:
282 if line.startswith('#'):
283 result += [line]
284 else:
285 match_object = re.search('^\s*\S+\s*(\S+)', line)
286 if match_object:
287 word = match_object.group(1)
288 if word in notes:
289 if notes_filename in notes[word]:
290 line = line.rstrip('\n')
291 line = "%-30s %s\n" % (line, notes[word][notes_filename])
292 result += [line]
293 else:
294 result += [line]
295 else:
296 result += [line]
297 return result
299 def remove_notes(lines, notes_group):
300 notes_filename = notes_filenames()[0]
301 notes = {}
302 for k in notes_group.keys():
303 if notes_filename in notes_group[k]:
304 notes[k] = notes_group[k][notes_filename]
306 result = []
307 for line in lines:
308 line = line.rstrip('\n')
309 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
310 if match_object:
311 result.append("".join([
312 match_object.group(1),
313 match_object.group(2),
314 match_object.group(3),
315 match_object.group(4),
316 "\n"
317 ]))
318 notes[match_object.group(4)] = match_object.group(6)
319 else:
320 result.append(line+"\n")
322 save_notes(notes_filename, notes)
323 return result
325 def save_notes(filename, notes):
326 lines = []
327 saved_words = []
328 with codecs.open(filename, "r", "utf-8") as f:
329 for line in f.readlines():
330 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
331 if word in notes:
332 line = "%-29s %s\n" % (word, notes[word])
333 saved_words.append(word)
334 lines.append(line)
335 for word in [x for x in notes.keys() if not x in saved_words]:
336 line = "%-29s %s\n" % (word, notes[word])
337 lines.append(line)
339 with codecs.open(filename, "w", "utf-8") as f:
340 for line in lines:
341 f.write(line)
344 def substract_dictionary(dict1, dict2):
345 """
346 returns dict1 - dict2
347 """
348 result = {}
349 for (k,v) in dict1.items():
350 if not k in dict2:
351 result[k] = v
352 return result
354 def dump_words(words, filename):
355 with codecs.open(filename, "w+", "utf-8") as f:
356 for word in words.keys():
357 f.write(("%s\n"%word)*words[word])
359 def error_message(text):
360 print text
362 def find_wordgroups_weights(word_pairs, normalizator):
363 weight = {}
364 for (num, word) in word_pairs:
365 normalized = normalizator.normalize(word)
366 weight.setdefault(normalized, 0)
367 weight[normalized] += num
368 return weight
370 def find_linked_words(notes):
371 linked_words = {}
372 for word in notes.keys():
373 for note in notes[word].values():
374 if "@" in note:
375 result = re.search(r'\@(\S*)', note)
376 if result:
377 main_word = result.group(1)
378 if main_word:
379 linked_words[word] = main_word
380 return linked_words
382 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
383 (num1, word1) = pair1
384 (num2, word2) = pair2
386 normalized_word1 = normalizator.normalize(word1)
387 normalized_word2 = normalizator.normalize(word2)
389 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
390 if cmp_res != 0:
391 return cmp_res
392 else:
393 cmp_res = cmp(normalized_word1, normalized_word2)
394 if cmp_res != 0:
395 return cmp_res
396 else:
397 return cmp(int(num1), int(num2))
400 def print_words_sorted(
401 word_pairs,
402 stats,
403 normalizator,
404 print_stats=True,
405 stats_only=False,
406 compressed_wordlist=False,
407 show_range=0,
408 show_range_percentage=0,
409 ):
410 if stats_only:
411 codecs.getwriter("utf-8")(sys.stdout).write(
412 " ".join([
413 "%-10s" % x for x in [
414 "LANG",
415 "KNOWN%",
416 "UNKNOWN%",
417 "KNOWN",
418 "TOTAL",
419 "WPS",
420 "UWPS*10"
421 ]]) + "\n")
422 codecs.getwriter("utf-8")(sys.stdout).write(
423 " ".join([
424 "%(language)-10s",
425 "%(percentage)-10.2f",
426 "%(percentage_unknown)-10.2f",
427 "%(total_known)-11d"
428 "%(total)-11d"
429 "%(wps)-11d"
430 "%(uwps)-11d"
431 ]) % stats + "\n")
432 return
434 if print_stats:
435 codecs.getwriter("utf-8")(sys.stdout).write(
436 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
438 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
439 known = int(stats['total_known'])
440 total = int(stats['total'])
441 current_level = 0
442 old_normalized_word = None
443 words_of_this_group = []
444 printed_words = 0
445 for word_pair in word_pairs:
447 normalized_word = normalizator.normalize(word_pair[1])
448 if old_normalized_word and old_normalized_word != normalized_word:
449 #codecs.getwriter("utf-8")(sys.stdout).write(
450 # "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
451 if compressed_wordlist:
452 compressed_word_pair = (
453 sum(x[0] for x in words_of_this_group),
454 normalizator.best_word_from_group(words_of_this_group)
455 )
456 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
457 printed_words += 1
458 words_of_this_group = []
460 old_normalized_word = normalized_word
461 words_of_this_group.append(word_pair)
463 if not compressed_wordlist:
464 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
465 printed_words += 1
468 known += word_pair[0]
469 if 100.0*known/total >= level_lines[0]:
470 current_level = level_lines[0]
471 while 100.0*known/total > level_lines[0]:
472 current_level = level_lines[0]
473 level_lines = level_lines[1:]
474 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
476 if show_range >0 and printed_words >= show_range:
477 break
478 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
479 break
481 def filter_add_notes(args):
482 lines = readlines_from_file(args[0])
483 notes = load_notes(notes_filenames())
484 lines = add_notes(lines, notes)
485 with codecs.open(args[0], "w", "utf-8") as f:
486 for line in lines:
487 f.write(line)
489 def filter_remove_notes(args):
490 lines = readlines_from_file(args[0])
491 notes = load_notes(notes_filenames())
492 lines = remove_notes(lines, notes)
493 with codecs.open(args[0], "w", "utf-8") as f:
494 for line in lines:
495 f.write(line)
497 def filter_get_words_group_words_add_stat(args):
498 vocabulary = load_vocabulary()
499 notes = load_notes(notes_filenames())
500 lines = readlines_from_stdin()
501 group_by = [1]
503 if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
504 group_by.append(2)
505 if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
506 group_by.append(3)
507 words = get_words(lines, group_by)
508 stats_only = False
509 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
510 stats_only = True
512 compressed_wordlist = False
513 if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
514 compressed_wordlist = True
516 show_range = os.environ.get('SHOW_RANGE', '')
517 if show_range != '':
518 show_range = int(show_range)
519 else:
520 show_range = 0
521 show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '')
522 if show_range_percentage != '':
523 show_range_percentage = int(show_range_percentage)
524 else:
525 show_range_percentage = 0
528 stats = {}
529 stats['total'] = sum(words[x] for x in words.keys())
530 if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':
531 words = substract_dictionary(words, vocabulary)
533 stats['total_unknown'] = sum(words[x] for x in words.keys())
534 stats['total_known'] = stats['total'] - stats['total_unknown']
535 stats['percentage'] = 100.0*stats['total_known']/stats['total']
536 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
537 stats['groups'] = 0
538 stats['words'] = len(words)
539 stats['sentences'] = 0 #FIXME
540 stats['wps'] = 0 #FIXME
541 stats['uwps'] = 0 #FIXME
542 stats['language'] = config['language']
544 linked_words = find_linked_words(notes)
545 normalizator = Normalizator(config['language'], linked_words)
547 words_with_freq = []
548 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
549 words_with_freq.append((words[k], k))
551 wgw = find_wordgroups_weights(words_with_freq, normalizator)
552 if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':
553 words_with_freq = sorted(
554 words_with_freq,
555 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
556 reverse=True)
558 print_words_sorted(
559 words_with_freq,
560 stats,
561 normalizator,
562 stats_only=stats_only,
563 compressed_wordlist=compressed_wordlist,
564 show_range=show_range,
565 show_range_percentage=show_range_percentage,
566 )
568 (options, args) = parser.parse_args()
569 if options.language:
570 config['language'] = options.language
572 if options.function:
573 function_names = {
574 'add_notes' : filter_add_notes,
575 'remove_notes': filter_remove_notes,
576 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
577 }
578 if options.function in function_names:
579 function_names[options.function](args)
580 else:
581 error_message("Unkown function %s.\nAvailable functions:\n%s" % (
582 options.function, "".join([" "+x for x in sorted(function_names.keys())])))
583 sys.exit(1)
588 #os.system("vim")