new-words

view new-words.py @ 53:f583256b7ab1

-p key support in new-words.py
author Igor Chubin <igor@chub.in>
date Mon Oct 31 20:21:20 2011 +0200 (2011-10-31)
parents 74e05d4436ee
children e25de9ea9184
line source
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 from __future__ import with_statement
5 import codecs
6 import difflib
7 import logging
8 import os
9 import optparse
10 import re
11 import subprocess
12 import sys
13 import Stemmer
14 try:
15 import psyco
16 psyco.full()
17 except:
18 pass
20 config = {
21 'config_directory': os.environ['HOME'] + '/.new-words',
22 'language': 'en',
23 }
25 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
27 class Normalizator:
28 def __init__(self, language, linked_words={}):
29 stemmer_algorithm = {
30 'de' : 'german',
31 'en' : 'english',
32 'es' : 'spanish',
33 'ru' : 'russian',
34 'it' : 'italian',
35 'uk' : 'ukrainian',
36 }
37 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
38 self.linked_words = linked_words
40 def normalize(self, word):
41 word_chain = []
42 while word in self.linked_words and not word in word_chain:
43 word_chain.append(word)
44 word = self.linked_words[word]
45 return self.stemmer.stemWord(word.lower())
47 def best_word_from_group(self, wordpairs_group):
48 """Returns the word that is the most relevant to the wordpairs_group.
50 At the moment: returns the word with minimal length"""
52 def f(x, y):
53 return difflib.SequenceMatcher(
54 None,
55 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),
56 x.lower(),
57 y.lower()).ratio()
59 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
60 best_match = list(x[1] for x in sorted(
61 (x for x in wordpairs_group if len(x[1]) == minimal_length),
62 key=lambda x:x[0],
63 reverse=True))[0]
65 return best_match
67 suggestions = self.dictionary_suggestions(best_match)
68 if len(suggestions) == 1:
69 return best_match
71 verb = False
72 corrected_best_match = best_match
73 if best_match[-2:] == 'et':
74 word = best_match[:-1]+"n"
75 sugg = self.dictionary_suggestions(word)
76 if len(sugg) == 1:
77 return word
78 suggestions += sugg
79 corrected_best_match = word
80 corrected_best_match = best_match[:-2]
81 verb = True
83 if best_match[-1] == 't':
84 word = best_match[:-1]+"en"
85 sugg = self.dictionary_suggestions(word)
86 if len(sugg) == 1:
87 return word
88 suggestions += sugg
89 corrected_best_match = best_match[:-1]
90 verb = True
92 if corrected_best_match[0].lower() == corrected_best_match[0]:
93 suggestions = [ x for x in suggestions
94 if x[0].lower() == x[0] ]
96 if suggestions == []:
97 return best_match+"_"
98 return best_match+" "+(" ".join(
99 sorted(
100 suggestions,
101 key = lambda x: f(x, corrected_best_match),
102 reverse = True
103 )
104 )
105 )
107 def dictionary_suggestions(self, word):
108 return [
109 x.decode('utf-8').rstrip('\n')
110 for x
111 in subprocess.Popen(
112 ["de-variants", word],
113 stdout=subprocess.PIPE
114 ).stdout.readlines() ]
117 parser = optparse.OptionParser()
119 parser.add_option(
120 "-a", "--no-marks",
121 help="don't add marks (and don't save marks added by user)",
122 action="store_true",
123 dest="no_marks")
125 parser.add_option(
126 "-c", "--compressed",
127 help="show compressed wordlist: one word per group",
128 action="store_true",
129 dest="compressed")
131 parser.add_option(
132 "-k", "--known-words",
133 help="put higher words that are similar to the known words (only for English)",
134 action="store_true",
135 dest="compressed")
137 parser.add_option(
138 "-l", "--language",
139 help="specify language of text",
140 action="store",
141 dest="language")
143 parser.add_option(
144 "-f", "--function",
145 help="filter through subsystem [INTERNAL]",
146 action="store",
147 dest="function")
149 parser.add_option(
150 "-m", "--merge-tag",
151 help="merge words tagged with specified tag into the main vocabulary",
152 action="store",
153 dest="merge_tag")
155 parser.add_option(
156 "-M", "--merge-tagged",
157 help="merge words tagged with ANY tag into the main vocabulary",
158 action="store_true",
159 dest="merge_tagged")
161 parser.add_option(
162 "-n", "--non-interactive",
163 help="non-interactive mode (don't run vi)",
164 action="store_true",
165 dest="non_interactive")
167 parser.add_option(
168 "-N", "--no-filter",
169 help="switch off known words filtering",
170 action="store_true",
171 dest="no_filter")
173 parser.add_option(
174 "-p", "--pages",
175 help="work with specified pages only (pages = start-stop/total )",
176 action="store",
177 dest="pages")
179 parser.add_option(
180 "-d", "--delete-tag",
181 help="delete subvocabulary of specified tag",
182 action="store",
183 dest="delete_tag")
185 parser.add_option(
186 "-s", "--text-stats",
187 help="show the text statistics (percentage of known words and so on) and exit",
188 action="store_true",
189 dest="text_stats")
191 parser.add_option(
192 "-S", "--voc-stats",
193 help="show your vocabulary statistics (number of words and word groups)",
194 action="store_true",
195 dest="voc_stats")
197 parser.add_option(
198 "-t", "--tag",
199 help="tag known words with tag",
200 action="store",
201 dest="tag")
203 parser.add_option(
204 "-T", "--show-tags",
205 help="tag known words with tag",
206 action="store_true",
207 dest="show_tags")
209 parser.add_option(
210 "-2", "--two-words",
211 help="find 2 words' sequences",
212 action="store_true",
213 dest="two_words")
215 parser.add_option(
216 "-3", "--three-words",
217 help="find 3 words' sequences",
218 action="store_true",
219 dest="three_words")
221 def readlines_from_file(filename):
222 res = []
223 with codecs.open(filename, "r", "utf-8") as f:
224 for line in f.readlines():
225 res += [line]
226 return res
228 def readlines_from_stdin():
229 return codecs.getreader("utf-8")(sys.stdin).readlines()
231 def words_from_line(line):
232 line = line.rstrip('\n')
233 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
234 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
235 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
237 def get_words(lines, group_by=[1]):
238 """
239 Returns hash of words in a file
240 word => number
241 """
242 result = {}
243 (a, b, c) = ("", "", "")
244 for line in lines:
245 words = words_from_line(line)
246 for word in words:
247 if re.match('[0-9]*$', word):
248 continue
249 result.setdefault(word, 0)
250 result[word] += 1
251 if 2 in group_by and a != "" and b != "":
252 w = "%s_%s" % (a,b)
253 result.setdefault(w, 0)
254 result[w] += 1
255 if 3 in group_by and not "" in [a,b,c]:
256 w = "%s_%s_%s" % (a,b,c)
257 result.setdefault(w, 0)
258 result[w] += 1
259 (a,b,c) = (b, c, word)
261 logging.debug(result)
262 return result
264 def load_vocabulary():
265 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
267 def notes_filenames():
268 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
270 def load_notes(files):
271 notes = {}
272 for filename in files:
273 with codecs.open(filename, "r", "utf-8") as f:
274 for line in f.readlines():
275 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
276 notes.setdefault(word, {})
277 notes[word][filename] = note
278 return notes
280 def add_notes(lines, notes):
281 notes_filename = notes_filenames()[0]
282 result = []
283 for line in lines:
284 if line.startswith('#'):
285 result += [line]
286 else:
287 match_object = re.search('^\s*\S+\s*(\S+)', line)
288 if match_object:
289 word = match_object.group(1)
290 if word in notes:
291 if notes_filename in notes[word]:
292 line = line.rstrip('\n')
293 line = "%-30s %s\n" % (line, notes[word][notes_filename])
294 result += [line]
295 else:
296 result += [line]
297 else:
298 result += [line]
299 return result
301 def remove_notes(lines, notes_group):
302 notes_filename = notes_filenames()[0]
303 notes = {}
304 for k in notes_group.keys():
305 if notes_filename in notes_group[k]:
306 notes[k] = notes_group[k][notes_filename]
308 result = []
309 for line in lines:
310 line = line.rstrip('\n')
311 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
312 if match_object:
313 result.append("".join([
314 match_object.group(1),
315 match_object.group(2),
316 match_object.group(3),
317 match_object.group(4),
318 "\n"
319 ]))
320 notes[match_object.group(4)] = match_object.group(6)
321 else:
322 result.append(line+"\n")
324 save_notes(notes_filename, notes)
325 return result
327 def save_notes(filename, notes):
328 lines = []
329 saved_words = []
330 with codecs.open(filename, "r", "utf-8") as f:
331 for line in f.readlines():
332 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
333 if word in notes:
334 line = "%-29s %s\n" % (word, notes[word])
335 saved_words.append(word)
336 lines.append(line)
337 for word in [x for x in notes.keys() if not x in saved_words]:
338 line = "%-29s %s\n" % (word, notes[word])
339 lines.append(line)
341 with codecs.open(filename, "w", "utf-8") as f:
342 for line in lines:
343 f.write(line)
346 def substract_dictionary(dict1, dict2):
347 """
348 returns dict1 - dict2
349 """
350 result = {}
351 for (k,v) in dict1.items():
352 if not k in dict2:
353 result[k] = v
354 return result
356 def dump_words(words, filename):
357 with codecs.open(filename, "w+", "utf-8") as f:
358 for word in words.keys():
359 f.write(("%s\n"%word)*words[word])
361 def error_message(text):
362 print text
364 def find_wordgroups_weights(word_pairs, normalizator):
365 weight = {}
366 for (num, word) in word_pairs:
367 normalized = normalizator.normalize(word)
368 weight.setdefault(normalized, 0)
369 weight[normalized] += num
370 return weight
372 def find_linked_words(notes):
373 linked_words = {}
374 for word in notes.keys():
375 for note in notes[word].values():
376 if "@" in note:
377 result = re.search(r'\@(\S*)', note)
378 if result:
379 main_word = result.group(1)
380 if main_word:
381 linked_words[word] = main_word
382 return linked_words
384 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
385 (num1, word1) = pair1
386 (num2, word2) = pair2
388 normalized_word1 = normalizator.normalize(word1)
389 normalized_word2 = normalizator.normalize(word2)
391 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
392 if cmp_res != 0:
393 return cmp_res
394 else:
395 cmp_res = cmp(normalized_word1, normalized_word2)
396 if cmp_res != 0:
397 return cmp_res
398 else:
399 return cmp(int(num1), int(num2))
402 def print_words_sorted(
403 word_pairs,
404 stats,
405 normalizator,
406 print_stats=True,
407 stats_only=False,
408 compressed_wordlist=False,
409 show_range=0,
410 show_range_percentage=0,
411 ):
412 if stats_only:
413 codecs.getwriter("utf-8")(sys.stdout).write(
414 " ".join([
415 "%-10s" % x for x in [
416 "LANG",
417 "KNOWN%",
418 "UNKNOWN%",
419 "KNOWN",
420 "TOTAL",
421 "WPS",
422 "UWPS*10"
423 ]]) + "\n")
424 codecs.getwriter("utf-8")(sys.stdout).write(
425 " ".join([
426 "%(language)-10s",
427 "%(percentage)-10.2f",
428 "%(percentage_unknown)-10.2f",
429 "%(total_known)-11d"
430 "%(total)-11d"
431 "%(wps)-11d"
432 "%(uwps)-11d"
433 ]) % stats + "\n")
434 return
436 if print_stats:
437 codecs.getwriter("utf-8")(sys.stdout).write(
438 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
440 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
441 known = int(stats['total_known'])
442 total = int(stats['total'])
443 current_level = 0
444 old_normalized_word = None
445 words_of_this_group = []
446 printed_words = 0
447 for word_pair in word_pairs:
449 normalized_word = normalizator.normalize(word_pair[1])
450 if old_normalized_word and old_normalized_word != normalized_word:
451 #codecs.getwriter("utf-8")(sys.stdout).write(
452 # "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
453 if compressed_wordlist:
454 compressed_word_pair = (
455 sum(x[0] for x in words_of_this_group),
456 normalizator.best_word_from_group(words_of_this_group)
457 )
458 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
459 printed_words += 1
460 words_of_this_group = []
462 old_normalized_word = normalized_word
463 words_of_this_group.append(word_pair)
465 if not compressed_wordlist:
466 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
467 printed_words += 1
470 known += word_pair[0]
471 if 100.0*known/total >= level_lines[0]:
472 current_level = level_lines[0]
473 while 100.0*known/total > level_lines[0]:
474 current_level = level_lines[0]
475 level_lines = level_lines[1:]
476 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
478 if show_range >0 and printed_words >= show_range:
479 break
480 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
481 break
483 def filter_add_notes(args):
484 lines = readlines_from_file(args[0])
485 notes = load_notes(notes_filenames())
486 lines = add_notes(lines, notes)
487 with codecs.open(args[0], "w", "utf-8") as f:
488 for line in lines:
489 f.write(line)
491 def filter_remove_notes(args):
492 lines = readlines_from_file(args[0])
493 notes = load_notes(notes_filenames())
494 lines = remove_notes(lines, notes)
495 with codecs.open(args[0], "w", "utf-8") as f:
496 for line in lines:
497 f.write(line)
499 def parse_parts_description(parts_description):
500 """
501 Returns triad (start, stop, step)
502 basing on parts_description string.
503 from-to/step
504 from+delta/step
505 """
506 def incorrect_parts_description(pd):
507 raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % pd)
509 try:
510 (a, step) = parts_description.split("/", 1)
511 step = int(step)
512 start = 0
513 stop = 0
514 if '-' in a:
515 (start, stop) = a.split("-", 1)
516 start = int(start)
517 stop = int(stop)
518 elif '+' in a:
519 (start, stop) = a.split("+", 1)
520 start = int(start)
521 stop = int(stop)
522 else:
523 start = int(a)
524 stop = start + 1
525 return (start, stop, step)
527 except:
528 raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % pd)
531 def take_part(lines, part_description = None):
532 if part_description == None:
533 return lines
534 (start, stop, step) = parse_parts_description(part_description)
535 n = len(lines)
536 part_size = (1.0*n) / step
537 result = []
538 for i in range(n):
539 if part_size * i >= start and part_size * i <= stop:
540 result += lines[i]
541 return result
543 def filter_get_words_group_words_add_stat(args):
544 vocabulary = load_vocabulary()
545 notes = load_notes(notes_filenames())
546 lines = take_part(readlines_from_stdin(), config.get('pages', ''))
547 group_by = [1]
549 if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
550 group_by.append(2)
551 if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
552 group_by.append(3)
553 words = get_words(lines, group_by)
554 stats_only = False
555 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
556 stats_only = True
558 compressed_wordlist = False
559 if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
560 compressed_wordlist = True
562 show_range = os.environ.get('SHOW_RANGE', '')
563 if show_range != '':
564 show_range = int(show_range)
565 else:
566 show_range = 0
567 show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '')
568 if show_range_percentage != '':
569 show_range_percentage = int(show_range_percentage)
570 else:
571 show_range_percentage = 0
574 stats = {}
575 stats['total'] = sum(words[x] for x in words.keys())
576 if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':
577 words = substract_dictionary(words, vocabulary)
579 stats['total_unknown'] = sum(words[x] for x in words.keys())
580 stats['total_known'] = stats['total'] - stats['total_unknown']
581 stats['percentage'] = 100.0*stats['total_known']/stats['total']
582 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
583 stats['groups'] = 0
584 stats['words'] = len(words)
585 stats['sentences'] = 0 #FIXME
586 stats['wps'] = 0 #FIXME
587 stats['uwps'] = 0 #FIXME
588 stats['language'] = config['language']
590 linked_words = find_linked_words(notes)
591 normalizator = Normalizator(config['language'], linked_words)
593 # filter words by allowed_words_filter
594 if os.environ.get('ALLOWED_WORDS_FILENAME', ''):
595 allowed_words_filename = os.environ.get('ALLOWED_WORDS_FILENAME', '')
596 normalized_allowed_words = [
597 normalizator.normalize(w.rstrip('\n'))
598 for w in readlines_from_file(allowed_words_filename)
599 ]
601 result = {}
602 for w, wn in words.iteritems():
603 if normalizator.normalize(w) in normalized_allowed_words:
604 result[w] = wn
605 words = result
607 words_with_freq = []
608 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
609 words_with_freq.append((words[k], k))
611 wgw = find_wordgroups_weights(words_with_freq, normalizator)
612 if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':
613 words_with_freq = sorted(
614 words_with_freq,
615 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
616 reverse=True)
618 print_words_sorted(
619 words_with_freq,
620 stats,
621 normalizator,
622 stats_only=stats_only,
623 compressed_wordlist=compressed_wordlist,
624 show_range=show_range,
625 show_range_percentage=show_range_percentage,
626 )
628 (options, args) = parser.parse_args()
629 if options.language:
630 config['language'] = options.language
632 if options.function:
633 function_names = {
634 'add_notes' : filter_add_notes,
635 'remove_notes': filter_remove_notes,
636 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
637 }
638 if options.function in function_names:
639 function_names[options.function](args)
640 else:
641 error_message("Unkown function %s.\nAvailable functions:\n%s" % (
642 options.function, "".join([" "+x for x in sorted(function_names.keys())])))
643 sys.exit(1)
648 #os.system("vim")