new-words

view new-words.py @ 63:1b8b30ad7c95

vocabulary filename option
author Igor Chubin <igor@chub.in>
date Sat Nov 12 14:03:20 2011 +0100 (2011-11-12)
parents 3682038403ad
children 5a003076eb11
line source
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 from __future__ import with_statement
5 import codecs
6 import difflib
7 import logging
8 import os
9 import optparse
10 import re
11 import subprocess
12 import sys
13 import Stemmer
14 import tempfile
15 try:
16 import psyco
17 psyco.full()
18 except:
19 pass
21 config = {
22 'config_directory': os.environ['HOME'] + '/.new-words',
23 'language': 'en',
24 }
26 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
28 class Normalizator:
29 def __init__(self, language, linked_words={}):
30 stemmer_algorithm = {
31 'de' : 'german',
32 'fr' : 'french',
33 'en' : 'english',
34 'es' : 'spanish',
35 'ru' : 'russian',
36 'it' : 'italian',
37 'uk' : 'ukrainian',
38 }
39 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
40 self.linked_words = linked_words
42 def normalize(self, word):
43 word_chain = []
44 while word in self.linked_words and not word in word_chain:
45 word_chain.append(word)
46 word = self.linked_words[word]
47 return self.stemmer.stemWord(word.lower())
49 def best_word_from_group(self, wordpairs_group):
50 """Returns the word that is the most relevant to the wordpairs_group.
52 At the moment: returns the word with minimal length"""
54 def f(x, y):
55 return difflib.SequenceMatcher(
56 None,
57 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),
58 x.lower(),
59 y.lower()).ratio()
61 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
62 best_match = list(x[1] for x in sorted(
63 (x for x in wordpairs_group if len(x[1]) == minimal_length),
64 key=lambda x:x[0],
65 reverse=True))[0]
67 return best_match
69 suggestions = self.dictionary_suggestions(best_match)
70 if len(suggestions) == 1:
71 return best_match
73 verb = False
74 corrected_best_match = best_match
75 if best_match[-2:] == 'et':
76 word = best_match[:-1]+"n"
77 sugg = self.dictionary_suggestions(word)
78 if len(sugg) == 1:
79 return word
80 suggestions += sugg
81 corrected_best_match = word
82 corrected_best_match = best_match[:-2]
83 verb = True
85 if best_match[-1] == 't':
86 word = best_match[:-1]+"en"
87 sugg = self.dictionary_suggestions(word)
88 if len(sugg) == 1:
89 return word
90 suggestions += sugg
91 corrected_best_match = best_match[:-1]
92 verb = True
94 if corrected_best_match[0].lower() == corrected_best_match[0]:
95 suggestions = [ x for x in suggestions
96 if x[0].lower() == x[0] ]
98 if suggestions == []:
99 return best_match+"_"
100 return best_match+" "+(" ".join(
101 sorted(
102 suggestions,
103 key = lambda x: f(x, corrected_best_match),
104 reverse = True
105 )
106 )
107 )
109 def dictionary_suggestions(self, word):
110 return [
111 x.decode('utf-8').rstrip('\n')
112 for x
113 in subprocess.Popen(
114 ["de-variants", word],
115 stdout=subprocess.PIPE
116 ).stdout.readlines() ]
119 parser = optparse.OptionParser()
121 parser.add_option(
122 "-a", "--no-marks",
123 help="don't add marks (and don't save marks added by user) [NOT IMPLEMENTED YET]",
124 action="store_true",
125 dest="no_marks")
127 parser.add_option(
128 "-c", "--compressed",
129 help="show compressed wordlist: one word per group",
130 action="store_true",
131 dest="compressed")
133 parser.add_option(
134 "-k", "--known-words",
135 help="put higher words that are similar to the known words (only for English)",
136 action="store_true",
137 dest="compressed")
139 parser.add_option(
140 "-l", "--language",
141 help="specify language of text",
142 action="store",
143 dest="language")
145 parser.add_option(
146 "-f", "--allowed-words",
147 help="file with list of allowed words (words that will be shown in the output)",
148 action="store",
149 dest="allowed_words")
151 parser.add_option(
152 "-G", "--words-grouping",
153 help="turn off word grouping",
154 action="store_true",
155 dest="no_words_grouping")
157 parser.add_option(
158 "-X", "--function",
159 help="filter through subsystem [INTERNAL]",
160 action="store",
161 dest="function")
163 parser.add_option(
164 "-m", "--merge-tag",
165 help="merge words tagged with specified tag into the main vocabulary [NOT IMPLEMENTED YET]",
166 action="store",
167 dest="merge_tag")
169 parser.add_option(
170 "-M", "--merge-tagged",
171 help="merge words tagged with ANY tag into the main vocabulary [NOT IMPLEMENTED YET]",
172 action="store_true",
173 dest="merge_tagged")
175 parser.add_option(
176 "-n", "--non-interactive",
177 help="non-interactive mode (don't run vi)",
178 action="store_true",
179 dest="non_interactive")
181 parser.add_option(
182 "-N", "--no-filter",
183 help="switch off known words filtering",
184 action="store_true",
185 dest="no_filter")
187 parser.add_option(
188 "-p", "--pages",
189 help="work with specified pages only (pages = start-stop/total )",
190 action="store",
191 dest="pages")
193 parser.add_option(
194 "-d", "--delete-tag",
195 help="delete subvocabulary of specified tag",
196 action="store",
197 dest="delete_tag")
199 parser.add_option(
200 "-r", "--show-range",
201 help="show only words specified number of words",
202 action="store",
203 dest="show_range")
205 parser.add_option(
206 "-R", "--show-range-percentage",
207 help="show only words that cover specified percentage of the text, skip the rest",
208 action="store",
209 dest="show_range_percentage")
211 parser.add_option(
212 "-s", "--text-stats",
213 help="show the text statistics (percentage of known words and so on) and exit",
214 action="store_true",
215 dest="text_stats")
217 parser.add_option(
218 "-S", "--voc-stats",
219 help="show your vocabulary statistics (number of words and word groups) [NOT IMPLEMENTED YET]",
220 action="store_true",
221 dest="voc_stats")
223 parser.add_option(
224 "-t", "--tag",
225 help="tag known words with tag",
226 action="store",
227 dest="tag")
229 parser.add_option(
230 "-T", "--show-tags",
231 help="tag known words with tag",
232 action="store_true",
233 dest="show_tags")
235 parser.add_option(
236 "-v", "--vocabulary-filename",
237 help="use specified file as a vocabulary",
238 action="store",
239 dest="vocabulary_filename")
241 parser.add_option(
242 "-2", "--two-words",
243 help="find 2 words' sequences",
244 action="store_true",
245 dest="two_words")
247 parser.add_option(
248 "-3", "--three-words",
249 help="find 3 words' sequences",
250 action="store_true",
251 dest="three_words")
253 def readlines_from_file(filename):
254 res = []
255 with codecs.open(filename, "r", "utf-8") as f:
256 for line in f.readlines():
257 res += [line]
258 return res
260 def readlines_from_url(url):
261 return [x.decode('utf-8') for x in
262 subprocess.Popen(
263 "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url),
264 shell = True,
265 stdout = subprocess.PIPE,
266 stderr = subprocess.STDOUT
267 ).communicate()[0].split('\n')
268 ]
270 def readlines_from_stdin():
271 return codecs.getreader("utf-8")(sys.stdin).readlines()
273 def words_from_line(line):
274 line = line.rstrip('\n')
275 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
276 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
277 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
279 def get_words(lines, group_by=[1]):
280 """
281 Returns hash of words in a file
282 word => number
283 """
284 result = {}
285 (a, b, c) = ("", "", "")
286 for line in lines:
287 words = words_from_line(line)
288 for word in words:
289 if re.match('[0-9]*$', word):
290 continue
291 result.setdefault(word, 0)
292 result[word] += 1
293 if 2 in group_by and a != "" and b != "":
294 w = "%s_%s" % (a,b)
295 result.setdefault(w, 0)
296 result[w] += 1
297 if 3 in group_by and not "" in [a,b,c]:
298 w = "%s_%s_%s" % (a,b,c)
299 result.setdefault(w, 0)
300 result[w] += 1
301 (a,b,c) = (b, c, word)
303 logging.debug(result)
304 return result
306 def voc_filename():
307 if 'vocabulary_filename' in config:
308 return config['vocabulary_filename']
309 return "%s/%s.txt"%(config['config_directory'], config['language'])
311 def load_vocabulary():
312 return get_words(readlines_from_file(voc_filename()))
314 def notes_filenames():
315 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
317 def load_notes(files):
318 notes = {}
319 for filename in files:
320 with codecs.open(filename, "r", "utf-8") as f:
321 for line in f.readlines():
322 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
323 notes.setdefault(word, {})
324 notes[word][filename] = note
325 return notes
327 def add_notes(lines, notes):
328 notes_filename = notes_filenames()[0]
329 result = []
330 for line in lines:
331 if line.startswith('#'):
332 result += [line]
333 else:
334 match_object = re.search('^\s*\S+\s*(\S+)', line)
335 if match_object:
336 word = match_object.group(1)
337 if word in notes:
338 if notes_filename in notes[word]:
339 line = line.rstrip('\n')
340 line = "%-30s %s\n" % (line, notes[word][notes_filename])
341 result += [line]
342 else:
343 result += [line]
344 else:
345 result += [line]
346 return result
348 def remove_notes(lines, notes_group):
349 notes_filename = notes_filenames()[0]
350 notes = {}
351 for k in notes_group.keys():
352 if notes_filename in notes_group[k]:
353 notes[k] = notes_group[k][notes_filename]
355 result = []
356 for line in lines:
357 line = line.rstrip('\n')
358 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
359 if match_object:
360 result.append("".join([
361 match_object.group(1),
362 match_object.group(2),
363 match_object.group(3),
364 match_object.group(4),
365 "\n"
366 ]))
367 notes[match_object.group(4)] = match_object.group(6)
368 else:
369 result.append(line+"\n")
371 save_notes(notes_filename, notes)
372 return result
374 def save_notes(filename, notes):
375 lines = []
376 saved_words = []
377 with codecs.open(filename, "r", "utf-8") as f:
378 for line in f.readlines():
379 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
380 if word in notes:
381 line = "%-29s %s\n" % (word, notes[word])
382 saved_words.append(word)
383 lines.append(line)
384 for word in [x for x in notes.keys() if not x in saved_words]:
385 line = "%-29s %s\n" % (word, notes[word])
386 lines.append(line)
388 with codecs.open(filename, "w", "utf-8") as f:
389 for line in lines:
390 f.write(line)
393 def substract_dictionary(dict1, dict2):
394 """
395 returns dict1 - dict2
396 """
397 result = {}
398 for (k,v) in dict1.items():
399 if not k in dict2:
400 result[k] = v
401 return result
403 def dump_words(words, filename):
404 with codecs.open(filename, "w+", "utf-8") as f:
405 for word in words.keys():
406 f.write(("%s\n"%word)*words[word])
408 def error_message(text):
409 print text
411 def find_wordgroups_weights(word_pairs, normalizator):
412 weight = {}
413 for (num, word) in word_pairs:
414 normalized = normalizator.normalize(word)
415 weight.setdefault(normalized, 0)
416 weight[normalized] += num
417 return weight
419 def find_linked_words(notes):
420 linked_words = {}
421 for word in notes.keys():
422 for note in notes[word].values():
423 if "@" in note:
424 result = re.search(r'\@(\S*)', note)
425 if result:
426 main_word = result.group(1)
427 if main_word:
428 linked_words[word] = main_word
429 return linked_words
431 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
432 (num1, word1) = pair1
433 (num2, word2) = pair2
435 normalized_word1 = normalizator.normalize(word1)
436 normalized_word2 = normalizator.normalize(word2)
438 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
439 if cmp_res != 0:
440 return cmp_res
441 else:
442 cmp_res = cmp(normalized_word1, normalized_word2)
443 if cmp_res != 0:
444 return cmp_res
445 else:
446 return cmp(int(num1), int(num2))
449 def print_words_sorted(
450 word_pairs,
451 stats,
452 normalizator,
453 print_stats=True,
454 stats_only=False,
455 compressed_wordlist=False,
456 show_range=0,
457 show_range_percentage=0,
458 ):
459 result = []
460 if stats_only:
461 #codecs.getwriter("utf-8")(sys.stdout).write(
462 result.append(
463 " ".join([
464 "%-10s" % x for x in [
465 "LANG",
466 "KNOWN%",
467 "UNKNOWN%",
468 "KNOWN",
469 "TOTAL",
470 "WPS",
471 "UWPS*10"
472 ]]) + "\n")
473 result.append(
474 " ".join([
475 "%(language)-10s",
476 "%(percentage)-10.2f",
477 "%(percentage_unknown)-10.2f",
478 "%(total_known)-11d"
479 "%(total)-11d"
480 "%(wps)-11d"
481 "%(uwps)-11d"
482 ]) % stats + "\n")
483 return "".join(result)
485 if print_stats:
486 result.append(
487 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
489 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
490 known = int(stats['total_known'])
491 total = int(stats['total'])
492 current_level = 0
493 old_normalized_word = None
494 words_of_this_group = []
495 printed_words = 0
496 for word_pair in word_pairs:
498 normalized_word = normalizator.normalize(word_pair[1])
499 if old_normalized_word and old_normalized_word != normalized_word:
500 if compressed_wordlist:
501 compressed_word_pair = (
502 sum(x[0] for x in words_of_this_group),
503 normalizator.best_word_from_group(words_of_this_group)
504 )
505 result.append("%10s %s\n" % compressed_word_pair)
506 printed_words += 1
507 words_of_this_group = []
509 old_normalized_word = normalized_word
510 words_of_this_group.append(word_pair)
512 if not compressed_wordlist:
513 result.append("%10s %s\n" % word_pair)
514 printed_words += 1
517 known += word_pair[0]
518 if 100.0*known/total >= level_lines[0]:
519 current_level = level_lines[0]
520 while 100.0*known/total > level_lines[0]:
521 current_level = level_lines[0]
522 level_lines = level_lines[1:]
523 result.append("# %s\n" % current_level)
525 if show_range >0 and printed_words >= show_range:
526 break
527 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
528 break
530 return result
532 def parse_parts_description(parts_description):
533 """
534 Returns triad (start, stop, step)
535 basing on parts_description string.
536 from-to/step
537 from+delta/step
538 """
540 try:
541 (a, step) = parts_description.split("/", 1)
542 step = int(step)
543 start = 0
544 stop = 0
545 if '-' in a:
546 (start, stop) = a.split("-", 1)
547 start = int(start)
548 stop = int(stop)
549 elif '+' in a:
550 (start, stop) = a.split("+", 1)
551 start = int(start)
552 stop = int(stop)
553 else:
554 start = int(a)
555 stop = start + 1
556 return (start, stop, step)
558 except:
559 raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % parts_description)
562 def take_part(lines, part_description = None):
563 if part_description == None or part_description == '':
564 return lines
565 (start, stop, step) = parse_parts_description(part_description)
566 n = len(lines)
567 part_size = (1.0*n) / step
568 result = []
569 for i in range(n):
570 if i >= start * part_size and i <= stop * part_size:
571 result += [lines[i]]
572 return result
574 def filter_get_words_group_words_add_stat(args):
575 vocabulary = load_vocabulary()
576 notes = load_notes(notes_filenames())
578 if len(args) > 0:
579 if 'http://' in args[0]:
580 input_lines = readlines_from_url(args[0])
581 else:
582 input_lines = readlines_from_file(args[0])
583 else:
584 input_lines = readlines_from_stdin()
586 if len(input_lines) == 0:
587 print >> sys.stderr, "Nothing to do, standard input is empty, exiting."
588 sys.exit(1)
590 lines = take_part(input_lines, config.get('pages', ''))
592 (_, original_text_tempfile) = tempfile.mkstemp(prefix='new-word')
593 with codecs.open(original_text_tempfile, "w", "utf-8") as f:
594 f.write("".join(lines))
596 group_by = [1]
598 if 'two_words' in config:
599 group_by.append(2)
600 if 'three_words' in config:
601 group_by.append(3)
602 words = get_words(lines, group_by)
603 stats_only = False
604 if 'text_stats' in config:
605 stats_only = True
607 compressed_wordlist = False
608 if 'compressed' in config:
609 compressed_wordlist = True
611 if 'show_range' in config:
612 show_range = int(config['show_range'])
613 else:
614 show_range = 0
616 if 'show_range_percentage' in config:
617 show_range_percentage = int(config['show_range_percentage'])
618 else:
619 show_range_percentage = 0
622 stats = {}
623 stats['total'] = sum(words[x] for x in words.keys())
624 if not 'no_filter' in config:
625 words = substract_dictionary(words, vocabulary)
627 stats['total_unknown'] = sum(words[x] for x in words.keys())
628 stats['total_known'] = stats['total'] - stats['total_unknown']
629 stats['percentage'] = 100.0*stats['total_known']/stats['total']
630 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
631 stats['groups'] = 0
632 stats['words'] = len(words)
633 stats['sentences'] = 0 #FIXME
634 stats['wps'] = 0 #FIXME
635 stats['uwps'] = 0 #FIXME
636 stats['language'] = config['language']
638 linked_words = find_linked_words(notes)
639 normalizator = Normalizator(config['language'], linked_words)
641 # filter words by allowed_words_filter
642 if 'allowed_words' in config:
643 allowed_words_filename = config['allowed_words']
644 normalized_allowed_words = [
645 normalizator.normalize(w.rstrip('\n'))
646 for w in readlines_from_file(allowed_words_filename)
647 ]
649 result = {}
650 for w, wn in words.iteritems():
651 if normalizator.normalize(w) in normalized_allowed_words:
652 result[w] = wn
653 words = result
655 words_with_freq = []
656 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
657 words_with_freq.append((words[k], k))
659 wgw = find_wordgroups_weights(words_with_freq, normalizator)
660 if not 'no_words_grouping' in config or not config['no_words_grouping']:
661 words_with_freq = sorted(
662 words_with_freq,
663 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
664 reverse=True)
666 output = print_words_sorted(
667 words_with_freq,
668 stats,
669 normalizator,
670 stats_only=stats_only,
671 compressed_wordlist=compressed_wordlist,
672 show_range=show_range,
673 show_range_percentage=show_range_percentage,
674 )
677 if ('non_interactive' in config or 'text_stats' in config):
678 codecs.getwriter("utf-8")(sys.stdout).write("".join(output))
679 else:
680 (_, temp1) = tempfile.mkstemp(prefix='new-word')
681 (_, temp2) = tempfile.mkstemp(prefix='new-word')
683 with codecs.open(temp1, "w", "utf-8") as f:
684 f.write("".join(output))
685 with codecs.open(temp2, "w", "utf-8") as f:
686 f.write("".join(add_notes(output, notes)))
688 os.putenv('ORIGINAL_TEXT', original_text_tempfile)
689 os.system((
690 "vim"
691 " -c 'setlocal spell spelllang={language}'"
692 " -c 'set keywordprg={language}'"
693 " -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255'"
694 " {filename}"
695 " < /dev/tty > /dev/tty"
696 ).format(language=config['language'], filename=temp2))
698 lines = remove_notes(readlines_from_file(temp2), notes)
700 # compare lines_before and lines_after and return deleted words
701 lines_before = output
702 lines_after = lines
703 deleted_words = []
705 lines_after_set = set(lines_after)
706 for line in lines_before:
707 if line not in lines_after_set:
708 line = line.strip()
709 if ' ' in line:
710 word = re.split('\s+', line, 1)[1]
711 if ' ' in word:
712 word = re.split('\s+', word, 1)[0]
713 deleted_words.append(word)
715 with codecs.open(voc_filename(), "a", "utf-8") as f:
716 f.write("\n".join(deleted_words + ['']))
718 os.unlink(temp1)
719 os.unlink(temp2)
721 os.unlink(original_text_tempfile)
723 (options, args) = parser.parse_args()
724 if options.language:
725 config['language'] = options.language
727 if options.pages:
728 config['pages'] = options.pages
729 else:
730 config['pages'] = ""
732 if options.allowed_words:
733 config['allowed_words'] = options.allowed_words
735 if options.show_range:
736 config['show_range'] = options.show_range
738 if options.show_range_percentage:
739 config['show_range_percentage'] = options.show_range_percentage
741 if options.non_interactive:
742 config['non_interactive'] = True
744 if options.text_stats:
745 config['text_stats'] = True
747 if options.compressed:
748 config['compressed'] = True
750 if options.no_filter:
751 config['no_filter'] = True
753 if options.two_words:
754 config['two_words'] = True
756 if options.three_words:
757 config['three_words'] = True
759 if options.no_words_grouping:
760 config['no_words_grouping'] = True
762 filter_get_words_group_words_add_stat(args)
764 #if options.function:
765 # function_names = {
766 # 'get_words_group_words_add_stat': ,
767 # }
768 # if options.function in function_names:
769 # function_names[options.function](args)
770 # else:
771 # error_message("Unkown function %s.\nAvailable functions:\n%s" % (
772 # options.function, "".join([" "+x for x in sorted(function_names.keys())])))
773 # sys.exit(1)
774 #
778 #os.system("vim")