new-words

view new-words.py @ 59:7a7a88277c08

experimental script oneliners.sh moved to misc/
author Igor Chubin <igor@chub.in>
date Thu Nov 03 16:10:58 2011 +0100 (2011-11-03)
parents e25de9ea9184
children 3682038403ad
line source
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 from __future__ import with_statement
5 import codecs
6 import difflib
7 import logging
8 import os
9 import optparse
10 import re
11 import subprocess
12 import sys
13 import Stemmer
14 import tempfile
15 try:
16 import psyco
17 psyco.full()
18 except:
19 pass
21 config = {
22 'config_directory': os.environ['HOME'] + '/.new-words',
23 'language': 'en',
24 }
26 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
28 class Normalizator:
29 def __init__(self, language, linked_words={}):
30 stemmer_algorithm = {
31 'de' : 'german',
32 'en' : 'english',
33 'es' : 'spanish',
34 'ru' : 'russian',
35 'it' : 'italian',
36 'uk' : 'ukrainian',
37 }
38 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
39 self.linked_words = linked_words
41 def normalize(self, word):
42 word_chain = []
43 while word in self.linked_words and not word in word_chain:
44 word_chain.append(word)
45 word = self.linked_words[word]
46 return self.stemmer.stemWord(word.lower())
48 def best_word_from_group(self, wordpairs_group):
49 """Returns the word that is the most relevant to the wordpairs_group.
51 At the moment: returns the word with minimal length"""
53 def f(x, y):
54 return difflib.SequenceMatcher(
55 None,
56 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),
57 x.lower(),
58 y.lower()).ratio()
60 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
61 best_match = list(x[1] for x in sorted(
62 (x for x in wordpairs_group if len(x[1]) == minimal_length),
63 key=lambda x:x[0],
64 reverse=True))[0]
66 return best_match
68 suggestions = self.dictionary_suggestions(best_match)
69 if len(suggestions) == 1:
70 return best_match
72 verb = False
73 corrected_best_match = best_match
74 if best_match[-2:] == 'et':
75 word = best_match[:-1]+"n"
76 sugg = self.dictionary_suggestions(word)
77 if len(sugg) == 1:
78 return word
79 suggestions += sugg
80 corrected_best_match = word
81 corrected_best_match = best_match[:-2]
82 verb = True
84 if best_match[-1] == 't':
85 word = best_match[:-1]+"en"
86 sugg = self.dictionary_suggestions(word)
87 if len(sugg) == 1:
88 return word
89 suggestions += sugg
90 corrected_best_match = best_match[:-1]
91 verb = True
93 if corrected_best_match[0].lower() == corrected_best_match[0]:
94 suggestions = [ x for x in suggestions
95 if x[0].lower() == x[0] ]
97 if suggestions == []:
98 return best_match+"_"
99 return best_match+" "+(" ".join(
100 sorted(
101 suggestions,
102 key = lambda x: f(x, corrected_best_match),
103 reverse = True
104 )
105 )
106 )
108 def dictionary_suggestions(self, word):
109 return [
110 x.decode('utf-8').rstrip('\n')
111 for x
112 in subprocess.Popen(
113 ["de-variants", word],
114 stdout=subprocess.PIPE
115 ).stdout.readlines() ]
118 parser = optparse.OptionParser()
120 parser.add_option(
121 "-a", "--no-marks",
122 help="don't add marks (and don't save marks added by user) [NOT IMPLEMENTED YET]",
123 action="store_true",
124 dest="no_marks")
126 parser.add_option(
127 "-c", "--compressed",
128 help="show compressed wordlist: one word per group",
129 action="store_true",
130 dest="compressed")
132 parser.add_option(
133 "-k", "--known-words",
134 help="put higher words that are similar to the known words (only for English)",
135 action="store_true",
136 dest="compressed")
138 parser.add_option(
139 "-l", "--language",
140 help="specify language of text",
141 action="store",
142 dest="language")
144 parser.add_option(
145 "-f", "--allowed-words",
146 help="file with list of allowed words (words that will be shown in the output)",
147 action="store",
148 dest="allowed_words")
150 parser.add_option(
151 "-G", "--words-grouping",
152 help="turn off word grouping",
153 action="store_true",
154 dest="no_words_grouping")
156 parser.add_option(
157 "-X", "--function",
158 help="filter through subsystem [INTERNAL]",
159 action="store",
160 dest="function")
162 parser.add_option(
163 "-m", "--merge-tag",
164 help="merge words tagged with specified tag into the main vocabulary [NOT IMPLEMENTED YET]",
165 action="store",
166 dest="merge_tag")
168 parser.add_option(
169 "-M", "--merge-tagged",
170 help="merge words tagged with ANY tag into the main vocabulary [NOT IMPLEMENTED YET]",
171 action="store_true",
172 dest="merge_tagged")
174 parser.add_option(
175 "-n", "--non-interactive",
176 help="non-interactive mode (don't run vi)",
177 action="store_true",
178 dest="non_interactive")
180 parser.add_option(
181 "-N", "--no-filter",
182 help="switch off known words filtering",
183 action="store_true",
184 dest="no_filter")
186 parser.add_option(
187 "-p", "--pages",
188 help="work with specified pages only (pages = start-stop/total )",
189 action="store",
190 dest="pages")
192 parser.add_option(
193 "-d", "--delete-tag",
194 help="delete subvocabulary of specified tag",
195 action="store",
196 dest="delete_tag")
198 parser.add_option(
199 "-r", "--show-range",
200 help="show only words specified number of words",
201 action="store",
202 dest="show_range")
204 parser.add_option(
205 "-R", "--show-range-percentage",
206 help="show only words that cover specified percentage of the text, skip the rest",
207 action="store",
208 dest="show_range_percentage")
210 parser.add_option(
211 "-s", "--text-stats",
212 help="show the text statistics (percentage of known words and so on) and exit",
213 action="store_true",
214 dest="text_stats")
216 parser.add_option(
217 "-S", "--voc-stats",
218 help="show your vocabulary statistics (number of words and word groups) [NOT IMPLEMENTED YET]",
219 action="store_true",
220 dest="voc_stats")
222 parser.add_option(
223 "-t", "--tag",
224 help="tag known words with tag",
225 action="store",
226 dest="tag")
228 parser.add_option(
229 "-T", "--show-tags",
230 help="tag known words with tag",
231 action="store_true",
232 dest="show_tags")
234 parser.add_option(
235 "-2", "--two-words",
236 help="find 2 words' sequences",
237 action="store_true",
238 dest="two_words")
240 parser.add_option(
241 "-3", "--three-words",
242 help="find 3 words' sequences",
243 action="store_true",
244 dest="three_words")
246 def readlines_from_file(filename):
247 res = []
248 with codecs.open(filename, "r", "utf-8") as f:
249 for line in f.readlines():
250 res += [line]
251 return res
253 def readlines_from_url(url):
254 return [x.decode('utf-8') for x in
255 subprocess.Popen(
256 "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url),
257 shell = True,
258 stdout = subprocess.PIPE,
259 stderr = subprocess.STDOUT
260 ).communicate()[0].split('\n')
261 ]
263 def readlines_from_stdin():
264 return codecs.getreader("utf-8")(sys.stdin).readlines()
266 def words_from_line(line):
267 line = line.rstrip('\n')
268 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
269 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
270 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
272 def get_words(lines, group_by=[1]):
273 """
274 Returns hash of words in a file
275 word => number
276 """
277 result = {}
278 (a, b, c) = ("", "", "")
279 for line in lines:
280 words = words_from_line(line)
281 for word in words:
282 if re.match('[0-9]*$', word):
283 continue
284 result.setdefault(word, 0)
285 result[word] += 1
286 if 2 in group_by and a != "" and b != "":
287 w = "%s_%s" % (a,b)
288 result.setdefault(w, 0)
289 result[w] += 1
290 if 3 in group_by and not "" in [a,b,c]:
291 w = "%s_%s_%s" % (a,b,c)
292 result.setdefault(w, 0)
293 result[w] += 1
294 (a,b,c) = (b, c, word)
296 logging.debug(result)
297 return result
299 def voc_filename():
300 return "%s/%s.txt"%(config['config_directory'], config['language'])
302 def load_vocabulary():
303 return get_words(readlines_from_file(voc_filename()))
305 def notes_filenames():
306 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
308 def load_notes(files):
309 notes = {}
310 for filename in files:
311 with codecs.open(filename, "r", "utf-8") as f:
312 for line in f.readlines():
313 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
314 notes.setdefault(word, {})
315 notes[word][filename] = note
316 return notes
318 def add_notes(lines, notes):
319 notes_filename = notes_filenames()[0]
320 result = []
321 for line in lines:
322 if line.startswith('#'):
323 result += [line]
324 else:
325 match_object = re.search('^\s*\S+\s*(\S+)', line)
326 if match_object:
327 word = match_object.group(1)
328 if word in notes:
329 if notes_filename in notes[word]:
330 line = line.rstrip('\n')
331 line = "%-30s %s\n" % (line, notes[word][notes_filename])
332 result += [line]
333 else:
334 result += [line]
335 else:
336 result += [line]
337 return result
339 def remove_notes(lines, notes_group):
340 notes_filename = notes_filenames()[0]
341 notes = {}
342 for k in notes_group.keys():
343 if notes_filename in notes_group[k]:
344 notes[k] = notes_group[k][notes_filename]
346 result = []
347 for line in lines:
348 line = line.rstrip('\n')
349 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
350 if match_object:
351 result.append("".join([
352 match_object.group(1),
353 match_object.group(2),
354 match_object.group(3),
355 match_object.group(4),
356 "\n"
357 ]))
358 notes[match_object.group(4)] = match_object.group(6)
359 else:
360 result.append(line+"\n")
362 save_notes(notes_filename, notes)
363 return result
365 def save_notes(filename, notes):
366 lines = []
367 saved_words = []
368 with codecs.open(filename, "r", "utf-8") as f:
369 for line in f.readlines():
370 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
371 if word in notes:
372 line = "%-29s %s\n" % (word, notes[word])
373 saved_words.append(word)
374 lines.append(line)
375 for word in [x for x in notes.keys() if not x in saved_words]:
376 line = "%-29s %s\n" % (word, notes[word])
377 lines.append(line)
379 with codecs.open(filename, "w", "utf-8") as f:
380 for line in lines:
381 f.write(line)
384 def substract_dictionary(dict1, dict2):
385 """
386 returns dict1 - dict2
387 """
388 result = {}
389 for (k,v) in dict1.items():
390 if not k in dict2:
391 result[k] = v
392 return result
394 def dump_words(words, filename):
395 with codecs.open(filename, "w+", "utf-8") as f:
396 for word in words.keys():
397 f.write(("%s\n"%word)*words[word])
399 def error_message(text):
400 print text
402 def find_wordgroups_weights(word_pairs, normalizator):
403 weight = {}
404 for (num, word) in word_pairs:
405 normalized = normalizator.normalize(word)
406 weight.setdefault(normalized, 0)
407 weight[normalized] += num
408 return weight
410 def find_linked_words(notes):
411 linked_words = {}
412 for word in notes.keys():
413 for note in notes[word].values():
414 if "@" in note:
415 result = re.search(r'\@(\S*)', note)
416 if result:
417 main_word = result.group(1)
418 if main_word:
419 linked_words[word] = main_word
420 return linked_words
422 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
423 (num1, word1) = pair1
424 (num2, word2) = pair2
426 normalized_word1 = normalizator.normalize(word1)
427 normalized_word2 = normalizator.normalize(word2)
429 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
430 if cmp_res != 0:
431 return cmp_res
432 else:
433 cmp_res = cmp(normalized_word1, normalized_word2)
434 if cmp_res != 0:
435 return cmp_res
436 else:
437 return cmp(int(num1), int(num2))
440 def print_words_sorted(
441 word_pairs,
442 stats,
443 normalizator,
444 print_stats=True,
445 stats_only=False,
446 compressed_wordlist=False,
447 show_range=0,
448 show_range_percentage=0,
449 ):
450 result = []
451 if stats_only:
452 #codecs.getwriter("utf-8")(sys.stdout).write(
453 result.append(
454 " ".join([
455 "%-10s" % x for x in [
456 "LANG",
457 "KNOWN%",
458 "UNKNOWN%",
459 "KNOWN",
460 "TOTAL",
461 "WPS",
462 "UWPS*10"
463 ]]) + "\n")
464 result.append(
465 " ".join([
466 "%(language)-10s",
467 "%(percentage)-10.2f",
468 "%(percentage_unknown)-10.2f",
469 "%(total_known)-11d"
470 "%(total)-11d"
471 "%(wps)-11d"
472 "%(uwps)-11d"
473 ]) % stats + "\n")
474 return "".join(result)
476 if print_stats:
477 result.append(
478 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
480 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
481 known = int(stats['total_known'])
482 total = int(stats['total'])
483 current_level = 0
484 old_normalized_word = None
485 words_of_this_group = []
486 printed_words = 0
487 for word_pair in word_pairs:
489 normalized_word = normalizator.normalize(word_pair[1])
490 if old_normalized_word and old_normalized_word != normalized_word:
491 if compressed_wordlist:
492 compressed_word_pair = (
493 sum(x[0] for x in words_of_this_group),
494 normalizator.best_word_from_group(words_of_this_group)
495 )
496 result.append("%10s %s\n" % compressed_word_pair)
497 printed_words += 1
498 words_of_this_group = []
500 old_normalized_word = normalized_word
501 words_of_this_group.append(word_pair)
503 if not compressed_wordlist:
504 result.append("%10s %s\n" % word_pair)
505 printed_words += 1
508 known += word_pair[0]
509 if 100.0*known/total >= level_lines[0]:
510 current_level = level_lines[0]
511 while 100.0*known/total > level_lines[0]:
512 current_level = level_lines[0]
513 level_lines = level_lines[1:]
514 result.append("# %s\n" % current_level)
516 if show_range >0 and printed_words >= show_range:
517 break
518 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
519 break
521 return result
523 def parse_parts_description(parts_description):
524 """
525 Returns triad (start, stop, step)
526 basing on parts_description string.
527 from-to/step
528 from+delta/step
529 """
531 try:
532 (a, step) = parts_description.split("/", 1)
533 step = int(step)
534 start = 0
535 stop = 0
536 if '-' in a:
537 (start, stop) = a.split("-", 1)
538 start = int(start)
539 stop = int(stop)
540 elif '+' in a:
541 (start, stop) = a.split("+", 1)
542 start = int(start)
543 stop = int(stop)
544 else:
545 start = int(a)
546 stop = start + 1
547 return (start, stop, step)
549 except:
550 raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % parts_description)
553 def take_part(lines, part_description = None):
554 if part_description == None or part_description == '':
555 return lines
556 (start, stop, step) = parse_parts_description(part_description)
557 n = len(lines)
558 part_size = (1.0*n) / step
559 result = []
560 for i in range(n):
561 if i >= start * part_size and i <= stop * part_size:
562 result += [lines[i]]
563 return result
565 def filter_get_words_group_words_add_stat(args):
566 vocabulary = load_vocabulary()
567 notes = load_notes(notes_filenames())
569 if len(args) > 0:
570 if 'http://' in args[0]:
571 input_lines = readlines_from_url(args[0])
572 else:
573 input_lines = readlines_from_file(args[0])
574 else:
575 input_lines = readlines_from_stdin()
577 if len(input_lines) == 0:
578 print >> sys.stderr, "Nothing to do, standard input is empty, exiting."
579 sys.exit(1)
581 lines = take_part(input_lines, config.get('pages', ''))
583 (_, original_text_tempfile) = tempfile.mkstemp(prefix='new-word')
584 with codecs.open(original_text_tempfile, "w", "utf-8") as f:
585 f.write("".join(lines))
587 group_by = [1]
589 if 'two_words' in config:
590 group_by.append(2)
591 if 'three_words' in config:
592 group_by.append(3)
593 words = get_words(lines, group_by)
594 stats_only = False
595 if 'text_stats' in config:
596 stats_only = True
598 compressed_wordlist = False
599 if 'compressed' in config:
600 compressed_wordlist = True
602 if 'show_range' in config:
603 show_range = int(config['show_range'])
604 else:
605 show_range = 0
607 if 'show_range_percentage' in config:
608 show_range_percentage = int(config['show_range_percentage'])
609 else:
610 show_range_percentage = 0
613 stats = {}
614 stats['total'] = sum(words[x] for x in words.keys())
615 if not 'no_filter' in config:
616 words = substract_dictionary(words, vocabulary)
618 stats['total_unknown'] = sum(words[x] for x in words.keys())
619 stats['total_known'] = stats['total'] - stats['total_unknown']
620 stats['percentage'] = 100.0*stats['total_known']/stats['total']
621 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
622 stats['groups'] = 0
623 stats['words'] = len(words)
624 stats['sentences'] = 0 #FIXME
625 stats['wps'] = 0 #FIXME
626 stats['uwps'] = 0 #FIXME
627 stats['language'] = config['language']
629 linked_words = find_linked_words(notes)
630 normalizator = Normalizator(config['language'], linked_words)
632 # filter words by allowed_words_filter
633 if 'allowed_words' in config:
634 allowed_words_filename = config['allowed_words']
635 normalized_allowed_words = [
636 normalizator.normalize(w.rstrip('\n'))
637 for w in readlines_from_file(allowed_words_filename)
638 ]
640 result = {}
641 for w, wn in words.iteritems():
642 if normalizator.normalize(w) in normalized_allowed_words:
643 result[w] = wn
644 words = result
646 words_with_freq = []
647 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
648 words_with_freq.append((words[k], k))
650 wgw = find_wordgroups_weights(words_with_freq, normalizator)
651 if not 'no_words_grouping' in config or not config['no_words_grouping']:
652 words_with_freq = sorted(
653 words_with_freq,
654 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
655 reverse=True)
657 output = print_words_sorted(
658 words_with_freq,
659 stats,
660 normalizator,
661 stats_only=stats_only,
662 compressed_wordlist=compressed_wordlist,
663 show_range=show_range,
664 show_range_percentage=show_range_percentage,
665 )
668 if ('non_interactive' in config or 'text_stats' in config):
669 codecs.getwriter("utf-8")(sys.stdout).write("".join(output))
670 else:
671 (_, temp1) = tempfile.mkstemp(prefix='new-word')
672 (_, temp2) = tempfile.mkstemp(prefix='new-word')
674 with codecs.open(temp1, "w", "utf-8") as f:
675 f.write("".join(output))
676 with codecs.open(temp2, "w", "utf-8") as f:
677 f.write("".join(add_notes(output, notes)))
679 os.putenv('ORIGINAL_TEXT', original_text_tempfile)
680 os.system((
681 "vim"
682 " -c 'setlocal spell spelllang={language}'"
683 " -c 'set keywordprg={language}'"
684 " -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255'"
685 " {filename}"
686 " < /dev/tty > /dev/tty"
687 ).format(language=config['language'], filename=temp2))
689 lines = remove_notes(readlines_from_file(temp2), notes)
691 # compare lines_before and lines_after and return deleted words
692 lines_before = output
693 lines_after = lines
694 deleted_words = []
696 for line in lines_before:
697 if line not in lines_after:
698 line = line.strip()
699 if ' ' in line:
700 word = re.split('\s+', line, 1)[1]
701 if ' ' in word:
702 word = re.split('\s+', word, 1)[0]
703 deleted_words.append(word)
705 with codecs.open(voc_filename(), "a", "utf-8") as f:
706 f.write("\n".join(deleted_words + ['']))
708 os.unlink(temp1)
709 os.unlink(temp2)
711 os.unlink(original_text_tempfile)
713 (options, args) = parser.parse_args()
714 if options.language:
715 config['language'] = options.language
717 if options.pages:
718 config['pages'] = options.pages
719 else:
720 config['pages'] = ""
722 if options.allowed_words:
723 config['allowed_words'] = options.allowed_words
725 if options.show_range:
726 config['show_range'] = options.show_range
728 if options.show_range_percentage:
729 config['show_range_percentage'] = options.show_range_percentage
731 if options.non_interactive:
732 config['non_interactive'] = True
734 if options.text_stats:
735 config['text_stats'] = True
737 if options.compressed:
738 config['compressed'] = True
740 if options.no_filter:
741 config['no_filter'] = True
743 if options.two_words:
744 config['two_words'] = True
746 if options.three_words:
747 config['three_words'] = True
749 if options.no_words_grouping:
750 config['no_words_grouping'] = True
752 filter_get_words_group_words_add_stat(args)
754 #if options.function:
755 # function_names = {
756 # 'get_words_group_words_add_stat': ,
757 # }
758 # if options.function in function_names:
759 # function_names[options.function](args)
760 # else:
761 # error_message("Unkown function %s.\nAvailable functions:\n%s" % (
762 # options.function, "".join([" "+x for x in sorted(function_names.keys())])))
763 # sys.exit(1)
764 #
768 #os.system("vim")