new-words

view new-words.py @ 48:7194bdb56475

new feature: -r and -R can specify number of words (or percentage) to show
author Igor Chubin <igor@chub.in>
date Tue Feb 08 20:35:38 2011 +0200 (2011-02-08)
parents d708e2c1bad8
children 00286f6bfa85
line source
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 from __future__ import with_statement
5 import codecs
6 import logging
7 import os
8 import optparse
9 import re
10 import subprocess
11 import sys
12 import Stemmer
13 try:
14 import psyco
15 psyco.full()
16 except:
17 pass
19 config = {
20 'config_directory': os.environ['HOME'] + '/.new-words',
21 'language': 'en',
22 }
24 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
26 class Normalizator:
27 def __init__(self, language, linked_words={}):
28 stemmer_algorithm = {
29 'de' : 'german',
30 'en' : 'english',
31 'ru' : 'russian',
32 'uk' : 'ukrainian',
33 }
34 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
35 self.linked_words = linked_words
37 def normalize(self, word):
38 word_chain = []
39 while word in self.linked_words and not word in word_chain:
40 word_chain.append(word)
41 word = self.linked_words[word]
42 return self.stemmer.stemWord(word.lower())
44 def best_word_from_group(self, wordpairs_group):
45 """Returns the word that is the most relevant to the wordpairs_group.
47 At the moment: returns the word with minimal length"""
49 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
50 return list(x[1] for x in sorted(
51 (x for x in wordpairs_group if len(x[1]) == minimal_length),
52 key=lambda x:x[0],
53 reverse=True))[0]
55 parser = optparse.OptionParser()
57 parser.add_option(
58 "-a", "--no-marks",
59 help="don't add marks (and don't save marks added by user)",
60 action="store_true",
61 dest="no_marks")
63 parser.add_option(
64 "-c", "--compressed",
65 help="show compressed wordlist: one word per group",
66 action="store_true",
67 dest="compressed")
69 parser.add_option(
70 "-k", "--known-words",
71 help="put higher words that are similar to the known words (only for English)",
72 action="store_true",
73 dest="compressed")
75 parser.add_option(
76 "-l", "--language",
77 help="specify language of text",
78 action="store",
79 dest="language")
81 parser.add_option(
82 "-f", "--function",
83 help="filter through subsystem [INTERNAL]",
84 action="store",
85 dest="function")
87 parser.add_option(
88 "-m", "--merge-tag",
89 help="merge words tagged with specified tag into the main vocabulary",
90 action="store",
91 dest="merge_tag")
93 parser.add_option(
94 "-M", "--merge-tagged",
95 help="merge words tagged with ANY tag into the main vocabulary",
96 action="store_true",
97 dest="merge_tagged")
99 parser.add_option(
100 "-n", "--non-interactive",
101 help="non-interactive mode (don't run vi)",
102 action="store_true",
103 dest="non_interactive")
105 parser.add_option(
106 "-N", "--no-filter",
107 help="switch off known words filtering",
108 action="store_true",
109 dest="no_filter")
111 parser.add_option(
112 "-p", "--pages",
113 help="work with specified pages only (pages = start-stop/total )",
114 action="store",
115 dest="pages")
117 parser.add_option(
118 "-d", "--delete-tag",
119 help="delete subvocabulary of specified tag",
120 action="store",
121 dest="delete_tag")
123 parser.add_option(
124 "-s", "--text-stats",
125 help="show the text statistics (percentage of known words and so on) and exit",
126 action="store_true",
127 dest="text_stats")
129 parser.add_option(
130 "-S", "--voc-stats",
131 help="show your vocabulary statistics (number of words and word groups)",
132 action="store_true",
133 dest="voc_stats")
135 parser.add_option(
136 "-t", "--tag",
137 help="tag known words with tag",
138 action="store",
139 dest="tag")
141 parser.add_option(
142 "-T", "--show-tags",
143 help="tag known words with tag",
144 action="store_true",
145 dest="show_tags")
147 parser.add_option(
148 "-2", "--two-words",
149 help="find 2 words' sequences",
150 action="store_true",
151 dest="two_words")
153 parser.add_option(
154 "-3", "--three-words",
155 help="find 3 words' sequences",
156 action="store_true",
157 dest="three_words")
159 def readlines_from_file(filename):
160 res = []
161 with codecs.open(filename, "r", "utf-8") as f:
162 for line in f.readlines():
163 res += [line]
164 return res
166 def readlines_from_stdin():
167 return codecs.getreader("utf-8")(sys.stdin).readlines()
169 def words_from_line(line):
170 line = line.rstrip('\n')
171 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
172 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
173 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
175 def get_words(lines, group_by=[1]):
176 """
177 Returns hash of words in a file
178 word => number
179 """
180 result = {}
181 (a, b, c) = ("", "", "")
182 for line in lines:
183 words = words_from_line(line)
184 for word in words:
185 if re.match('[0-9]*$', word):
186 continue
187 result.setdefault(word, 0)
188 result[word] += 1
189 if 2 in group_by and a != "" and b != "":
190 w = "%s_%s" % (a,b)
191 result.setdefault(w, 0)
192 result[w] += 1
193 if 3 in group_by and not "" in [a,b,c]:
194 w = "%s_%s_%s" % (a,b,c)
195 result.setdefault(w, 0)
196 result[w] += 1
197 (a,b,c) = (b, c, word)
199 logging.debug(result)
200 return result
202 def load_vocabulary():
203 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
205 def notes_filenames():
206 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
208 def load_notes(files):
209 notes = {}
210 for filename in files:
211 with codecs.open(filename, "r", "utf-8") as f:
212 for line in f.readlines():
213 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
214 notes.setdefault(word, {})
215 notes[word][filename] = note
216 return notes
218 def add_notes(lines, notes):
219 notes_filename = notes_filenames()[0]
220 result = []
221 for line in lines:
222 if line.startswith('#'):
223 result += [line]
224 else:
225 match_object = re.search('^\s*\S+\s*(\S+)', line)
226 if match_object:
227 word = match_object.group(1)
228 if word in notes:
229 if notes_filename in notes[word]:
230 line = line.rstrip('\n')
231 line = "%-30s %s\n" % (line, notes[word][notes_filename])
232 result += [line]
233 else:
234 result += [line]
235 else:
236 result += [line]
237 return result
239 def remove_notes(lines, notes_group):
240 notes_filename = notes_filenames()[0]
241 notes = {}
242 for k in notes_group.keys():
243 if notes_filename in notes_group[k]:
244 notes[k] = notes_group[k][notes_filename]
246 result = []
247 for line in lines:
248 line = line.rstrip('\n')
249 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
250 if match_object:
251 result.append("".join([
252 match_object.group(1),
253 match_object.group(2),
254 match_object.group(3),
255 match_object.group(4),
256 "\n"
257 ]))
258 notes[match_object.group(4)] = match_object.group(6)
259 else:
260 result.append(line+"\n")
262 save_notes(notes_filename, notes)
263 return result
265 def save_notes(filename, notes):
266 lines = []
267 saved_words = []
268 with codecs.open(filename, "r", "utf-8") as f:
269 for line in f.readlines():
270 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
271 if word in notes:
272 line = "%-29s %s\n" % (word, notes[word])
273 saved_words.append(word)
274 lines.append(line)
275 for word in [x for x in notes.keys() if not x in saved_words]:
276 line = "%-29s %s\n" % (word, notes[word])
277 lines.append(line)
279 with codecs.open(filename, "w", "utf-8") as f:
280 for line in lines:
281 f.write(line)
284 def substract_dictionary(dict1, dict2):
285 """
286 returns dict1 - dict2
287 """
288 result = {}
289 for (k,v) in dict1.items():
290 if not k in dict2:
291 result[k] = v
292 return result
294 def dump_words(words, filename):
295 with codecs.open(filename, "w+", "utf-8") as f:
296 for word in words.keys():
297 f.write(("%s\n"%word)*words[word])
299 def error_message(text):
300 print text
302 def find_wordgroups_weights(word_pairs, normalizator):
303 weight = {}
304 for (num, word) in word_pairs:
305 normalized = normalizator.normalize(word)
306 weight.setdefault(normalized, 0)
307 weight[normalized] += num
308 return weight
310 def find_linked_words(notes):
311 linked_words = {}
312 for word in notes.keys():
313 for note in notes[word].values():
314 if "@" in note:
315 result = re.search(r'\@(\S*)', note)
316 if result:
317 main_word = result.group(1)
318 if main_word:
319 linked_words[word] = main_word
320 return linked_words
322 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
323 (num1, word1) = pair1
324 (num2, word2) = pair2
326 normalized_word1 = normalizator.normalize(word1)
327 normalized_word2 = normalizator.normalize(word2)
329 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
330 if cmp_res != 0:
331 return cmp_res
332 else:
333 cmp_res = cmp(normalized_word1, normalized_word2)
334 if cmp_res != 0:
335 return cmp_res
336 else:
337 return cmp(int(num1), int(num2))
340 def print_words_sorted(
341 word_pairs,
342 stats,
343 normalizator,
344 print_stats=True,
345 stats_only=False,
346 compressed_wordlist=False,
347 show_range=0,
348 show_range_percentage=0,
349 ):
350 if stats_only:
351 codecs.getwriter("utf-8")(sys.stdout).write(
352 " ".join([
353 "%-10s" % x for x in [
354 "LANG",
355 "KNOWN%",
356 "UNKNOWN%",
357 "KNOWN",
358 "TOTAL",
359 "WPS",
360 "UWPS*10"
361 ]]) + "\n")
362 codecs.getwriter("utf-8")(sys.stdout).write(
363 " ".join([
364 "%(language)-10s",
365 "%(percentage)-10.2f",
366 "%(percentage_unknown)-10.2f",
367 "%(total_known)-11d"
368 "%(total)-11d"
369 "%(wps)-11d"
370 "%(uwps)-11d"
371 ]) % stats + "\n")
372 return
374 if print_stats:
375 codecs.getwriter("utf-8")(sys.stdout).write(
376 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
378 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
379 known = int(stats['total_known'])
380 total = int(stats['total'])
381 current_level = 0
382 old_normalized_word = None
383 words_of_this_group = []
384 printed_words = 0
385 for word_pair in word_pairs:
387 normalized_word = normalizator.normalize(word_pair[1])
388 if old_normalized_word and old_normalized_word != normalized_word:
389 #codecs.getwriter("utf-8")(sys.stdout).write(
390 # "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
391 compressed_word_pair = (
392 sum(x[0] for x in words_of_this_group),
393 normalizator.best_word_from_group(words_of_this_group)
394 )
395 if compressed_wordlist:
396 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
397 printed_words += 1
398 words_of_this_group = []
400 old_normalized_word = normalized_word
401 words_of_this_group.append(word_pair)
403 if not compressed_wordlist:
404 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
405 printed_words += 1
408 known += word_pair[0]
409 if 100.0*known/total >= level_lines[0]:
410 current_level = level_lines[0]
411 while 100.0*known/total > level_lines[0]:
412 current_level = level_lines[0]
413 level_lines = level_lines[1:]
414 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
416 if show_range >0 and printed_words >= show_range:
417 break
418 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
419 break
421 def filter_add_notes(args):
422 lines = readlines_from_file(args[0])
423 notes = load_notes(notes_filenames())
424 lines = add_notes(lines, notes)
425 with codecs.open(args[0], "w", "utf-8") as f:
426 for line in lines:
427 f.write(line)
429 def filter_remove_notes(args):
430 lines = readlines_from_file(args[0])
431 notes = load_notes(notes_filenames())
432 lines = remove_notes(lines, notes)
433 with codecs.open(args[0], "w", "utf-8") as f:
434 for line in lines:
435 f.write(line)
437 def filter_get_words_group_words_add_stat(args):
438 vocabulary = load_vocabulary()
439 notes = load_notes(notes_filenames())
440 lines = readlines_from_stdin()
441 group_by = [1]
443 if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
444 group_by.append(2)
445 if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
446 group_by.append(3)
447 words = get_words(lines, group_by)
448 stats_only = False
449 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
450 stats_only = True
452 compressed_wordlist = False
453 if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
454 compressed_wordlist = True
456 show_range = os.environ.get('SHOW_RANGE', '')
457 if show_range != '':
458 show_range = int(show_range)
459 else:
460 show_range = 0
461 show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '')
462 if show_range_percentage != '':
463 show_range_percentage = int(show_range_percentage)
464 else:
465 show_range_percentage = 0
468 stats = {}
469 stats['total'] = sum(words[x] for x in words.keys())
470 if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':
471 words = substract_dictionary(words, vocabulary)
473 stats['total_unknown'] = sum(words[x] for x in words.keys())
474 stats['total_known'] = stats['total'] - stats['total_unknown']
475 stats['percentage'] = 100.0*stats['total_known']/stats['total']
476 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
477 stats['groups'] = 0
478 stats['words'] = len(words)
479 stats['sentences'] = 0 #FIXME
480 stats['wps'] = 0 #FIXME
481 stats['uwps'] = 0 #FIXME
482 stats['language'] = config['language']
484 linked_words = find_linked_words(notes)
485 normalizator = Normalizator(config['language'], linked_words)
487 words_with_freq = []
488 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
489 words_with_freq.append((words[k], k))
491 wgw = find_wordgroups_weights(words_with_freq, normalizator)
492 if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':
493 words_with_freq = sorted(
494 words_with_freq,
495 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
496 reverse=True)
498 print_words_sorted(
499 words_with_freq,
500 stats,
501 normalizator,
502 stats_only=stats_only,
503 compressed_wordlist=compressed_wordlist,
504 show_range=show_range,
505 show_range_percentage=show_range_percentage,
506 )
508 (options, args) = parser.parse_args()
509 if options.language:
510 config['language'] = options.language
512 if options.function:
513 function_names = {
514 'add_notes' : filter_add_notes,
515 'remove_notes': filter_remove_notes,
516 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
517 }
518 if options.function in function_names:
519 function_names[options.function](args)
520 else:
521 error_message("Unkown function %s.\nAvailable functions:\n%s" % (
522 options.function, "".join([" "+x for x in sorted(function_names.keys())])))
523 sys.exit(1)
528 #os.system("vim")