new-words

view new-words.py @ 45:5f90e44eecfc

new-words.py: turn words filtering and grouping on and off
author Igor Chubin <igor@chub.in>
date Fri Feb 04 06:18:50 2011 +0100 (2011-02-04)
parents 7eb1a8c3eade
children d708e2c1bad8
line source
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 from __future__ import with_statement
5 import codecs
6 import logging
7 import os
8 import optparse
9 import re
10 import subprocess
11 import sys
12 import Stemmer
13 try:
14 import psyco
15 psyco.full()
16 except:
17 pass
19 config = {
20 'config_directory': os.environ['HOME'] + '/.new-words',
21 'language': 'en',
22 }
24 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
26 class Normalizator:
27 def __init__(self, language, linked_words={}):
28 stemmer_algorithm = {
29 'de' : 'german',
30 'en' : 'english',
31 'ru' : 'russian',
32 'uk' : 'ukrainian',
33 }
34 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
35 self.linked_words = linked_words
37 def normalize(self, word):
38 word_chain = []
39 while word in self.linked_words and not word in word_chain:
40 word_chain.append(word)
41 word = self.linked_words[word]
42 return self.stemmer.stemWord(word.lower())
44 parser = optparse.OptionParser()
46 parser.add_option(
47 "-a", "--no-marks",
48 help="don't add marks (and don't save marks added by user)",
49 action="store_true",
50 dest="no_marks")
52 parser.add_option(
53 "-c", "--compressed",
54 help="show compressed wordlist: one word per group",
55 action="store_true",
56 dest="compressed")
58 parser.add_option(
59 "-k", "--known-words",
60 help="put higher words that are similar to the known words (only for English)",
61 action="store_true",
62 dest="compressed")
64 parser.add_option(
65 "-l", "--language",
66 help="specify language of text",
67 action="store",
68 dest="language")
70 parser.add_option(
71 "-f", "--function",
72 help="filter through subsystem [INTERNAL]",
73 action="store",
74 dest="function")
76 parser.add_option(
77 "-m", "--merge-tag",
78 help="merge words tagged with specified tag into the main vocabulary",
79 action="store",
80 dest="merge_tag")
82 parser.add_option(
83 "-M", "--merge-tagged",
84 help="merge words tagged with ANY tag into the main vocabulary",
85 action="store_true",
86 dest="merge_tagged")
88 parser.add_option(
89 "-n", "--non-interactive",
90 help="non-interactive mode (don't run vi)",
91 action="store_true",
92 dest="non_interactive")
94 parser.add_option(
95 "-N", "--no-filter",
96 help="switch off known words filtering",
97 action="store_true",
98 dest="no_filter")
100 parser.add_option(
101 "-p", "--pages",
102 help="work with specified pages only (pages = start-stop/total )",
103 action="store",
104 dest="pages")
106 parser.add_option(
107 "-r", "--remove-tag",
108 help="remove subvocabulary of specified tag",
109 action="store",
110 dest="remove_tag")
112 parser.add_option(
113 "-s", "--text-stats",
114 help="show the text statistics (percentage of known words and so on) and exit",
115 action="store_true",
116 dest="text_stats")
118 parser.add_option(
119 "-S", "--voc-stats",
120 help="show your vocabulary statistics (number of words and word groups)",
121 action="store_true",
122 dest="voc_stats")
124 parser.add_option(
125 "-t", "--tag",
126 help="tag known words with tag",
127 action="store",
128 dest="tag")
130 parser.add_option(
131 "-T", "--show-tags",
132 help="tag known words with tag",
133 action="store_true",
134 dest="show_tags")
136 parser.add_option(
137 "-2", "--two-words",
138 help="find 2 words' sequences",
139 action="store_true",
140 dest="two_words")
142 parser.add_option(
143 "-3", "--three-words",
144 help="find 3 words' sequences",
145 action="store_true",
146 dest="three_words")
148 def readlines_from_file(filename):
149 res = []
150 with codecs.open(filename, "r", "utf-8") as f:
151 for line in f.readlines():
152 res += [line]
153 return res
155 def readlines_from_stdin():
156 return codecs.getreader("utf-8")(sys.stdin).readlines()
158 def words_from_line(line):
159 line = line.rstrip('\n')
160 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
161 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
162 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
164 def get_words(lines, group_by=[1]):
165 """
166 Returns hash of words in a file
167 word => number
168 """
169 result = {}
170 (a, b, c) = ("", "", "")
171 for line in lines:
172 words = words_from_line(line)
173 for word in words:
174 if re.match('[0-9]*$', word):
175 continue
176 result.setdefault(word, 0)
177 result[word] += 1
178 if 2 in group_by and a != "" and b != "":
179 w = "%s_%s" % (a,b)
180 result.setdefault(w, 0)
181 result[w] += 1
182 if 3 in group_by and not "" in [a,b,c]:
183 w = "%s_%s_%s" % (a,b,c)
184 result.setdefault(w, 0)
185 result[w] += 1
186 (a,b,c) = (b, c, word)
188 logging.debug(result)
189 return result
191 def load_vocabulary():
192 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
194 def notes_filenames():
195 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
197 def load_notes(files):
198 notes = {}
199 for filename in files:
200 with codecs.open(filename, "r", "utf-8") as f:
201 for line in f.readlines():
202 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
203 notes.setdefault(word, {})
204 notes[word][filename] = note
205 return notes
207 def add_notes(lines, notes):
208 notes_filename = notes_filenames()[0]
209 result = []
210 for line in lines:
211 if line.startswith('#'):
212 result += [line]
213 else:
214 match_object = re.search('^\s*\S+\s*(\S+)', line)
215 if match_object:
216 word = match_object.group(1)
217 if word in notes:
218 #logging.debug(word)
219 #logging.debug(line)
220 if notes_filename in notes[word]:
221 line = line.rstrip('\n')
222 line = "%-30s %s\n" % (line, notes[word][notes_filename])
223 #logging.debug(line)
224 result += [line]
225 else:
226 result += [line]
227 else:
228 result += [line]
229 return result
231 def remove_notes(lines, notes_group):
232 notes_filename = notes_filenames()[0]
233 notes = {}
234 for k in notes_group.keys():
235 if notes_filename in notes_group[k]:
236 notes[k] = notes_group[k][notes_filename]
238 result = []
239 for line in lines:
240 line = line.rstrip('\n')
241 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
242 if match_object:
243 result.append("".join([
244 match_object.group(1),
245 match_object.group(2),
246 match_object.group(3),
247 match_object.group(4),
248 "\n"
249 ]))
250 notes[match_object.group(4)] = match_object.group(6)
251 else:
252 result.append(line+"\n")
254 save_notes(notes_filename, notes)
255 return result
257 def save_notes(filename, notes):
258 lines = []
259 saved_words = []
260 with codecs.open(filename, "r", "utf-8") as f:
261 for line in f.readlines():
262 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
263 if word in notes:
264 line = "%-29s %s\n" % (word, notes[word])
265 saved_words.append(word)
266 lines.append(line)
267 for word in [x for x in notes.keys() if not x in saved_words]:
268 line = "%-29s %s\n" % (word, notes[word])
269 lines.append(line)
271 with codecs.open(filename, "w", "utf-8") as f:
272 for line in lines:
273 f.write(line)
276 def substract_dictionary(dict1, dict2):
277 """
278 returns dict1 - dict2
279 """
280 result = {}
281 for (k,v) in dict1.items():
282 if not k in dict2:
283 result[k] = v
284 return result
286 def dump_words(words, filename):
287 with codecs.open(filename, "w+", "utf-8") as f:
288 for word in words.keys():
289 f.write(("%s\n"%word)*words[word])
291 def error_message(text):
292 print text
294 def find_wordgroups_weights(word_pairs, normalizator):
295 weight = {}
296 for (num, word) in word_pairs:
297 normalized = normalizator.normalize(word)
298 weight.setdefault(normalized, 0)
299 weight[normalized] += num
300 return weight
302 def find_linked_words(notes):
303 linked_words = {}
304 for word in notes.keys():
305 for note in notes[word].values():
306 if "@" in note:
307 result = re.search(r'\@(\S*)', note)
308 if result:
309 main_word = result.group(1)
310 if main_word:
311 linked_words[word] = main_word
312 return linked_words
314 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
315 (num1, word1) = pair1
316 (num2, word2) = pair2
318 normalized_word1 = normalizator.normalize(word1)
319 normalized_word2 = normalizator.normalize(word2)
321 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
322 if cmp_res != 0:
323 return cmp_res
324 else:
325 cmp_res = cmp(normalized_word1, normalized_word2)
326 if cmp_res != 0:
327 return cmp_res
328 else:
329 return cmp(int(num1), int(num2))
331 def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False):
332 if stats_only:
333 codecs.getwriter("utf-8")(sys.stdout).write(
334 " ".join([
335 "%-10s" % x for x in [
336 "LANG",
337 "KNOWN%",
338 "UNKNOWN%",
339 "KNOWN",
340 "TOTAL",
341 "WPS",
342 "UWPS*10"
343 ]]) + "\n")
344 codecs.getwriter("utf-8")(sys.stdout).write(
345 " ".join([
346 "%(language)-10s",
347 "%(percentage)-10.2f",
348 "%(percentage_unknown)-10.2f",
349 "%(total_known)-11d"
350 "%(total)-11d"
351 "%(wps)-11d"
352 "%(uwps)-11d"
353 ]) % stats + "\n")
354 return
356 if print_stats:
357 codecs.getwriter("utf-8")(sys.stdout).write(
358 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
360 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
361 known = int(stats['total_known'])
362 total = int(stats['total'])
363 current_level = 0
364 for word_pair in word_pairs:
365 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
366 known += word_pair[0]
367 if 100.0*known/total >= level_lines[0]:
368 current_level = level_lines[0]
369 while 100.0*known/total > level_lines[0]:
370 current_level = level_lines[0]
371 level_lines = level_lines[1:]
372 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
374 def filter_add_notes(args):
375 lines = readlines_from_file(args[0])
376 notes = load_notes(notes_filenames())
377 lines = add_notes(lines, notes)
378 with codecs.open(args[0], "w", "utf-8") as f:
379 for line in lines:
380 f.write(line)
382 def filter_remove_notes(args):
383 lines = readlines_from_file(args[0])
384 notes = load_notes(notes_filenames())
385 lines = remove_notes(lines, notes)
386 with codecs.open(args[0], "w", "utf-8") as f:
387 for line in lines:
388 f.write(line)
390 def filter_get_words_group_words_add_stat(args):
391 vocabulary = load_vocabulary()
392 notes = load_notes(notes_filenames())
393 lines = readlines_from_stdin()
394 group_by = [1]
395 if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
396 group_by.append(2)
397 if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
398 group_by.append(3)
399 words = get_words(lines, group_by)
400 stats_only = False
401 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
402 stats_only = True
405 stats = {}
406 stats['total'] = sum(words[x] for x in words.keys())
407 if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':
408 words = substract_dictionary(words, vocabulary)
410 stats['total_unknown'] = sum(words[x] for x in words.keys())
411 stats['total_known'] = stats['total'] - stats['total_unknown']
412 stats['percentage'] = 100.0*stats['total_known']/stats['total']
413 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
414 stats['groups'] = 0
415 stats['words'] = len(words)
416 stats['sentences'] = 0 #FIXME
417 stats['wps'] = 0 #FIXME
418 stats['uwps'] = 0 #FIXME
419 stats['language'] = config['language']
421 linked_words = find_linked_words(notes)
422 normalizator = Normalizator(config['language'], linked_words)
424 words_with_freq = []
425 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
426 words_with_freq.append((words[k], k))
428 wgw = find_wordgroups_weights(words_with_freq, normalizator)
429 if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':
430 words_with_freq = sorted(
431 words_with_freq,
432 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
433 reverse=True)
435 print_words_sorted(words_with_freq, stats, stats_only=stats_only)
437 (options, args) = parser.parse_args()
438 if options.language:
439 config['language'] = options.language
441 if options.function:
442 function_names = {
443 'add_notes' : filter_add_notes,
444 'remove_notes': filter_remove_notes,
445 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
446 }
447 if options.function in function_names:
448 function_names[options.function](args)
449 else:
450 error_message("Unkown function %s.\nAvailable functions:\n%s" % (
451 options.function, "".join([" "+x for x in sorted(function_names.keys())])))
452 sys.exit(1)
457 #os.system("vim")