new-words

view new-words.py @ 40:c3a50c0d2400

Functions for adding/removing notes + statistics now implemented in Python.

Option -O (old-style) is not supported anymore. If you need old-style new-words use new-words.sh
author Igor Chubin <igor@chub.in>
date Sun Jan 23 17:09:44 2011 +0100 (2011-01-23)
parents a598e0d25784
children 4629e08b0d87
line source
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 from __future__ import with_statement
5 import codecs
6 import logging
7 import os
8 import optparse
9 import re
10 import subprocess
11 import sys
12 import Stemmer
14 config = {
15 'config_directory': os.environ['HOME'] + '/.new-words',
16 'language': 'en',
17 }
19 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
21 class Normalizator:
22 def __init__(self, language, linked_words={}):
23 stemmer_algorithm = {
24 'de' : 'german',
25 'en' : 'english',
26 'ru' : 'russian',
27 'uk' : 'ukrainian',
28 }
29 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
30 self.linked_words = linked_words
32 def normalize(self, word):
33 word_chain = []
34 while word in self.linked_words and not word in word_chain:
35 word_chain.append(word)
36 word = self.linked_words[word]
37 return self.stemmer.stemWord(word.lower())
39 parser = optparse.OptionParser()
41 parser.add_option(
42 "-a", "--no-marks",
43 help="don't add marks (and don't save marks added by user)",
44 action="store_true",
45 dest="no_marks")
47 parser.add_option(
48 "-c", "--compressed",
49 help="show compressed wordlist: one word per group",
50 action="store_true",
51 dest="compressed")
53 parser.add_option(
54 "-k", "--known-words",
55 help="put higher words that are similar to the known words (only for English)",
56 action="store_true",
57 dest="compressed")
59 parser.add_option(
60 "-l", "--language",
61 help="specify language of text",
62 action="store",
63 dest="language")
65 parser.add_option(
66 "-f", "--function",
67 help="filter through subsystem [INTERNAL]",
68 action="store",
69 dest="function")
71 parser.add_option(
72 "-m", "--merge-tag",
73 help="merge words tagged with specified tag into the main vocabulary",
74 action="store",
75 dest="merge_tag")
77 parser.add_option(
78 "-M", "--merge-tagged",
79 help="merge words tagged with ANY tag into the main vocabulary",
80 action="store_true",
81 dest="merge_tagged")
83 parser.add_option(
84 "-n", "--non-interactive",
85 help="non-interactive mode (don't run vi)",
86 action="store_true",
87 dest="non_interactive")
89 parser.add_option(
90 "-N", "--no-filter",
91 help="switch off known words filtering",
92 action="store_true",
93 dest="no_filter")
95 parser.add_option(
96 "-p", "--pages",
97 help="work with specified pages only (pages = start-stop/total )",
98 action="store",
99 dest="pages")
101 parser.add_option(
102 "-r", "--remove-tag",
103 help="remove subvocabulary of specified tag",
104 action="store",
105 dest="remove_tag")
107 parser.add_option(
108 "-s", "--text-stats",
109 help="show the text statistics (percentage of known words and so on) and exit",
110 action="store_true",
111 dest="text_stats")
113 parser.add_option(
114 "-S", "--voc-stats",
115 help="show your vocabulary statistics (number of words and word groups)",
116 action="store_true",
117 dest="voc_stats")
119 parser.add_option(
120 "-t", "--tag",
121 help="tag known words with tag",
122 action="store",
123 dest="tag")
125 parser.add_option(
126 "-T", "--show-tags",
127 help="tag known words with tag",
128 action="store_true",
129 dest="show_tags")
131 parser.add_option(
132 "-2", "--two-words",
133 help="find 2 words' sequences",
134 action="store_true",
135 dest="two_words")
137 parser.add_option(
138 "-3", "--three-words",
139 help="find 3 words' sequences",
140 action="store_true",
141 dest="three_words")
143 def readlines_from_file(filename):
144 res = []
145 with codecs.open(filename, "r", "utf-8") as f:
146 for line in f.readlines():
147 res += [line]
148 return res
150 def readlines_from_stdin():
151 return codecs.getreader("utf-8")(sys.stdin).readlines()
153 def words_from_line(line):
154 line = line.rstrip('\n')
155 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
156 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
157 return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line)
159 def get_words(lines):
160 """
161 Returns hash of words in a file
162 word => number
163 """
164 result = {}
165 for line in lines:
166 words = words_from_line(line)
167 for word in words:
168 result.setdefault(word, 0)
169 result[word] += 1
170 return result
172 def load_vocabulary():
173 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
175 def notes_filenames():
176 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
178 def load_notes(files):
179 notes = {}
180 for filename in files:
181 with codecs.open(filename, "r", "utf-8") as f:
182 for line in f.readlines():
183 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
184 notes.setdefault(word, {})
185 notes[word][filename] = note
186 return notes
188 def add_notes(lines, notes):
189 notes_filename = notes_filenames()[0]
190 result = []
191 for line in lines:
192 if line.startswith('#'):
193 result += [line]
194 else:
195 match_object = re.search('^\s*\S+\s*(\S+)', line)
196 if match_object:
197 word = match_object.group(1)
198 if word in notes:
199 logging.debug(word)
200 logging.debug(line)
201 if notes_filename in notes[word]:
202 line = line.rstrip('\n')
203 line = "%-30s %s\n" % (line, notes[word][notes_filename])
204 logging.debug(line)
205 result += [line]
206 else:
207 result += [line]
208 else:
209 result += [line]
210 return result
212 def remove_notes(lines, notes_group):
213 notes_filename = notes_filenames()[0]
214 notes = {}
215 for k in notes_group.keys():
216 if notes_filename in notes_group[k]:
217 notes[k] = notes_group[k][notes_filename]
219 result = []
220 for line in lines:
221 line = line.rstrip('\n')
222 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
223 if match_object:
224 result.append("".join([
225 match_object.group(1),
226 match_object.group(2),
227 match_object.group(3),
228 match_object.group(4),
229 "\n"
230 ]))
231 notes[match_object.group(4)] = match_object.group(6)
232 else:
233 result.append(line+"\n")
235 save_notes(notes_filename, notes)
236 return result
238 def save_notes(filename, notes):
239 lines = []
240 saved_words = []
241 with codecs.open(filename, "r", "utf-8") as f:
242 for line in f.readlines():
243 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
244 if word in notes:
245 line = "%-29s %s\n" % (word, notes[word])
246 saved_words.append(word)
247 lines.append(line)
248 for word in [x for x in notes.keys() if not x in saved_words]:
249 line = "%-29s %s\n" % (word, notes[word])
250 lines.append(line)
252 with codecs.open(filename, "w", "utf-8") as f:
253 for line in lines:
254 f.write(line)
257 def substract_dictionary(dict1, dict2):
258 """
259 returns dict1 - dict2
260 """
261 result = {}
262 for (k,v) in dict1.items():
263 if not k in dict2:
264 result[k] = v
265 return result
267 def dump_words(words, filename):
268 with codecs.open(filename, "w+", "utf-8") as f:
269 for word in words.keys():
270 f.write(("%s\n"%word)*words[word])
272 def error_message(text):
273 print text
275 def find_wordgroups_weights(word_pairs, normalizator):
276 weight = {}
277 for (num, word) in word_pairs:
278 normalized = normalizator.normalize(word)
279 weight.setdefault(normalized, 0)
280 weight[normalized] += num
281 return weight
283 def find_linked_words(notes):
284 linked_words = {}
285 for word in notes.keys():
286 for note in notes[word].values():
287 if "@" in note:
288 result = re.search(r'\@(\S*)', note)
289 if result:
290 main_word = result.group(1)
291 if main_word:
292 linked_words[word] = main_word
293 return linked_words
295 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
296 (num1, word1) = pair1
297 (num2, word2) = pair2
299 normalized_word1 = normalizator.normalize(word1)
300 normalized_word2 = normalizator.normalize(word2)
302 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
303 if cmp_res != 0:
304 return cmp_res
305 else:
306 cmp_res = cmp(normalized_word1, normalized_word2)
307 if cmp_res != 0:
308 return cmp_res
309 else:
310 return cmp(int(num1), int(num2))
312 def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False):
313 if stats_only:
314 codecs.getwriter("utf-8")(sys.stdout).write("stat_only")
315 return
317 if print_stats:
318 codecs.getwriter("utf-8")(sys.stdout).write(
319 "# %(language)s, %(percentage)s, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
321 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
322 known = int(stats['total_known'])
323 total = int(stats['total'])
324 current_level = 0
325 for word_pair in word_pairs:
326 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
327 known += word_pair[0]
328 if 100.0*known/total >= level_lines[0]:
329 current_level = level_lines[0]
330 while 100.0*known/total > level_lines[0]:
331 current_level = level_lines[0]
332 level_lines = level_lines[1:]
333 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
335 def filter_add_notes(args):
336 lines = readlines_from_file(args[0])
337 notes = load_notes(notes_filenames())
338 lines = add_notes(lines, notes)
339 with codecs.open(args[0], "w", "utf-8") as f:
340 for line in lines:
341 f.write(line)
343 def filter_remove_notes(args):
344 lines = readlines_from_file(args[0])
345 notes = load_notes(notes_filenames())
346 lines = remove_notes(lines, notes)
347 with codecs.open(args[0], "w", "utf-8") as f:
348 for line in lines:
349 f.write(line)
351 def filter_get_words_group_words_add_stat(args):
352 vocabulary = load_vocabulary()
353 notes = load_notes(notes_filenames())
354 lines = readlines_from_stdin()
355 words = get_words(lines)
357 stats = {}
358 stats['total'] = sum(words[x] for x in words.keys())
359 words = substract_dictionary(words, vocabulary)
361 stats['total_unknown'] = sum(words[x] for x in words.keys())
362 stats['total_known'] = stats['total'] - stats['total_unknown']
363 stats['percentage'] = "%7.2f"%(100.0*stats['total_known']/stats['total'])
364 stats['groups'] = 0
365 stats['words'] = len(words)
366 stats['sentences'] = 0 #FIXME
367 stats['language'] = config['language']
369 linked_words = find_linked_words(notes)
370 normalizator = Normalizator(config['language'], linked_words)
372 word_pairs = []
373 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
374 word_pairs.append((words[k], k))
376 wgw = find_wordgroups_weights(word_pairs, normalizator)
377 word_pairs = sorted(
378 word_pairs,
379 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
380 reverse=True)
382 print_words_sorted(word_pairs, stats)
384 (options, args) = parser.parse_args()
385 if options.language:
386 config['language'] = options.language
388 if options.function:
389 function_names = {
390 'add_notes' : filter_add_notes,
391 'remove_notes': filter_remove_notes,
392 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
393 }
394 if options.function in function_names:
395 function_names[options.function](args)
396 else:
397 error_message("Unkown function %s.\nAvailable functions:\n%s" % (
398 options.function, "".join([" "+x for x in sorted(function_names.keys())])))
399 sys.exit(1)
404 #os.system("vim")