new-words
annotate new-words.py @ 40:c3a50c0d2400
Functions for adding/removing notes + statistics now implemented in Python.
Option -O (old-style) is not supported anymore. If you need old-style new-words use new-words.sh
Option -O (old-style) is not supported anymore. If you need old-style new-words use new-words.sh
author | Igor Chubin <igor@chub.in> |
---|---|
date | Sun Jan 23 17:09:44 2011 +0100 (2011-01-23) |
parents | a598e0d25784 |
children | 4629e08b0d87 |
rev | line source |
---|---|
igor@37 | 1 #!/usr/bin/env python |
igor@38 | 2 # -*- coding: utf-8 -*- |
igor@37 | 3 |
igor@40 | 4 from __future__ import with_statement |
igor@38 | 5 import codecs |
igor@38 | 6 import logging |
igor@38 | 7 import os |
igor@37 | 8 import optparse |
igor@38 | 9 import re |
igor@38 | 10 import subprocess |
igor@38 | 11 import sys |
igor@38 | 12 import Stemmer |
igor@38 | 13 |
igor@38 | 14 config = { |
igor@38 | 15 'config_directory': os.environ['HOME'] + '/.new-words', |
igor@38 | 16 'language': 'en', |
igor@38 | 17 } |
igor@38 | 18 |
igor@38 | 19 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG) |
igor@38 | 20 |
igor@38 | 21 class Normalizator: |
igor@38 | 22 def __init__(self, language, linked_words={}): |
igor@38 | 23 stemmer_algorithm = { |
igor@38 | 24 'de' : 'german', |
igor@38 | 25 'en' : 'english', |
igor@38 | 26 'ru' : 'russian', |
igor@38 | 27 'uk' : 'ukrainian', |
igor@38 | 28 } |
igor@38 | 29 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language]) |
igor@38 | 30 self.linked_words = linked_words |
igor@38 | 31 |
igor@38 | 32 def normalize(self, word): |
igor@38 | 33 word_chain = [] |
igor@38 | 34 while word in self.linked_words and not word in word_chain: |
igor@38 | 35 word_chain.append(word) |
igor@38 | 36 word = self.linked_words[word] |
igor@38 | 37 return self.stemmer.stemWord(word.lower()) |
igor@37 | 38 |
igor@37 | 39 parser = optparse.OptionParser() |
igor@37 | 40 |
igor@37 | 41 parser.add_option( |
igor@37 | 42 "-a", "--no-marks", |
igor@37 | 43 help="don't add marks (and don't save marks added by user)", |
igor@37 | 44 action="store_true", |
igor@37 | 45 dest="no_marks") |
igor@37 | 46 |
igor@37 | 47 parser.add_option( |
igor@37 | 48 "-c", "--compressed", |
igor@37 | 49 help="show compressed wordlist: one word per group", |
igor@37 | 50 action="store_true", |
igor@37 | 51 dest="compressed") |
igor@37 | 52 |
igor@37 | 53 parser.add_option( |
igor@37 | 54 "-k", "--known-words", |
igor@37 | 55 help="put higher words that are similar to the known words (only for English)", |
igor@37 | 56 action="store_true", |
igor@37 | 57 dest="compressed") |
igor@37 | 58 |
igor@37 | 59 parser.add_option( |
igor@37 | 60 "-l", "--language", |
igor@37 | 61 help="specify language of text", |
igor@37 | 62 action="store", |
igor@37 | 63 dest="language") |
igor@37 | 64 |
igor@37 | 65 parser.add_option( |
igor@38 | 66 "-f", "--function", |
igor@38 | 67 help="filter through subsystem [INTERNAL]", |
igor@38 | 68 action="store", |
igor@38 | 69 dest="function") |
igor@38 | 70 |
igor@38 | 71 parser.add_option( |
igor@37 | 72 "-m", "--merge-tag", |
igor@37 | 73 help="merge words tagged with specified tag into the main vocabulary", |
igor@37 | 74 action="store", |
igor@37 | 75 dest="merge_tag") |
igor@37 | 76 |
igor@37 | 77 parser.add_option( |
igor@37 | 78 "-M", "--merge-tagged", |
igor@37 | 79 help="merge words tagged with ANY tag into the main vocabulary", |
igor@37 | 80 action="store_true", |
igor@37 | 81 dest="merge_tagged") |
igor@37 | 82 |
igor@37 | 83 parser.add_option( |
igor@37 | 84 "-n", "--non-interactive", |
igor@37 | 85 help="non-interactive mode (don't run vi)", |
igor@37 | 86 action="store_true", |
igor@37 | 87 dest="non_interactive") |
igor@37 | 88 |
igor@37 | 89 parser.add_option( |
igor@37 | 90 "-N", "--no-filter", |
igor@37 | 91 help="switch off known words filtering", |
igor@37 | 92 action="store_true", |
igor@37 | 93 dest="no_filter") |
igor@37 | 94 |
igor@37 | 95 parser.add_option( |
igor@37 | 96 "-p", "--pages", |
igor@37 | 97 help="work with specified pages only (pages = start-stop/total )", |
igor@37 | 98 action="store", |
igor@37 | 99 dest="pages") |
igor@37 | 100 |
igor@37 | 101 parser.add_option( |
igor@37 | 102 "-r", "--remove-tag", |
igor@37 | 103 help="remove subvocabulary of specified tag", |
igor@37 | 104 action="store", |
igor@37 | 105 dest="remove_tag") |
igor@37 | 106 |
igor@37 | 107 parser.add_option( |
igor@37 | 108 "-s", "--text-stats", |
igor@37 | 109 help="show the text statistics (percentage of known words and so on) and exit", |
igor@37 | 110 action="store_true", |
igor@37 | 111 dest="text_stats") |
igor@37 | 112 |
igor@37 | 113 parser.add_option( |
igor@37 | 114 "-S", "--voc-stats", |
igor@37 | 115 help="show your vocabulary statistics (number of words and word groups)", |
igor@37 | 116 action="store_true", |
igor@37 | 117 dest="voc_stats") |
igor@37 | 118 |
igor@37 | 119 parser.add_option( |
igor@37 | 120 "-t", "--tag", |
igor@37 | 121 help="tag known words with tag", |
igor@37 | 122 action="store", |
igor@37 | 123 dest="tag") |
igor@37 | 124 |
igor@37 | 125 parser.add_option( |
igor@37 | 126 "-T", "--show-tags", |
igor@37 | 127 help="tag known words with tag", |
igor@37 | 128 action="store_true", |
igor@37 | 129 dest="show_tags") |
igor@37 | 130 |
igor@37 | 131 parser.add_option( |
igor@37 | 132 "-2", "--two-words", |
igor@37 | 133 help="find 2 words' sequences", |
igor@37 | 134 action="store_true", |
igor@37 | 135 dest="two_words") |
igor@37 | 136 |
igor@37 | 137 parser.add_option( |
igor@37 | 138 "-3", "--three-words", |
igor@37 | 139 help="find 3 words' sequences", |
igor@37 | 140 action="store_true", |
igor@37 | 141 dest="three_words") |
igor@37 | 142 |
igor@38 | 143 def readlines_from_file(filename): |
igor@38 | 144 res = [] |
igor@38 | 145 with codecs.open(filename, "r", "utf-8") as f: |
igor@38 | 146 for line in f.readlines(): |
igor@38 | 147 res += [line] |
igor@38 | 148 return res |
igor@38 | 149 |
igor@38 | 150 def readlines_from_stdin(): |
igor@38 | 151 return codecs.getreader("utf-8")(sys.stdin).readlines() |
igor@38 | 152 |
igor@38 | 153 def words_from_line(line): |
igor@38 | 154 line = line.rstrip('\n') |
igor@38 | 155 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line) |
igor@38 | 156 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line) |
igor@38 | 157 return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line) |
igor@38 | 158 |
igor@38 | 159 def get_words(lines): |
igor@38 | 160 """ |
igor@38 | 161 Returns hash of words in a file |
igor@38 | 162 word => number |
igor@38 | 163 """ |
igor@38 | 164 result = {} |
igor@38 | 165 for line in lines: |
igor@38 | 166 words = words_from_line(line) |
igor@38 | 167 for word in words: |
igor@38 | 168 result.setdefault(word, 0) |
igor@38 | 169 result[word] += 1 |
igor@38 | 170 return result |
igor@38 | 171 |
igor@38 | 172 def load_vocabulary(): |
igor@38 | 173 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language']))) |
igor@38 | 174 |
igor@38 | 175 def notes_filenames(): |
igor@38 | 176 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])] |
igor@38 | 177 |
igor@38 | 178 def load_notes(files): |
igor@38 | 179 notes = {} |
igor@38 | 180 for filename in files: |
igor@39 | 181 with codecs.open(filename, "r", "utf-8") as f: |
igor@38 | 182 for line in f.readlines(): |
igor@38 | 183 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) |
igor@38 | 184 notes.setdefault(word, {}) |
igor@38 | 185 notes[word][filename] = note |
igor@38 | 186 return notes |
igor@38 | 187 |
igor@39 | 188 def add_notes(lines, notes): |
igor@39 | 189 notes_filename = notes_filenames()[0] |
igor@39 | 190 result = [] |
igor@39 | 191 for line in lines: |
igor@39 | 192 if line.startswith('#'): |
igor@39 | 193 result += [line] |
igor@39 | 194 else: |
igor@39 | 195 match_object = re.search('^\s*\S+\s*(\S+)', line) |
igor@39 | 196 if match_object: |
igor@39 | 197 word = match_object.group(1) |
igor@39 | 198 if word in notes: |
igor@39 | 199 logging.debug(word) |
igor@39 | 200 logging.debug(line) |
igor@39 | 201 if notes_filename in notes[word]: |
igor@39 | 202 line = line.rstrip('\n') |
igor@39 | 203 line = "%-30s %s\n" % (line, notes[word][notes_filename]) |
igor@39 | 204 logging.debug(line) |
igor@39 | 205 result += [line] |
igor@39 | 206 else: |
igor@39 | 207 result += [line] |
igor@39 | 208 else: |
igor@39 | 209 result += [line] |
igor@39 | 210 return result |
igor@39 | 211 |
igor@39 | 212 def remove_notes(lines, notes_group): |
igor@39 | 213 notes_filename = notes_filenames()[0] |
igor@39 | 214 notes = {} |
igor@39 | 215 for k in notes_group.keys(): |
igor@39 | 216 if notes_filename in notes_group[k]: |
igor@39 | 217 notes[k] = notes_group[k][notes_filename] |
igor@39 | 218 |
igor@39 | 219 result = [] |
igor@39 | 220 for line in lines: |
igor@39 | 221 line = line.rstrip('\n') |
igor@39 | 222 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line) |
igor@39 | 223 if match_object: |
igor@39 | 224 result.append("".join([ |
igor@39 | 225 match_object.group(1), |
igor@39 | 226 match_object.group(2), |
igor@39 | 227 match_object.group(3), |
igor@39 | 228 match_object.group(4), |
igor@39 | 229 "\n" |
igor@39 | 230 ])) |
igor@39 | 231 notes[match_object.group(4)] = match_object.group(6) |
igor@39 | 232 else: |
igor@39 | 233 result.append(line+"\n") |
igor@39 | 234 |
igor@39 | 235 save_notes(notes_filename, notes) |
igor@39 | 236 return result |
igor@39 | 237 |
igor@39 | 238 def save_notes(filename, notes): |
igor@39 | 239 lines = [] |
igor@39 | 240 saved_words = [] |
igor@39 | 241 with codecs.open(filename, "r", "utf-8") as f: |
igor@39 | 242 for line in f.readlines(): |
igor@39 | 243 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) |
igor@39 | 244 if word in notes: |
igor@39 | 245 line = "%-29s %s\n" % (word, notes[word]) |
igor@39 | 246 saved_words.append(word) |
igor@39 | 247 lines.append(line) |
igor@39 | 248 for word in [x for x in notes.keys() if not x in saved_words]: |
igor@39 | 249 line = "%-29s %s\n" % (word, notes[word]) |
igor@39 | 250 lines.append(line) |
igor@39 | 251 |
igor@39 | 252 with codecs.open(filename, "w", "utf-8") as f: |
igor@39 | 253 for line in lines: |
igor@39 | 254 f.write(line) |
igor@39 | 255 |
igor@39 | 256 |
igor@38 | 257 def substract_dictionary(dict1, dict2): |
igor@38 | 258 """ |
igor@38 | 259 returns dict1 - dict2 |
igor@38 | 260 """ |
igor@38 | 261 result = {} |
igor@38 | 262 for (k,v) in dict1.items(): |
igor@38 | 263 if not k in dict2: |
igor@38 | 264 result[k] = v |
igor@38 | 265 return result |
igor@38 | 266 |
igor@38 | 267 def dump_words(words, filename): |
igor@38 | 268 with codecs.open(filename, "w+", "utf-8") as f: |
igor@38 | 269 for word in words.keys(): |
igor@38 | 270 f.write(("%s\n"%word)*words[word]) |
igor@38 | 271 |
igor@38 | 272 def error_message(text): |
igor@38 | 273 print text |
igor@38 | 274 |
igor@40 | 275 def find_wordgroups_weights(word_pairs, normalizator): |
igor@38 | 276 weight = {} |
igor@40 | 277 for (num, word) in word_pairs: |
igor@38 | 278 normalized = normalizator.normalize(word) |
igor@38 | 279 weight.setdefault(normalized, 0) |
igor@40 | 280 weight[normalized] += num |
igor@38 | 281 return weight |
igor@38 | 282 |
igor@38 | 283 def find_linked_words(notes): |
igor@38 | 284 linked_words = {} |
igor@38 | 285 for word in notes.keys(): |
igor@38 | 286 for note in notes[word].values(): |
igor@38 | 287 if "@" in note: |
igor@38 | 288 result = re.search(r'\@(\S*)', note) |
igor@38 | 289 if result: |
igor@38 | 290 main_word = result.group(1) |
igor@38 | 291 if main_word: |
igor@38 | 292 linked_words[word] = main_word |
igor@38 | 293 return linked_words |
igor@38 | 294 |
igor@40 | 295 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words): |
igor@40 | 296 (num1, word1) = pair1 |
igor@40 | 297 (num2, word2) = pair2 |
igor@38 | 298 |
igor@38 | 299 normalized_word1 = normalizator.normalize(word1) |
igor@38 | 300 normalized_word2 = normalizator.normalize(word2) |
igor@38 | 301 |
igor@38 | 302 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2]) |
igor@38 | 303 if cmp_res != 0: |
igor@38 | 304 return cmp_res |
igor@38 | 305 else: |
igor@38 | 306 cmp_res = cmp(normalized_word1, normalized_word2) |
igor@38 | 307 if cmp_res != 0: |
igor@38 | 308 return cmp_res |
igor@38 | 309 else: |
igor@38 | 310 return cmp(int(num1), int(num2)) |
igor@38 | 311 |
igor@40 | 312 def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False): |
igor@40 | 313 if stats_only: |
igor@40 | 314 codecs.getwriter("utf-8")(sys.stdout).write("stat_only") |
igor@40 | 315 return |
igor@38 | 316 |
igor@40 | 317 if print_stats: |
igor@40 | 318 codecs.getwriter("utf-8")(sys.stdout).write( |
igor@40 | 319 "# %(language)s, %(percentage)s, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats) |
igor@38 | 320 |
igor@40 | 321 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102) |
igor@40 | 322 known = int(stats['total_known']) |
igor@40 | 323 total = int(stats['total']) |
igor@40 | 324 current_level = 0 |
igor@40 | 325 for word_pair in word_pairs: |
igor@40 | 326 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair) |
igor@40 | 327 known += word_pair[0] |
igor@40 | 328 if 100.0*known/total >= level_lines[0]: |
igor@40 | 329 current_level = level_lines[0] |
igor@40 | 330 while 100.0*known/total > level_lines[0]: |
igor@40 | 331 current_level = level_lines[0] |
igor@40 | 332 level_lines = level_lines[1:] |
igor@40 | 333 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level) |
igor@38 | 334 |
igor@39 | 335 def filter_add_notes(args): |
igor@39 | 336 lines = readlines_from_file(args[0]) |
igor@39 | 337 notes = load_notes(notes_filenames()) |
igor@39 | 338 lines = add_notes(lines, notes) |
igor@39 | 339 with codecs.open(args[0], "w", "utf-8") as f: |
igor@39 | 340 for line in lines: |
igor@39 | 341 f.write(line) |
igor@39 | 342 |
igor@39 | 343 def filter_remove_notes(args): |
igor@39 | 344 lines = readlines_from_file(args[0]) |
igor@39 | 345 notes = load_notes(notes_filenames()) |
igor@39 | 346 lines = remove_notes(lines, notes) |
igor@39 | 347 with codecs.open(args[0], "w", "utf-8") as f: |
igor@39 | 348 for line in lines: |
igor@39 | 349 f.write(line) |
igor@39 | 350 |
igor@40 | 351 def filter_get_words_group_words_add_stat(args): |
igor@40 | 352 vocabulary = load_vocabulary() |
igor@40 | 353 notes = load_notes(notes_filenames()) |
igor@40 | 354 lines = readlines_from_stdin() |
igor@40 | 355 words = get_words(lines) |
igor@40 | 356 |
igor@40 | 357 stats = {} |
igor@40 | 358 stats['total'] = sum(words[x] for x in words.keys()) |
igor@40 | 359 words = substract_dictionary(words, vocabulary) |
igor@40 | 360 |
igor@40 | 361 stats['total_unknown'] = sum(words[x] for x in words.keys()) |
igor@40 | 362 stats['total_known'] = stats['total'] - stats['total_unknown'] |
igor@40 | 363 stats['percentage'] = "%7.2f"%(100.0*stats['total_known']/stats['total']) |
igor@40 | 364 stats['groups'] = 0 |
igor@40 | 365 stats['words'] = len(words) |
igor@40 | 366 stats['sentences'] = 0 #FIXME |
igor@40 | 367 stats['language'] = config['language'] |
igor@40 | 368 |
igor@40 | 369 linked_words = find_linked_words(notes) |
igor@40 | 370 normalizator = Normalizator(config['language'], linked_words) |
igor@40 | 371 |
igor@40 | 372 word_pairs = [] |
igor@40 | 373 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True): |
igor@40 | 374 word_pairs.append((words[k], k)) |
igor@40 | 375 |
igor@40 | 376 wgw = find_wordgroups_weights(word_pairs, normalizator) |
igor@40 | 377 word_pairs = sorted( |
igor@40 | 378 word_pairs, |
igor@40 | 379 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words), |
igor@40 | 380 reverse=True) |
igor@40 | 381 |
igor@40 | 382 print_words_sorted(word_pairs, stats) |
igor@40 | 383 |
igor@37 | 384 (options, args) = parser.parse_args() |
igor@38 | 385 if options.language: |
igor@38 | 386 config['language'] = options.language |
igor@37 | 387 |
igor@38 | 388 if options.function: |
igor@38 | 389 function_names = { |
igor@39 | 390 'add_notes' : filter_add_notes, |
igor@39 | 391 'remove_notes': filter_remove_notes, |
igor@40 | 392 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat, |
igor@38 | 393 } |
igor@38 | 394 if options.function in function_names: |
igor@38 | 395 function_names[options.function](args) |
igor@38 | 396 else: |
igor@38 | 397 error_message("Unkown function %s.\nAvailable functions:\n%s" % ( |
igor@38 | 398 options.function, "".join([" "+x for x in sorted(function_names.keys())]))) |
igor@38 | 399 sys.exit(1) |
igor@37 | 400 |
igor@37 | 401 |
igor@37 | 402 |
igor@37 | 403 |
igor@38 | 404 #os.system("vim") |
igor@37 | 405 |