new-words
annotate new-words.py @ 43:d532e7b52ab2
-s key support in new-words.py
Now new-words-py.sh -s works in the same way as new-words.sh.
(WPS and UWPS fields are not calculated correctly yet).
Now new-words-py.sh -s works in the same way as new-words.sh.
(WPS and UWPS fields are not calculated correctly yet).
author | Igor Chubin <igor@chub.in> |
---|---|
date | Fri Jan 28 12:40:58 2011 +0200 (2011-01-28) |
parents | 3ec83a7cc544 |
children | 7eb1a8c3eade |
rev | line source |
---|---|
igor@37 | 1 #!/usr/bin/env python |
igor@38 | 2 # -*- coding: utf-8 -*- |
igor@37 | 3 |
igor@40 | 4 from __future__ import with_statement |
igor@38 | 5 import codecs |
igor@38 | 6 import logging |
igor@38 | 7 import os |
igor@37 | 8 import optparse |
igor@38 | 9 import re |
igor@38 | 10 import subprocess |
igor@38 | 11 import sys |
igor@38 | 12 import Stemmer |
igor@42 | 13 try: |
igor@42 | 14 import psyco |
igor@42 | 15 psyco.full() |
igor@42 | 16 except: |
igor@42 | 17 pass |
igor@38 | 18 |
igor@38 | 19 config = { |
igor@38 | 20 'config_directory': os.environ['HOME'] + '/.new-words', |
igor@38 | 21 'language': 'en', |
igor@38 | 22 } |
igor@38 | 23 |
igor@38 | 24 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG) |
igor@38 | 25 |
igor@38 | 26 class Normalizator: |
igor@38 | 27 def __init__(self, language, linked_words={}): |
igor@38 | 28 stemmer_algorithm = { |
igor@38 | 29 'de' : 'german', |
igor@38 | 30 'en' : 'english', |
igor@38 | 31 'ru' : 'russian', |
igor@38 | 32 'uk' : 'ukrainian', |
igor@38 | 33 } |
igor@38 | 34 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language]) |
igor@38 | 35 self.linked_words = linked_words |
igor@38 | 36 |
igor@38 | 37 def normalize(self, word): |
igor@38 | 38 word_chain = [] |
igor@38 | 39 while word in self.linked_words and not word in word_chain: |
igor@38 | 40 word_chain.append(word) |
igor@38 | 41 word = self.linked_words[word] |
igor@38 | 42 return self.stemmer.stemWord(word.lower()) |
igor@37 | 43 |
igor@37 | 44 parser = optparse.OptionParser() |
igor@37 | 45 |
igor@37 | 46 parser.add_option( |
igor@37 | 47 "-a", "--no-marks", |
igor@37 | 48 help="don't add marks (and don't save marks added by user)", |
igor@37 | 49 action="store_true", |
igor@37 | 50 dest="no_marks") |
igor@37 | 51 |
igor@37 | 52 parser.add_option( |
igor@37 | 53 "-c", "--compressed", |
igor@37 | 54 help="show compressed wordlist: one word per group", |
igor@37 | 55 action="store_true", |
igor@37 | 56 dest="compressed") |
igor@37 | 57 |
igor@37 | 58 parser.add_option( |
igor@37 | 59 "-k", "--known-words", |
igor@37 | 60 help="put higher words that are similar to the known words (only for English)", |
igor@37 | 61 action="store_true", |
igor@37 | 62 dest="compressed") |
igor@37 | 63 |
igor@37 | 64 parser.add_option( |
igor@37 | 65 "-l", "--language", |
igor@37 | 66 help="specify language of text", |
igor@37 | 67 action="store", |
igor@37 | 68 dest="language") |
igor@37 | 69 |
igor@37 | 70 parser.add_option( |
igor@38 | 71 "-f", "--function", |
igor@38 | 72 help="filter through subsystem [INTERNAL]", |
igor@38 | 73 action="store", |
igor@38 | 74 dest="function") |
igor@38 | 75 |
igor@38 | 76 parser.add_option( |
igor@37 | 77 "-m", "--merge-tag", |
igor@37 | 78 help="merge words tagged with specified tag into the main vocabulary", |
igor@37 | 79 action="store", |
igor@37 | 80 dest="merge_tag") |
igor@37 | 81 |
igor@37 | 82 parser.add_option( |
igor@37 | 83 "-M", "--merge-tagged", |
igor@37 | 84 help="merge words tagged with ANY tag into the main vocabulary", |
igor@37 | 85 action="store_true", |
igor@37 | 86 dest="merge_tagged") |
igor@37 | 87 |
igor@37 | 88 parser.add_option( |
igor@37 | 89 "-n", "--non-interactive", |
igor@37 | 90 help="non-interactive mode (don't run vi)", |
igor@37 | 91 action="store_true", |
igor@37 | 92 dest="non_interactive") |
igor@37 | 93 |
igor@37 | 94 parser.add_option( |
igor@37 | 95 "-N", "--no-filter", |
igor@37 | 96 help="switch off known words filtering", |
igor@37 | 97 action="store_true", |
igor@37 | 98 dest="no_filter") |
igor@37 | 99 |
igor@37 | 100 parser.add_option( |
igor@37 | 101 "-p", "--pages", |
igor@37 | 102 help="work with specified pages only (pages = start-stop/total )", |
igor@37 | 103 action="store", |
igor@37 | 104 dest="pages") |
igor@37 | 105 |
igor@37 | 106 parser.add_option( |
igor@37 | 107 "-r", "--remove-tag", |
igor@37 | 108 help="remove subvocabulary of specified tag", |
igor@37 | 109 action="store", |
igor@37 | 110 dest="remove_tag") |
igor@37 | 111 |
igor@37 | 112 parser.add_option( |
igor@37 | 113 "-s", "--text-stats", |
igor@37 | 114 help="show the text statistics (percentage of known words and so on) and exit", |
igor@37 | 115 action="store_true", |
igor@37 | 116 dest="text_stats") |
igor@37 | 117 |
igor@37 | 118 parser.add_option( |
igor@37 | 119 "-S", "--voc-stats", |
igor@37 | 120 help="show your vocabulary statistics (number of words and word groups)", |
igor@37 | 121 action="store_true", |
igor@37 | 122 dest="voc_stats") |
igor@37 | 123 |
igor@37 | 124 parser.add_option( |
igor@37 | 125 "-t", "--tag", |
igor@37 | 126 help="tag known words with tag", |
igor@37 | 127 action="store", |
igor@37 | 128 dest="tag") |
igor@37 | 129 |
igor@37 | 130 parser.add_option( |
igor@37 | 131 "-T", "--show-tags", |
igor@37 | 132 help="tag known words with tag", |
igor@37 | 133 action="store_true", |
igor@37 | 134 dest="show_tags") |
igor@37 | 135 |
igor@37 | 136 parser.add_option( |
igor@37 | 137 "-2", "--two-words", |
igor@37 | 138 help="find 2 words' sequences", |
igor@37 | 139 action="store_true", |
igor@37 | 140 dest="two_words") |
igor@37 | 141 |
igor@37 | 142 parser.add_option( |
igor@37 | 143 "-3", "--three-words", |
igor@37 | 144 help="find 3 words' sequences", |
igor@37 | 145 action="store_true", |
igor@37 | 146 dest="three_words") |
igor@37 | 147 |
igor@38 | 148 def readlines_from_file(filename): |
igor@38 | 149 res = [] |
igor@38 | 150 with codecs.open(filename, "r", "utf-8") as f: |
igor@38 | 151 for line in f.readlines(): |
igor@38 | 152 res += [line] |
igor@38 | 153 return res |
igor@38 | 154 |
igor@38 | 155 def readlines_from_stdin(): |
igor@38 | 156 return codecs.getreader("utf-8")(sys.stdin).readlines() |
igor@38 | 157 |
igor@38 | 158 def words_from_line(line): |
igor@38 | 159 line = line.rstrip('\n') |
igor@38 | 160 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line) |
igor@38 | 161 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line) |
igor@38 | 162 return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line) |
igor@38 | 163 |
igor@38 | 164 def get_words(lines): |
igor@38 | 165 """ |
igor@38 | 166 Returns hash of words in a file |
igor@38 | 167 word => number |
igor@38 | 168 """ |
igor@38 | 169 result = {} |
igor@38 | 170 for line in lines: |
igor@38 | 171 words = words_from_line(line) |
igor@38 | 172 for word in words: |
igor@41 | 173 if re.match('[0-9]*$', word): |
igor@41 | 174 continue |
igor@38 | 175 result.setdefault(word, 0) |
igor@38 | 176 result[word] += 1 |
igor@38 | 177 return result |
igor@38 | 178 |
igor@38 | 179 def load_vocabulary(): |
igor@38 | 180 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language']))) |
igor@38 | 181 |
igor@38 | 182 def notes_filenames(): |
igor@38 | 183 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])] |
igor@38 | 184 |
igor@38 | 185 def load_notes(files): |
igor@38 | 186 notes = {} |
igor@38 | 187 for filename in files: |
igor@39 | 188 with codecs.open(filename, "r", "utf-8") as f: |
igor@38 | 189 for line in f.readlines(): |
igor@38 | 190 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) |
igor@38 | 191 notes.setdefault(word, {}) |
igor@38 | 192 notes[word][filename] = note |
igor@38 | 193 return notes |
igor@38 | 194 |
igor@39 | 195 def add_notes(lines, notes): |
igor@39 | 196 notes_filename = notes_filenames()[0] |
igor@39 | 197 result = [] |
igor@39 | 198 for line in lines: |
igor@39 | 199 if line.startswith('#'): |
igor@39 | 200 result += [line] |
igor@39 | 201 else: |
igor@39 | 202 match_object = re.search('^\s*\S+\s*(\S+)', line) |
igor@39 | 203 if match_object: |
igor@39 | 204 word = match_object.group(1) |
igor@39 | 205 if word in notes: |
igor@39 | 206 logging.debug(word) |
igor@39 | 207 logging.debug(line) |
igor@39 | 208 if notes_filename in notes[word]: |
igor@39 | 209 line = line.rstrip('\n') |
igor@39 | 210 line = "%-30s %s\n" % (line, notes[word][notes_filename]) |
igor@39 | 211 logging.debug(line) |
igor@39 | 212 result += [line] |
igor@39 | 213 else: |
igor@39 | 214 result += [line] |
igor@39 | 215 else: |
igor@39 | 216 result += [line] |
igor@39 | 217 return result |
igor@39 | 218 |
igor@39 | 219 def remove_notes(lines, notes_group): |
igor@39 | 220 notes_filename = notes_filenames()[0] |
igor@39 | 221 notes = {} |
igor@39 | 222 for k in notes_group.keys(): |
igor@39 | 223 if notes_filename in notes_group[k]: |
igor@39 | 224 notes[k] = notes_group[k][notes_filename] |
igor@39 | 225 |
igor@39 | 226 result = [] |
igor@39 | 227 for line in lines: |
igor@39 | 228 line = line.rstrip('\n') |
igor@39 | 229 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line) |
igor@39 | 230 if match_object: |
igor@39 | 231 result.append("".join([ |
igor@39 | 232 match_object.group(1), |
igor@39 | 233 match_object.group(2), |
igor@39 | 234 match_object.group(3), |
igor@39 | 235 match_object.group(4), |
igor@39 | 236 "\n" |
igor@39 | 237 ])) |
igor@39 | 238 notes[match_object.group(4)] = match_object.group(6) |
igor@39 | 239 else: |
igor@39 | 240 result.append(line+"\n") |
igor@39 | 241 |
igor@39 | 242 save_notes(notes_filename, notes) |
igor@39 | 243 return result |
igor@39 | 244 |
igor@39 | 245 def save_notes(filename, notes): |
igor@39 | 246 lines = [] |
igor@39 | 247 saved_words = [] |
igor@39 | 248 with codecs.open(filename, "r", "utf-8") as f: |
igor@39 | 249 for line in f.readlines(): |
igor@39 | 250 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) |
igor@39 | 251 if word in notes: |
igor@39 | 252 line = "%-29s %s\n" % (word, notes[word]) |
igor@39 | 253 saved_words.append(word) |
igor@39 | 254 lines.append(line) |
igor@39 | 255 for word in [x for x in notes.keys() if not x in saved_words]: |
igor@39 | 256 line = "%-29s %s\n" % (word, notes[word]) |
igor@39 | 257 lines.append(line) |
igor@39 | 258 |
igor@39 | 259 with codecs.open(filename, "w", "utf-8") as f: |
igor@39 | 260 for line in lines: |
igor@39 | 261 f.write(line) |
igor@39 | 262 |
igor@39 | 263 |
igor@38 | 264 def substract_dictionary(dict1, dict2): |
igor@38 | 265 """ |
igor@38 | 266 returns dict1 - dict2 |
igor@38 | 267 """ |
igor@38 | 268 result = {} |
igor@38 | 269 for (k,v) in dict1.items(): |
igor@38 | 270 if not k in dict2: |
igor@38 | 271 result[k] = v |
igor@38 | 272 return result |
igor@38 | 273 |
igor@38 | 274 def dump_words(words, filename): |
igor@38 | 275 with codecs.open(filename, "w+", "utf-8") as f: |
igor@38 | 276 for word in words.keys(): |
igor@38 | 277 f.write(("%s\n"%word)*words[word]) |
igor@38 | 278 |
igor@38 | 279 def error_message(text): |
igor@38 | 280 print text |
igor@38 | 281 |
igor@40 | 282 def find_wordgroups_weights(word_pairs, normalizator): |
igor@38 | 283 weight = {} |
igor@40 | 284 for (num, word) in word_pairs: |
igor@38 | 285 normalized = normalizator.normalize(word) |
igor@38 | 286 weight.setdefault(normalized, 0) |
igor@40 | 287 weight[normalized] += num |
igor@38 | 288 return weight |
igor@38 | 289 |
igor@38 | 290 def find_linked_words(notes): |
igor@38 | 291 linked_words = {} |
igor@38 | 292 for word in notes.keys(): |
igor@38 | 293 for note in notes[word].values(): |
igor@38 | 294 if "@" in note: |
igor@38 | 295 result = re.search(r'\@(\S*)', note) |
igor@38 | 296 if result: |
igor@38 | 297 main_word = result.group(1) |
igor@38 | 298 if main_word: |
igor@38 | 299 linked_words[word] = main_word |
igor@38 | 300 return linked_words |
igor@38 | 301 |
igor@40 | 302 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words): |
igor@40 | 303 (num1, word1) = pair1 |
igor@40 | 304 (num2, word2) = pair2 |
igor@38 | 305 |
igor@38 | 306 normalized_word1 = normalizator.normalize(word1) |
igor@38 | 307 normalized_word2 = normalizator.normalize(word2) |
igor@38 | 308 |
igor@38 | 309 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2]) |
igor@38 | 310 if cmp_res != 0: |
igor@38 | 311 return cmp_res |
igor@38 | 312 else: |
igor@38 | 313 cmp_res = cmp(normalized_word1, normalized_word2) |
igor@38 | 314 if cmp_res != 0: |
igor@38 | 315 return cmp_res |
igor@38 | 316 else: |
igor@38 | 317 return cmp(int(num1), int(num2)) |
igor@38 | 318 |
igor@40 | 319 def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False): |
igor@40 | 320 if stats_only: |
igor@43 | 321 codecs.getwriter("utf-8")(sys.stdout).write( |
igor@43 | 322 " ".join([ |
igor@43 | 323 "%-10s" % x for x in [ |
igor@43 | 324 "LANG", |
igor@43 | 325 "KNOWN%", |
igor@43 | 326 "UNKNOWN%", |
igor@43 | 327 "KNOWN", |
igor@43 | 328 "TOTAL", |
igor@43 | 329 "WPS", |
igor@43 | 330 "UWPS*10" |
igor@43 | 331 ]]) + "\n") |
igor@43 | 332 codecs.getwriter("utf-8")(sys.stdout).write( |
igor@43 | 333 " ".join([ |
igor@43 | 334 "%(language)-10s", |
igor@43 | 335 "%(percentage)-10.2f", |
igor@43 | 336 "%(percentage_unknown)-10.2f", |
igor@43 | 337 "%(total_known)-11d" |
igor@43 | 338 "%(total)-11d" |
igor@43 | 339 "%(wps)-11d" |
igor@43 | 340 "%(uwps)-11d" |
igor@43 | 341 ]) % stats + "\n") |
igor@40 | 342 return |
igor@38 | 343 |
igor@40 | 344 if print_stats: |
igor@40 | 345 codecs.getwriter("utf-8")(sys.stdout).write( |
igor@43 | 346 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats) |
igor@38 | 347 |
igor@40 | 348 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102) |
igor@40 | 349 known = int(stats['total_known']) |
igor@40 | 350 total = int(stats['total']) |
igor@40 | 351 current_level = 0 |
igor@40 | 352 for word_pair in word_pairs: |
igor@40 | 353 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair) |
igor@40 | 354 known += word_pair[0] |
igor@40 | 355 if 100.0*known/total >= level_lines[0]: |
igor@40 | 356 current_level = level_lines[0] |
igor@40 | 357 while 100.0*known/total > level_lines[0]: |
igor@40 | 358 current_level = level_lines[0] |
igor@40 | 359 level_lines = level_lines[1:] |
igor@40 | 360 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level) |
igor@38 | 361 |
igor@39 | 362 def filter_add_notes(args): |
igor@39 | 363 lines = readlines_from_file(args[0]) |
igor@39 | 364 notes = load_notes(notes_filenames()) |
igor@39 | 365 lines = add_notes(lines, notes) |
igor@39 | 366 with codecs.open(args[0], "w", "utf-8") as f: |
igor@39 | 367 for line in lines: |
igor@39 | 368 f.write(line) |
igor@39 | 369 |
igor@39 | 370 def filter_remove_notes(args): |
igor@39 | 371 lines = readlines_from_file(args[0]) |
igor@39 | 372 notes = load_notes(notes_filenames()) |
igor@39 | 373 lines = remove_notes(lines, notes) |
igor@39 | 374 with codecs.open(args[0], "w", "utf-8") as f: |
igor@39 | 375 for line in lines: |
igor@39 | 376 f.write(line) |
igor@39 | 377 |
igor@40 | 378 def filter_get_words_group_words_add_stat(args): |
igor@40 | 379 vocabulary = load_vocabulary() |
igor@40 | 380 notes = load_notes(notes_filenames()) |
igor@40 | 381 lines = readlines_from_stdin() |
igor@40 | 382 words = get_words(lines) |
igor@43 | 383 stats_only = False |
igor@43 | 384 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES': |
igor@43 | 385 stats_only = True |
igor@40 | 386 |
igor@40 | 387 stats = {} |
igor@40 | 388 stats['total'] = sum(words[x] for x in words.keys()) |
igor@40 | 389 words = substract_dictionary(words, vocabulary) |
igor@40 | 390 |
igor@40 | 391 stats['total_unknown'] = sum(words[x] for x in words.keys()) |
igor@40 | 392 stats['total_known'] = stats['total'] - stats['total_unknown'] |
igor@43 | 393 stats['percentage'] = 100.0*stats['total_known']/stats['total'] |
igor@43 | 394 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total'] |
igor@40 | 395 stats['groups'] = 0 |
igor@40 | 396 stats['words'] = len(words) |
igor@43 | 397 stats['sentences'] = 0 #FIXME |
igor@43 | 398 stats['wps'] = 0 #FIXME |
igor@43 | 399 stats['uwps'] = 0 #FIXME |
igor@40 | 400 stats['language'] = config['language'] |
igor@40 | 401 |
igor@40 | 402 linked_words = find_linked_words(notes) |
igor@40 | 403 normalizator = Normalizator(config['language'], linked_words) |
igor@40 | 404 |
igor@40 | 405 word_pairs = [] |
igor@40 | 406 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True): |
igor@40 | 407 word_pairs.append((words[k], k)) |
igor@40 | 408 |
igor@40 | 409 wgw = find_wordgroups_weights(word_pairs, normalizator) |
igor@40 | 410 word_pairs = sorted( |
igor@40 | 411 word_pairs, |
igor@40 | 412 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words), |
igor@40 | 413 reverse=True) |
igor@40 | 414 |
igor@43 | 415 print_words_sorted(word_pairs, stats, stats_only=stats_only) |
igor@40 | 416 |
igor@37 | 417 (options, args) = parser.parse_args() |
igor@38 | 418 if options.language: |
igor@38 | 419 config['language'] = options.language |
igor@37 | 420 |
igor@38 | 421 if options.function: |
igor@38 | 422 function_names = { |
igor@39 | 423 'add_notes' : filter_add_notes, |
igor@39 | 424 'remove_notes': filter_remove_notes, |
igor@40 | 425 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat, |
igor@38 | 426 } |
igor@38 | 427 if options.function in function_names: |
igor@38 | 428 function_names[options.function](args) |
igor@38 | 429 else: |
igor@38 | 430 error_message("Unkown function %s.\nAvailable functions:\n%s" % ( |
igor@38 | 431 options.function, "".join([" "+x for x in sorted(function_names.keys())]))) |
igor@38 | 432 sys.exit(1) |
igor@37 | 433 |
igor@37 | 434 |
igor@37 | 435 |
igor@37 | 436 |
igor@38 | 437 #os.system("vim") |
igor@37 | 438 |