new-words

annotate new-words.py @ 48:7194bdb56475

new feature: -r and -R can specify number of words (or percentage) to show
author Igor Chubin <igor@chub.in>
date Tue Feb 08 20:35:38 2011 +0200 (2011-02-08)
parents d708e2c1bad8
children 00286f6bfa85
rev   line source
igor@37 1 #!/usr/bin/env python
igor@38 2 # -*- coding: utf-8 -*-
igor@37 3
igor@40 4 from __future__ import with_statement
igor@38 5 import codecs
igor@38 6 import logging
igor@38 7 import os
igor@37 8 import optparse
igor@38 9 import re
igor@38 10 import subprocess
igor@38 11 import sys
igor@38 12 import Stemmer
igor@42 13 try:
igor@42 14 import psyco
igor@42 15 psyco.full()
igor@42 16 except:
igor@42 17 pass
igor@38 18
igor@38 19 config = {
igor@38 20 'config_directory': os.environ['HOME'] + '/.new-words',
igor@38 21 'language': 'en',
igor@38 22 }
igor@38 23
igor@38 24 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
igor@38 25
igor@38 26 class Normalizator:
igor@38 27 def __init__(self, language, linked_words={}):
igor@38 28 stemmer_algorithm = {
igor@38 29 'de' : 'german',
igor@38 30 'en' : 'english',
igor@38 31 'ru' : 'russian',
igor@38 32 'uk' : 'ukrainian',
igor@38 33 }
igor@38 34 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
igor@38 35 self.linked_words = linked_words
igor@38 36
igor@38 37 def normalize(self, word):
igor@38 38 word_chain = []
igor@38 39 while word in self.linked_words and not word in word_chain:
igor@38 40 word_chain.append(word)
igor@38 41 word = self.linked_words[word]
igor@38 42 return self.stemmer.stemWord(word.lower())
igor@37 43
igor@47 44 def best_word_from_group(self, wordpairs_group):
igor@47 45 """Returns the word that is the most relevant to the wordpairs_group.
igor@47 46
igor@47 47 At the moment: returns the word with minimal length"""
igor@47 48
igor@47 49 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
igor@47 50 return list(x[1] for x in sorted(
igor@47 51 (x for x in wordpairs_group if len(x[1]) == minimal_length),
igor@47 52 key=lambda x:x[0],
igor@47 53 reverse=True))[0]
igor@47 54
igor@37 55 parser = optparse.OptionParser()
igor@37 56
igor@37 57 parser.add_option(
igor@37 58 "-a", "--no-marks",
igor@37 59 help="don't add marks (and don't save marks added by user)",
igor@37 60 action="store_true",
igor@37 61 dest="no_marks")
igor@37 62
igor@37 63 parser.add_option(
igor@37 64 "-c", "--compressed",
igor@37 65 help="show compressed wordlist: one word per group",
igor@37 66 action="store_true",
igor@37 67 dest="compressed")
igor@37 68
igor@37 69 parser.add_option(
igor@37 70 "-k", "--known-words",
igor@37 71 help="put higher words that are similar to the known words (only for English)",
igor@37 72 action="store_true",
igor@37 73 dest="compressed")
igor@37 74
igor@37 75 parser.add_option(
igor@37 76 "-l", "--language",
igor@37 77 help="specify language of text",
igor@37 78 action="store",
igor@37 79 dest="language")
igor@37 80
igor@37 81 parser.add_option(
igor@38 82 "-f", "--function",
igor@38 83 help="filter through subsystem [INTERNAL]",
igor@38 84 action="store",
igor@38 85 dest="function")
igor@38 86
igor@38 87 parser.add_option(
igor@37 88 "-m", "--merge-tag",
igor@37 89 help="merge words tagged with specified tag into the main vocabulary",
igor@37 90 action="store",
igor@37 91 dest="merge_tag")
igor@37 92
igor@37 93 parser.add_option(
igor@37 94 "-M", "--merge-tagged",
igor@37 95 help="merge words tagged with ANY tag into the main vocabulary",
igor@37 96 action="store_true",
igor@37 97 dest="merge_tagged")
igor@37 98
igor@37 99 parser.add_option(
igor@37 100 "-n", "--non-interactive",
igor@37 101 help="non-interactive mode (don't run vi)",
igor@37 102 action="store_true",
igor@37 103 dest="non_interactive")
igor@37 104
igor@37 105 parser.add_option(
igor@37 106 "-N", "--no-filter",
igor@37 107 help="switch off known words filtering",
igor@37 108 action="store_true",
igor@37 109 dest="no_filter")
igor@37 110
igor@37 111 parser.add_option(
igor@37 112 "-p", "--pages",
igor@37 113 help="work with specified pages only (pages = start-stop/total )",
igor@37 114 action="store",
igor@37 115 dest="pages")
igor@37 116
igor@37 117 parser.add_option(
igor@48 118 "-d", "--delete-tag",
igor@48 119 help="delete subvocabulary of specified tag",
igor@37 120 action="store",
igor@48 121 dest="delete_tag")
igor@37 122
igor@37 123 parser.add_option(
igor@37 124 "-s", "--text-stats",
igor@37 125 help="show the text statistics (percentage of known words and so on) and exit",
igor@37 126 action="store_true",
igor@37 127 dest="text_stats")
igor@37 128
igor@37 129 parser.add_option(
igor@37 130 "-S", "--voc-stats",
igor@37 131 help="show your vocabulary statistics (number of words and word groups)",
igor@37 132 action="store_true",
igor@37 133 dest="voc_stats")
igor@37 134
igor@37 135 parser.add_option(
igor@37 136 "-t", "--tag",
igor@37 137 help="tag known words with tag",
igor@37 138 action="store",
igor@37 139 dest="tag")
igor@37 140
igor@37 141 parser.add_option(
igor@37 142 "-T", "--show-tags",
igor@37 143 help="tag known words with tag",
igor@37 144 action="store_true",
igor@37 145 dest="show_tags")
igor@37 146
igor@37 147 parser.add_option(
igor@37 148 "-2", "--two-words",
igor@37 149 help="find 2 words' sequences",
igor@37 150 action="store_true",
igor@37 151 dest="two_words")
igor@37 152
igor@37 153 parser.add_option(
igor@37 154 "-3", "--three-words",
igor@37 155 help="find 3 words' sequences",
igor@37 156 action="store_true",
igor@37 157 dest="three_words")
igor@37 158
igor@38 159 def readlines_from_file(filename):
igor@38 160 res = []
igor@38 161 with codecs.open(filename, "r", "utf-8") as f:
igor@38 162 for line in f.readlines():
igor@38 163 res += [line]
igor@38 164 return res
igor@38 165
igor@38 166 def readlines_from_stdin():
igor@38 167 return codecs.getreader("utf-8")(sys.stdin).readlines()
igor@38 168
igor@38 169 def words_from_line(line):
igor@38 170 line = line.rstrip('\n')
igor@38 171 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
igor@38 172 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
igor@44 173 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
igor@38 174
igor@44 175 def get_words(lines, group_by=[1]):
igor@38 176 """
igor@38 177 Returns hash of words in a file
igor@38 178 word => number
igor@38 179 """
igor@38 180 result = {}
igor@44 181 (a, b, c) = ("", "", "")
igor@38 182 for line in lines:
igor@38 183 words = words_from_line(line)
igor@38 184 for word in words:
igor@41 185 if re.match('[0-9]*$', word):
igor@41 186 continue
igor@38 187 result.setdefault(word, 0)
igor@38 188 result[word] += 1
igor@44 189 if 2 in group_by and a != "" and b != "":
igor@44 190 w = "%s_%s" % (a,b)
igor@44 191 result.setdefault(w, 0)
igor@44 192 result[w] += 1
igor@44 193 if 3 in group_by and not "" in [a,b,c]:
igor@44 194 w = "%s_%s_%s" % (a,b,c)
igor@44 195 result.setdefault(w, 0)
igor@44 196 result[w] += 1
igor@44 197 (a,b,c) = (b, c, word)
igor@44 198
igor@44 199 logging.debug(result)
igor@38 200 return result
igor@38 201
igor@38 202 def load_vocabulary():
igor@38 203 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
igor@38 204
igor@38 205 def notes_filenames():
igor@38 206 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
igor@38 207
igor@38 208 def load_notes(files):
igor@38 209 notes = {}
igor@38 210 for filename in files:
igor@39 211 with codecs.open(filename, "r", "utf-8") as f:
igor@38 212 for line in f.readlines():
igor@38 213 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@38 214 notes.setdefault(word, {})
igor@38 215 notes[word][filename] = note
igor@38 216 return notes
igor@38 217
igor@39 218 def add_notes(lines, notes):
igor@39 219 notes_filename = notes_filenames()[0]
igor@39 220 result = []
igor@39 221 for line in lines:
igor@39 222 if line.startswith('#'):
igor@39 223 result += [line]
igor@39 224 else:
igor@39 225 match_object = re.search('^\s*\S+\s*(\S+)', line)
igor@39 226 if match_object:
igor@39 227 word = match_object.group(1)
igor@39 228 if word in notes:
igor@39 229 if notes_filename in notes[word]:
igor@39 230 line = line.rstrip('\n')
igor@39 231 line = "%-30s %s\n" % (line, notes[word][notes_filename])
igor@39 232 result += [line]
igor@39 233 else:
igor@39 234 result += [line]
igor@39 235 else:
igor@39 236 result += [line]
igor@39 237 return result
igor@39 238
igor@39 239 def remove_notes(lines, notes_group):
igor@39 240 notes_filename = notes_filenames()[0]
igor@39 241 notes = {}
igor@39 242 for k in notes_group.keys():
igor@39 243 if notes_filename in notes_group[k]:
igor@39 244 notes[k] = notes_group[k][notes_filename]
igor@39 245
igor@39 246 result = []
igor@39 247 for line in lines:
igor@39 248 line = line.rstrip('\n')
igor@39 249 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
igor@39 250 if match_object:
igor@39 251 result.append("".join([
igor@39 252 match_object.group(1),
igor@39 253 match_object.group(2),
igor@39 254 match_object.group(3),
igor@39 255 match_object.group(4),
igor@39 256 "\n"
igor@39 257 ]))
igor@39 258 notes[match_object.group(4)] = match_object.group(6)
igor@39 259 else:
igor@39 260 result.append(line+"\n")
igor@39 261
igor@39 262 save_notes(notes_filename, notes)
igor@39 263 return result
igor@39 264
igor@39 265 def save_notes(filename, notes):
igor@39 266 lines = []
igor@39 267 saved_words = []
igor@39 268 with codecs.open(filename, "r", "utf-8") as f:
igor@39 269 for line in f.readlines():
igor@39 270 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@39 271 if word in notes:
igor@39 272 line = "%-29s %s\n" % (word, notes[word])
igor@39 273 saved_words.append(word)
igor@39 274 lines.append(line)
igor@39 275 for word in [x for x in notes.keys() if not x in saved_words]:
igor@39 276 line = "%-29s %s\n" % (word, notes[word])
igor@39 277 lines.append(line)
igor@39 278
igor@39 279 with codecs.open(filename, "w", "utf-8") as f:
igor@39 280 for line in lines:
igor@39 281 f.write(line)
igor@39 282
igor@39 283
igor@38 284 def substract_dictionary(dict1, dict2):
igor@38 285 """
igor@38 286 returns dict1 - dict2
igor@38 287 """
igor@38 288 result = {}
igor@38 289 for (k,v) in dict1.items():
igor@38 290 if not k in dict2:
igor@38 291 result[k] = v
igor@38 292 return result
igor@38 293
igor@38 294 def dump_words(words, filename):
igor@38 295 with codecs.open(filename, "w+", "utf-8") as f:
igor@38 296 for word in words.keys():
igor@38 297 f.write(("%s\n"%word)*words[word])
igor@38 298
igor@38 299 def error_message(text):
igor@38 300 print text
igor@38 301
igor@40 302 def find_wordgroups_weights(word_pairs, normalizator):
igor@38 303 weight = {}
igor@40 304 for (num, word) in word_pairs:
igor@38 305 normalized = normalizator.normalize(word)
igor@38 306 weight.setdefault(normalized, 0)
igor@40 307 weight[normalized] += num
igor@38 308 return weight
igor@38 309
igor@38 310 def find_linked_words(notes):
igor@38 311 linked_words = {}
igor@38 312 for word in notes.keys():
igor@38 313 for note in notes[word].values():
igor@38 314 if "@" in note:
igor@38 315 result = re.search(r'\@(\S*)', note)
igor@38 316 if result:
igor@38 317 main_word = result.group(1)
igor@38 318 if main_word:
igor@38 319 linked_words[word] = main_word
igor@38 320 return linked_words
igor@38 321
igor@40 322 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
igor@40 323 (num1, word1) = pair1
igor@40 324 (num2, word2) = pair2
igor@38 325
igor@38 326 normalized_word1 = normalizator.normalize(word1)
igor@38 327 normalized_word2 = normalizator.normalize(word2)
igor@38 328
igor@38 329 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
igor@38 330 if cmp_res != 0:
igor@38 331 return cmp_res
igor@38 332 else:
igor@38 333 cmp_res = cmp(normalized_word1, normalized_word2)
igor@38 334 if cmp_res != 0:
igor@38 335 return cmp_res
igor@38 336 else:
igor@38 337 return cmp(int(num1), int(num2))
igor@38 338
igor@47 339
igor@48 340 def print_words_sorted(
igor@48 341 word_pairs,
igor@48 342 stats,
igor@48 343 normalizator,
igor@48 344 print_stats=True,
igor@48 345 stats_only=False,
igor@48 346 compressed_wordlist=False,
igor@48 347 show_range=0,
igor@48 348 show_range_percentage=0,
igor@48 349 ):
igor@40 350 if stats_only:
igor@43 351 codecs.getwriter("utf-8")(sys.stdout).write(
igor@43 352 " ".join([
igor@43 353 "%-10s" % x for x in [
igor@43 354 "LANG",
igor@43 355 "KNOWN%",
igor@43 356 "UNKNOWN%",
igor@43 357 "KNOWN",
igor@43 358 "TOTAL",
igor@43 359 "WPS",
igor@43 360 "UWPS*10"
igor@43 361 ]]) + "\n")
igor@43 362 codecs.getwriter("utf-8")(sys.stdout).write(
igor@43 363 " ".join([
igor@43 364 "%(language)-10s",
igor@43 365 "%(percentage)-10.2f",
igor@43 366 "%(percentage_unknown)-10.2f",
igor@43 367 "%(total_known)-11d"
igor@43 368 "%(total)-11d"
igor@43 369 "%(wps)-11d"
igor@43 370 "%(uwps)-11d"
igor@43 371 ]) % stats + "\n")
igor@40 372 return
igor@38 373
igor@40 374 if print_stats:
igor@40 375 codecs.getwriter("utf-8")(sys.stdout).write(
igor@43 376 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
igor@38 377
igor@40 378 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
igor@40 379 known = int(stats['total_known'])
igor@40 380 total = int(stats['total'])
igor@40 381 current_level = 0
igor@47 382 old_normalized_word = None
igor@47 383 words_of_this_group = []
igor@48 384 printed_words = 0
igor@40 385 for word_pair in word_pairs:
igor@47 386
igor@47 387 normalized_word = normalizator.normalize(word_pair[1])
igor@47 388 if old_normalized_word and old_normalized_word != normalized_word:
igor@47 389 #codecs.getwriter("utf-8")(sys.stdout).write(
igor@47 390 # "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
igor@47 391 compressed_word_pair = (
igor@47 392 sum(x[0] for x in words_of_this_group),
igor@47 393 normalizator.best_word_from_group(words_of_this_group)
igor@47 394 )
igor@47 395 if compressed_wordlist:
igor@47 396 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
igor@48 397 printed_words += 1
igor@47 398 words_of_this_group = []
igor@47 399
igor@47 400 old_normalized_word = normalized_word
igor@47 401 words_of_this_group.append(word_pair)
igor@47 402
igor@47 403 if not compressed_wordlist:
igor@47 404 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
igor@48 405 printed_words += 1
igor@47 406
igor@47 407
igor@40 408 known += word_pair[0]
igor@40 409 if 100.0*known/total >= level_lines[0]:
igor@40 410 current_level = level_lines[0]
igor@40 411 while 100.0*known/total > level_lines[0]:
igor@40 412 current_level = level_lines[0]
igor@40 413 level_lines = level_lines[1:]
igor@40 414 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
igor@38 415
igor@48 416 if show_range >0 and printed_words >= show_range:
igor@48 417 break
igor@48 418 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
igor@48 419 break
igor@48 420
igor@39 421 def filter_add_notes(args):
igor@39 422 lines = readlines_from_file(args[0])
igor@39 423 notes = load_notes(notes_filenames())
igor@39 424 lines = add_notes(lines, notes)
igor@39 425 with codecs.open(args[0], "w", "utf-8") as f:
igor@39 426 for line in lines:
igor@39 427 f.write(line)
igor@39 428
igor@39 429 def filter_remove_notes(args):
igor@39 430 lines = readlines_from_file(args[0])
igor@39 431 notes = load_notes(notes_filenames())
igor@39 432 lines = remove_notes(lines, notes)
igor@39 433 with codecs.open(args[0], "w", "utf-8") as f:
igor@39 434 for line in lines:
igor@39 435 f.write(line)
igor@39 436
igor@40 437 def filter_get_words_group_words_add_stat(args):
igor@40 438 vocabulary = load_vocabulary()
igor@40 439 notes = load_notes(notes_filenames())
igor@40 440 lines = readlines_from_stdin()
igor@44 441 group_by = [1]
igor@48 442
igor@44 443 if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
igor@44 444 group_by.append(2)
igor@44 445 if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
igor@44 446 group_by.append(3)
igor@44 447 words = get_words(lines, group_by)
igor@43 448 stats_only = False
igor@43 449 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
igor@43 450 stats_only = True
igor@40 451
igor@47 452 compressed_wordlist = False
igor@47 453 if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
igor@47 454 compressed_wordlist = True
igor@47 455
igor@48 456 show_range = os.environ.get('SHOW_RANGE', '')
igor@48 457 if show_range != '':
igor@48 458 show_range = int(show_range)
igor@48 459 else:
igor@48 460 show_range = 0
igor@48 461 show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '')
igor@48 462 if show_range_percentage != '':
igor@48 463 show_range_percentage = int(show_range_percentage)
igor@48 464 else:
igor@48 465 show_range_percentage = 0
igor@48 466
igor@44 467
igor@40 468 stats = {}
igor@40 469 stats['total'] = sum(words[x] for x in words.keys())
igor@45 470 if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':
igor@45 471 words = substract_dictionary(words, vocabulary)
igor@40 472
igor@40 473 stats['total_unknown'] = sum(words[x] for x in words.keys())
igor@40 474 stats['total_known'] = stats['total'] - stats['total_unknown']
igor@43 475 stats['percentage'] = 100.0*stats['total_known']/stats['total']
igor@43 476 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
igor@40 477 stats['groups'] = 0
igor@40 478 stats['words'] = len(words)
igor@43 479 stats['sentences'] = 0 #FIXME
igor@43 480 stats['wps'] = 0 #FIXME
igor@43 481 stats['uwps'] = 0 #FIXME
igor@40 482 stats['language'] = config['language']
igor@40 483
igor@40 484 linked_words = find_linked_words(notes)
igor@40 485 normalizator = Normalizator(config['language'], linked_words)
igor@40 486
igor@44 487 words_with_freq = []
igor@40 488 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
igor@44 489 words_with_freq.append((words[k], k))
igor@40 490
igor@44 491 wgw = find_wordgroups_weights(words_with_freq, normalizator)
igor@45 492 if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':
igor@45 493 words_with_freq = sorted(
igor@44 494 words_with_freq,
igor@40 495 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
igor@40 496 reverse=True)
igor@40 497
igor@47 498 print_words_sorted(
igor@47 499 words_with_freq,
igor@47 500 stats,
igor@47 501 normalizator,
igor@47 502 stats_only=stats_only,
igor@48 503 compressed_wordlist=compressed_wordlist,
igor@48 504 show_range=show_range,
igor@48 505 show_range_percentage=show_range_percentage,
igor@47 506 )
igor@40 507
igor@37 508 (options, args) = parser.parse_args()
igor@38 509 if options.language:
igor@38 510 config['language'] = options.language
igor@37 511
igor@38 512 if options.function:
igor@38 513 function_names = {
igor@39 514 'add_notes' : filter_add_notes,
igor@39 515 'remove_notes': filter_remove_notes,
igor@40 516 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
igor@38 517 }
igor@38 518 if options.function in function_names:
igor@38 519 function_names[options.function](args)
igor@38 520 else:
igor@38 521 error_message("Unkown function %s.\nAvailable functions:\n%s" % (
igor@38 522 options.function, "".join([" "+x for x in sorted(function_names.keys())])))
igor@38 523 sys.exit(1)
igor@37 524
igor@37 525
igor@37 526
igor@37 527
igor@38 528 #os.system("vim")
igor@37 529