new-words

annotate new-words.py @ 45:5f90e44eecfc

new-words.py: turn words filtering and grouping on and off
author Igor Chubin <igor@chub.in>
date Fri Feb 04 06:18:50 2011 +0100 (2011-02-04)
parents 7eb1a8c3eade
children d708e2c1bad8
rev   line source
igor@37 1 #!/usr/bin/env python
igor@38 2 # -*- coding: utf-8 -*-
igor@37 3
igor@40 4 from __future__ import with_statement
igor@38 5 import codecs
igor@38 6 import logging
igor@38 7 import os
igor@37 8 import optparse
igor@38 9 import re
igor@38 10 import subprocess
igor@38 11 import sys
igor@38 12 import Stemmer
igor@42 13 try:
igor@42 14 import psyco
igor@42 15 psyco.full()
igor@42 16 except:
igor@42 17 pass
igor@38 18
igor@38 19 config = {
igor@38 20 'config_directory': os.environ['HOME'] + '/.new-words',
igor@38 21 'language': 'en',
igor@38 22 }
igor@38 23
igor@38 24 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
igor@38 25
igor@38 26 class Normalizator:
igor@38 27 def __init__(self, language, linked_words={}):
igor@38 28 stemmer_algorithm = {
igor@38 29 'de' : 'german',
igor@38 30 'en' : 'english',
igor@38 31 'ru' : 'russian',
igor@38 32 'uk' : 'ukrainian',
igor@38 33 }
igor@38 34 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
igor@38 35 self.linked_words = linked_words
igor@38 36
igor@38 37 def normalize(self, word):
igor@38 38 word_chain = []
igor@38 39 while word in self.linked_words and not word in word_chain:
igor@38 40 word_chain.append(word)
igor@38 41 word = self.linked_words[word]
igor@38 42 return self.stemmer.stemWord(word.lower())
igor@37 43
igor@37 44 parser = optparse.OptionParser()
igor@37 45
igor@37 46 parser.add_option(
igor@37 47 "-a", "--no-marks",
igor@37 48 help="don't add marks (and don't save marks added by user)",
igor@37 49 action="store_true",
igor@37 50 dest="no_marks")
igor@37 51
igor@37 52 parser.add_option(
igor@37 53 "-c", "--compressed",
igor@37 54 help="show compressed wordlist: one word per group",
igor@37 55 action="store_true",
igor@37 56 dest="compressed")
igor@37 57
igor@37 58 parser.add_option(
igor@37 59 "-k", "--known-words",
igor@37 60 help="put higher words that are similar to the known words (only for English)",
igor@37 61 action="store_true",
igor@37 62 dest="compressed")
igor@37 63
igor@37 64 parser.add_option(
igor@37 65 "-l", "--language",
igor@37 66 help="specify language of text",
igor@37 67 action="store",
igor@37 68 dest="language")
igor@37 69
igor@37 70 parser.add_option(
igor@38 71 "-f", "--function",
igor@38 72 help="filter through subsystem [INTERNAL]",
igor@38 73 action="store",
igor@38 74 dest="function")
igor@38 75
igor@38 76 parser.add_option(
igor@37 77 "-m", "--merge-tag",
igor@37 78 help="merge words tagged with specified tag into the main vocabulary",
igor@37 79 action="store",
igor@37 80 dest="merge_tag")
igor@37 81
igor@37 82 parser.add_option(
igor@37 83 "-M", "--merge-tagged",
igor@37 84 help="merge words tagged with ANY tag into the main vocabulary",
igor@37 85 action="store_true",
igor@37 86 dest="merge_tagged")
igor@37 87
igor@37 88 parser.add_option(
igor@37 89 "-n", "--non-interactive",
igor@37 90 help="non-interactive mode (don't run vi)",
igor@37 91 action="store_true",
igor@37 92 dest="non_interactive")
igor@37 93
igor@37 94 parser.add_option(
igor@37 95 "-N", "--no-filter",
igor@37 96 help="switch off known words filtering",
igor@37 97 action="store_true",
igor@37 98 dest="no_filter")
igor@37 99
igor@37 100 parser.add_option(
igor@37 101 "-p", "--pages",
igor@37 102 help="work with specified pages only (pages = start-stop/total )",
igor@37 103 action="store",
igor@37 104 dest="pages")
igor@37 105
igor@37 106 parser.add_option(
igor@37 107 "-r", "--remove-tag",
igor@37 108 help="remove subvocabulary of specified tag",
igor@37 109 action="store",
igor@37 110 dest="remove_tag")
igor@37 111
igor@37 112 parser.add_option(
igor@37 113 "-s", "--text-stats",
igor@37 114 help="show the text statistics (percentage of known words and so on) and exit",
igor@37 115 action="store_true",
igor@37 116 dest="text_stats")
igor@37 117
igor@37 118 parser.add_option(
igor@37 119 "-S", "--voc-stats",
igor@37 120 help="show your vocabulary statistics (number of words and word groups)",
igor@37 121 action="store_true",
igor@37 122 dest="voc_stats")
igor@37 123
igor@37 124 parser.add_option(
igor@37 125 "-t", "--tag",
igor@37 126 help="tag known words with tag",
igor@37 127 action="store",
igor@37 128 dest="tag")
igor@37 129
igor@37 130 parser.add_option(
igor@37 131 "-T", "--show-tags",
igor@37 132 help="tag known words with tag",
igor@37 133 action="store_true",
igor@37 134 dest="show_tags")
igor@37 135
igor@37 136 parser.add_option(
igor@37 137 "-2", "--two-words",
igor@37 138 help="find 2 words' sequences",
igor@37 139 action="store_true",
igor@37 140 dest="two_words")
igor@37 141
igor@37 142 parser.add_option(
igor@37 143 "-3", "--three-words",
igor@37 144 help="find 3 words' sequences",
igor@37 145 action="store_true",
igor@37 146 dest="three_words")
igor@37 147
igor@38 148 def readlines_from_file(filename):
igor@38 149 res = []
igor@38 150 with codecs.open(filename, "r", "utf-8") as f:
igor@38 151 for line in f.readlines():
igor@38 152 res += [line]
igor@38 153 return res
igor@38 154
igor@38 155 def readlines_from_stdin():
igor@38 156 return codecs.getreader("utf-8")(sys.stdin).readlines()
igor@38 157
igor@38 158 def words_from_line(line):
igor@38 159 line = line.rstrip('\n')
igor@38 160 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
igor@38 161 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
igor@44 162 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
igor@38 163
igor@44 164 def get_words(lines, group_by=[1]):
igor@38 165 """
igor@38 166 Returns hash of words in a file
igor@38 167 word => number
igor@38 168 """
igor@38 169 result = {}
igor@44 170 (a, b, c) = ("", "", "")
igor@38 171 for line in lines:
igor@38 172 words = words_from_line(line)
igor@38 173 for word in words:
igor@41 174 if re.match('[0-9]*$', word):
igor@41 175 continue
igor@38 176 result.setdefault(word, 0)
igor@38 177 result[word] += 1
igor@44 178 if 2 in group_by and a != "" and b != "":
igor@44 179 w = "%s_%s" % (a,b)
igor@44 180 result.setdefault(w, 0)
igor@44 181 result[w] += 1
igor@44 182 if 3 in group_by and not "" in [a,b,c]:
igor@44 183 w = "%s_%s_%s" % (a,b,c)
igor@44 184 result.setdefault(w, 0)
igor@44 185 result[w] += 1
igor@44 186 (a,b,c) = (b, c, word)
igor@44 187
igor@44 188 logging.debug(result)
igor@38 189 return result
igor@38 190
igor@38 191 def load_vocabulary():
igor@38 192 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
igor@38 193
igor@38 194 def notes_filenames():
igor@38 195 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
igor@38 196
igor@38 197 def load_notes(files):
igor@38 198 notes = {}
igor@38 199 for filename in files:
igor@39 200 with codecs.open(filename, "r", "utf-8") as f:
igor@38 201 for line in f.readlines():
igor@38 202 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@38 203 notes.setdefault(word, {})
igor@38 204 notes[word][filename] = note
igor@38 205 return notes
igor@38 206
igor@39 207 def add_notes(lines, notes):
igor@39 208 notes_filename = notes_filenames()[0]
igor@39 209 result = []
igor@39 210 for line in lines:
igor@39 211 if line.startswith('#'):
igor@39 212 result += [line]
igor@39 213 else:
igor@39 214 match_object = re.search('^\s*\S+\s*(\S+)', line)
igor@39 215 if match_object:
igor@39 216 word = match_object.group(1)
igor@39 217 if word in notes:
igor@44 218 #logging.debug(word)
igor@44 219 #logging.debug(line)
igor@39 220 if notes_filename in notes[word]:
igor@39 221 line = line.rstrip('\n')
igor@39 222 line = "%-30s %s\n" % (line, notes[word][notes_filename])
igor@44 223 #logging.debug(line)
igor@39 224 result += [line]
igor@39 225 else:
igor@39 226 result += [line]
igor@39 227 else:
igor@39 228 result += [line]
igor@39 229 return result
igor@39 230
igor@39 231 def remove_notes(lines, notes_group):
igor@39 232 notes_filename = notes_filenames()[0]
igor@39 233 notes = {}
igor@39 234 for k in notes_group.keys():
igor@39 235 if notes_filename in notes_group[k]:
igor@39 236 notes[k] = notes_group[k][notes_filename]
igor@39 237
igor@39 238 result = []
igor@39 239 for line in lines:
igor@39 240 line = line.rstrip('\n')
igor@39 241 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
igor@39 242 if match_object:
igor@39 243 result.append("".join([
igor@39 244 match_object.group(1),
igor@39 245 match_object.group(2),
igor@39 246 match_object.group(3),
igor@39 247 match_object.group(4),
igor@39 248 "\n"
igor@39 249 ]))
igor@39 250 notes[match_object.group(4)] = match_object.group(6)
igor@39 251 else:
igor@39 252 result.append(line+"\n")
igor@39 253
igor@39 254 save_notes(notes_filename, notes)
igor@39 255 return result
igor@39 256
igor@39 257 def save_notes(filename, notes):
igor@39 258 lines = []
igor@39 259 saved_words = []
igor@39 260 with codecs.open(filename, "r", "utf-8") as f:
igor@39 261 for line in f.readlines():
igor@39 262 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@39 263 if word in notes:
igor@39 264 line = "%-29s %s\n" % (word, notes[word])
igor@39 265 saved_words.append(word)
igor@39 266 lines.append(line)
igor@39 267 for word in [x for x in notes.keys() if not x in saved_words]:
igor@39 268 line = "%-29s %s\n" % (word, notes[word])
igor@39 269 lines.append(line)
igor@39 270
igor@39 271 with codecs.open(filename, "w", "utf-8") as f:
igor@39 272 for line in lines:
igor@39 273 f.write(line)
igor@39 274
igor@39 275
igor@38 276 def substract_dictionary(dict1, dict2):
igor@38 277 """
igor@38 278 returns dict1 - dict2
igor@38 279 """
igor@38 280 result = {}
igor@38 281 for (k,v) in dict1.items():
igor@38 282 if not k in dict2:
igor@38 283 result[k] = v
igor@38 284 return result
igor@38 285
igor@38 286 def dump_words(words, filename):
igor@38 287 with codecs.open(filename, "w+", "utf-8") as f:
igor@38 288 for word in words.keys():
igor@38 289 f.write(("%s\n"%word)*words[word])
igor@38 290
igor@38 291 def error_message(text):
igor@38 292 print text
igor@38 293
igor@40 294 def find_wordgroups_weights(word_pairs, normalizator):
igor@38 295 weight = {}
igor@40 296 for (num, word) in word_pairs:
igor@38 297 normalized = normalizator.normalize(word)
igor@38 298 weight.setdefault(normalized, 0)
igor@40 299 weight[normalized] += num
igor@38 300 return weight
igor@38 301
igor@38 302 def find_linked_words(notes):
igor@38 303 linked_words = {}
igor@38 304 for word in notes.keys():
igor@38 305 for note in notes[word].values():
igor@38 306 if "@" in note:
igor@38 307 result = re.search(r'\@(\S*)', note)
igor@38 308 if result:
igor@38 309 main_word = result.group(1)
igor@38 310 if main_word:
igor@38 311 linked_words[word] = main_word
igor@38 312 return linked_words
igor@38 313
igor@40 314 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
igor@40 315 (num1, word1) = pair1
igor@40 316 (num2, word2) = pair2
igor@38 317
igor@38 318 normalized_word1 = normalizator.normalize(word1)
igor@38 319 normalized_word2 = normalizator.normalize(word2)
igor@38 320
igor@38 321 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
igor@38 322 if cmp_res != 0:
igor@38 323 return cmp_res
igor@38 324 else:
igor@38 325 cmp_res = cmp(normalized_word1, normalized_word2)
igor@38 326 if cmp_res != 0:
igor@38 327 return cmp_res
igor@38 328 else:
igor@38 329 return cmp(int(num1), int(num2))
igor@38 330
igor@40 331 def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False):
igor@40 332 if stats_only:
igor@43 333 codecs.getwriter("utf-8")(sys.stdout).write(
igor@43 334 " ".join([
igor@43 335 "%-10s" % x for x in [
igor@43 336 "LANG",
igor@43 337 "KNOWN%",
igor@43 338 "UNKNOWN%",
igor@43 339 "KNOWN",
igor@43 340 "TOTAL",
igor@43 341 "WPS",
igor@43 342 "UWPS*10"
igor@43 343 ]]) + "\n")
igor@43 344 codecs.getwriter("utf-8")(sys.stdout).write(
igor@43 345 " ".join([
igor@43 346 "%(language)-10s",
igor@43 347 "%(percentage)-10.2f",
igor@43 348 "%(percentage_unknown)-10.2f",
igor@43 349 "%(total_known)-11d"
igor@43 350 "%(total)-11d"
igor@43 351 "%(wps)-11d"
igor@43 352 "%(uwps)-11d"
igor@43 353 ]) % stats + "\n")
igor@40 354 return
igor@38 355
igor@40 356 if print_stats:
igor@40 357 codecs.getwriter("utf-8")(sys.stdout).write(
igor@43 358 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
igor@38 359
igor@40 360 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
igor@40 361 known = int(stats['total_known'])
igor@40 362 total = int(stats['total'])
igor@40 363 current_level = 0
igor@40 364 for word_pair in word_pairs:
igor@40 365 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
igor@40 366 known += word_pair[0]
igor@40 367 if 100.0*known/total >= level_lines[0]:
igor@40 368 current_level = level_lines[0]
igor@40 369 while 100.0*known/total > level_lines[0]:
igor@40 370 current_level = level_lines[0]
igor@40 371 level_lines = level_lines[1:]
igor@40 372 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
igor@38 373
igor@39 374 def filter_add_notes(args):
igor@39 375 lines = readlines_from_file(args[0])
igor@39 376 notes = load_notes(notes_filenames())
igor@39 377 lines = add_notes(lines, notes)
igor@39 378 with codecs.open(args[0], "w", "utf-8") as f:
igor@39 379 for line in lines:
igor@39 380 f.write(line)
igor@39 381
igor@39 382 def filter_remove_notes(args):
igor@39 383 lines = readlines_from_file(args[0])
igor@39 384 notes = load_notes(notes_filenames())
igor@39 385 lines = remove_notes(lines, notes)
igor@39 386 with codecs.open(args[0], "w", "utf-8") as f:
igor@39 387 for line in lines:
igor@39 388 f.write(line)
igor@39 389
igor@40 390 def filter_get_words_group_words_add_stat(args):
igor@40 391 vocabulary = load_vocabulary()
igor@40 392 notes = load_notes(notes_filenames())
igor@40 393 lines = readlines_from_stdin()
igor@44 394 group_by = [1]
igor@44 395 if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
igor@44 396 group_by.append(2)
igor@44 397 if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
igor@44 398 group_by.append(3)
igor@44 399 words = get_words(lines, group_by)
igor@43 400 stats_only = False
igor@43 401 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
igor@43 402 stats_only = True
igor@40 403
igor@44 404
igor@40 405 stats = {}
igor@40 406 stats['total'] = sum(words[x] for x in words.keys())
igor@45 407 if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':
igor@45 408 words = substract_dictionary(words, vocabulary)
igor@40 409
igor@40 410 stats['total_unknown'] = sum(words[x] for x in words.keys())
igor@40 411 stats['total_known'] = stats['total'] - stats['total_unknown']
igor@43 412 stats['percentage'] = 100.0*stats['total_known']/stats['total']
igor@43 413 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
igor@40 414 stats['groups'] = 0
igor@40 415 stats['words'] = len(words)
igor@43 416 stats['sentences'] = 0 #FIXME
igor@43 417 stats['wps'] = 0 #FIXME
igor@43 418 stats['uwps'] = 0 #FIXME
igor@40 419 stats['language'] = config['language']
igor@40 420
igor@40 421 linked_words = find_linked_words(notes)
igor@40 422 normalizator = Normalizator(config['language'], linked_words)
igor@40 423
igor@44 424 words_with_freq = []
igor@40 425 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
igor@44 426 words_with_freq.append((words[k], k))
igor@40 427
igor@44 428 wgw = find_wordgroups_weights(words_with_freq, normalizator)
igor@45 429 if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':
igor@45 430 words_with_freq = sorted(
igor@44 431 words_with_freq,
igor@40 432 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
igor@40 433 reverse=True)
igor@40 434
igor@44 435 print_words_sorted(words_with_freq, stats, stats_only=stats_only)
igor@40 436
igor@37 437 (options, args) = parser.parse_args()
igor@38 438 if options.language:
igor@38 439 config['language'] = options.language
igor@37 440
igor@38 441 if options.function:
igor@38 442 function_names = {
igor@39 443 'add_notes' : filter_add_notes,
igor@39 444 'remove_notes': filter_remove_notes,
igor@40 445 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
igor@38 446 }
igor@38 447 if options.function in function_names:
igor@38 448 function_names[options.function](args)
igor@38 449 else:
igor@38 450 error_message("Unkown function %s.\nAvailable functions:\n%s" % (
igor@38 451 options.function, "".join([" "+x for x in sorted(function_names.keys())])))
igor@38 452 sys.exit(1)
igor@37 453
igor@37 454
igor@37 455
igor@37 456
igor@38 457 #os.system("vim")
igor@37 458