new-words: new-words.py annotate

new-words

annotate new-words.py @ 48:7194bdb56475

new feature: -r and -R can specify number of words (or percentage) to show

author	Igor Chubin <igor@chub.in>
date	Tue Feb 08 20:35:38 2011 +0200 (2011-02-08)
parents	d708e2c1bad8
children	00286f6bfa85

rev	line source
igor@37	1 #!/usr/bin/env python
igor@38	2 # -- coding: utf-8 --
igor@37	3
igor@40	4 from __future__ import with_statement
igor@38	5 import codecs
igor@38	6 import logging
igor@38	7 import os
igor@37	8 import optparse
igor@38	9 import re
igor@38	10 import subprocess
igor@38	11 import sys
igor@38	12 import Stemmer
igor@42	13 try:
igor@42	14 import psyco
igor@42	15 psyco.full()
igor@42	16 except:
igor@42	17 pass
igor@38	18
igor@38	19 config = {
igor@38	20 'config_directory': os.environ['HOME'] + '/.new-words',
igor@38	21 'language': 'en',
igor@38	22 }
igor@38	23
igor@38	24 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
igor@38	25
igor@38	26 class Normalizator:
igor@38	27 def __init__(self, language, linked_words={}):
igor@38	28 stemmer_algorithm = {
igor@38	29 'de' : 'german',
igor@38	30 'en' : 'english',
igor@38	31 'ru' : 'russian',
igor@38	32 'uk' : 'ukrainian',
igor@38	33 }
igor@38	34 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
igor@38	35 self.linked_words = linked_words
igor@38	36
igor@38	37 def normalize(self, word):
igor@38	38 word_chain = []
igor@38	39 while word in self.linked_words and not word in word_chain:
igor@38	40 word_chain.append(word)
igor@38	41 word = self.linked_words[word]
igor@38	42 return self.stemmer.stemWord(word.lower())
igor@37	43
igor@47	44 def best_word_from_group(self, wordpairs_group):
igor@47	45 """Returns the word that is the most relevant to the wordpairs_group.
igor@47	46
igor@47	47 At the moment: returns the word with minimal length"""
igor@47	48
igor@47	49 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
igor@47	50 return list(x[1] for x in sorted(
igor@47	51 (x for x in wordpairs_group if len(x[1]) == minimal_length),
igor@47	52 key=lambda x:x[0],
igor@47	53 reverse=True))[0]
igor@47	54
igor@37	55 parser = optparse.OptionParser()
igor@37	56
igor@37	57 parser.add_option(
igor@37	58 "-a", "--no-marks",
igor@37	59 help="don't add marks (and don't save marks added by user)",
igor@37	60 action="store_true",
igor@37	61 dest="no_marks")
igor@37	62
igor@37	63 parser.add_option(
igor@37	64 "-c", "--compressed",
igor@37	65 help="show compressed wordlist: one word per group",
igor@37	66 action="store_true",
igor@37	67 dest="compressed")
igor@37	68
igor@37	69 parser.add_option(
igor@37	70 "-k", "--known-words",
igor@37	71 help="put higher words that are similar to the known words (only for English)",
igor@37	72 action="store_true",
igor@37	73 dest="compressed")
igor@37	74
igor@37	75 parser.add_option(
igor@37	76 "-l", "--language",
igor@37	77 help="specify language of text",
igor@37	78 action="store",
igor@37	79 dest="language")
igor@37	80
igor@37	81 parser.add_option(
igor@38	82 "-f", "--function",
igor@38	83 help="filter through subsystem [INTERNAL]",
igor@38	84 action="store",
igor@38	85 dest="function")
igor@38	86
igor@38	87 parser.add_option(
igor@37	88 "-m", "--merge-tag",
igor@37	89 help="merge words tagged with specified tag into the main vocabulary",
igor@37	90 action="store",
igor@37	91 dest="merge_tag")
igor@37	92
igor@37	93 parser.add_option(
igor@37	94 "-M", "--merge-tagged",
igor@37	95 help="merge words tagged with ANY tag into the main vocabulary",
igor@37	96 action="store_true",
igor@37	97 dest="merge_tagged")
igor@37	98
igor@37	99 parser.add_option(
igor@37	100 "-n", "--non-interactive",
igor@37	101 help="non-interactive mode (don't run vi)",
igor@37	102 action="store_true",
igor@37	103 dest="non_interactive")
igor@37	104
igor@37	105 parser.add_option(
igor@37	106 "-N", "--no-filter",
igor@37	107 help="switch off known words filtering",
igor@37	108 action="store_true",
igor@37	109 dest="no_filter")
igor@37	110
igor@37	111 parser.add_option(
igor@37	112 "-p", "--pages",
igor@37	113 help="work with specified pages only (pages = start-stop/total )",
igor@37	114 action="store",
igor@37	115 dest="pages")
igor@37	116
igor@37	117 parser.add_option(
igor@48	118 "-d", "--delete-tag",
igor@48	119 help="delete subvocabulary of specified tag",
igor@37	120 action="store",
igor@48	121 dest="delete_tag")
igor@37	122
igor@37	123 parser.add_option(
igor@37	124 "-s", "--text-stats",
igor@37	125 help="show the text statistics (percentage of known words and so on) and exit",
igor@37	126 action="store_true",
igor@37	127 dest="text_stats")
igor@37	128
igor@37	129 parser.add_option(
igor@37	130 "-S", "--voc-stats",
igor@37	131 help="show your vocabulary statistics (number of words and word groups)",
igor@37	132 action="store_true",
igor@37	133 dest="voc_stats")
igor@37	134
igor@37	135 parser.add_option(
igor@37	136 "-t", "--tag",
igor@37	137 help="tag known words with tag",
igor@37	138 action="store",
igor@37	139 dest="tag")
igor@37	140
igor@37	141 parser.add_option(
igor@37	142 "-T", "--show-tags",
igor@37	143 help="tag known words with tag",
igor@37	144 action="store_true",
igor@37	145 dest="show_tags")
igor@37	146
igor@37	147 parser.add_option(
igor@37	148 "-2", "--two-words",
igor@37	149 help="find 2 words' sequences",
igor@37	150 action="store_true",
igor@37	151 dest="two_words")
igor@37	152
igor@37	153 parser.add_option(
igor@37	154 "-3", "--three-words",
igor@37	155 help="find 3 words' sequences",
igor@37	156 action="store_true",
igor@37	157 dest="three_words")
igor@37	158
igor@38	159 def readlines_from_file(filename):
igor@38	160 res = []
igor@38	161 with codecs.open(filename, "r", "utf-8") as f:
igor@38	162 for line in f.readlines():
igor@38	163 res += [line]
igor@38	164 return res
igor@38	165
igor@38	166 def readlines_from_stdin():
igor@38	167 return codecs.getreader("utf-8")(sys.stdin).readlines()
igor@38	168
igor@38	169 def words_from_line(line):
igor@38	170 line = line.rstrip('\n')
igor@38	171 #return re.split('(?:\s\|[*\r,.:#@()+=<>$;"?!\|\[\]^%&~{}«»–])+', line)
igor@38	172 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
igor@44	173 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
igor@38	174
igor@44	175 def get_words(lines, group_by=[1]):
igor@38	176 """
igor@38	177 Returns hash of words in a file
igor@38	178 word => number
igor@38	179 """
igor@38	180 result = {}
igor@44	181 (a, b, c) = ("", "", "")
igor@38	182 for line in lines:
igor@38	183 words = words_from_line(line)
igor@38	184 for word in words:
igor@41	185 if re.match('[0-9]*$', word):
igor@41	186 continue
igor@38	187 result.setdefault(word, 0)
igor@38	188 result[word] += 1
igor@44	189 if 2 in group_by and a != "" and b != "":
igor@44	190 w = "%s_%s" % (a,b)
igor@44	191 result.setdefault(w, 0)
igor@44	192 result[w] += 1
igor@44	193 if 3 in group_by and not "" in [a,b,c]:
igor@44	194 w = "%s_%s_%s" % (a,b,c)
igor@44	195 result.setdefault(w, 0)
igor@44	196 result[w] += 1
igor@44	197 (a,b,c) = (b, c, word)
igor@44	198
igor@44	199 logging.debug(result)
igor@38	200 return result
igor@38	201
igor@38	202 def load_vocabulary():
igor@38	203 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
igor@38	204
igor@38	205 def notes_filenames():
igor@38	206 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
igor@38	207
igor@38	208 def load_notes(files):
igor@38	209 notes = {}
igor@38	210 for filename in files:
igor@39	211 with codecs.open(filename, "r", "utf-8") as f:
igor@38	212 for line in f.readlines():
igor@38	213 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@38	214 notes.setdefault(word, {})
igor@38	215 notes[word][filename] = note
igor@38	216 return notes
igor@38	217
igor@39	218 def add_notes(lines, notes):
igor@39	219 notes_filename = notes_filenames()[0]
igor@39	220 result = []
igor@39	221 for line in lines:
igor@39	222 if line.startswith('#'):
igor@39	223 result += [line]
igor@39	224 else:
igor@39	225 match_object = re.search('^\s\S+\s(\S+)', line)
igor@39	226 if match_object:
igor@39	227 word = match_object.group(1)
igor@39	228 if word in notes:
igor@39	229 if notes_filename in notes[word]:
igor@39	230 line = line.rstrip('\n')
igor@39	231 line = "%-30s %s\n" % (line, notes[word][notes_filename])
igor@39	232 result += [line]
igor@39	233 else:
igor@39	234 result += [line]
igor@39	235 else:
igor@39	236 result += [line]
igor@39	237 return result
igor@39	238
igor@39	239 def remove_notes(lines, notes_group):
igor@39	240 notes_filename = notes_filenames()[0]
igor@39	241 notes = {}
igor@39	242 for k in notes_group.keys():
igor@39	243 if notes_filename in notes_group[k]:
igor@39	244 notes[k] = notes_group[k][notes_filename]
igor@39	245
igor@39	246 result = []
igor@39	247 for line in lines:
igor@39	248 line = line.rstrip('\n')
igor@39	249 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
igor@39	250 if match_object:
igor@39	251 result.append("".join([
igor@39	252 match_object.group(1),
igor@39	253 match_object.group(2),
igor@39	254 match_object.group(3),
igor@39	255 match_object.group(4),
igor@39	256 "\n"
igor@39	257 ]))
igor@39	258 notes[match_object.group(4)] = match_object.group(6)
igor@39	259 else:
igor@39	260 result.append(line+"\n")
igor@39	261
igor@39	262 save_notes(notes_filename, notes)
igor@39	263 return result
igor@39	264
igor@39	265 def save_notes(filename, notes):
igor@39	266 lines = []
igor@39	267 saved_words = []
igor@39	268 with codecs.open(filename, "r", "utf-8") as f:
igor@39	269 for line in f.readlines():
igor@39	270 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@39	271 if word in notes:
igor@39	272 line = "%-29s %s\n" % (word, notes[word])
igor@39	273 saved_words.append(word)
igor@39	274 lines.append(line)
igor@39	275 for word in [x for x in notes.keys() if not x in saved_words]:
igor@39	276 line = "%-29s %s\n" % (word, notes[word])
igor@39	277 lines.append(line)
igor@39	278
igor@39	279 with codecs.open(filename, "w", "utf-8") as f:
igor@39	280 for line in lines:
igor@39	281 f.write(line)
igor@39	282
igor@39	283
igor@38	284 def substract_dictionary(dict1, dict2):
igor@38	285 """
igor@38	286 returns dict1 - dict2
igor@38	287 """
igor@38	288 result = {}
igor@38	289 for (k,v) in dict1.items():
igor@38	290 if not k in dict2:
igor@38	291 result[k] = v
igor@38	292 return result
igor@38	293
igor@38	294 def dump_words(words, filename):
igor@38	295 with codecs.open(filename, "w+", "utf-8") as f:
igor@38	296 for word in words.keys():
igor@38	297 f.write(("%s\n"%word)*words[word])
igor@38	298
igor@38	299 def error_message(text):
igor@38	300 print text
igor@38	301
igor@40	302 def find_wordgroups_weights(word_pairs, normalizator):
igor@38	303 weight = {}
igor@40	304 for (num, word) in word_pairs:
igor@38	305 normalized = normalizator.normalize(word)
igor@38	306 weight.setdefault(normalized, 0)
igor@40	307 weight[normalized] += num
igor@38	308 return weight
igor@38	309
igor@38	310 def find_linked_words(notes):
igor@38	311 linked_words = {}
igor@38	312 for word in notes.keys():
igor@38	313 for note in notes[word].values():
igor@38	314 if "@" in note:
igor@38	315 result = re.search(r'\@(\S*)', note)
igor@38	316 if result:
igor@38	317 main_word = result.group(1)
igor@38	318 if main_word:
igor@38	319 linked_words[word] = main_word
igor@38	320 return linked_words
igor@38	321
igor@40	322 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
igor@40	323 (num1, word1) = pair1
igor@40	324 (num2, word2) = pair2
igor@38	325
igor@38	326 normalized_word1 = normalizator.normalize(word1)
igor@38	327 normalized_word2 = normalizator.normalize(word2)
igor@38	328
igor@38	329 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
igor@38	330 if cmp_res != 0:
igor@38	331 return cmp_res
igor@38	332 else:
igor@38	333 cmp_res = cmp(normalized_word1, normalized_word2)
igor@38	334 if cmp_res != 0:
igor@38	335 return cmp_res
igor@38	336 else:
igor@38	337 return cmp(int(num1), int(num2))
igor@38	338
igor@47	339
igor@48	340 def print_words_sorted(
igor@48	341 word_pairs,
igor@48	342 stats,
igor@48	343 normalizator,
igor@48	344 print_stats=True,
igor@48	345 stats_only=False,
igor@48	346 compressed_wordlist=False,
igor@48	347 show_range=0,
igor@48	348 show_range_percentage=0,
igor@48	349 ):
igor@40	350 if stats_only:
igor@43	351 codecs.getwriter("utf-8")(sys.stdout).write(
igor@43	352 " ".join([
igor@43	353 "%-10s" % x for x in [
igor@43	354 "LANG",
igor@43	355 "KNOWN%",
igor@43	356 "UNKNOWN%",
igor@43	357 "KNOWN",
igor@43	358 "TOTAL",
igor@43	359 "WPS",
igor@43	360 "UWPS*10"
igor@43	361 ]]) + "\n")
igor@43	362 codecs.getwriter("utf-8")(sys.stdout).write(
igor@43	363 " ".join([
igor@43	364 "%(language)-10s",
igor@43	365 "%(percentage)-10.2f",
igor@43	366 "%(percentage_unknown)-10.2f",
igor@43	367 "%(total_known)-11d"
igor@43	368 "%(total)-11d"
igor@43	369 "%(wps)-11d"
igor@43	370 "%(uwps)-11d"
igor@43	371 ]) % stats + "\n")
igor@40	372 return
igor@38	373
igor@40	374 if print_stats:
igor@40	375 codecs.getwriter("utf-8")(sys.stdout).write(
igor@43	376 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
igor@38	377
igor@40	378 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
igor@40	379 known = int(stats['total_known'])
igor@40	380 total = int(stats['total'])
igor@40	381 current_level = 0
igor@47	382 old_normalized_word = None
igor@47	383 words_of_this_group = []
igor@48	384 printed_words = 0
igor@40	385 for word_pair in word_pairs:
igor@47	386
igor@47	387 normalized_word = normalizator.normalize(word_pair[1])
igor@47	388 if old_normalized_word and old_normalized_word != normalized_word:
igor@47	389 #codecs.getwriter("utf-8")(sys.stdout).write(
igor@47	390 # "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
igor@47	391 compressed_word_pair = (
igor@47	392 sum(x[0] for x in words_of_this_group),
igor@47	393 normalizator.best_word_from_group(words_of_this_group)
igor@47	394 )
igor@47	395 if compressed_wordlist:
igor@47	396 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
igor@48	397 printed_words += 1
igor@47	398 words_of_this_group = []
igor@47	399
igor@47	400 old_normalized_word = normalized_word
igor@47	401 words_of_this_group.append(word_pair)
igor@47	402
igor@47	403 if not compressed_wordlist:
igor@47	404 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
igor@48	405 printed_words += 1
igor@47	406
igor@47	407
igor@40	408 known += word_pair[0]
igor@40	409 if 100.0*known/total >= level_lines[0]:
igor@40	410 current_level = level_lines[0]
igor@40	411 while 100.0*known/total > level_lines[0]:
igor@40	412 current_level = level_lines[0]
igor@40	413 level_lines = level_lines[1:]
igor@40	414 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
igor@38	415
igor@48	416 if show_range >0 and printed_words >= show_range:
igor@48	417 break
igor@48	418 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
igor@48	419 break
igor@48	420
igor@39	421 def filter_add_notes(args):
igor@39	422 lines = readlines_from_file(args[0])
igor@39	423 notes = load_notes(notes_filenames())
igor@39	424 lines = add_notes(lines, notes)
igor@39	425 with codecs.open(args[0], "w", "utf-8") as f:
igor@39	426 for line in lines:
igor@39	427 f.write(line)
igor@39	428
igor@39	429 def filter_remove_notes(args):
igor@39	430 lines = readlines_from_file(args[0])
igor@39	431 notes = load_notes(notes_filenames())
igor@39	432 lines = remove_notes(lines, notes)
igor@39	433 with codecs.open(args[0], "w", "utf-8") as f:
igor@39	434 for line in lines:
igor@39	435 f.write(line)
igor@39	436
igor@40	437 def filter_get_words_group_words_add_stat(args):
igor@40	438 vocabulary = load_vocabulary()
igor@40	439 notes = load_notes(notes_filenames())
igor@40	440 lines = readlines_from_stdin()
igor@44	441 group_by = [1]
igor@48	442
igor@44	443 if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
igor@44	444 group_by.append(2)
igor@44	445 if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
igor@44	446 group_by.append(3)
igor@44	447 words = get_words(lines, group_by)
igor@43	448 stats_only = False
igor@43	449 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
igor@43	450 stats_only = True
igor@40	451
igor@47	452 compressed_wordlist = False
igor@47	453 if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
igor@47	454 compressed_wordlist = True
igor@47	455
igor@48	456 show_range = os.environ.get('SHOW_RANGE', '')
igor@48	457 if show_range != '':
igor@48	458 show_range = int(show_range)
igor@48	459 else:
igor@48	460 show_range = 0
igor@48	461 show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '')
igor@48	462 if show_range_percentage != '':
igor@48	463 show_range_percentage = int(show_range_percentage)
igor@48	464 else:
igor@48	465 show_range_percentage = 0
igor@48	466
igor@44	467
igor@40	468 stats = {}
igor@40	469 stats['total'] = sum(words[x] for x in words.keys())
igor@45	470 if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':
igor@45	471 words = substract_dictionary(words, vocabulary)
igor@40	472
igor@40	473 stats['total_unknown'] = sum(words[x] for x in words.keys())
igor@40	474 stats['total_known'] = stats['total'] - stats['total_unknown']
igor@43	475 stats['percentage'] = 100.0*stats['total_known']/stats['total']
igor@43	476 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
igor@40	477 stats['groups'] = 0
igor@40	478 stats['words'] = len(words)
igor@43	479 stats['sentences'] = 0 #FIXME
igor@43	480 stats['wps'] = 0 #FIXME
igor@43	481 stats['uwps'] = 0 #FIXME
igor@40	482 stats['language'] = config['language']
igor@40	483
igor@40	484 linked_words = find_linked_words(notes)
igor@40	485 normalizator = Normalizator(config['language'], linked_words)
igor@40	486
igor@44	487 words_with_freq = []
igor@40	488 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
igor@44	489 words_with_freq.append((words[k], k))
igor@40	490
igor@44	491 wgw = find_wordgroups_weights(words_with_freq, normalizator)
igor@45	492 if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':
igor@45	493 words_with_freq = sorted(
igor@44	494 words_with_freq,
igor@40	495 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
igor@40	496 reverse=True)
igor@40	497
igor@47	498 print_words_sorted(
igor@47	499 words_with_freq,
igor@47	500 stats,
igor@47	501 normalizator,
igor@47	502 stats_only=stats_only,
igor@48	503 compressed_wordlist=compressed_wordlist,
igor@48	504 show_range=show_range,
igor@48	505 show_range_percentage=show_range_percentage,
igor@47	506 )
igor@40	507
igor@37	508 (options, args) = parser.parse_args()
igor@38	509 if options.language:
igor@38	510 config['language'] = options.language
igor@37	511
igor@38	512 if options.function:
igor@38	513 function_names = {
igor@39	514 'add_notes' : filter_add_notes,
igor@39	515 'remove_notes': filter_remove_notes,
igor@40	516 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
igor@38	517 }
igor@38	518 if options.function in function_names:
igor@38	519 function_names[options.function](args)
igor@38	520 else:
igor@38	521 error_message("Unkown function %s.\nAvailable functions:\n%s" % (
igor@38	522 options.function, "".join([" "+x for x in sorted(function_names.keys())])))
igor@38	523 sys.exit(1)
igor@37	524
igor@37	525
igor@37	526
igor@37	527
igor@38	528 #os.system("vim")
igor@37	529