new-words: new-words.py annotate

new-words

annotate new-words.py @ 53:f583256b7ab1

-p key support in new-words.py

author	Igor Chubin <igor@chub.in>
date	Mon Oct 31 20:21:20 2011 +0200 (2011-10-31)
parents	74e05d4436ee
children	e25de9ea9184

rev	line source
igor@37	1 #!/usr/bin/env python
igor@38	2 # -- coding: utf-8 --
igor@37	3
igor@40	4 from __future__ import with_statement
igor@38	5 import codecs
igor@49	6 import difflib
igor@38	7 import logging
igor@38	8 import os
igor@37	9 import optparse
igor@38	10 import re
igor@38	11 import subprocess
igor@38	12 import sys
igor@38	13 import Stemmer
igor@42	14 try:
igor@42	15 import psyco
igor@42	16 psyco.full()
igor@42	17 except:
igor@42	18 pass
igor@38	19
igor@38	20 config = {
igor@38	21 'config_directory': os.environ['HOME'] + '/.new-words',
igor@38	22 'language': 'en',
igor@38	23 }
igor@38	24
igor@38	25 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
igor@38	26
igor@38	27 class Normalizator:
igor@38	28 def __init__(self, language, linked_words={}):
igor@38	29 stemmer_algorithm = {
igor@38	30 'de' : 'german',
igor@38	31 'en' : 'english',
igor@51	32 'es' : 'spanish',
igor@38	33 'ru' : 'russian',
igor@51	34 'it' : 'italian',
igor@38	35 'uk' : 'ukrainian',
igor@38	36 }
igor@38	37 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
igor@38	38 self.linked_words = linked_words
igor@38	39
igor@38	40 def normalize(self, word):
igor@38	41 word_chain = []
igor@38	42 while word in self.linked_words and not word in word_chain:
igor@38	43 word_chain.append(word)
igor@38	44 word = self.linked_words[word]
igor@38	45 return self.stemmer.stemWord(word.lower())
igor@37	46
igor@47	47 def best_word_from_group(self, wordpairs_group):
igor@47	48 """Returns the word that is the most relevant to the wordpairs_group.
igor@47	49
igor@47	50 At the moment: returns the word with minimal length"""
igor@49	51
igor@49	52 def f(x, y):
igor@49	53 return difflib.SequenceMatcher(
igor@49	54 None,
igor@49	55 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),
igor@49	56 x.lower(),
igor@49	57 y.lower()).ratio()
igor@47	58
igor@47	59 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
igor@49	60 best_match = list(x[1] for x in sorted(
igor@47	61 (x for x in wordpairs_group if len(x[1]) == minimal_length),
igor@47	62 key=lambda x:x[0],
igor@47	63 reverse=True))[0]
igor@47	64
igor@51	65 return best_match
igor@51	66
igor@49	67 suggestions = self.dictionary_suggestions(best_match)
igor@49	68 if len(suggestions) == 1:
igor@49	69 return best_match
igor@49	70
igor@49	71 verb = False
igor@49	72 corrected_best_match = best_match
igor@49	73 if best_match[-2:] == 'et':
igor@49	74 word = best_match[:-1]+"n"
igor@49	75 sugg = self.dictionary_suggestions(word)
igor@49	76 if len(sugg) == 1:
igor@49	77 return word
igor@49	78 suggestions += sugg
igor@49	79 corrected_best_match = word
igor@49	80 corrected_best_match = best_match[:-2]
igor@49	81 verb = True
igor@49	82
igor@49	83 if best_match[-1] == 't':
igor@49	84 word = best_match[:-1]+"en"
igor@49	85 sugg = self.dictionary_suggestions(word)
igor@49	86 if len(sugg) == 1:
igor@49	87 return word
igor@49	88 suggestions += sugg
igor@49	89 corrected_best_match = best_match[:-1]
igor@49	90 verb = True
igor@49	91
igor@49	92 if corrected_best_match[0].lower() == corrected_best_match[0]:
igor@49	93 suggestions = [ x for x in suggestions
igor@49	94 if x[0].lower() == x[0] ]
igor@49	95
igor@49	96 if suggestions == []:
igor@49	97 return best_match+"_"
igor@49	98 return best_match+" "+(" ".join(
igor@49	99 sorted(
igor@49	100 suggestions,
igor@49	101 key = lambda x: f(x, corrected_best_match),
igor@49	102 reverse = True
igor@49	103 )
igor@49	104 )
igor@49	105 )
igor@49	106
igor@49	107 def dictionary_suggestions(self, word):
igor@49	108 return [
igor@49	109 x.decode('utf-8').rstrip('\n')
igor@49	110 for x
igor@49	111 in subprocess.Popen(
igor@49	112 ["de-variants", word],
igor@49	113 stdout=subprocess.PIPE
igor@49	114 ).stdout.readlines() ]
igor@49	115
igor@49	116
igor@37	117 parser = optparse.OptionParser()
igor@37	118
igor@37	119 parser.add_option(
igor@37	120 "-a", "--no-marks",
igor@37	121 help="don't add marks (and don't save marks added by user)",
igor@37	122 action="store_true",
igor@37	123 dest="no_marks")
igor@37	124
igor@37	125 parser.add_option(
igor@37	126 "-c", "--compressed",
igor@37	127 help="show compressed wordlist: one word per group",
igor@37	128 action="store_true",
igor@37	129 dest="compressed")
igor@37	130
igor@37	131 parser.add_option(
igor@37	132 "-k", "--known-words",
igor@37	133 help="put higher words that are similar to the known words (only for English)",
igor@37	134 action="store_true",
igor@37	135 dest="compressed")
igor@37	136
igor@37	137 parser.add_option(
igor@37	138 "-l", "--language",
igor@37	139 help="specify language of text",
igor@37	140 action="store",
igor@37	141 dest="language")
igor@37	142
igor@37	143 parser.add_option(
igor@38	144 "-f", "--function",
igor@38	145 help="filter through subsystem [INTERNAL]",
igor@38	146 action="store",
igor@38	147 dest="function")
igor@38	148
igor@38	149 parser.add_option(
igor@37	150 "-m", "--merge-tag",
igor@37	151 help="merge words tagged with specified tag into the main vocabulary",
igor@37	152 action="store",
igor@37	153 dest="merge_tag")
igor@37	154
igor@37	155 parser.add_option(
igor@37	156 "-M", "--merge-tagged",
igor@37	157 help="merge words tagged with ANY tag into the main vocabulary",
igor@37	158 action="store_true",
igor@37	159 dest="merge_tagged")
igor@37	160
igor@37	161 parser.add_option(
igor@37	162 "-n", "--non-interactive",
igor@37	163 help="non-interactive mode (don't run vi)",
igor@37	164 action="store_true",
igor@37	165 dest="non_interactive")
igor@37	166
igor@37	167 parser.add_option(
igor@37	168 "-N", "--no-filter",
igor@37	169 help="switch off known words filtering",
igor@37	170 action="store_true",
igor@37	171 dest="no_filter")
igor@37	172
igor@37	173 parser.add_option(
igor@37	174 "-p", "--pages",
igor@37	175 help="work with specified pages only (pages = start-stop/total )",
igor@37	176 action="store",
igor@37	177 dest="pages")
igor@37	178
igor@37	179 parser.add_option(
igor@48	180 "-d", "--delete-tag",
igor@48	181 help="delete subvocabulary of specified tag",
igor@37	182 action="store",
igor@48	183 dest="delete_tag")
igor@37	184
igor@37	185 parser.add_option(
igor@37	186 "-s", "--text-stats",
igor@37	187 help="show the text statistics (percentage of known words and so on) and exit",
igor@37	188 action="store_true",
igor@37	189 dest="text_stats")
igor@37	190
igor@37	191 parser.add_option(
igor@37	192 "-S", "--voc-stats",
igor@37	193 help="show your vocabulary statistics (number of words and word groups)",
igor@37	194 action="store_true",
igor@37	195 dest="voc_stats")
igor@37	196
igor@37	197 parser.add_option(
igor@37	198 "-t", "--tag",
igor@37	199 help="tag known words with tag",
igor@37	200 action="store",
igor@37	201 dest="tag")
igor@37	202
igor@37	203 parser.add_option(
igor@37	204 "-T", "--show-tags",
igor@37	205 help="tag known words with tag",
igor@37	206 action="store_true",
igor@37	207 dest="show_tags")
igor@37	208
igor@37	209 parser.add_option(
igor@37	210 "-2", "--two-words",
igor@37	211 help="find 2 words' sequences",
igor@37	212 action="store_true",
igor@37	213 dest="two_words")
igor@37	214
igor@37	215 parser.add_option(
igor@37	216 "-3", "--three-words",
igor@37	217 help="find 3 words' sequences",
igor@37	218 action="store_true",
igor@37	219 dest="three_words")
igor@37	220
igor@38	221 def readlines_from_file(filename):
igor@38	222 res = []
igor@38	223 with codecs.open(filename, "r", "utf-8") as f:
igor@38	224 for line in f.readlines():
igor@38	225 res += [line]
igor@38	226 return res
igor@38	227
igor@38	228 def readlines_from_stdin():
igor@38	229 return codecs.getreader("utf-8")(sys.stdin).readlines()
igor@38	230
igor@38	231 def words_from_line(line):
igor@38	232 line = line.rstrip('\n')
igor@38	233 #return re.split('(?:\s\|[*\r,.:#@()+=<>$;"?!\|\[\]^%&~{}«»–])+', line)
igor@38	234 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
igor@44	235 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
igor@38	236
igor@44	237 def get_words(lines, group_by=[1]):
igor@38	238 """
igor@38	239 Returns hash of words in a file
igor@38	240 word => number
igor@38	241 """
igor@38	242 result = {}
igor@44	243 (a, b, c) = ("", "", "")
igor@38	244 for line in lines:
igor@38	245 words = words_from_line(line)
igor@38	246 for word in words:
igor@41	247 if re.match('[0-9]*$', word):
igor@41	248 continue
igor@38	249 result.setdefault(word, 0)
igor@38	250 result[word] += 1
igor@44	251 if 2 in group_by and a != "" and b != "":
igor@44	252 w = "%s_%s" % (a,b)
igor@44	253 result.setdefault(w, 0)
igor@44	254 result[w] += 1
igor@44	255 if 3 in group_by and not "" in [a,b,c]:
igor@44	256 w = "%s_%s_%s" % (a,b,c)
igor@44	257 result.setdefault(w, 0)
igor@44	258 result[w] += 1
igor@44	259 (a,b,c) = (b, c, word)
igor@44	260
igor@44	261 logging.debug(result)
igor@38	262 return result
igor@38	263
igor@38	264 def load_vocabulary():
igor@38	265 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
igor@38	266
igor@38	267 def notes_filenames():
igor@38	268 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
igor@38	269
igor@38	270 def load_notes(files):
igor@38	271 notes = {}
igor@38	272 for filename in files:
igor@39	273 with codecs.open(filename, "r", "utf-8") as f:
igor@38	274 for line in f.readlines():
igor@38	275 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@38	276 notes.setdefault(word, {})
igor@38	277 notes[word][filename] = note
igor@38	278 return notes
igor@38	279
igor@39	280 def add_notes(lines, notes):
igor@39	281 notes_filename = notes_filenames()[0]
igor@39	282 result = []
igor@39	283 for line in lines:
igor@39	284 if line.startswith('#'):
igor@39	285 result += [line]
igor@39	286 else:
igor@39	287 match_object = re.search('^\s\S+\s(\S+)', line)
igor@39	288 if match_object:
igor@39	289 word = match_object.group(1)
igor@39	290 if word in notes:
igor@39	291 if notes_filename in notes[word]:
igor@39	292 line = line.rstrip('\n')
igor@39	293 line = "%-30s %s\n" % (line, notes[word][notes_filename])
igor@39	294 result += [line]
igor@39	295 else:
igor@39	296 result += [line]
igor@39	297 else:
igor@39	298 result += [line]
igor@39	299 return result
igor@39	300
igor@39	301 def remove_notes(lines, notes_group):
igor@39	302 notes_filename = notes_filenames()[0]
igor@39	303 notes = {}
igor@39	304 for k in notes_group.keys():
igor@39	305 if notes_filename in notes_group[k]:
igor@39	306 notes[k] = notes_group[k][notes_filename]
igor@39	307
igor@39	308 result = []
igor@39	309 for line in lines:
igor@39	310 line = line.rstrip('\n')
igor@39	311 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
igor@39	312 if match_object:
igor@39	313 result.append("".join([
igor@39	314 match_object.group(1),
igor@39	315 match_object.group(2),
igor@39	316 match_object.group(3),
igor@39	317 match_object.group(4),
igor@39	318 "\n"
igor@39	319 ]))
igor@39	320 notes[match_object.group(4)] = match_object.group(6)
igor@39	321 else:
igor@39	322 result.append(line+"\n")
igor@39	323
igor@39	324 save_notes(notes_filename, notes)
igor@39	325 return result
igor@39	326
igor@39	327 def save_notes(filename, notes):
igor@39	328 lines = []
igor@39	329 saved_words = []
igor@39	330 with codecs.open(filename, "r", "utf-8") as f:
igor@39	331 for line in f.readlines():
igor@39	332 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@39	333 if word in notes:
igor@39	334 line = "%-29s %s\n" % (word, notes[word])
igor@39	335 saved_words.append(word)
igor@39	336 lines.append(line)
igor@39	337 for word in [x for x in notes.keys() if not x in saved_words]:
igor@39	338 line = "%-29s %s\n" % (word, notes[word])
igor@39	339 lines.append(line)
igor@39	340
igor@39	341 with codecs.open(filename, "w", "utf-8") as f:
igor@39	342 for line in lines:
igor@39	343 f.write(line)
igor@39	344
igor@39	345
igor@38	346 def substract_dictionary(dict1, dict2):
igor@38	347 """
igor@38	348 returns dict1 - dict2
igor@38	349 """
igor@38	350 result = {}
igor@38	351 for (k,v) in dict1.items():
igor@38	352 if not k in dict2:
igor@38	353 result[k] = v
igor@38	354 return result
igor@38	355
igor@38	356 def dump_words(words, filename):
igor@38	357 with codecs.open(filename, "w+", "utf-8") as f:
igor@38	358 for word in words.keys():
igor@38	359 f.write(("%s\n"%word)*words[word])
igor@38	360
igor@38	361 def error_message(text):
igor@38	362 print text
igor@38	363
igor@40	364 def find_wordgroups_weights(word_pairs, normalizator):
igor@38	365 weight = {}
igor@40	366 for (num, word) in word_pairs:
igor@38	367 normalized = normalizator.normalize(word)
igor@38	368 weight.setdefault(normalized, 0)
igor@40	369 weight[normalized] += num
igor@38	370 return weight
igor@38	371
igor@38	372 def find_linked_words(notes):
igor@38	373 linked_words = {}
igor@38	374 for word in notes.keys():
igor@38	375 for note in notes[word].values():
igor@38	376 if "@" in note:
igor@38	377 result = re.search(r'\@(\S*)', note)
igor@38	378 if result:
igor@38	379 main_word = result.group(1)
igor@38	380 if main_word:
igor@38	381 linked_words[word] = main_word
igor@38	382 return linked_words
igor@38	383
igor@40	384 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
igor@40	385 (num1, word1) = pair1
igor@40	386 (num2, word2) = pair2
igor@38	387
igor@38	388 normalized_word1 = normalizator.normalize(word1)
igor@38	389 normalized_word2 = normalizator.normalize(word2)
igor@38	390
igor@38	391 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
igor@38	392 if cmp_res != 0:
igor@38	393 return cmp_res
igor@38	394 else:
igor@38	395 cmp_res = cmp(normalized_word1, normalized_word2)
igor@38	396 if cmp_res != 0:
igor@38	397 return cmp_res
igor@38	398 else:
igor@38	399 return cmp(int(num1), int(num2))
igor@38	400
igor@47	401
igor@48	402 def print_words_sorted(
igor@48	403 word_pairs,
igor@48	404 stats,
igor@48	405 normalizator,
igor@48	406 print_stats=True,
igor@48	407 stats_only=False,
igor@48	408 compressed_wordlist=False,
igor@48	409 show_range=0,
igor@48	410 show_range_percentage=0,
igor@48	411 ):
igor@40	412 if stats_only:
igor@43	413 codecs.getwriter("utf-8")(sys.stdout).write(
igor@43	414 " ".join([
igor@43	415 "%-10s" % x for x in [
igor@43	416 "LANG",
igor@43	417 "KNOWN%",
igor@43	418 "UNKNOWN%",
igor@43	419 "KNOWN",
igor@43	420 "TOTAL",
igor@43	421 "WPS",
igor@43	422 "UWPS*10"
igor@43	423 ]]) + "\n")
igor@43	424 codecs.getwriter("utf-8")(sys.stdout).write(
igor@43	425 " ".join([
igor@43	426 "%(language)-10s",
igor@43	427 "%(percentage)-10.2f",
igor@43	428 "%(percentage_unknown)-10.2f",
igor@43	429 "%(total_known)-11d"
igor@43	430 "%(total)-11d"
igor@43	431 "%(wps)-11d"
igor@43	432 "%(uwps)-11d"
igor@43	433 ]) % stats + "\n")
igor@40	434 return
igor@38	435
igor@40	436 if print_stats:
igor@40	437 codecs.getwriter("utf-8")(sys.stdout).write(
igor@43	438 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
igor@38	439
igor@40	440 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
igor@40	441 known = int(stats['total_known'])
igor@40	442 total = int(stats['total'])
igor@40	443 current_level = 0
igor@47	444 old_normalized_word = None
igor@47	445 words_of_this_group = []
igor@48	446 printed_words = 0
igor@40	447 for word_pair in word_pairs:
igor@47	448
igor@47	449 normalized_word = normalizator.normalize(word_pair[1])
igor@47	450 if old_normalized_word and old_normalized_word != normalized_word:
igor@47	451 #codecs.getwriter("utf-8")(sys.stdout).write(
igor@47	452 # "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
igor@47	453 if compressed_wordlist:
igor@49	454 compressed_word_pair = (
igor@49	455 sum(x[0] for x in words_of_this_group),
igor@49	456 normalizator.best_word_from_group(words_of_this_group)
igor@49	457 )
igor@47	458 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
igor@48	459 printed_words += 1
igor@47	460 words_of_this_group = []
igor@47	461
igor@47	462 old_normalized_word = normalized_word
igor@47	463 words_of_this_group.append(word_pair)
igor@47	464
igor@47	465 if not compressed_wordlist:
igor@47	466 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
igor@48	467 printed_words += 1
igor@47	468
igor@47	469
igor@40	470 known += word_pair[0]
igor@40	471 if 100.0*known/total >= level_lines[0]:
igor@40	472 current_level = level_lines[0]
igor@40	473 while 100.0*known/total > level_lines[0]:
igor@40	474 current_level = level_lines[0]
igor@40	475 level_lines = level_lines[1:]
igor@40	476 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
igor@38	477
igor@48	478 if show_range >0 and printed_words >= show_range:
igor@48	479 break
igor@48	480 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
igor@48	481 break
igor@48	482
igor@39	483 def filter_add_notes(args):
igor@39	484 lines = readlines_from_file(args[0])
igor@39	485 notes = load_notes(notes_filenames())
igor@39	486 lines = add_notes(lines, notes)
igor@39	487 with codecs.open(args[0], "w", "utf-8") as f:
igor@39	488 for line in lines:
igor@39	489 f.write(line)
igor@39	490
igor@39	491 def filter_remove_notes(args):
igor@39	492 lines = readlines_from_file(args[0])
igor@39	493 notes = load_notes(notes_filenames())
igor@39	494 lines = remove_notes(lines, notes)
igor@39	495 with codecs.open(args[0], "w", "utf-8") as f:
igor@39	496 for line in lines:
igor@39	497 f.write(line)
igor@39	498
igor@53	499 def parse_parts_description(parts_description):
igor@53	500 """
igor@53	501 Returns triad (start, stop, step)
igor@53	502 basing on parts_description string.
igor@53	503 from-to/step
igor@53	504 from+delta/step
igor@53	505 """
igor@53	506 def incorrect_parts_description(pd):
igor@53	507 raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % pd)
igor@53	508
igor@53	509 try:
igor@53	510 (a, step) = parts_description.split("/", 1)
igor@53	511 step = int(step)
igor@53	512 start = 0
igor@53	513 stop = 0
igor@53	514 if '-' in a:
igor@53	515 (start, stop) = a.split("-", 1)
igor@53	516 start = int(start)
igor@53	517 stop = int(stop)
igor@53	518 elif '+' in a:
igor@53	519 (start, stop) = a.split("+", 1)
igor@53	520 start = int(start)
igor@53	521 stop = int(stop)
igor@53	522 else:
igor@53	523 start = int(a)
igor@53	524 stop = start + 1
igor@53	525 return (start, stop, step)
igor@53	526
igor@53	527 except:
igor@53	528 raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % pd)
igor@53	529
igor@53	530
igor@53	531 def take_part(lines, part_description = None):
igor@53	532 if part_description == None:
igor@53	533 return lines
igor@53	534 (start, stop, step) = parse_parts_description(part_description)
igor@53	535 n = len(lines)
igor@53	536 part_size = (1.0*n) / step
igor@53	537 result = []
igor@53	538 for i in range(n):
igor@53	539 if part_size * i >= start and part_size * i <= stop:
igor@53	540 result += lines[i]
igor@53	541 return result
igor@53	542
igor@40	543 def filter_get_words_group_words_add_stat(args):
igor@40	544 vocabulary = load_vocabulary()
igor@40	545 notes = load_notes(notes_filenames())
igor@53	546 lines = take_part(readlines_from_stdin(), config.get('pages', ''))
igor@44	547 group_by = [1]
igor@48	548
igor@44	549 if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
igor@44	550 group_by.append(2)
igor@44	551 if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
igor@44	552 group_by.append(3)
igor@44	553 words = get_words(lines, group_by)
igor@43	554 stats_only = False
igor@43	555 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
igor@43	556 stats_only = True
igor@40	557
igor@47	558 compressed_wordlist = False
igor@47	559 if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
igor@47	560 compressed_wordlist = True
igor@47	561
igor@48	562 show_range = os.environ.get('SHOW_RANGE', '')
igor@48	563 if show_range != '':
igor@48	564 show_range = int(show_range)
igor@48	565 else:
igor@48	566 show_range = 0
igor@48	567 show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '')
igor@48	568 if show_range_percentage != '':
igor@48	569 show_range_percentage = int(show_range_percentage)
igor@48	570 else:
igor@48	571 show_range_percentage = 0
igor@48	572
igor@44	573
igor@40	574 stats = {}
igor@40	575 stats['total'] = sum(words[x] for x in words.keys())
igor@45	576 if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':
igor@45	577 words = substract_dictionary(words, vocabulary)
igor@40	578
igor@40	579 stats['total_unknown'] = sum(words[x] for x in words.keys())
igor@40	580 stats['total_known'] = stats['total'] - stats['total_unknown']
igor@43	581 stats['percentage'] = 100.0*stats['total_known']/stats['total']
igor@43	582 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
igor@40	583 stats['groups'] = 0
igor@40	584 stats['words'] = len(words)
igor@43	585 stats['sentences'] = 0 #FIXME
igor@43	586 stats['wps'] = 0 #FIXME
igor@43	587 stats['uwps'] = 0 #FIXME
igor@40	588 stats['language'] = config['language']
igor@40	589
igor@40	590 linked_words = find_linked_words(notes)
igor@40	591 normalizator = Normalizator(config['language'], linked_words)
igor@40	592
igor@50	593 # filter words by allowed_words_filter
igor@50	594 if os.environ.get('ALLOWED_WORDS_FILENAME', ''):
igor@50	595 allowed_words_filename = os.environ.get('ALLOWED_WORDS_FILENAME', '')
igor@50	596 normalized_allowed_words = [
igor@50	597 normalizator.normalize(w.rstrip('\n'))
igor@50	598 for w in readlines_from_file(allowed_words_filename)
igor@50	599 ]
igor@50	600
igor@50	601 result = {}
igor@50	602 for w, wn in words.iteritems():
igor@50	603 if normalizator.normalize(w) in normalized_allowed_words:
igor@50	604 result[w] = wn
igor@50	605 words = result
igor@50	606
igor@44	607 words_with_freq = []
igor@40	608 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
igor@44	609 words_with_freq.append((words[k], k))
igor@40	610
igor@44	611 wgw = find_wordgroups_weights(words_with_freq, normalizator)
igor@45	612 if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':
igor@45	613 words_with_freq = sorted(
igor@44	614 words_with_freq,
igor@40	615 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
igor@40	616 reverse=True)
igor@40	617
igor@47	618 print_words_sorted(
igor@47	619 words_with_freq,
igor@47	620 stats,
igor@47	621 normalizator,
igor@47	622 stats_only=stats_only,
igor@48	623 compressed_wordlist=compressed_wordlist,
igor@48	624 show_range=show_range,
igor@48	625 show_range_percentage=show_range_percentage,
igor@47	626 )
igor@40	627
igor@37	628 (options, args) = parser.parse_args()
igor@38	629 if options.language:
igor@38	630 config['language'] = options.language
igor@37	631
igor@38	632 if options.function:
igor@38	633 function_names = {
igor@39	634 'add_notes' : filter_add_notes,
igor@39	635 'remove_notes': filter_remove_notes,
igor@40	636 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
igor@38	637 }
igor@38	638 if options.function in function_names:
igor@38	639 function_names[options.function](args)
igor@38	640 else:
igor@38	641 error_message("Unkown function %s.\nAvailable functions:\n%s" % (
igor@38	642 options.function, "".join([" "+x for x in sorted(function_names.keys())])))
igor@38	643 sys.exit(1)
igor@37	644
igor@37	645
igor@37	646
igor@37	647
igor@38	648 #os.system("vim")
igor@37	649