new-words: new-words.py annotate

new-words

annotate new-words.py @ 54:e25de9ea9184

new-words.py is almost ready

author	Igor Chubin <igor@chub.in>
date	Tue Nov 01 20:19:18 2011 +0100 (2011-11-01)
parents	f583256b7ab1
children	2a1a25e61872

rev	line source
igor@37	1 #!/usr/bin/env python
igor@38	2 # -- coding: utf-8 --
igor@37	3
igor@40	4 from __future__ import with_statement
igor@38	5 import codecs
igor@49	6 import difflib
igor@38	7 import logging
igor@38	8 import os
igor@37	9 import optparse
igor@38	10 import re
igor@38	11 import subprocess
igor@38	12 import sys
igor@38	13 import Stemmer
igor@54	14 import tempfile
igor@42	15 try:
igor@42	16 import psyco
igor@42	17 psyco.full()
igor@42	18 except:
igor@42	19 pass
igor@38	20
igor@38	21 config = {
igor@38	22 'config_directory': os.environ['HOME'] + '/.new-words',
igor@38	23 'language': 'en',
igor@38	24 }
igor@38	25
igor@38	26 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
igor@38	27
igor@38	28 class Normalizator:
igor@38	29 def __init__(self, language, linked_words={}):
igor@38	30 stemmer_algorithm = {
igor@38	31 'de' : 'german',
igor@38	32 'en' : 'english',
igor@51	33 'es' : 'spanish',
igor@38	34 'ru' : 'russian',
igor@51	35 'it' : 'italian',
igor@38	36 'uk' : 'ukrainian',
igor@38	37 }
igor@38	38 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
igor@38	39 self.linked_words = linked_words
igor@38	40
igor@38	41 def normalize(self, word):
igor@38	42 word_chain = []
igor@38	43 while word in self.linked_words and not word in word_chain:
igor@38	44 word_chain.append(word)
igor@38	45 word = self.linked_words[word]
igor@38	46 return self.stemmer.stemWord(word.lower())
igor@37	47
igor@47	48 def best_word_from_group(self, wordpairs_group):
igor@47	49 """Returns the word that is the most relevant to the wordpairs_group.
igor@47	50
igor@47	51 At the moment: returns the word with minimal length"""
igor@49	52
igor@49	53 def f(x, y):
igor@49	54 return difflib.SequenceMatcher(
igor@49	55 None,
igor@49	56 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),
igor@49	57 x.lower(),
igor@49	58 y.lower()).ratio()
igor@47	59
igor@47	60 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
igor@49	61 best_match = list(x[1] for x in sorted(
igor@47	62 (x for x in wordpairs_group if len(x[1]) == minimal_length),
igor@47	63 key=lambda x:x[0],
igor@47	64 reverse=True))[0]
igor@47	65
igor@51	66 return best_match
igor@51	67
igor@49	68 suggestions = self.dictionary_suggestions(best_match)
igor@49	69 if len(suggestions) == 1:
igor@49	70 return best_match
igor@49	71
igor@49	72 verb = False
igor@49	73 corrected_best_match = best_match
igor@49	74 if best_match[-2:] == 'et':
igor@49	75 word = best_match[:-1]+"n"
igor@49	76 sugg = self.dictionary_suggestions(word)
igor@49	77 if len(sugg) == 1:
igor@49	78 return word
igor@49	79 suggestions += sugg
igor@49	80 corrected_best_match = word
igor@49	81 corrected_best_match = best_match[:-2]
igor@49	82 verb = True
igor@49	83
igor@49	84 if best_match[-1] == 't':
igor@49	85 word = best_match[:-1]+"en"
igor@49	86 sugg = self.dictionary_suggestions(word)
igor@49	87 if len(sugg) == 1:
igor@49	88 return word
igor@49	89 suggestions += sugg
igor@49	90 corrected_best_match = best_match[:-1]
igor@49	91 verb = True
igor@49	92
igor@49	93 if corrected_best_match[0].lower() == corrected_best_match[0]:
igor@49	94 suggestions = [ x for x in suggestions
igor@49	95 if x[0].lower() == x[0] ]
igor@49	96
igor@49	97 if suggestions == []:
igor@49	98 return best_match+"_"
igor@49	99 return best_match+" "+(" ".join(
igor@49	100 sorted(
igor@49	101 suggestions,
igor@49	102 key = lambda x: f(x, corrected_best_match),
igor@49	103 reverse = True
igor@49	104 )
igor@49	105 )
igor@49	106 )
igor@49	107
igor@49	108 def dictionary_suggestions(self, word):
igor@49	109 return [
igor@49	110 x.decode('utf-8').rstrip('\n')
igor@49	111 for x
igor@49	112 in subprocess.Popen(
igor@49	113 ["de-variants", word],
igor@49	114 stdout=subprocess.PIPE
igor@49	115 ).stdout.readlines() ]
igor@49	116
igor@49	117
igor@37	118 parser = optparse.OptionParser()
igor@37	119
igor@37	120 parser.add_option(
igor@37	121 "-a", "--no-marks",
igor@37	122 help="don't add marks (and don't save marks added by user)",
igor@37	123 action="store_true",
igor@37	124 dest="no_marks")
igor@37	125
igor@37	126 parser.add_option(
igor@37	127 "-c", "--compressed",
igor@37	128 help="show compressed wordlist: one word per group",
igor@37	129 action="store_true",
igor@37	130 dest="compressed")
igor@37	131
igor@37	132 parser.add_option(
igor@37	133 "-k", "--known-words",
igor@37	134 help="put higher words that are similar to the known words (only for English)",
igor@37	135 action="store_true",
igor@37	136 dest="compressed")
igor@37	137
igor@37	138 parser.add_option(
igor@37	139 "-l", "--language",
igor@37	140 help="specify language of text",
igor@37	141 action="store",
igor@37	142 dest="language")
igor@37	143
igor@37	144 parser.add_option(
igor@54	145 "-f", "--allowed-words",
igor@54	146 help="file with list of allowed words (words that will be shown in the output)",
igor@54	147 action="store",
igor@54	148 dest="allowed_words")
igor@54	149
igor@54	150 parser.add_option(
igor@54	151 "-X", "--function",
igor@38	152 help="filter through subsystem [INTERNAL]",
igor@38	153 action="store",
igor@38	154 dest="function")
igor@38	155
igor@38	156 parser.add_option(
igor@37	157 "-m", "--merge-tag",
igor@37	158 help="merge words tagged with specified tag into the main vocabulary",
igor@37	159 action="store",
igor@37	160 dest="merge_tag")
igor@37	161
igor@37	162 parser.add_option(
igor@37	163 "-M", "--merge-tagged",
igor@37	164 help="merge words tagged with ANY tag into the main vocabulary",
igor@37	165 action="store_true",
igor@37	166 dest="merge_tagged")
igor@37	167
igor@37	168 parser.add_option(
igor@37	169 "-n", "--non-interactive",
igor@37	170 help="non-interactive mode (don't run vi)",
igor@37	171 action="store_true",
igor@37	172 dest="non_interactive")
igor@37	173
igor@37	174 parser.add_option(
igor@37	175 "-N", "--no-filter",
igor@37	176 help="switch off known words filtering",
igor@37	177 action="store_true",
igor@37	178 dest="no_filter")
igor@37	179
igor@37	180 parser.add_option(
igor@37	181 "-p", "--pages",
igor@37	182 help="work with specified pages only (pages = start-stop/total )",
igor@37	183 action="store",
igor@37	184 dest="pages")
igor@37	185
igor@37	186 parser.add_option(
igor@48	187 "-d", "--delete-tag",
igor@48	188 help="delete subvocabulary of specified tag",
igor@37	189 action="store",
igor@48	190 dest="delete_tag")
igor@37	191
igor@37	192 parser.add_option(
igor@54	193 "-R", "--show-range-percentage",
igor@54	194 help="show only words that cover specified percentage of the text, skip the rest",
igor@54	195 action="store",
igor@54	196 dest="show_range_percentage")
igor@54	197
igor@54	198 parser.add_option(
igor@37	199 "-s", "--text-stats",
igor@37	200 help="show the text statistics (percentage of known words and so on) and exit",
igor@37	201 action="store_true",
igor@37	202 dest="text_stats")
igor@37	203
igor@37	204 parser.add_option(
igor@37	205 "-S", "--voc-stats",
igor@37	206 help="show your vocabulary statistics (number of words and word groups)",
igor@37	207 action="store_true",
igor@37	208 dest="voc_stats")
igor@37	209
igor@37	210 parser.add_option(
igor@37	211 "-t", "--tag",
igor@37	212 help="tag known words with tag",
igor@37	213 action="store",
igor@37	214 dest="tag")
igor@37	215
igor@37	216 parser.add_option(
igor@37	217 "-T", "--show-tags",
igor@37	218 help="tag known words with tag",
igor@37	219 action="store_true",
igor@37	220 dest="show_tags")
igor@37	221
igor@37	222 parser.add_option(
igor@37	223 "-2", "--two-words",
igor@37	224 help="find 2 words' sequences",
igor@37	225 action="store_true",
igor@37	226 dest="two_words")
igor@37	227
igor@37	228 parser.add_option(
igor@37	229 "-3", "--three-words",
igor@37	230 help="find 3 words' sequences",
igor@37	231 action="store_true",
igor@37	232 dest="three_words")
igor@37	233
igor@38	234 def readlines_from_file(filename):
igor@38	235 res = []
igor@38	236 with codecs.open(filename, "r", "utf-8") as f:
igor@38	237 for line in f.readlines():
igor@38	238 res += [line]
igor@38	239 return res
igor@38	240
igor@54	241 def readlines_from_url(url):
igor@54	242 return [x.decode('utf-8') for x in
igor@54	243 subprocess.Popen(
igor@54	244 "lynx -dump '{url}' \| perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url),
igor@54	245 shell = True,
igor@54	246 stdout = subprocess.PIPE,
igor@54	247 stderr = subprocess.STDOUT
igor@54	248 ).communicate()[0].split('\n')
igor@54	249 ]
igor@54	250
igor@38	251 def readlines_from_stdin():
igor@38	252 return codecs.getreader("utf-8")(sys.stdin).readlines()
igor@38	253
igor@38	254 def words_from_line(line):
igor@38	255 line = line.rstrip('\n')
igor@38	256 #return re.split('(?:\s\|[*\r,.:#@()+=<>$;"?!\|\[\]^%&~{}«»–])+', line)
igor@38	257 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
igor@44	258 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
igor@38	259
igor@44	260 def get_words(lines, group_by=[1]):
igor@38	261 """
igor@38	262 Returns hash of words in a file
igor@38	263 word => number
igor@38	264 """
igor@38	265 result = {}
igor@44	266 (a, b, c) = ("", "", "")
igor@38	267 for line in lines:
igor@38	268 words = words_from_line(line)
igor@38	269 for word in words:
igor@41	270 if re.match('[0-9]*$', word):
igor@41	271 continue
igor@38	272 result.setdefault(word, 0)
igor@38	273 result[word] += 1
igor@44	274 if 2 in group_by and a != "" and b != "":
igor@44	275 w = "%s_%s" % (a,b)
igor@44	276 result.setdefault(w, 0)
igor@44	277 result[w] += 1
igor@44	278 if 3 in group_by and not "" in [a,b,c]:
igor@44	279 w = "%s_%s_%s" % (a,b,c)
igor@44	280 result.setdefault(w, 0)
igor@44	281 result[w] += 1
igor@44	282 (a,b,c) = (b, c, word)
igor@44	283
igor@44	284 logging.debug(result)
igor@38	285 return result
igor@38	286
igor@54	287 def voc_filename():
igor@54	288 return "%s/%s.txt"%(config['config_directory'], config['language'])
igor@54	289
igor@38	290 def load_vocabulary():
igor@54	291 return get_words(readlines_from_file(voc_filename()))
igor@38	292
igor@38	293 def notes_filenames():
igor@38	294 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
igor@38	295
igor@38	296 def load_notes(files):
igor@38	297 notes = {}
igor@38	298 for filename in files:
igor@39	299 with codecs.open(filename, "r", "utf-8") as f:
igor@38	300 for line in f.readlines():
igor@38	301 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@38	302 notes.setdefault(word, {})
igor@38	303 notes[word][filename] = note
igor@38	304 return notes
igor@38	305
igor@39	306 def add_notes(lines, notes):
igor@39	307 notes_filename = notes_filenames()[0]
igor@39	308 result = []
igor@39	309 for line in lines:
igor@39	310 if line.startswith('#'):
igor@39	311 result += [line]
igor@39	312 else:
igor@39	313 match_object = re.search('^\s\S+\s(\S+)', line)
igor@39	314 if match_object:
igor@39	315 word = match_object.group(1)
igor@39	316 if word in notes:
igor@39	317 if notes_filename in notes[word]:
igor@39	318 line = line.rstrip('\n')
igor@39	319 line = "%-30s %s\n" % (line, notes[word][notes_filename])
igor@39	320 result += [line]
igor@39	321 else:
igor@39	322 result += [line]
igor@39	323 else:
igor@39	324 result += [line]
igor@39	325 return result
igor@39	326
igor@39	327 def remove_notes(lines, notes_group):
igor@39	328 notes_filename = notes_filenames()[0]
igor@39	329 notes = {}
igor@39	330 for k in notes_group.keys():
igor@39	331 if notes_filename in notes_group[k]:
igor@39	332 notes[k] = notes_group[k][notes_filename]
igor@39	333
igor@39	334 result = []
igor@39	335 for line in lines:
igor@39	336 line = line.rstrip('\n')
igor@39	337 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
igor@39	338 if match_object:
igor@39	339 result.append("".join([
igor@39	340 match_object.group(1),
igor@39	341 match_object.group(2),
igor@39	342 match_object.group(3),
igor@39	343 match_object.group(4),
igor@39	344 "\n"
igor@39	345 ]))
igor@39	346 notes[match_object.group(4)] = match_object.group(6)
igor@39	347 else:
igor@39	348 result.append(line+"\n")
igor@39	349
igor@39	350 save_notes(notes_filename, notes)
igor@39	351 return result
igor@39	352
igor@39	353 def save_notes(filename, notes):
igor@39	354 lines = []
igor@39	355 saved_words = []
igor@39	356 with codecs.open(filename, "r", "utf-8") as f:
igor@39	357 for line in f.readlines():
igor@39	358 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@39	359 if word in notes:
igor@39	360 line = "%-29s %s\n" % (word, notes[word])
igor@39	361 saved_words.append(word)
igor@39	362 lines.append(line)
igor@39	363 for word in [x for x in notes.keys() if not x in saved_words]:
igor@39	364 line = "%-29s %s\n" % (word, notes[word])
igor@39	365 lines.append(line)
igor@39	366
igor@39	367 with codecs.open(filename, "w", "utf-8") as f:
igor@39	368 for line in lines:
igor@39	369 f.write(line)
igor@39	370
igor@39	371
igor@38	372 def substract_dictionary(dict1, dict2):
igor@38	373 """
igor@38	374 returns dict1 - dict2
igor@38	375 """
igor@38	376 result = {}
igor@38	377 for (k,v) in dict1.items():
igor@38	378 if not k in dict2:
igor@38	379 result[k] = v
igor@38	380 return result
igor@38	381
igor@38	382 def dump_words(words, filename):
igor@38	383 with codecs.open(filename, "w+", "utf-8") as f:
igor@38	384 for word in words.keys():
igor@38	385 f.write(("%s\n"%word)*words[word])
igor@38	386
igor@38	387 def error_message(text):
igor@38	388 print text
igor@38	389
igor@40	390 def find_wordgroups_weights(word_pairs, normalizator):
igor@38	391 weight = {}
igor@40	392 for (num, word) in word_pairs:
igor@38	393 normalized = normalizator.normalize(word)
igor@38	394 weight.setdefault(normalized, 0)
igor@40	395 weight[normalized] += num
igor@38	396 return weight
igor@38	397
igor@38	398 def find_linked_words(notes):
igor@38	399 linked_words = {}
igor@38	400 for word in notes.keys():
igor@38	401 for note in notes[word].values():
igor@38	402 if "@" in note:
igor@38	403 result = re.search(r'\@(\S*)', note)
igor@38	404 if result:
igor@38	405 main_word = result.group(1)
igor@38	406 if main_word:
igor@38	407 linked_words[word] = main_word
igor@38	408 return linked_words
igor@38	409
igor@40	410 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
igor@40	411 (num1, word1) = pair1
igor@40	412 (num2, word2) = pair2
igor@38	413
igor@38	414 normalized_word1 = normalizator.normalize(word1)
igor@38	415 normalized_word2 = normalizator.normalize(word2)
igor@38	416
igor@38	417 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
igor@38	418 if cmp_res != 0:
igor@38	419 return cmp_res
igor@38	420 else:
igor@38	421 cmp_res = cmp(normalized_word1, normalized_word2)
igor@38	422 if cmp_res != 0:
igor@38	423 return cmp_res
igor@38	424 else:
igor@38	425 return cmp(int(num1), int(num2))
igor@38	426
igor@47	427
igor@48	428 def print_words_sorted(
igor@48	429 word_pairs,
igor@48	430 stats,
igor@48	431 normalizator,
igor@48	432 print_stats=True,
igor@48	433 stats_only=False,
igor@48	434 compressed_wordlist=False,
igor@48	435 show_range=0,
igor@48	436 show_range_percentage=0,
igor@48	437 ):
igor@54	438 result = []
igor@40	439 if stats_only:
igor@54	440 #codecs.getwriter("utf-8")(sys.stdout).write(
igor@54	441 result.append(
igor@43	442 " ".join([
igor@43	443 "%-10s" % x for x in [
igor@43	444 "LANG",
igor@43	445 "KNOWN%",
igor@43	446 "UNKNOWN%",
igor@43	447 "KNOWN",
igor@43	448 "TOTAL",
igor@43	449 "WPS",
igor@43	450 "UWPS*10"
igor@43	451 ]]) + "\n")
igor@54	452 result.append(
igor@43	453 " ".join([
igor@43	454 "%(language)-10s",
igor@43	455 "%(percentage)-10.2f",
igor@43	456 "%(percentage_unknown)-10.2f",
igor@43	457 "%(total_known)-11d"
igor@43	458 "%(total)-11d"
igor@43	459 "%(wps)-11d"
igor@43	460 "%(uwps)-11d"
igor@43	461 ]) % stats + "\n")
igor@54	462 return "".join(result)
igor@38	463
igor@40	464 if print_stats:
igor@54	465 result.append(
igor@43	466 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
igor@38	467
igor@40	468 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
igor@40	469 known = int(stats['total_known'])
igor@40	470 total = int(stats['total'])
igor@40	471 current_level = 0
igor@47	472 old_normalized_word = None
igor@47	473 words_of_this_group = []
igor@48	474 printed_words = 0
igor@40	475 for word_pair in word_pairs:
igor@47	476
igor@47	477 normalized_word = normalizator.normalize(word_pair[1])
igor@47	478 if old_normalized_word and old_normalized_word != normalized_word:
igor@47	479 if compressed_wordlist:
igor@49	480 compressed_word_pair = (
igor@49	481 sum(x[0] for x in words_of_this_group),
igor@49	482 normalizator.best_word_from_group(words_of_this_group)
igor@49	483 )
igor@54	484 result.append("%10s %s\n" % compressed_word_pair)
igor@48	485 printed_words += 1
igor@47	486 words_of_this_group = []
igor@47	487
igor@47	488 old_normalized_word = normalized_word
igor@47	489 words_of_this_group.append(word_pair)
igor@47	490
igor@47	491 if not compressed_wordlist:
igor@54	492 result.append("%10s %s\n" % word_pair)
igor@48	493 printed_words += 1
igor@47	494
igor@47	495
igor@40	496 known += word_pair[0]
igor@40	497 if 100.0*known/total >= level_lines[0]:
igor@40	498 current_level = level_lines[0]
igor@40	499 while 100.0*known/total > level_lines[0]:
igor@40	500 current_level = level_lines[0]
igor@40	501 level_lines = level_lines[1:]
igor@54	502 result.append("# %s\n" % current_level)
igor@38	503
igor@48	504 if show_range >0 and printed_words >= show_range:
igor@48	505 break
igor@48	506 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
igor@48	507 break
igor@48	508
igor@54	509 return result
igor@39	510
igor@53	511 def parse_parts_description(parts_description):
igor@53	512 """
igor@53	513 Returns triad (start, stop, step)
igor@53	514 basing on parts_description string.
igor@53	515 from-to/step
igor@53	516 from+delta/step
igor@53	517 """
igor@53	518
igor@53	519 try:
igor@53	520 (a, step) = parts_description.split("/", 1)
igor@53	521 step = int(step)
igor@53	522 start = 0
igor@53	523 stop = 0
igor@53	524 if '-' in a:
igor@53	525 (start, stop) = a.split("-", 1)
igor@53	526 start = int(start)
igor@53	527 stop = int(stop)
igor@53	528 elif '+' in a:
igor@53	529 (start, stop) = a.split("+", 1)
igor@53	530 start = int(start)
igor@53	531 stop = int(stop)
igor@53	532 else:
igor@53	533 start = int(a)
igor@53	534 stop = start + 1
igor@53	535 return (start, stop, step)
igor@53	536
igor@53	537 except:
igor@54	538 raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % parts_description)
igor@53	539
igor@53	540
igor@53	541 def take_part(lines, part_description = None):
igor@53	542 if part_description == None:
igor@53	543 return lines
igor@53	544 (start, stop, step) = parse_parts_description(part_description)
igor@53	545 n = len(lines)
igor@53	546 part_size = (1.0*n) / step
igor@53	547 result = []
igor@53	548 for i in range(n):
igor@54	549 if i >= start * part_size and i <= stop * part_size:
igor@54	550 result += [lines[i]]
igor@53	551 return result
igor@53	552
igor@40	553 def filter_get_words_group_words_add_stat(args):
igor@40	554 vocabulary = load_vocabulary()
igor@40	555 notes = load_notes(notes_filenames())
igor@54	556
igor@54	557 if len(args) > 0:
igor@54	558 if 'http://' in args[0]:
igor@54	559 input_lines = readlines_from_url(args[0])
igor@54	560 else:
igor@54	561 input_lines = readlines_from_file(args[0])
igor@54	562 else:
igor@54	563 input_lines = readlines_from_stdin()
igor@54	564
igor@54	565 if len(input_lines) == 0:
igor@54	566 print >> sys.stderr, "Nothing to do, standard input is empty, exiting."
igor@54	567 sys.exit(1)
igor@54	568
igor@54	569 lines = take_part(input_lines, config.get('pages', ''))
igor@54	570
igor@54	571 (_, original_text_tempfile) = tempfile.mkstemp(prefix='new-word')
igor@54	572 with codecs.open(original_text_tempfile, "w", "utf-8") as f:
igor@54	573 f.write("".join(lines))
igor@54	574
igor@44	575 group_by = [1]
igor@48	576
igor@54	577 if 'two_words' in config:
igor@44	578 group_by.append(2)
igor@54	579 if 'three_words' in config:
igor@44	580 group_by.append(3)
igor@44	581 words = get_words(lines, group_by)
igor@43	582 stats_only = False
igor@54	583 if 'text_stats' in config:
igor@43	584 stats_only = True
igor@40	585
igor@47	586 compressed_wordlist = False
igor@54	587 if 'compressed' in config:
igor@47	588 compressed_wordlist = True
igor@47	589
igor@48	590 show_range = os.environ.get('SHOW_RANGE', '')
igor@48	591 if show_range != '':
igor@48	592 show_range = int(show_range)
igor@48	593 else:
igor@48	594 show_range = 0
igor@54	595
igor@54	596 if 'show_range_percentage' in config:
igor@54	597 show_range_percentage = int(config['show_range_percentage'])
igor@48	598 else:
igor@48	599 show_range_percentage = 0
igor@48	600
igor@44	601
igor@40	602 stats = {}
igor@40	603 stats['total'] = sum(words[x] for x in words.keys())
igor@54	604 if not 'no_filter' in config:
igor@45	605 words = substract_dictionary(words, vocabulary)
igor@40	606
igor@40	607 stats['total_unknown'] = sum(words[x] for x in words.keys())
igor@40	608 stats['total_known'] = stats['total'] - stats['total_unknown']
igor@43	609 stats['percentage'] = 100.0*stats['total_known']/stats['total']
igor@43	610 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
igor@40	611 stats['groups'] = 0
igor@40	612 stats['words'] = len(words)
igor@43	613 stats['sentences'] = 0 #FIXME
igor@43	614 stats['wps'] = 0 #FIXME
igor@43	615 stats['uwps'] = 0 #FIXME
igor@40	616 stats['language'] = config['language']
igor@40	617
igor@40	618 linked_words = find_linked_words(notes)
igor@40	619 normalizator = Normalizator(config['language'], linked_words)
igor@40	620
igor@50	621 # filter words by allowed_words_filter
igor@54	622 if 'allowed_words' in config:
igor@54	623 allowed_words_filename = config['allowed_words']
igor@50	624 normalized_allowed_words = [
igor@50	625 normalizator.normalize(w.rstrip('\n'))
igor@50	626 for w in readlines_from_file(allowed_words_filename)
igor@50	627 ]
igor@50	628
igor@50	629 result = {}
igor@50	630 for w, wn in words.iteritems():
igor@50	631 if normalizator.normalize(w) in normalized_allowed_words:
igor@50	632 result[w] = wn
igor@50	633 words = result
igor@50	634
igor@44	635 words_with_freq = []
igor@40	636 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
igor@44	637 words_with_freq.append((words[k], k))
igor@40	638
igor@44	639 wgw = find_wordgroups_weights(words_with_freq, normalizator)
igor@45	640 if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':
igor@45	641 words_with_freq = sorted(
igor@44	642 words_with_freq,
igor@40	643 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
igor@40	644 reverse=True)
igor@40	645
igor@54	646 output = print_words_sorted(
igor@47	647 words_with_freq,
igor@47	648 stats,
igor@47	649 normalizator,
igor@47	650 stats_only=stats_only,
igor@48	651 compressed_wordlist=compressed_wordlist,
igor@48	652 show_range=show_range,
igor@48	653 show_range_percentage=show_range_percentage,
igor@47	654 )
igor@40	655
igor@54	656
igor@54	657 if ('non_interactive' in config or 'text_stats' in config):
igor@54	658 codecs.getwriter("utf-8")(sys.stdout).write("".join(output))
igor@54	659 else:
igor@54	660 (_, temp1) = tempfile.mkstemp(prefix='new-word')
igor@54	661 (_, temp2) = tempfile.mkstemp(prefix='new-word')
igor@54	662
igor@54	663 with codecs.open(temp1, "w", "utf-8") as f:
igor@54	664 f.write("".join(output))
igor@54	665 with codecs.open(temp2, "w", "utf-8") as f:
igor@54	666 f.write("".join(add_notes(output, notes)))
igor@54	667
igor@54	668 os.putenv('ORIGINAL_TEXT', original_text_tempfile)
igor@54	669 os.system((
igor@54	670 "vim"
igor@54	671 " -c 'setlocal spell spelllang={language}'"
igor@54	672 " -c 'set keywordprg={language}'"
igor@54	673 " -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255'"
igor@54	674 " {filename}"
igor@54	675 " < /dev/tty > /dev/tty"
igor@54	676 ).format(language=config['language'], filename=temp2))
igor@54	677
igor@54	678 lines = remove_notes(readlines_from_file(temp2), notes)
igor@54	679
igor@54	680 # compare lines_before and lines_after and return deleted words
igor@54	681 lines_before = output
igor@54	682 lines_after = lines
igor@54	683 deleted_words = []
igor@54	684
igor@54	685 for line in lines_before:
igor@54	686 if line not in lines_after:
igor@54	687 line = line.strip()
igor@54	688 if ' ' in line:
igor@54	689 word = re.split('\s+', line, 1)[1]
igor@54	690 if ' ' in word:
igor@54	691 word = re.split('\s+', word, 1)[0]
igor@54	692 deleted_words.append(word)
igor@54	693
igor@54	694 with codecs.open(voc_filename(), "a", "utf-8") as f:
igor@54	695 f.write("\n".join(deleted_words + ['']))
igor@54	696
igor@54	697 os.unlink(temp1)
igor@54	698 os.unlink(temp2)
igor@54	699
igor@54	700 os.unlink(original_text_tempfile)
igor@54	701
igor@37	702 (options, args) = parser.parse_args()
igor@38	703 if options.language:
igor@38	704 config['language'] = options.language
igor@37	705
igor@54	706 if options.pages:
igor@54	707 config['pages'] = options.pages
igor@54	708 else:
igor@54	709 config['pages'] = ""
igor@54	710
igor@54	711 if options.allowed_words:
igor@54	712 config['allowed_words'] = options.allowed_words
igor@54	713
igor@54	714 if options.show_range_percentage:
igor@54	715 config['show_range_percentage'] = options.show_range_percentage
igor@54	716
igor@54	717 if options.non_interactive:
igor@54	718 config['non_interactive'] = True
igor@54	719
igor@54	720 if options.text_stats:
igor@54	721 config['text_stats'] = True
igor@54	722
igor@54	723 if options.compressed:
igor@54	724 config['compressed'] = True
igor@54	725
igor@54	726 if options.no_filter:
igor@54	727 config['no_filter'] = True
igor@54	728
igor@54	729 if options.two_words:
igor@54	730 config['two_words'] = True
igor@54	731
igor@54	732 if options.three_words:
igor@54	733 config['three_words'] = True
igor@54	734
igor@38	735 if options.function:
igor@38	736 function_names = {
igor@40	737 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
igor@38	738 }
igor@38	739 if options.function in function_names:
igor@38	740 function_names[options.function](args)
igor@38	741 else:
igor@38	742 error_message("Unkown function %s.\nAvailable functions:\n%s" % (
igor@38	743 options.function, "".join([" "+x for x in sorted(function_names.keys())])))
igor@38	744 sys.exit(1)
igor@37	745
igor@37	746
igor@37	747
igor@37	748
igor@38	749 #os.system("vim")
igor@37	750