new-words: new-words.py annotate

new-words

annotate new-words.py @ 68:846240941452

added -C key: compress to lines; fixed bug with #90-line

author	Igor Chubin <igor@chub.in>
date	Sun Sep 23 16:07:29 2012 +0300 (2012-09-23)
parents	5a003076eb11
children

rev	line source
igor@37	1 #!/usr/bin/env python
igor@38	2 # -- coding: utf-8 --
igor@37	3
igor@40	4 from __future__ import with_statement
igor@38	5 import codecs
igor@49	6 import difflib
igor@38	7 import logging
igor@38	8 import os
igor@37	9 import optparse
igor@38	10 import re
igor@38	11 import subprocess
igor@38	12 import sys
igor@38	13 import Stemmer
igor@54	14 import tempfile
igor@42	15 try:
igor@42	16 import psyco
igor@42	17 psyco.full()
igor@42	18 except:
igor@42	19 pass
igor@38	20
igor@38	21 config = {
igor@38	22 'config_directory': os.environ['HOME'] + '/.new-words',
igor@38	23 'language': 'en',
igor@38	24 }
igor@38	25
igor@38	26 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
igor@38	27
igor@38	28 class Normalizator:
igor@38	29 def __init__(self, language, linked_words={}):
igor@38	30 stemmer_algorithm = {
igor@38	31 'de' : 'german',
igor@63	32 'fr' : 'french',
igor@38	33 'en' : 'english',
igor@51	34 'es' : 'spanish',
igor@38	35 'ru' : 'russian',
igor@51	36 'it' : 'italian',
igor@38	37 'uk' : 'ukrainian',
igor@38	38 }
igor@65	39 try:
igor@65	40 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
igor@65	41 except:
igor@65	42 self.stemmer = None
igor@38	43 self.linked_words = linked_words
igor@38	44
igor@38	45 def normalize(self, word):
igor@38	46 word_chain = []
igor@38	47 while word in self.linked_words and not word in word_chain:
igor@38	48 word_chain.append(word)
igor@38	49 word = self.linked_words[word]
igor@65	50 if self.stemmer:
igor@65	51 return self.stemmer.stemWord(word.lower())
igor@65	52 else:
igor@65	53 return word.lower()
igor@37	54
igor@47	55 def best_word_from_group(self, wordpairs_group):
igor@47	56 """Returns the word that is the most relevant to the wordpairs_group.
igor@47	57
igor@47	58 At the moment: returns the word with minimal length"""
igor@49	59
igor@49	60 def f(x, y):
igor@49	61 return difflib.SequenceMatcher(
igor@49	62 None,
igor@49	63 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),
igor@49	64 x.lower(),
igor@49	65 y.lower()).ratio()
igor@47	66
igor@47	67 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
igor@49	68 best_match = list(x[1] for x in sorted(
igor@47	69 (x for x in wordpairs_group if len(x[1]) == minimal_length),
igor@47	70 key=lambda x:x[0],
igor@47	71 reverse=True))[0]
igor@47	72
igor@51	73 return best_match
igor@51	74
igor@49	75 suggestions = self.dictionary_suggestions(best_match)
igor@49	76 if len(suggestions) == 1:
igor@49	77 return best_match
igor@49	78
igor@49	79 verb = False
igor@49	80 corrected_best_match = best_match
igor@49	81 if best_match[-2:] == 'et':
igor@49	82 word = best_match[:-1]+"n"
igor@49	83 sugg = self.dictionary_suggestions(word)
igor@49	84 if len(sugg) == 1:
igor@49	85 return word
igor@49	86 suggestions += sugg
igor@49	87 corrected_best_match = word
igor@49	88 corrected_best_match = best_match[:-2]
igor@49	89 verb = True
igor@49	90
igor@49	91 if best_match[-1] == 't':
igor@49	92 word = best_match[:-1]+"en"
igor@49	93 sugg = self.dictionary_suggestions(word)
igor@49	94 if len(sugg) == 1:
igor@49	95 return word
igor@49	96 suggestions += sugg
igor@49	97 corrected_best_match = best_match[:-1]
igor@49	98 verb = True
igor@49	99
igor@49	100 if corrected_best_match[0].lower() == corrected_best_match[0]:
igor@49	101 suggestions = [ x for x in suggestions
igor@49	102 if x[0].lower() == x[0] ]
igor@49	103
igor@49	104 if suggestions == []:
igor@49	105 return best_match+"_"
igor@49	106 return best_match+" "+(" ".join(
igor@49	107 sorted(
igor@49	108 suggestions,
igor@49	109 key = lambda x: f(x, corrected_best_match),
igor@49	110 reverse = True
igor@49	111 )
igor@49	112 )
igor@49	113 )
igor@49	114
igor@49	115 def dictionary_suggestions(self, word):
igor@49	116 return [
igor@49	117 x.decode('utf-8').rstrip('\n')
igor@49	118 for x
igor@49	119 in subprocess.Popen(
igor@49	120 ["de-variants", word],
igor@49	121 stdout=subprocess.PIPE
igor@49	122 ).stdout.readlines() ]
igor@49	123
igor@49	124
igor@37	125 parser = optparse.OptionParser()
igor@37	126
igor@37	127 parser.add_option(
igor@37	128 "-a", "--no-marks",
igor@55	129 help="don't add marks (and don't save marks added by user) [NOT IMPLEMENTED YET]",
igor@37	130 action="store_true",
igor@37	131 dest="no_marks")
igor@37	132
igor@37	133 parser.add_option(
igor@37	134 "-c", "--compressed",
igor@37	135 help="show compressed wordlist: one word per group",
igor@37	136 action="store_true",
igor@37	137 dest="compressed")
igor@37	138
igor@37	139 parser.add_option(
igor@68	140 "-C", "--compressed-to-line",
igor@68	141 help="show compressed wordlist: all words of the group in a line",
igor@68	142 action="store_true",
igor@68	143 dest="compressed_to_line")
igor@68	144
igor@68	145 parser.add_option(
igor@37	146 "-k", "--known-words",
igor@37	147 help="put higher words that are similar to the known words (only for English)",
igor@37	148 action="store_true",
igor@37	149 dest="compressed")
igor@37	150
igor@37	151 parser.add_option(
igor@37	152 "-l", "--language",
igor@37	153 help="specify language of text",
igor@37	154 action="store",
igor@37	155 dest="language")
igor@37	156
igor@37	157 parser.add_option(
igor@54	158 "-f", "--allowed-words",
igor@54	159 help="file with list of allowed words (words that will be shown in the output)",
igor@54	160 action="store",
igor@54	161 dest="allowed_words")
igor@54	162
igor@54	163 parser.add_option(
igor@55	164 "-G", "--words-grouping",
igor@55	165 help="turn off word grouping",
igor@55	166 action="store_true",
igor@55	167 dest="no_words_grouping")
igor@55	168
igor@55	169 parser.add_option(
igor@54	170 "-X", "--function",
igor@38	171 help="filter through subsystem [INTERNAL]",
igor@38	172 action="store",
igor@38	173 dest="function")
igor@38	174
igor@38	175 parser.add_option(
igor@37	176 "-m", "--merge-tag",
igor@55	177 help="merge words tagged with specified tag into the main vocabulary [NOT IMPLEMENTED YET]",
igor@37	178 action="store",
igor@37	179 dest="merge_tag")
igor@37	180
igor@37	181 parser.add_option(
igor@37	182 "-M", "--merge-tagged",
igor@55	183 help="merge words tagged with ANY tag into the main vocabulary [NOT IMPLEMENTED YET]",
igor@37	184 action="store_true",
igor@37	185 dest="merge_tagged")
igor@37	186
igor@37	187 parser.add_option(
igor@37	188 "-n", "--non-interactive",
igor@37	189 help="non-interactive mode (don't run vi)",
igor@37	190 action="store_true",
igor@37	191 dest="non_interactive")
igor@37	192
igor@37	193 parser.add_option(
igor@37	194 "-N", "--no-filter",
igor@37	195 help="switch off known words filtering",
igor@37	196 action="store_true",
igor@37	197 dest="no_filter")
igor@37	198
igor@37	199 parser.add_option(
igor@37	200 "-p", "--pages",
igor@37	201 help="work with specified pages only (pages = start-stop/total )",
igor@37	202 action="store",
igor@37	203 dest="pages")
igor@37	204
igor@37	205 parser.add_option(
igor@48	206 "-d", "--delete-tag",
igor@48	207 help="delete subvocabulary of specified tag",
igor@37	208 action="store",
igor@48	209 dest="delete_tag")
igor@37	210
igor@37	211 parser.add_option(
igor@55	212 "-r", "--show-range",
igor@55	213 help="show only words specified number of words",
igor@55	214 action="store",
igor@55	215 dest="show_range")
igor@55	216
igor@55	217 parser.add_option(
igor@54	218 "-R", "--show-range-percentage",
igor@54	219 help="show only words that cover specified percentage of the text, skip the rest",
igor@54	220 action="store",
igor@54	221 dest="show_range_percentage")
igor@54	222
igor@54	223 parser.add_option(
igor@37	224 "-s", "--text-stats",
igor@37	225 help="show the text statistics (percentage of known words and so on) and exit",
igor@37	226 action="store_true",
igor@37	227 dest="text_stats")
igor@37	228
igor@37	229 parser.add_option(
igor@37	230 "-S", "--voc-stats",
igor@55	231 help="show your vocabulary statistics (number of words and word groups) [NOT IMPLEMENTED YET]",
igor@37	232 action="store_true",
igor@37	233 dest="voc_stats")
igor@37	234
igor@37	235 parser.add_option(
igor@37	236 "-t", "--tag",
igor@37	237 help="tag known words with tag",
igor@37	238 action="store",
igor@37	239 dest="tag")
igor@37	240
igor@37	241 parser.add_option(
igor@37	242 "-T", "--show-tags",
igor@37	243 help="tag known words with tag",
igor@37	244 action="store_true",
igor@37	245 dest="show_tags")
igor@37	246
igor@37	247 parser.add_option(
igor@63	248 "-v", "--vocabulary-filename",
igor@63	249 help="use specified file as a vocabulary",
igor@63	250 action="store",
igor@63	251 dest="vocabulary_filename")
igor@63	252
igor@63	253 parser.add_option(
igor@65	254 "-w", "--web",
igor@65	255 help="Web browser version",
igor@65	256 action="store_true",
igor@65	257 dest="web")
igor@65	258
igor@65	259 parser.add_option(
igor@37	260 "-2", "--two-words",
igor@37	261 help="find 2 words' sequences",
igor@37	262 action="store_true",
igor@37	263 dest="two_words")
igor@37	264
igor@37	265 parser.add_option(
igor@37	266 "-3", "--three-words",
igor@37	267 help="find 3 words' sequences",
igor@37	268 action="store_true",
igor@37	269 dest="three_words")
igor@37	270
igor@38	271 def readlines_from_file(filename):
igor@38	272 res = []
igor@38	273 with codecs.open(filename, "r", "utf-8") as f:
igor@38	274 for line in f.readlines():
igor@38	275 res += [line]
igor@38	276 return res
igor@38	277
igor@54	278 def readlines_from_url(url):
igor@54	279 return [x.decode('utf-8') for x in
igor@54	280 subprocess.Popen(
igor@54	281 "lynx -dump '{url}' \| perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url),
igor@54	282 shell = True,
igor@54	283 stdout = subprocess.PIPE,
igor@54	284 stderr = subprocess.STDOUT
igor@54	285 ).communicate()[0].split('\n')
igor@54	286 ]
igor@54	287
igor@38	288 def readlines_from_stdin():
igor@38	289 return codecs.getreader("utf-8")(sys.stdin).readlines()
igor@38	290
igor@38	291 def words_from_line(line):
igor@38	292 line = line.rstrip('\n')
igor@38	293 #return re.split('(?:\s\|[*\r,.:#@()+=<>$;"?!\|\[\]^%&~{}«»–])+', line)
igor@38	294 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
igor@44	295 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
igor@38	296
igor@44	297 def get_words(lines, group_by=[1]):
igor@38	298 """
igor@38	299 Returns hash of words in a file
igor@38	300 word => number
igor@38	301 """
igor@38	302 result = {}
igor@44	303 (a, b, c) = ("", "", "")
igor@38	304 for line in lines:
igor@38	305 words = words_from_line(line)
igor@38	306 for word in words:
igor@41	307 if re.match('[0-9]*$', word):
igor@41	308 continue
igor@38	309 result.setdefault(word, 0)
igor@38	310 result[word] += 1
igor@44	311 if 2 in group_by and a != "" and b != "":
igor@44	312 w = "%s_%s" % (a,b)
igor@44	313 result.setdefault(w, 0)
igor@44	314 result[w] += 1
igor@44	315 if 3 in group_by and not "" in [a,b,c]:
igor@44	316 w = "%s_%s_%s" % (a,b,c)
igor@44	317 result.setdefault(w, 0)
igor@44	318 result[w] += 1
igor@44	319 (a,b,c) = (b, c, word)
igor@44	320
igor@44	321 logging.debug(result)
igor@38	322 return result
igor@38	323
igor@54	324 def voc_filename():
igor@63	325 if 'vocabulary_filename' in config:
igor@63	326 return config['vocabulary_filename']
igor@54	327 return "%s/%s.txt"%(config['config_directory'], config['language'])
igor@54	328
igor@38	329 def load_vocabulary():
igor@54	330 return get_words(readlines_from_file(voc_filename()))
igor@38	331
igor@38	332 def notes_filenames():
igor@38	333 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
igor@38	334
igor@38	335 def load_notes(files):
igor@38	336 notes = {}
igor@38	337 for filename in files:
igor@39	338 with codecs.open(filename, "r", "utf-8") as f:
igor@38	339 for line in f.readlines():
igor@38	340 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@38	341 notes.setdefault(word, {})
igor@38	342 notes[word][filename] = note
igor@38	343 return notes
igor@38	344
igor@39	345 def add_notes(lines, notes):
igor@39	346 notes_filename = notes_filenames()[0]
igor@39	347 result = []
igor@39	348 for line in lines:
igor@39	349 if line.startswith('#'):
igor@39	350 result += [line]
igor@39	351 else:
igor@39	352 match_object = re.search('^\s\S+\s(\S+)', line)
igor@39	353 if match_object:
igor@39	354 word = match_object.group(1)
igor@39	355 if word in notes:
igor@39	356 if notes_filename in notes[word]:
igor@39	357 line = line.rstrip('\n')
igor@39	358 line = "%-30s %s\n" % (line, notes[word][notes_filename])
igor@39	359 result += [line]
igor@39	360 else:
igor@39	361 result += [line]
igor@39	362 else:
igor@39	363 result += [line]
igor@39	364 return result
igor@39	365
igor@39	366 def remove_notes(lines, notes_group):
igor@39	367 notes_filename = notes_filenames()[0]
igor@39	368 notes = {}
igor@39	369 for k in notes_group.keys():
igor@39	370 if notes_filename in notes_group[k]:
igor@39	371 notes[k] = notes_group[k][notes_filename]
igor@39	372
igor@39	373 result = []
igor@39	374 for line in lines:
igor@39	375 line = line.rstrip('\n')
igor@39	376 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
igor@39	377 if match_object:
igor@39	378 result.append("".join([
igor@39	379 match_object.group(1),
igor@39	380 match_object.group(2),
igor@39	381 match_object.group(3),
igor@39	382 match_object.group(4),
igor@39	383 "\n"
igor@39	384 ]))
igor@39	385 notes[match_object.group(4)] = match_object.group(6)
igor@39	386 else:
igor@39	387 result.append(line+"\n")
igor@39	388
igor@39	389 save_notes(notes_filename, notes)
igor@39	390 return result
igor@39	391
igor@39	392 def save_notes(filename, notes):
igor@39	393 lines = []
igor@39	394 saved_words = []
igor@39	395 with codecs.open(filename, "r", "utf-8") as f:
igor@39	396 for line in f.readlines():
igor@39	397 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@39	398 if word in notes:
igor@39	399 line = "%-29s %s\n" % (word, notes[word])
igor@39	400 saved_words.append(word)
igor@39	401 lines.append(line)
igor@39	402 for word in [x for x in notes.keys() if not x in saved_words]:
igor@39	403 line = "%-29s %s\n" % (word, notes[word])
igor@39	404 lines.append(line)
igor@39	405
igor@39	406 with codecs.open(filename, "w", "utf-8") as f:
igor@39	407 for line in lines:
igor@39	408 f.write(line)
igor@39	409
igor@39	410
igor@38	411 def substract_dictionary(dict1, dict2):
igor@38	412 """
igor@38	413 returns dict1 - dict2
igor@38	414 """
igor@38	415 result = {}
igor@38	416 for (k,v) in dict1.items():
igor@38	417 if not k in dict2:
igor@38	418 result[k] = v
igor@38	419 return result
igor@38	420
igor@38	421 def dump_words(words, filename):
igor@38	422 with codecs.open(filename, "w+", "utf-8") as f:
igor@38	423 for word in words.keys():
igor@38	424 f.write(("%s\n"%word)*words[word])
igor@38	425
igor@38	426 def error_message(text):
igor@38	427 print text
igor@38	428
igor@40	429 def find_wordgroups_weights(word_pairs, normalizator):
igor@38	430 weight = {}
igor@40	431 for (num, word) in word_pairs:
igor@38	432 normalized = normalizator.normalize(word)
igor@38	433 weight.setdefault(normalized, 0)
igor@40	434 weight[normalized] += num
igor@38	435 return weight
igor@38	436
igor@38	437 def find_linked_words(notes):
igor@38	438 linked_words = {}
igor@38	439 for word in notes.keys():
igor@38	440 for note in notes[word].values():
igor@38	441 if "@" in note:
igor@38	442 result = re.search(r'\@(\S*)', note)
igor@38	443 if result:
igor@38	444 main_word = result.group(1)
igor@38	445 if main_word:
igor@38	446 linked_words[word] = main_word
igor@38	447 return linked_words
igor@38	448
igor@40	449 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
igor@40	450 (num1, word1) = pair1
igor@40	451 (num2, word2) = pair2
igor@38	452
igor@38	453 normalized_word1 = normalizator.normalize(word1)
igor@38	454 normalized_word2 = normalizator.normalize(word2)
igor@38	455
igor@38	456 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
igor@38	457 if cmp_res != 0:
igor@38	458 return cmp_res
igor@38	459 else:
igor@38	460 cmp_res = cmp(normalized_word1, normalized_word2)
igor@38	461 if cmp_res != 0:
igor@38	462 return cmp_res
igor@38	463 else:
igor@38	464 return cmp(int(num1), int(num2))
igor@38	465
igor@47	466
igor@48	467 def print_words_sorted(
igor@48	468 word_pairs,
igor@48	469 stats,
igor@48	470 normalizator,
igor@48	471 print_stats=True,
igor@48	472 stats_only=False,
igor@48	473 compressed_wordlist=False,
igor@68	474 compressed_to_line = False,
igor@48	475 show_range=0,
igor@48	476 show_range_percentage=0,
igor@48	477 ):
igor@54	478 result = []
igor@40	479 if stats_only:
igor@54	480 #codecs.getwriter("utf-8")(sys.stdout).write(
igor@54	481 result.append(
igor@43	482 " ".join([
igor@43	483 "%-10s" % x for x in [
igor@43	484 "LANG",
igor@43	485 "KNOWN%",
igor@43	486 "UNKNOWN%",
igor@43	487 "KNOWN",
igor@43	488 "TOTAL",
igor@43	489 "WPS",
igor@43	490 "UWPS*10"
igor@43	491 ]]) + "\n")
igor@54	492 result.append(
igor@43	493 " ".join([
igor@43	494 "%(language)-10s",
igor@43	495 "%(percentage)-10.2f",
igor@43	496 "%(percentage_unknown)-10.2f",
igor@43	497 "%(total_known)-11d"
igor@43	498 "%(total)-11d"
igor@43	499 "%(wps)-11d"
igor@43	500 "%(uwps)-11d"
igor@43	501 ]) % stats + "\n")
igor@54	502 return "".join(result)
igor@38	503
igor@40	504 if print_stats:
igor@54	505 result.append(
igor@43	506 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
igor@38	507
igor@40	508 known = int(stats['total_known'])
igor@40	509 total = int(stats['total'])
igor@68	510 level_lines = range(int(float(stats['percentage']))/5*5+5,90,5)+range(90,102)
igor@68	511 if 100.0*known/total >= level_lines[0]:
igor@68	512 current_level = level_lines[0]
igor@68	513 while 100.0*known/total > level_lines[0]:
igor@68	514 current_level = level_lines[0]
igor@68	515 level_lines = level_lines[1:]
igor@47	516 old_normalized_word = None
igor@47	517 words_of_this_group = []
igor@48	518 printed_words = 0
igor@40	519 for word_pair in word_pairs:
igor@47	520
igor@47	521 normalized_word = normalizator.normalize(word_pair[1])
igor@47	522 if old_normalized_word and old_normalized_word != normalized_word:
igor@47	523 if compressed_wordlist:
igor@49	524 compressed_word_pair = (
igor@49	525 sum(x[0] for x in words_of_this_group),
igor@49	526 normalizator.best_word_from_group(words_of_this_group)
igor@49	527 )
igor@68	528 if compressed_to_line:
igor@68	529 result.append("%10s %s %s\n" % (compressed_word_pair + (" ".join(y for x,y in words_of_this_group if y not in compressed_word_pair),)))
igor@68	530 else:
igor@68	531 result.append("%10s %s\n" % compressed_word_pair)
igor@48	532 printed_words += 1
igor@47	533 words_of_this_group = []
igor@47	534
igor@47	535 old_normalized_word = normalized_word
igor@47	536 words_of_this_group.append(word_pair)
igor@47	537
igor@47	538 if not compressed_wordlist:
igor@54	539 result.append("%10s %s\n" % word_pair)
igor@48	540 printed_words += 1
igor@47	541
igor@47	542
igor@40	543 known += word_pair[0]
igor@40	544 if 100.0*known/total >= level_lines[0]:
igor@40	545 current_level = level_lines[0]
igor@40	546 while 100.0*known/total > level_lines[0]:
igor@40	547 current_level = level_lines[0]
igor@40	548 level_lines = level_lines[1:]
igor@54	549 result.append("# %s\n" % current_level)
igor@38	550
igor@48	551 if show_range >0 and printed_words >= show_range:
igor@48	552 break
igor@48	553 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
igor@48	554 break
igor@48	555
igor@54	556 return result
igor@39	557
igor@53	558 def parse_parts_description(parts_description):
igor@53	559 """
igor@53	560 Returns triad (start, stop, step)
igor@53	561 basing on parts_description string.
igor@53	562 from-to/step
igor@53	563 from+delta/step
igor@53	564 """
igor@53	565
igor@53	566 try:
igor@53	567 (a, step) = parts_description.split("/", 1)
igor@53	568 step = int(step)
igor@53	569 start = 0
igor@53	570 stop = 0
igor@53	571 if '-' in a:
igor@53	572 (start, stop) = a.split("-", 1)
igor@53	573 start = int(start)
igor@53	574 stop = int(stop)
igor@53	575 elif '+' in a:
igor@53	576 (start, stop) = a.split("+", 1)
igor@53	577 start = int(start)
igor@53	578 stop = int(stop)
igor@53	579 else:
igor@53	580 start = int(a)
igor@53	581 stop = start + 1
igor@53	582 return (start, stop, step)
igor@53	583
igor@53	584 except:
igor@54	585 raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % parts_description)
igor@53	586
igor@53	587
igor@53	588 def take_part(lines, part_description = None):
igor@55	589 if part_description == None or part_description == '':
igor@53	590 return lines
igor@53	591 (start, stop, step) = parse_parts_description(part_description)
igor@53	592 n = len(lines)
igor@53	593 part_size = (1.0*n) / step
igor@53	594 result = []
igor@53	595 for i in range(n):
igor@54	596 if i >= start * part_size and i <= stop * part_size:
igor@54	597 result += [lines[i]]
igor@53	598 return result
igor@53	599
igor@65	600 def web_editor(output):
igor@65	601 from twisted.internet import reactor
igor@65	602 from twisted.web.server import Site
igor@65	603 from twisted.web.static import File
igor@65	604 from twisted.web.resource import Resource
igor@65	605 import json
igor@65	606
igor@65	607 word_list = []
igor@65	608
igor@65	609 for o in output:
igor@65	610 a = re.split('\s+', o.strip(), 2)
igor@65	611 a = a + ['']*(3-len(a))
igor@65	612 word_list.append({'number':a[0], 'word':a[1], 'comment':a[2]})
igor@65	613
igor@65	614 print "Loaded ", len(word_list)
igor@65	615
igor@65	616 new_words_html = "/home/igor/hg/new-words/web"
igor@65	617
igor@65	618 class JSONPage(Resource):
igor@65	619 isLeaf = True
igor@65	620 def render_GET(self, request):
igor@65	621 return json.dumps({"word_list": word_list})
igor@65	622
igor@65	623 class SaveJSON(Resource):
igor@65	624 isLeaf = True
igor@65	625 def render_POST(self, request):
igor@65	626 print json.loads(request.args["selected_words"][0])
igor@65	627 return json.dumps({"status": "ok"})
igor@65	628
igor@65	629 json_page = JSONPage()
igor@65	630 save_json = SaveJSON()
igor@65	631
igor@65	632 resource = File(new_words_html)
igor@65	633 resource.putChild("json", json_page)
igor@65	634 resource.putChild("save", save_json)
igor@65	635
igor@65	636 factory = Site(resource)
igor@65	637 reactor.listenTCP(8880, factory)
igor@65	638 reactor.run()
igor@65	639
igor@65	640
igor@40	641 def filter_get_words_group_words_add_stat(args):
igor@40	642 vocabulary = load_vocabulary()
igor@40	643 notes = load_notes(notes_filenames())
igor@54	644
igor@65	645 input_lines = []
igor@54	646 if len(args) > 0:
igor@65	647 for arg in args:
igor@65	648 if 'http://' in arg:
igor@65	649 input_lines += readlines_from_url(arg)
igor@65	650 else:
igor@65	651 input_lines += readlines_from_file(arg)
igor@54	652 else:
igor@65	653 input_lines += readlines_from_stdin()
igor@54	654
igor@54	655 if len(input_lines) == 0:
igor@54	656 print >> sys.stderr, "Nothing to do, standard input is empty, exiting."
igor@54	657 sys.exit(1)
igor@54	658
igor@54	659 lines = take_part(input_lines, config.get('pages', ''))
igor@54	660
igor@54	661 (_, original_text_tempfile) = tempfile.mkstemp(prefix='new-word')
igor@54	662 with codecs.open(original_text_tempfile, "w", "utf-8") as f:
igor@54	663 f.write("".join(lines))
igor@54	664
igor@44	665 group_by = [1]
igor@48	666
igor@54	667 if 'two_words' in config:
igor@44	668 group_by.append(2)
igor@54	669 if 'three_words' in config:
igor@44	670 group_by.append(3)
igor@44	671 words = get_words(lines, group_by)
igor@43	672 stats_only = False
igor@54	673 if 'text_stats' in config:
igor@43	674 stats_only = True
igor@40	675
igor@47	676 compressed_wordlist = False
igor@68	677 if 'compressed' in config or 'compressed_to_line' in config:
igor@47	678 compressed_wordlist = True
igor@47	679
igor@68	680 compressed_to_line = 'compressed_to_line' in config
igor@68	681
igor@55	682 if 'show_range' in config:
igor@55	683 show_range = int(config['show_range'])
igor@48	684 else:
igor@48	685 show_range = 0
igor@54	686
igor@54	687 if 'show_range_percentage' in config:
igor@54	688 show_range_percentage = int(config['show_range_percentage'])
igor@48	689 else:
igor@48	690 show_range_percentage = 0
igor@48	691
igor@44	692
igor@40	693 stats = {}
igor@40	694 stats['total'] = sum(words[x] for x in words.keys())
igor@54	695 if not 'no_filter' in config:
igor@45	696 words = substract_dictionary(words, vocabulary)
igor@40	697
igor@40	698 stats['total_unknown'] = sum(words[x] for x in words.keys())
igor@40	699 stats['total_known'] = stats['total'] - stats['total_unknown']
igor@43	700 stats['percentage'] = 100.0*stats['total_known']/stats['total']
igor@43	701 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
igor@40	702 stats['groups'] = 0
igor@40	703 stats['words'] = len(words)
igor@43	704 stats['sentences'] = 0 #FIXME
igor@43	705 stats['wps'] = 0 #FIXME
igor@43	706 stats['uwps'] = 0 #FIXME
igor@40	707 stats['language'] = config['language']
igor@40	708
igor@40	709 linked_words = find_linked_words(notes)
igor@40	710 normalizator = Normalizator(config['language'], linked_words)
igor@40	711
igor@50	712 # filter words by allowed_words_filter
igor@54	713 if 'allowed_words' in config:
igor@54	714 allowed_words_filename = config['allowed_words']
igor@50	715 normalized_allowed_words = [
igor@50	716 normalizator.normalize(w.rstrip('\n'))
igor@50	717 for w in readlines_from_file(allowed_words_filename)
igor@50	718 ]
igor@50	719
igor@50	720 result = {}
igor@50	721 for w, wn in words.iteritems():
igor@50	722 if normalizator.normalize(w) in normalized_allowed_words:
igor@50	723 result[w] = wn
igor@50	724 words = result
igor@50	725
igor@44	726 words_with_freq = []
igor@40	727 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
igor@44	728 words_with_freq.append((words[k], k))
igor@40	729
igor@44	730 wgw = find_wordgroups_weights(words_with_freq, normalizator)
igor@55	731 if not 'no_words_grouping' in config or not config['no_words_grouping']:
igor@45	732 words_with_freq = sorted(
igor@44	733 words_with_freq,
igor@40	734 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
igor@40	735 reverse=True)
igor@40	736
igor@54	737 output = print_words_sorted(
igor@47	738 words_with_freq,
igor@47	739 stats,
igor@47	740 normalizator,
igor@47	741 stats_only=stats_only,
igor@48	742 compressed_wordlist=compressed_wordlist,
igor@68	743 compressed_to_line=compressed_to_line,
igor@48	744 show_range=show_range,
igor@48	745 show_range_percentage=show_range_percentage,
igor@47	746 )
igor@40	747
igor@54	748
igor@54	749 if ('non_interactive' in config or 'text_stats' in config):
igor@54	750 codecs.getwriter("utf-8")(sys.stdout).write("".join(output))
igor@65	751 elif config.get('web', False):
igor@65	752 web_editor(output)
igor@54	753 else:
igor@54	754 (_, temp1) = tempfile.mkstemp(prefix='new-word')
igor@54	755 (_, temp2) = tempfile.mkstemp(prefix='new-word')
igor@54	756
igor@54	757 with codecs.open(temp1, "w", "utf-8") as f:
igor@54	758 f.write("".join(output))
igor@54	759 with codecs.open(temp2, "w", "utf-8") as f:
igor@54	760 f.write("".join(add_notes(output, notes)))
igor@54	761
igor@54	762 os.putenv('ORIGINAL_TEXT', original_text_tempfile)
igor@54	763 os.system((
igor@54	764 "vim"
igor@54	765 " -c 'setlocal spell spelllang={language}'"
igor@54	766 " -c 'set keywordprg={language}'"
igor@54	767 " -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255'"
igor@54	768 " {filename}"
igor@54	769 " < /dev/tty > /dev/tty"
igor@54	770 ).format(language=config['language'], filename=temp2))
igor@54	771
igor@54	772 lines = remove_notes(readlines_from_file(temp2), notes)
igor@54	773
igor@54	774 # compare lines_before and lines_after and return deleted words
igor@54	775 lines_before = output
igor@54	776 lines_after = lines
igor@54	777 deleted_words = []
igor@54	778
igor@60	779 lines_after_set = set(lines_after)
igor@54	780 for line in lines_before:
igor@60	781 if line not in lines_after_set:
igor@54	782 line = line.strip()
igor@54	783 if ' ' in line:
igor@54	784 word = re.split('\s+', line, 1)[1]
igor@54	785 if ' ' in word:
igor@54	786 word = re.split('\s+', word, 1)[0]
igor@54	787 deleted_words.append(word)
igor@54	788
igor@54	789 with codecs.open(voc_filename(), "a", "utf-8") as f:
igor@54	790 f.write("\n".join(deleted_words + ['']))
igor@54	791
igor@54	792 os.unlink(temp1)
igor@54	793 os.unlink(temp2)
igor@54	794
igor@54	795 os.unlink(original_text_tempfile)
igor@54	796
igor@37	797 (options, args) = parser.parse_args()
igor@38	798 if options.language:
igor@38	799 config['language'] = options.language
igor@37	800
igor@54	801 if options.pages:
igor@54	802 config['pages'] = options.pages
igor@54	803 else:
igor@54	804 config['pages'] = ""
igor@54	805
igor@54	806 if options.allowed_words:
igor@54	807 config['allowed_words'] = options.allowed_words
igor@54	808
igor@55	809 if options.show_range:
igor@55	810 config['show_range'] = options.show_range
igor@55	811
igor@54	812 if options.show_range_percentage:
igor@54	813 config['show_range_percentage'] = options.show_range_percentage
igor@54	814
igor@54	815 if options.non_interactive:
igor@54	816 config['non_interactive'] = True
igor@54	817
igor@54	818 if options.text_stats:
igor@54	819 config['text_stats'] = True
igor@54	820
igor@54	821 if options.compressed:
igor@54	822 config['compressed'] = True
igor@54	823
igor@68	824 if options.compressed_to_line:
igor@68	825 config['compressed_to_line'] = True
igor@68	826
igor@54	827 if options.no_filter:
igor@54	828 config['no_filter'] = True
igor@54	829
igor@54	830 if options.two_words:
igor@54	831 config['two_words'] = True
igor@54	832
igor@54	833 if options.three_words:
igor@54	834 config['three_words'] = True
igor@54	835
igor@55	836 if options.no_words_grouping:
igor@55	837 config['no_words_grouping'] = True
igor@37	838
igor@65	839 if options.web:
igor@65	840 config['web'] = True
igor@65	841
igor@55	842 filter_get_words_group_words_add_stat(args)
igor@55	843
igor@55	844 #if options.function:
igor@55	845 # function_names = {
igor@55	846 # 'get_words_group_words_add_stat': ,
igor@55	847 # }
igor@55	848 # if options.function in function_names:
igor@55	849 # function_names[options.function](args)
igor@55	850 # else:
igor@55	851 # error_message("Unkown function %s.\nAvailable functions:\n%s" % (
igor@55	852 # options.function, "".join([" "+x for x in sorted(function_names.keys())])))
igor@55	853 # sys.exit(1)
igor@55	854 #
igor@37	855
igor@37	856
igor@37	857
igor@38	858 #os.system("vim")
igor@37	859