igor@37: #!/usr/bin/env python igor@38: # -*- coding: utf-8 -*- igor@37: igor@40: from __future__ import with_statement igor@38: import codecs igor@49: import difflib igor@38: import logging igor@38: import os igor@37: import optparse igor@38: import re igor@38: import subprocess igor@38: import sys igor@38: import Stemmer igor@54: import tempfile igor@42: try: igor@42: import psyco igor@42: psyco.full() igor@42: except: igor@42: pass igor@38: igor@38: config = { igor@38: 'config_directory': os.environ['HOME'] + '/.new-words', igor@38: 'language': 'en', igor@38: } igor@38: igor@38: logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG) igor@38: igor@38: class Normalizator: igor@38: def __init__(self, language, linked_words={}): igor@38: stemmer_algorithm = { igor@38: 'de' : 'german', igor@63: 'fr' : 'french', igor@38: 'en' : 'english', igor@51: 'es' : 'spanish', igor@38: 'ru' : 'russian', igor@51: 'it' : 'italian', igor@38: 'uk' : 'ukrainian', igor@38: } igor@65: try: igor@65: self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language]) igor@65: except: igor@65: self.stemmer = None igor@38: self.linked_words = linked_words igor@38: igor@38: def normalize(self, word): igor@38: word_chain = [] igor@38: while word in self.linked_words and not word in word_chain: igor@38: word_chain.append(word) igor@38: word = self.linked_words[word] igor@65: if self.stemmer: igor@65: return self.stemmer.stemWord(word.lower()) igor@65: else: igor@65: return word.lower() igor@37: igor@47: def best_word_from_group(self, wordpairs_group): igor@47: """Returns the word that is the most relevant to the wordpairs_group. igor@47: igor@47: At the moment: returns the word with minimal length""" igor@49: igor@49: def f(x, y): igor@49: return difflib.SequenceMatcher( igor@49: None, igor@49: #(x[-2:] == 'en' and x[:-2].lower() or x.lower()), igor@49: x.lower(), igor@49: y.lower()).ratio() igor@47: igor@47: minimal_length = min(len(pair[1]) for pair in wordpairs_group) igor@49: best_match = list(x[1] for x in sorted( igor@47: (x for x in wordpairs_group if len(x[1]) == minimal_length), igor@47: key=lambda x:x[0], igor@47: reverse=True))[0] igor@47: igor@51: return best_match igor@51: igor@49: suggestions = self.dictionary_suggestions(best_match) igor@49: if len(suggestions) == 1: igor@49: return best_match igor@49: igor@49: verb = False igor@49: corrected_best_match = best_match igor@49: if best_match[-2:] == 'et': igor@49: word = best_match[:-1]+"n" igor@49: sugg = self.dictionary_suggestions(word) igor@49: if len(sugg) == 1: igor@49: return word igor@49: suggestions += sugg igor@49: corrected_best_match = word igor@49: corrected_best_match = best_match[:-2] igor@49: verb = True igor@49: igor@49: if best_match[-1] == 't': igor@49: word = best_match[:-1]+"en" igor@49: sugg = self.dictionary_suggestions(word) igor@49: if len(sugg) == 1: igor@49: return word igor@49: suggestions += sugg igor@49: corrected_best_match = best_match[:-1] igor@49: verb = True igor@49: igor@49: if corrected_best_match[0].lower() == corrected_best_match[0]: igor@49: suggestions = [ x for x in suggestions igor@49: if x[0].lower() == x[0] ] igor@49: igor@49: if suggestions == []: igor@49: return best_match+"_" igor@49: return best_match+" "+(" ".join( igor@49: sorted( igor@49: suggestions, igor@49: key = lambda x: f(x, corrected_best_match), igor@49: reverse = True igor@49: ) igor@49: ) igor@49: ) igor@49: igor@49: def dictionary_suggestions(self, word): igor@49: return [ igor@49: x.decode('utf-8').rstrip('\n') igor@49: for x igor@49: in subprocess.Popen( igor@49: ["de-variants", word], igor@49: stdout=subprocess.PIPE igor@49: ).stdout.readlines() ] igor@49: igor@49: igor@37: parser = optparse.OptionParser() igor@37: igor@37: parser.add_option( igor@37: "-a", "--no-marks", igor@55: help="don't add marks (and don't save marks added by user) [NOT IMPLEMENTED YET]", igor@37: action="store_true", igor@37: dest="no_marks") igor@37: igor@37: parser.add_option( igor@37: "-c", "--compressed", igor@37: help="show compressed wordlist: one word per group", igor@37: action="store_true", igor@37: dest="compressed") igor@37: igor@37: parser.add_option( igor@37: "-k", "--known-words", igor@37: help="put higher words that are similar to the known words (only for English)", igor@37: action="store_true", igor@37: dest="compressed") igor@37: igor@37: parser.add_option( igor@37: "-l", "--language", igor@37: help="specify language of text", igor@37: action="store", igor@37: dest="language") igor@37: igor@37: parser.add_option( igor@54: "-f", "--allowed-words", igor@54: help="file with list of allowed words (words that will be shown in the output)", igor@54: action="store", igor@54: dest="allowed_words") igor@54: igor@54: parser.add_option( igor@55: "-G", "--words-grouping", igor@55: help="turn off word grouping", igor@55: action="store_true", igor@55: dest="no_words_grouping") igor@55: igor@55: parser.add_option( igor@54: "-X", "--function", igor@38: help="filter through subsystem [INTERNAL]", igor@38: action="store", igor@38: dest="function") igor@38: igor@38: parser.add_option( igor@37: "-m", "--merge-tag", igor@55: help="merge words tagged with specified tag into the main vocabulary [NOT IMPLEMENTED YET]", igor@37: action="store", igor@37: dest="merge_tag") igor@37: igor@37: parser.add_option( igor@37: "-M", "--merge-tagged", igor@55: help="merge words tagged with ANY tag into the main vocabulary [NOT IMPLEMENTED YET]", igor@37: action="store_true", igor@37: dest="merge_tagged") igor@37: igor@37: parser.add_option( igor@37: "-n", "--non-interactive", igor@37: help="non-interactive mode (don't run vi)", igor@37: action="store_true", igor@37: dest="non_interactive") igor@37: igor@37: parser.add_option( igor@37: "-N", "--no-filter", igor@37: help="switch off known words filtering", igor@37: action="store_true", igor@37: dest="no_filter") igor@37: igor@37: parser.add_option( igor@37: "-p", "--pages", igor@37: help="work with specified pages only (pages = start-stop/total )", igor@37: action="store", igor@37: dest="pages") igor@37: igor@37: parser.add_option( igor@48: "-d", "--delete-tag", igor@48: help="delete subvocabulary of specified tag", igor@37: action="store", igor@48: dest="delete_tag") igor@37: igor@37: parser.add_option( igor@55: "-r", "--show-range", igor@55: help="show only words specified number of words", igor@55: action="store", igor@55: dest="show_range") igor@55: igor@55: parser.add_option( igor@54: "-R", "--show-range-percentage", igor@54: help="show only words that cover specified percentage of the text, skip the rest", igor@54: action="store", igor@54: dest="show_range_percentage") igor@54: igor@54: parser.add_option( igor@37: "-s", "--text-stats", igor@37: help="show the text statistics (percentage of known words and so on) and exit", igor@37: action="store_true", igor@37: dest="text_stats") igor@37: igor@37: parser.add_option( igor@37: "-S", "--voc-stats", igor@55: help="show your vocabulary statistics (number of words and word groups) [NOT IMPLEMENTED YET]", igor@37: action="store_true", igor@37: dest="voc_stats") igor@37: igor@37: parser.add_option( igor@37: "-t", "--tag", igor@37: help="tag known words with tag", igor@37: action="store", igor@37: dest="tag") igor@37: igor@37: parser.add_option( igor@37: "-T", "--show-tags", igor@37: help="tag known words with tag", igor@37: action="store_true", igor@37: dest="show_tags") igor@37: igor@37: parser.add_option( igor@63: "-v", "--vocabulary-filename", igor@63: help="use specified file as a vocabulary", igor@63: action="store", igor@63: dest="vocabulary_filename") igor@63: igor@63: parser.add_option( igor@65: "-w", "--web", igor@65: help="Web browser version", igor@65: action="store_true", igor@65: dest="web") igor@65: igor@65: parser.add_option( igor@37: "-2", "--two-words", igor@37: help="find 2 words' sequences", igor@37: action="store_true", igor@37: dest="two_words") igor@37: igor@37: parser.add_option( igor@37: "-3", "--three-words", igor@37: help="find 3 words' sequences", igor@37: action="store_true", igor@37: dest="three_words") igor@37: igor@38: def readlines_from_file(filename): igor@38: res = [] igor@38: with codecs.open(filename, "r", "utf-8") as f: igor@38: for line in f.readlines(): igor@38: res += [line] igor@38: return res igor@38: igor@54: def readlines_from_url(url): igor@54: return [x.decode('utf-8') for x in igor@54: subprocess.Popen( igor@54: "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url), igor@54: shell = True, igor@54: stdout = subprocess.PIPE, igor@54: stderr = subprocess.STDOUT igor@54: ).communicate()[0].split('\n') igor@54: ] igor@54: igor@38: def readlines_from_stdin(): igor@38: return codecs.getreader("utf-8")(sys.stdin).readlines() igor@38: igor@38: def words_from_line(line): igor@38: line = line.rstrip('\n') igor@38: #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line) igor@38: #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line) igor@44: return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line) igor@38: igor@44: def get_words(lines, group_by=[1]): igor@38: """ igor@38: Returns hash of words in a file igor@38: word => number igor@38: """ igor@38: result = {} igor@44: (a, b, c) = ("", "", "") igor@38: for line in lines: igor@38: words = words_from_line(line) igor@38: for word in words: igor@41: if re.match('[0-9]*$', word): igor@41: continue igor@38: result.setdefault(word, 0) igor@38: result[word] += 1 igor@44: if 2 in group_by and a != "" and b != "": igor@44: w = "%s_%s" % (a,b) igor@44: result.setdefault(w, 0) igor@44: result[w] += 1 igor@44: if 3 in group_by and not "" in [a,b,c]: igor@44: w = "%s_%s_%s" % (a,b,c) igor@44: result.setdefault(w, 0) igor@44: result[w] += 1 igor@44: (a,b,c) = (b, c, word) igor@44: igor@44: logging.debug(result) igor@38: return result igor@38: igor@54: def voc_filename(): igor@63: if 'vocabulary_filename' in config: igor@63: return config['vocabulary_filename'] igor@54: return "%s/%s.txt"%(config['config_directory'], config['language']) igor@54: igor@38: def load_vocabulary(): igor@54: return get_words(readlines_from_file(voc_filename())) igor@38: igor@38: def notes_filenames(): igor@38: return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])] igor@38: igor@38: def load_notes(files): igor@38: notes = {} igor@38: for filename in files: igor@39: with codecs.open(filename, "r", "utf-8") as f: igor@38: for line in f.readlines(): igor@38: (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) igor@38: notes.setdefault(word, {}) igor@38: notes[word][filename] = note igor@38: return notes igor@38: igor@39: def add_notes(lines, notes): igor@39: notes_filename = notes_filenames()[0] igor@39: result = [] igor@39: for line in lines: igor@39: if line.startswith('#'): igor@39: result += [line] igor@39: else: igor@39: match_object = re.search('^\s*\S+\s*(\S+)', line) igor@39: if match_object: igor@39: word = match_object.group(1) igor@39: if word in notes: igor@39: if notes_filename in notes[word]: igor@39: line = line.rstrip('\n') igor@39: line = "%-30s %s\n" % (line, notes[word][notes_filename]) igor@39: result += [line] igor@39: else: igor@39: result += [line] igor@39: else: igor@39: result += [line] igor@39: return result igor@39: igor@39: def remove_notes(lines, notes_group): igor@39: notes_filename = notes_filenames()[0] igor@39: notes = {} igor@39: for k in notes_group.keys(): igor@39: if notes_filename in notes_group[k]: igor@39: notes[k] = notes_group[k][notes_filename] igor@39: igor@39: result = [] igor@39: for line in lines: igor@39: line = line.rstrip('\n') igor@39: match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line) igor@39: if match_object: igor@39: result.append("".join([ igor@39: match_object.group(1), igor@39: match_object.group(2), igor@39: match_object.group(3), igor@39: match_object.group(4), igor@39: "\n" igor@39: ])) igor@39: notes[match_object.group(4)] = match_object.group(6) igor@39: else: igor@39: result.append(line+"\n") igor@39: igor@39: save_notes(notes_filename, notes) igor@39: return result igor@39: igor@39: def save_notes(filename, notes): igor@39: lines = [] igor@39: saved_words = [] igor@39: with codecs.open(filename, "r", "utf-8") as f: igor@39: for line in f.readlines(): igor@39: (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) igor@39: if word in notes: igor@39: line = "%-29s %s\n" % (word, notes[word]) igor@39: saved_words.append(word) igor@39: lines.append(line) igor@39: for word in [x for x in notes.keys() if not x in saved_words]: igor@39: line = "%-29s %s\n" % (word, notes[word]) igor@39: lines.append(line) igor@39: igor@39: with codecs.open(filename, "w", "utf-8") as f: igor@39: for line in lines: igor@39: f.write(line) igor@39: igor@39: igor@38: def substract_dictionary(dict1, dict2): igor@38: """ igor@38: returns dict1 - dict2 igor@38: """ igor@38: result = {} igor@38: for (k,v) in dict1.items(): igor@38: if not k in dict2: igor@38: result[k] = v igor@38: return result igor@38: igor@38: def dump_words(words, filename): igor@38: with codecs.open(filename, "w+", "utf-8") as f: igor@38: for word in words.keys(): igor@38: f.write(("%s\n"%word)*words[word]) igor@38: igor@38: def error_message(text): igor@38: print text igor@38: igor@40: def find_wordgroups_weights(word_pairs, normalizator): igor@38: weight = {} igor@40: for (num, word) in word_pairs: igor@38: normalized = normalizator.normalize(word) igor@38: weight.setdefault(normalized, 0) igor@40: weight[normalized] += num igor@38: return weight igor@38: igor@38: def find_linked_words(notes): igor@38: linked_words = {} igor@38: for word in notes.keys(): igor@38: for note in notes[word].values(): igor@38: if "@" in note: igor@38: result = re.search(r'\@(\S*)', note) igor@38: if result: igor@38: main_word = result.group(1) igor@38: if main_word: igor@38: linked_words[word] = main_word igor@38: return linked_words igor@38: igor@40: def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words): igor@40: (num1, word1) = pair1 igor@40: (num2, word2) = pair2 igor@38: igor@38: normalized_word1 = normalizator.normalize(word1) igor@38: normalized_word2 = normalizator.normalize(word2) igor@38: igor@38: cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2]) igor@38: if cmp_res != 0: igor@38: return cmp_res igor@38: else: igor@38: cmp_res = cmp(normalized_word1, normalized_word2) igor@38: if cmp_res != 0: igor@38: return cmp_res igor@38: else: igor@38: return cmp(int(num1), int(num2)) igor@38: igor@47: igor@48: def print_words_sorted( igor@48: word_pairs, igor@48: stats, igor@48: normalizator, igor@48: print_stats=True, igor@48: stats_only=False, igor@48: compressed_wordlist=False, igor@48: show_range=0, igor@48: show_range_percentage=0, igor@48: ): igor@54: result = [] igor@40: if stats_only: igor@54: #codecs.getwriter("utf-8")(sys.stdout).write( igor@54: result.append( igor@43: " ".join([ igor@43: "%-10s" % x for x in [ igor@43: "LANG", igor@43: "KNOWN%", igor@43: "UNKNOWN%", igor@43: "KNOWN", igor@43: "TOTAL", igor@43: "WPS", igor@43: "UWPS*10" igor@43: ]]) + "\n") igor@54: result.append( igor@43: " ".join([ igor@43: "%(language)-10s", igor@43: "%(percentage)-10.2f", igor@43: "%(percentage_unknown)-10.2f", igor@43: "%(total_known)-11d" igor@43: "%(total)-11d" igor@43: "%(wps)-11d" igor@43: "%(uwps)-11d" igor@43: ]) % stats + "\n") igor@54: return "".join(result) igor@38: igor@40: if print_stats: igor@54: result.append( igor@43: "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats) igor@38: igor@40: level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102) igor@40: known = int(stats['total_known']) igor@40: total = int(stats['total']) igor@40: current_level = 0 igor@47: old_normalized_word = None igor@47: words_of_this_group = [] igor@48: printed_words = 0 igor@40: for word_pair in word_pairs: igor@47: igor@47: normalized_word = normalizator.normalize(word_pair[1]) igor@47: if old_normalized_word and old_normalized_word != normalized_word: igor@47: if compressed_wordlist: igor@49: compressed_word_pair = ( igor@49: sum(x[0] for x in words_of_this_group), igor@49: normalizator.best_word_from_group(words_of_this_group) igor@49: ) igor@54: result.append("%10s %s\n" % compressed_word_pair) igor@48: printed_words += 1 igor@47: words_of_this_group = [] igor@47: igor@47: old_normalized_word = normalized_word igor@47: words_of_this_group.append(word_pair) igor@47: igor@47: if not compressed_wordlist: igor@54: result.append("%10s %s\n" % word_pair) igor@48: printed_words += 1 igor@47: igor@47: igor@40: known += word_pair[0] igor@40: if 100.0*known/total >= level_lines[0]: igor@40: current_level = level_lines[0] igor@40: while 100.0*known/total > level_lines[0]: igor@40: current_level = level_lines[0] igor@40: level_lines = level_lines[1:] igor@54: result.append("# %s\n" % current_level) igor@38: igor@48: if show_range >0 and printed_words >= show_range: igor@48: break igor@48: if show_range_percentage >0 and 100.0*known/total >= show_range_percentage: igor@48: break igor@48: igor@54: return result igor@39: igor@53: def parse_parts_description(parts_description): igor@53: """ igor@53: Returns triad (start, stop, step) igor@53: basing on parts_description string. igor@53: from-to/step igor@53: from+delta/step igor@53: """ igor@53: igor@53: try: igor@53: (a, step) = parts_description.split("/", 1) igor@53: step = int(step) igor@53: start = 0 igor@53: stop = 0 igor@53: if '-' in a: igor@53: (start, stop) = a.split("-", 1) igor@53: start = int(start) igor@53: stop = int(stop) igor@53: elif '+' in a: igor@53: (start, stop) = a.split("+", 1) igor@53: start = int(start) igor@53: stop = int(stop) igor@53: else: igor@53: start = int(a) igor@53: stop = start + 1 igor@53: return (start, stop, step) igor@53: igor@53: except: igor@54: raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % parts_description) igor@53: igor@53: igor@53: def take_part(lines, part_description = None): igor@55: if part_description == None or part_description == '': igor@53: return lines igor@53: (start, stop, step) = parse_parts_description(part_description) igor@53: n = len(lines) igor@53: part_size = (1.0*n) / step igor@53: result = [] igor@53: for i in range(n): igor@54: if i >= start * part_size and i <= stop * part_size: igor@54: result += [lines[i]] igor@53: return result igor@53: igor@65: def web_editor(output): igor@65: from twisted.internet import reactor igor@65: from twisted.web.server import Site igor@65: from twisted.web.static import File igor@65: from twisted.web.resource import Resource igor@65: import json igor@65: igor@65: word_list = [] igor@65: igor@65: for o in output: igor@65: a = re.split('\s+', o.strip(), 2) igor@65: a = a + ['']*(3-len(a)) igor@65: word_list.append({'number':a[0], 'word':a[1], 'comment':a[2]}) igor@65: igor@65: print "Loaded ", len(word_list) igor@65: igor@65: new_words_html = "/home/igor/hg/new-words/web" igor@65: igor@65: class JSONPage(Resource): igor@65: isLeaf = True igor@65: def render_GET(self, request): igor@65: return json.dumps({"word_list": word_list}) igor@65: igor@65: class SaveJSON(Resource): igor@65: isLeaf = True igor@65: def render_POST(self, request): igor@65: print json.loads(request.args["selected_words"][0]) igor@65: return json.dumps({"status": "ok"}) igor@65: igor@65: json_page = JSONPage() igor@65: save_json = SaveJSON() igor@65: igor@65: resource = File(new_words_html) igor@65: resource.putChild("json", json_page) igor@65: resource.putChild("save", save_json) igor@65: igor@65: factory = Site(resource) igor@65: reactor.listenTCP(8880, factory) igor@65: reactor.run() igor@65: igor@65: igor@40: def filter_get_words_group_words_add_stat(args): igor@40: vocabulary = load_vocabulary() igor@40: notes = load_notes(notes_filenames()) igor@54: igor@65: input_lines = [] igor@54: if len(args) > 0: igor@65: for arg in args: igor@65: if 'http://' in arg: igor@65: input_lines += readlines_from_url(arg) igor@65: else: igor@65: input_lines += readlines_from_file(arg) igor@54: else: igor@65: input_lines += readlines_from_stdin() igor@54: igor@54: if len(input_lines) == 0: igor@54: print >> sys.stderr, "Nothing to do, standard input is empty, exiting." igor@54: sys.exit(1) igor@54: igor@54: lines = take_part(input_lines, config.get('pages', '')) igor@54: igor@54: (_, original_text_tempfile) = tempfile.mkstemp(prefix='new-word') igor@54: with codecs.open(original_text_tempfile, "w", "utf-8") as f: igor@54: f.write("".join(lines)) igor@54: igor@44: group_by = [1] igor@48: igor@54: if 'two_words' in config: igor@44: group_by.append(2) igor@54: if 'three_words' in config: igor@44: group_by.append(3) igor@44: words = get_words(lines, group_by) igor@43: stats_only = False igor@54: if 'text_stats' in config: igor@43: stats_only = True igor@40: igor@47: compressed_wordlist = False igor@54: if 'compressed' in config: igor@47: compressed_wordlist = True igor@47: igor@55: if 'show_range' in config: igor@55: show_range = int(config['show_range']) igor@48: else: igor@48: show_range = 0 igor@54: igor@54: if 'show_range_percentage' in config: igor@54: show_range_percentage = int(config['show_range_percentage']) igor@48: else: igor@48: show_range_percentage = 0 igor@48: igor@44: igor@40: stats = {} igor@40: stats['total'] = sum(words[x] for x in words.keys()) igor@54: if not 'no_filter' in config: igor@45: words = substract_dictionary(words, vocabulary) igor@40: igor@40: stats['total_unknown'] = sum(words[x] for x in words.keys()) igor@40: stats['total_known'] = stats['total'] - stats['total_unknown'] igor@43: stats['percentage'] = 100.0*stats['total_known']/stats['total'] igor@43: stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total'] igor@40: stats['groups'] = 0 igor@40: stats['words'] = len(words) igor@43: stats['sentences'] = 0 #FIXME igor@43: stats['wps'] = 0 #FIXME igor@43: stats['uwps'] = 0 #FIXME igor@40: stats['language'] = config['language'] igor@40: igor@40: linked_words = find_linked_words(notes) igor@40: normalizator = Normalizator(config['language'], linked_words) igor@40: igor@50: # filter words by allowed_words_filter igor@54: if 'allowed_words' in config: igor@54: allowed_words_filename = config['allowed_words'] igor@50: normalized_allowed_words = [ igor@50: normalizator.normalize(w.rstrip('\n')) igor@50: for w in readlines_from_file(allowed_words_filename) igor@50: ] igor@50: igor@50: result = {} igor@50: for w, wn in words.iteritems(): igor@50: if normalizator.normalize(w) in normalized_allowed_words: igor@50: result[w] = wn igor@50: words = result igor@50: igor@44: words_with_freq = [] igor@40: for k in sorted(words.keys(), key=lambda k: words[k], reverse=True): igor@44: words_with_freq.append((words[k], k)) igor@40: igor@44: wgw = find_wordgroups_weights(words_with_freq, normalizator) igor@55: if not 'no_words_grouping' in config or not config['no_words_grouping']: igor@45: words_with_freq = sorted( igor@44: words_with_freq, igor@40: cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words), igor@40: reverse=True) igor@40: igor@54: output = print_words_sorted( igor@47: words_with_freq, igor@47: stats, igor@47: normalizator, igor@47: stats_only=stats_only, igor@48: compressed_wordlist=compressed_wordlist, igor@48: show_range=show_range, igor@48: show_range_percentage=show_range_percentage, igor@47: ) igor@40: igor@54: igor@54: if ('non_interactive' in config or 'text_stats' in config): igor@54: codecs.getwriter("utf-8")(sys.stdout).write("".join(output)) igor@65: elif config.get('web', False): igor@65: web_editor(output) igor@54: else: igor@54: (_, temp1) = tempfile.mkstemp(prefix='new-word') igor@54: (_, temp2) = tempfile.mkstemp(prefix='new-word') igor@54: igor@54: with codecs.open(temp1, "w", "utf-8") as f: igor@54: f.write("".join(output)) igor@54: with codecs.open(temp2, "w", "utf-8") as f: igor@54: f.write("".join(add_notes(output, notes))) igor@54: igor@54: os.putenv('ORIGINAL_TEXT', original_text_tempfile) igor@54: os.system(( igor@54: "vim" igor@54: " -c 'setlocal spell spelllang={language}'" igor@54: " -c 'set keywordprg={language}'" igor@54: " -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255'" igor@54: " {filename}" igor@54: " < /dev/tty > /dev/tty" igor@54: ).format(language=config['language'], filename=temp2)) igor@54: igor@54: lines = remove_notes(readlines_from_file(temp2), notes) igor@54: igor@54: # compare lines_before and lines_after and return deleted words igor@54: lines_before = output igor@54: lines_after = lines igor@54: deleted_words = [] igor@54: igor@60: lines_after_set = set(lines_after) igor@54: for line in lines_before: igor@60: if line not in lines_after_set: igor@54: line = line.strip() igor@54: if ' ' in line: igor@54: word = re.split('\s+', line, 1)[1] igor@54: if ' ' in word: igor@54: word = re.split('\s+', word, 1)[0] igor@54: deleted_words.append(word) igor@54: igor@54: with codecs.open(voc_filename(), "a", "utf-8") as f: igor@54: f.write("\n".join(deleted_words + [''])) igor@54: igor@54: os.unlink(temp1) igor@54: os.unlink(temp2) igor@54: igor@54: os.unlink(original_text_tempfile) igor@54: igor@37: (options, args) = parser.parse_args() igor@38: if options.language: igor@38: config['language'] = options.language igor@37: igor@54: if options.pages: igor@54: config['pages'] = options.pages igor@54: else: igor@54: config['pages'] = "" igor@54: igor@54: if options.allowed_words: igor@54: config['allowed_words'] = options.allowed_words igor@54: igor@55: if options.show_range: igor@55: config['show_range'] = options.show_range igor@55: igor@54: if options.show_range_percentage: igor@54: config['show_range_percentage'] = options.show_range_percentage igor@54: igor@54: if options.non_interactive: igor@54: config['non_interactive'] = True igor@54: igor@54: if options.text_stats: igor@54: config['text_stats'] = True igor@54: igor@54: if options.compressed: igor@54: config['compressed'] = True igor@54: igor@54: if options.no_filter: igor@54: config['no_filter'] = True igor@54: igor@54: if options.two_words: igor@54: config['two_words'] = True igor@54: igor@54: if options.three_words: igor@54: config['three_words'] = True igor@54: igor@55: if options.no_words_grouping: igor@55: config['no_words_grouping'] = True igor@37: igor@65: if options.web: igor@65: config['web'] = True igor@65: igor@55: filter_get_words_group_words_add_stat(args) igor@55: igor@55: #if options.function: igor@55: # function_names = { igor@55: # 'get_words_group_words_add_stat': , igor@55: # } igor@55: # if options.function in function_names: igor@55: # function_names[options.function](args) igor@55: # else: igor@55: # error_message("Unkown function %s.\nAvailable functions:\n%s" % ( igor@55: # options.function, "".join([" "+x for x in sorted(function_names.keys())]))) igor@55: # sys.exit(1) igor@55: # igor@37: igor@37: igor@37: igor@38: #os.system("vim") igor@37: