igor@37: #!/usr/bin/env python
igor@38: # -*- coding: utf-8 -*-
igor@37: 
igor@40: from __future__ import with_statement
igor@38: import codecs
igor@49: import difflib
igor@38: import logging
igor@38: import os
igor@37: import optparse
igor@38: import re
igor@38: import subprocess
igor@38: import sys
igor@38: import Stemmer
igor@54: import tempfile
igor@42: try:
igor@42:     import psyco
igor@42:     psyco.full()
igor@42: except:
igor@42:     pass
igor@38: 
igor@38: config = {
igor@38:     'config_directory': os.environ['HOME'] + '/.new-words',
igor@38:     'language': 'en',
igor@38: }
igor@38: 
igor@38: logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
igor@38: 
igor@38: class Normalizator:
igor@38:     def __init__(self, language, linked_words={}):
igor@38:         stemmer_algorithm = {
igor@38:             'de' : 'german',
igor@38:             'en' : 'english',
igor@51:             'es' : 'spanish',
igor@38:             'ru' : 'russian',
igor@51:             'it' : 'italian',
igor@38:             'uk' : 'ukrainian',
igor@38:         }
igor@38:         self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
igor@38:         self.linked_words = linked_words
igor@38: 
igor@38:     def normalize(self, word):
igor@38:         word_chain = []
igor@38:         while word in self.linked_words and not word in word_chain:
igor@38:             word_chain.append(word)
igor@38:             word = self.linked_words[word]
igor@38:         return self.stemmer.stemWord(word.lower())
igor@37: 
igor@47:     def best_word_from_group(self, wordpairs_group):
igor@47:         """Returns the word that is the most relevant to the wordpairs_group.
igor@47: 
igor@47:         At the moment: returns the word with minimal length"""
igor@49:         
igor@49:         def f(x, y):
igor@49:             return difflib.SequenceMatcher(
igor@49:                         None, 
igor@49:                         #(x[-2:] == 'en' and x[:-2].lower() or x.lower()), 
igor@49:                         x.lower(),
igor@49:                         y.lower()).ratio()
igor@47: 
igor@47:         minimal_length = min(len(pair[1]) for pair in wordpairs_group)
igor@49:         best_match = list(x[1] for x in sorted(
igor@47:             (x for x in wordpairs_group if len(x[1]) == minimal_length),
igor@47:             key=lambda x:x[0],
igor@47:             reverse=True))[0]
igor@47: 
igor@51:         return best_match
igor@51: 
igor@49:         suggestions = self.dictionary_suggestions(best_match)
igor@49:         if len(suggestions) == 1:
igor@49:             return best_match
igor@49: 
igor@49:         verb = False
igor@49:         corrected_best_match = best_match
igor@49:         if best_match[-2:] == 'et':
igor@49:             word = best_match[:-1]+"n"
igor@49:             sugg = self.dictionary_suggestions(word)
igor@49:             if len(sugg) == 1:
igor@49:                 return word
igor@49:             suggestions += sugg
igor@49:             corrected_best_match = word
igor@49:             corrected_best_match = best_match[:-2]
igor@49:             verb = True
igor@49: 
igor@49:         if best_match[-1] == 't':
igor@49:             word = best_match[:-1]+"en"
igor@49:             sugg = self.dictionary_suggestions(word)
igor@49:             if len(sugg) == 1:
igor@49:                 return word
igor@49:             suggestions += sugg
igor@49:             corrected_best_match = best_match[:-1]
igor@49:             verb = True
igor@49: 
igor@49:         if corrected_best_match[0].lower() == corrected_best_match[0]:
igor@49:             suggestions = [ x for x in suggestions
igor@49:                 if x[0].lower() == x[0] ]
igor@49: 
igor@49:         if suggestions == []:
igor@49:             return best_match+"_"
igor@49:         return best_match+" "+(" ".join(
igor@49:                             sorted(
igor@49:                                 suggestions,
igor@49:                                 key = lambda x: f(x, corrected_best_match),
igor@49:                                 reverse = True
igor@49:                                 )
igor@49:                             )
igor@49:                         )
igor@49: 
igor@49:     def dictionary_suggestions(self, word):
igor@49:         return [
igor@49:             x.decode('utf-8').rstrip('\n')
igor@49:             for x 
igor@49:                 in subprocess.Popen(
igor@49:                     ["de-variants", word],
igor@49:                     stdout=subprocess.PIPE
igor@49:                 ).stdout.readlines() ]
igor@49: 
igor@49: 
igor@37: parser = optparse.OptionParser()
igor@37: 
igor@37: parser.add_option(
igor@37:     "-a", "--no-marks",
igor@37:     help="don't add marks (and don't save marks added by user)",
igor@37:     action="store_true",
igor@37:     dest="no_marks")
igor@37: 
igor@37: parser.add_option(
igor@37:     "-c", "--compressed",
igor@37:     help="show compressed wordlist: one word per group",
igor@37:     action="store_true",
igor@37:     dest="compressed")
igor@37: 
igor@37: parser.add_option(
igor@37:     "-k", "--known-words",
igor@37:     help="put higher words that are similar to the known words (only for English)",
igor@37:     action="store_true",
igor@37:     dest="compressed")
igor@37: 
igor@37: parser.add_option(
igor@37:     "-l", "--language",
igor@37:     help="specify language of text",
igor@37:     action="store",
igor@37:     dest="language")
igor@37: 
igor@37: parser.add_option(
igor@54:     "-f", "--allowed-words",
igor@54:     help="file with list of allowed words (words that will be shown in the output)",
igor@54:     action="store",
igor@54:     dest="allowed_words")
igor@54: 
igor@54: parser.add_option(
igor@54:     "-X", "--function",
igor@38:     help="filter through subsystem [INTERNAL]",
igor@38:     action="store",
igor@38:     dest="function")
igor@38: 
igor@38: parser.add_option(
igor@37:     "-m", "--merge-tag",
igor@37:     help="merge words tagged with specified tag into the main vocabulary",
igor@37:     action="store",
igor@37:     dest="merge_tag")
igor@37: 
igor@37: parser.add_option(
igor@37:     "-M", "--merge-tagged",
igor@37:     help="merge words tagged with ANY tag into the main vocabulary",
igor@37:     action="store_true",
igor@37:     dest="merge_tagged")
igor@37: 
igor@37: parser.add_option(
igor@37:     "-n", "--non-interactive",
igor@37:     help="non-interactive mode (don't run vi)",
igor@37:     action="store_true",
igor@37:     dest="non_interactive")
igor@37: 
igor@37: parser.add_option(
igor@37:     "-N", "--no-filter",
igor@37:     help="switch off known words filtering",
igor@37:     action="store_true",
igor@37:     dest="no_filter")
igor@37: 
igor@37: parser.add_option(
igor@37:     "-p", "--pages",
igor@37:     help="work with specified pages only (pages = start-stop/total )",
igor@37:     action="store",
igor@37:     dest="pages")
igor@37: 
igor@37: parser.add_option(
igor@48:     "-d", "--delete-tag",
igor@48:     help="delete subvocabulary of specified tag",
igor@37:     action="store",
igor@48:     dest="delete_tag")
igor@37: 
igor@37: parser.add_option(
igor@54:     "-R", "--show-range-percentage",
igor@54:     help="show only words that cover specified percentage of the text, skip the rest",
igor@54:     action="store",
igor@54:     dest="show_range_percentage")
igor@54: 
igor@54: parser.add_option(
igor@37:     "-s", "--text-stats",
igor@37:     help="show the text statistics (percentage of known words and so on) and exit",
igor@37:     action="store_true",
igor@37:     dest="text_stats")
igor@37: 
igor@37: parser.add_option(
igor@37:     "-S", "--voc-stats",
igor@37:     help="show your vocabulary statistics (number of words and word groups)",
igor@37:     action="store_true",
igor@37:     dest="voc_stats")
igor@37: 
igor@37: parser.add_option(
igor@37:     "-t", "--tag",
igor@37:     help="tag known words with tag",
igor@37:     action="store",
igor@37:     dest="tag")
igor@37: 
igor@37: parser.add_option(
igor@37:     "-T", "--show-tags",
igor@37:     help="tag known words with tag",
igor@37:     action="store_true",
igor@37:     dest="show_tags")
igor@37: 
igor@37: parser.add_option(
igor@37:     "-2", "--two-words",
igor@37:     help="find 2 words' sequences",
igor@37:     action="store_true",
igor@37:     dest="two_words")
igor@37: 
igor@37: parser.add_option(
igor@37:     "-3", "--three-words",
igor@37:     help="find 3 words' sequences",
igor@37:     action="store_true",
igor@37:     dest="three_words")
igor@37: 
igor@38: def readlines_from_file(filename):
igor@38:     res = []
igor@38:     with codecs.open(filename, "r", "utf-8") as f:
igor@38:         for line in f.readlines():
igor@38:             res += [line]
igor@38:     return res
igor@38: 
igor@54: def readlines_from_url(url):
igor@54:     return [x.decode('utf-8') for x in
igor@54:         subprocess.Popen(
igor@54:             "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url),
igor@54:             shell = True,
igor@54:             stdout = subprocess.PIPE,
igor@54:             stderr = subprocess.STDOUT
igor@54:             ).communicate()[0].split('\n')
igor@54:     ]
igor@54: 
igor@38: def readlines_from_stdin():
igor@38:     return codecs.getreader("utf-8")(sys.stdin).readlines()
igor@38: 
igor@38: def words_from_line(line):
igor@38:     line = line.rstrip('\n')
igor@38:     #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
igor@38:     #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
igor@44:     return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
igor@38: 
igor@44: def get_words(lines, group_by=[1]):
igor@38:     """
igor@38:     Returns hash of words in a file
igor@38:     word => number
igor@38:     """
igor@38:     result = {}
igor@44:     (a, b, c) = ("", "", "")
igor@38:     for line in lines:
igor@38:         words = words_from_line(line)
igor@38:         for word in words:
igor@41:             if re.match('[0-9]*$', word):
igor@41:                 continue
igor@38:             result.setdefault(word, 0)
igor@38:             result[word] += 1
igor@44:             if 2 in group_by and a != "" and b != "":
igor@44:                 w = "%s_%s" % (a,b)
igor@44:                 result.setdefault(w, 0)
igor@44:                 result[w] += 1
igor@44:             if 3 in group_by and not "" in [a,b,c]:
igor@44:                 w = "%s_%s_%s" % (a,b,c)
igor@44:                 result.setdefault(w, 0)
igor@44:                 result[w] += 1
igor@44:             (a,b,c) = (b, c, word)
igor@44: 
igor@44:     logging.debug(result)
igor@38:     return result
igor@38: 
igor@54: def voc_filename():
igor@54:     return "%s/%s.txt"%(config['config_directory'], config['language'])
igor@54: 
igor@38: def load_vocabulary():
igor@54:     return get_words(readlines_from_file(voc_filename()))
igor@38: 
igor@38: def notes_filenames():
igor@38:     return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
igor@38: 
igor@38: def load_notes(files):
igor@38:     notes = {}
igor@38:     for filename in files:
igor@39:         with codecs.open(filename, "r", "utf-8") as f:
igor@38:             for line in f.readlines():
igor@38:                 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@38:                 notes.setdefault(word, {})
igor@38:                 notes[word][filename] = note
igor@38:     return notes
igor@38: 
igor@39: def add_notes(lines, notes):
igor@39:     notes_filename = notes_filenames()[0]
igor@39:     result = []
igor@39:     for line in lines:
igor@39:         if line.startswith('#'):
igor@39:             result += [line]
igor@39:         else:
igor@39:             match_object = re.search('^\s*\S+\s*(\S+)', line)
igor@39:             if match_object:
igor@39:                 word = match_object.group(1)
igor@39:                 if word in notes:
igor@39:                     if notes_filename in notes[word]:
igor@39:                         line = line.rstrip('\n')
igor@39:                         line = "%-30s %s\n" % (line, notes[word][notes_filename])
igor@39:                         result += [line]
igor@39:                 else:
igor@39:                     result += [line]
igor@39:             else:
igor@39:                 result += [line]
igor@39:     return result
igor@39: 
igor@39: def remove_notes(lines, notes_group):
igor@39:     notes_filename = notes_filenames()[0]
igor@39:     notes = {}
igor@39:     for k in notes_group.keys():
igor@39:         if notes_filename in notes_group[k]:
igor@39:             notes[k] = notes_group[k][notes_filename]
igor@39: 
igor@39:     result = []
igor@39:     for line in lines:
igor@39:         line = line.rstrip('\n')
igor@39:         match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
igor@39:         if match_object:
igor@39:             result.append("".join([
igor@39:                 match_object.group(1),
igor@39:                 match_object.group(2),
igor@39:                 match_object.group(3),
igor@39:                 match_object.group(4),
igor@39:                 "\n"
igor@39:                 ]))
igor@39:             notes[match_object.group(4)] = match_object.group(6)
igor@39:         else:
igor@39:             result.append(line+"\n")
igor@39: 
igor@39:     save_notes(notes_filename, notes)
igor@39:     return result
igor@39: 
igor@39: def save_notes(filename, notes):
igor@39:     lines = []
igor@39:     saved_words = []
igor@39:     with codecs.open(filename, "r", "utf-8") as f:
igor@39:         for line in f.readlines():
igor@39:             (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@39:             if word in notes:
igor@39:                 line = "%-29s %s\n" % (word, notes[word])
igor@39:                 saved_words.append(word)
igor@39:             lines.append(line)
igor@39:     for word in [x for x in notes.keys() if not x in saved_words]:
igor@39:         line = "%-29s %s\n" % (word, notes[word])
igor@39:         lines.append(line)
igor@39: 
igor@39:     with codecs.open(filename, "w", "utf-8") as f:
igor@39:         for line in lines:
igor@39:             f.write(line)
igor@39: 
igor@39: 
igor@38: def substract_dictionary(dict1, dict2):
igor@38:     """
igor@38:     returns dict1 - dict2
igor@38:     """
igor@38:     result = {}
igor@38:     for (k,v) in dict1.items():
igor@38:         if not k in dict2:
igor@38:             result[k] = v
igor@38:     return result
igor@38: 
igor@38: def dump_words(words, filename):
igor@38:     with codecs.open(filename, "w+", "utf-8") as f:
igor@38:         for word in words.keys():
igor@38:             f.write(("%s\n"%word)*words[word])
igor@38: 
igor@38: def error_message(text):
igor@38:     print text
igor@38: 
igor@40: def find_wordgroups_weights(word_pairs, normalizator):
igor@38:     weight = {}
igor@40:     for (num, word) in word_pairs:
igor@38:         normalized = normalizator.normalize(word)
igor@38:         weight.setdefault(normalized, 0)
igor@40:         weight[normalized] += num
igor@38:     return weight
igor@38: 
igor@38: def find_linked_words(notes):
igor@38:     linked_words = {}
igor@38:     for word in notes.keys():
igor@38:         for note in notes[word].values():
igor@38:             if "@" in note:
igor@38:                 result = re.search(r'\@(\S*)', note)
igor@38:                 if result:
igor@38:                     main_word = result.group(1)
igor@38:                     if main_word:
igor@38:                         linked_words[word] = main_word
igor@38:     return linked_words
igor@38: 
igor@40: def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
igor@40:     (num1, word1) = pair1
igor@40:     (num2, word2) = pair2
igor@38: 
igor@38:     normalized_word1 = normalizator.normalize(word1)
igor@38:     normalized_word2 = normalizator.normalize(word2)
igor@38: 
igor@38:     cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
igor@38:     if cmp_res != 0:
igor@38:         return cmp_res
igor@38:     else:
igor@38:         cmp_res = cmp(normalized_word1, normalized_word2)
igor@38:         if cmp_res != 0:
igor@38:             return cmp_res
igor@38:         else:
igor@38:             return cmp(int(num1), int(num2))
igor@38: 
igor@47: 
igor@48: def print_words_sorted(
igor@48:         word_pairs,
igor@48:         stats,
igor@48:         normalizator,
igor@48:         print_stats=True,
igor@48:         stats_only=False,
igor@48:         compressed_wordlist=False,
igor@48:         show_range=0,
igor@48:         show_range_percentage=0,
igor@48:         ):
igor@54:     result = []
igor@40:     if stats_only:
igor@54:         #codecs.getwriter("utf-8")(sys.stdout).write(
igor@54:         result.append(
igor@43:             " ".join([
igor@43:                 "%-10s" % x for x in [
igor@43:                 "LANG",
igor@43:                 "KNOWN%",
igor@43:                 "UNKNOWN%",
igor@43:                 "KNOWN",
igor@43:                 "TOTAL",
igor@43:                 "WPS",
igor@43:                 "UWPS*10"
igor@43:                 ]]) + "\n")
igor@54:         result.append(
igor@43:             " ".join([
igor@43:                 "%(language)-10s",
igor@43:                 "%(percentage)-10.2f",
igor@43:                 "%(percentage_unknown)-10.2f",
igor@43:                 "%(total_known)-11d"
igor@43:                 "%(total)-11d"
igor@43:                 "%(wps)-11d"
igor@43:                 "%(uwps)-11d"
igor@43:                 ]) % stats + "\n")
igor@54:         return "".join(result)
igor@38: 
igor@40:     if print_stats:
igor@54:         result.append(
igor@43:             "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
igor@38: 
igor@40:     level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
igor@40:     known = int(stats['total_known'])
igor@40:     total = int(stats['total'])
igor@40:     current_level = 0
igor@47:     old_normalized_word = None
igor@47:     words_of_this_group = []
igor@48:     printed_words = 0
igor@40:     for word_pair in word_pairs:
igor@47: 
igor@47:         normalized_word = normalizator.normalize(word_pair[1])
igor@47:         if old_normalized_word and old_normalized_word != normalized_word:
igor@47:             if compressed_wordlist:
igor@49:                 compressed_word_pair = (
igor@49:                     sum(x[0] for x in words_of_this_group),
igor@49:                     normalizator.best_word_from_group(words_of_this_group)
igor@49:                     )
igor@54:                 result.append("%10s %s\n" % compressed_word_pair)
igor@48:                 printed_words += 1
igor@47:             words_of_this_group = []
igor@47: 
igor@47:         old_normalized_word = normalized_word
igor@47:         words_of_this_group.append(word_pair)
igor@47: 
igor@47:         if not compressed_wordlist:
igor@54:             result.append("%10s %s\n" % word_pair)
igor@48:             printed_words += 1
igor@47: 
igor@47: 
igor@40:         known += word_pair[0]
igor@40:         if 100.0*known/total >= level_lines[0]:
igor@40:             current_level = level_lines[0]
igor@40:             while 100.0*known/total > level_lines[0]:
igor@40:                 current_level = level_lines[0]
igor@40:                 level_lines = level_lines[1:]
igor@54:             result.append("# %s\n" % current_level)
igor@38: 
igor@48:         if show_range >0 and printed_words >= show_range:
igor@48:             break
igor@48:         if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
igor@48:             break
igor@48: 
igor@54:     return result
igor@39: 
igor@53: def parse_parts_description(parts_description):
igor@53:     """
igor@53:     Returns triad (start, stop, step)
igor@53:     basing on parts_description string.
igor@53:      from-to/step
igor@53:      from+delta/step
igor@53:     """
igor@53: 
igor@53:     try:
igor@53:         (a, step) = parts_description.split("/", 1)
igor@53:         step = int(step)
igor@53:         start = 0
igor@53:         stop = 0
igor@53:         if '-' in a:
igor@53:             (start, stop) = a.split("-", 1)
igor@53:             start = int(start)
igor@53:             stop = int(stop)
igor@53:         elif '+' in a:
igor@53:             (start, stop) = a.split("+", 1)
igor@53:             start = int(start)
igor@53:             stop = int(stop)
igor@53:         else:
igor@53:             start = int(a)
igor@53:             stop = start + 1
igor@53:         return (start, stop, step)
igor@53: 
igor@53:     except:
igor@54:         raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % parts_description)
igor@53: 
igor@53: 
igor@53: def take_part(lines, part_description = None):
igor@53:     if part_description == None:
igor@53:         return lines
igor@53:     (start, stop, step) = parse_parts_description(part_description)
igor@53:     n = len(lines)
igor@53:     part_size = (1.0*n) / step
igor@53:     result = []
igor@53:     for i in range(n):
igor@54:         if i >= start * part_size and i <= stop * part_size:
igor@54:             result += [lines[i]]
igor@53:     return result
igor@53: 
igor@40: def filter_get_words_group_words_add_stat(args):
igor@40:     vocabulary = load_vocabulary()
igor@40:     notes = load_notes(notes_filenames())
igor@54: 
igor@54:     if len(args) > 0:
igor@54:         if 'http://' in args[0]:
igor@54:             input_lines = readlines_from_url(args[0])
igor@54:         else:
igor@54:             input_lines = readlines_from_file(args[0])
igor@54:     else:
igor@54:         input_lines = readlines_from_stdin()
igor@54: 
igor@54:     if len(input_lines) == 0:
igor@54:         print >> sys.stderr, "Nothing to do, standard input is empty, exiting."
igor@54:         sys.exit(1)
igor@54: 
igor@54:     lines = take_part(input_lines, config.get('pages', ''))
igor@54: 
igor@54:     (_, original_text_tempfile) = tempfile.mkstemp(prefix='new-word')
igor@54:     with codecs.open(original_text_tempfile, "w", "utf-8") as f:
igor@54:         f.write("".join(lines))
igor@54: 
igor@44:     group_by = [1]
igor@48: 
igor@54:     if 'two_words' in config:
igor@44:         group_by.append(2)
igor@54:     if 'three_words' in config:
igor@44:         group_by.append(3)
igor@44:     words = get_words(lines, group_by)
igor@43:     stats_only = False
igor@54:     if 'text_stats' in config:
igor@43:         stats_only = True
igor@40: 
igor@47:     compressed_wordlist = False
igor@54:     if 'compressed' in config:
igor@47:         compressed_wordlist = True
igor@47: 
igor@48:     show_range = os.environ.get('SHOW_RANGE', '')
igor@48:     if show_range != '':
igor@48:         show_range = int(show_range)
igor@48:     else:
igor@48:         show_range = 0
igor@54: 
igor@54:     if 'show_range_percentage' in config:
igor@54:         show_range_percentage = int(config['show_range_percentage'])
igor@48:     else:
igor@48:         show_range_percentage = 0
igor@48: 
igor@44: 
igor@40:     stats = {}
igor@40:     stats['total'] = sum(words[x] for x in words.keys())
igor@54:     if not 'no_filter' in config:
igor@45:         words = substract_dictionary(words, vocabulary)
igor@40: 
igor@40:     stats['total_unknown'] = sum(words[x] for x in words.keys())
igor@40:     stats['total_known'] = stats['total'] - stats['total_unknown']
igor@43:     stats['percentage'] = 100.0*stats['total_known']/stats['total']
igor@43:     stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
igor@40:     stats['groups'] = 0
igor@40:     stats['words'] = len(words)
igor@43:     stats['sentences'] = 0  #FIXME
igor@43:     stats['wps'] = 0        #FIXME
igor@43:     stats['uwps'] = 0       #FIXME
igor@40:     stats['language'] = config['language']
igor@40: 
igor@40:     linked_words = find_linked_words(notes)
igor@40:     normalizator = Normalizator(config['language'], linked_words)
igor@40: 
igor@50:     # filter words by allowed_words_filter
igor@54:     if 'allowed_words' in config:
igor@54:         allowed_words_filename = config['allowed_words']
igor@50:         normalized_allowed_words = [
igor@50:             normalizator.normalize(w.rstrip('\n')) 
igor@50:             for w in readlines_from_file(allowed_words_filename)
igor@50:         ]
igor@50: 
igor@50:         result = {}
igor@50:         for w, wn in words.iteritems():
igor@50:             if normalizator.normalize(w) in normalized_allowed_words:
igor@50:                 result[w] = wn
igor@50:         words = result
igor@50: 
igor@44:     words_with_freq = []
igor@40:     for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
igor@44:         words_with_freq.append((words[k], k))
igor@40: 
igor@44:     wgw = find_wordgroups_weights(words_with_freq, normalizator)
igor@45:     if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':
igor@45:         words_with_freq = sorted(
igor@44:                 words_with_freq,
igor@40:                 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
igor@40:                 reverse=True)
igor@40: 
igor@54:     output = print_words_sorted(
igor@47:         words_with_freq,
igor@47:         stats,
igor@47:         normalizator,
igor@47:         stats_only=stats_only,
igor@48:         compressed_wordlist=compressed_wordlist,
igor@48:         show_range=show_range,
igor@48:         show_range_percentage=show_range_percentage,
igor@47:         )
igor@40: 
igor@54: 
igor@54:     if ('non_interactive' in config or 'text_stats' in config):
igor@54:         codecs.getwriter("utf-8")(sys.stdout).write("".join(output))
igor@54:     else:
igor@54:         (_, temp1) = tempfile.mkstemp(prefix='new-word')
igor@54:         (_, temp2) = tempfile.mkstemp(prefix='new-word')
igor@54: 
igor@54:         with codecs.open(temp1, "w", "utf-8") as f:
igor@54:             f.write("".join(output))
igor@54:         with codecs.open(temp2, "w", "utf-8") as f:
igor@54:             f.write("".join(add_notes(output, notes)))
igor@54: 
igor@54:         os.putenv('ORIGINAL_TEXT', original_text_tempfile)
igor@54:         os.system((
igor@54:             "vim"
igor@54:             " -c 'setlocal spell spelllang={language}'"
igor@54:             " -c 'set keywordprg={language}'"
igor@54:             " -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255'"
igor@54:             " {filename}"
igor@54:             " < /dev/tty > /dev/tty"
igor@54:             ).format(language=config['language'], filename=temp2))
igor@54: 
igor@54:         lines = remove_notes(readlines_from_file(temp2), notes)
igor@54: 
igor@54:         # compare lines_before and lines_after and return deleted words
igor@54:         lines_before = output
igor@54:         lines_after = lines
igor@54:         deleted_words = []
igor@54: 
igor@54:         for line in lines_before:
igor@54:             if line not in lines_after:
igor@54:                 line = line.strip()
igor@54:                 if ' ' in line:
igor@54:                     word = re.split('\s+', line, 1)[1]
igor@54:                     if ' ' in word:
igor@54:                         word = re.split('\s+', word, 1)[0]
igor@54:                 deleted_words.append(word)
igor@54: 
igor@54:         with codecs.open(voc_filename(), "a", "utf-8") as f:
igor@54:             f.write("\n".join(deleted_words + ['']))
igor@54: 
igor@54:         os.unlink(temp1)
igor@54:         os.unlink(temp2)
igor@54: 
igor@54:     os.unlink(original_text_tempfile)
igor@54: 
igor@37: (options, args) = parser.parse_args()
igor@38: if options.language:
igor@38:     config['language'] = options.language
igor@37: 
igor@54: if options.pages:
igor@54:     config['pages'] = options.pages
igor@54: else:
igor@54:     config['pages'] = ""
igor@54: 
igor@54: if options.allowed_words:
igor@54:     config['allowed_words'] = options.allowed_words
igor@54: 
igor@54: if options.show_range_percentage:
igor@54:     config['show_range_percentage'] = options.show_range_percentage
igor@54: 
igor@54: if options.non_interactive:
igor@54:     config['non_interactive'] = True
igor@54: 
igor@54: if options.text_stats:
igor@54:     config['text_stats'] = True
igor@54: 
igor@54: if options.compressed:
igor@54:     config['compressed'] = True
igor@54: 
igor@54: if options.no_filter:
igor@54:     config['no_filter'] = True
igor@54: 
igor@54: if options.two_words:
igor@54:     config['two_words'] = True
igor@54: 
igor@54: if options.three_words:
igor@54:     config['three_words'] = True
igor@54: 
igor@38: if options.function:
igor@38:     function_names = {
igor@40:         'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
igor@38:     }
igor@38:     if options.function in function_names:
igor@38:         function_names[options.function](args)
igor@38:     else:
igor@38:         error_message("Unkown function %s.\nAvailable functions:\n%s" % (
igor@38:             options.function, "".join(["   "+x for x in sorted(function_names.keys())])))
igor@38:         sys.exit(1)
igor@37: 
igor@37: 
igor@37: 
igor@37: 
igor@38: #os.system("vim")
igor@37: