| rev | 
   line source | 
| 
igor@37
 | 
     1 #!/usr/bin/env python
 | 
| 
igor@38
 | 
     2 # -*- coding: utf-8 -*-
 | 
| 
igor@37
 | 
     3 
 | 
| 
igor@40
 | 
     4 from __future__ import with_statement
 | 
| 
igor@38
 | 
     5 import codecs
 | 
| 
igor@49
 | 
     6 import difflib
 | 
| 
igor@38
 | 
     7 import logging
 | 
| 
igor@38
 | 
     8 import os
 | 
| 
igor@37
 | 
     9 import optparse
 | 
| 
igor@38
 | 
    10 import re
 | 
| 
igor@38
 | 
    11 import subprocess
 | 
| 
igor@38
 | 
    12 import sys
 | 
| 
igor@38
 | 
    13 import Stemmer
 | 
| 
igor@42
 | 
    14 try:
 | 
| 
igor@42
 | 
    15     import psyco
 | 
| 
igor@42
 | 
    16     psyco.full()
 | 
| 
igor@42
 | 
    17 except:
 | 
| 
igor@42
 | 
    18     pass
 | 
| 
igor@38
 | 
    19 
 | 
| 
igor@38
 | 
    20 config = {
 | 
| 
igor@38
 | 
    21     'config_directory': os.environ['HOME'] + '/.new-words',
 | 
| 
igor@38
 | 
    22     'language': 'en',
 | 
| 
igor@38
 | 
    23 }
 | 
| 
igor@38
 | 
    24 
 | 
| 
igor@38
 | 
    25 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
 | 
| 
igor@38
 | 
    26 
 | 
| 
igor@38
 | 
    27 class Normalizator:
 | 
| 
igor@38
 | 
    28     def __init__(self, language, linked_words={}):
 | 
| 
igor@38
 | 
    29         stemmer_algorithm = {
 | 
| 
igor@38
 | 
    30             'de' : 'german',
 | 
| 
igor@38
 | 
    31             'en' : 'english',
 | 
| 
igor@51
 | 
    32             'es' : 'spanish',
 | 
| 
igor@38
 | 
    33             'ru' : 'russian',
 | 
| 
igor@51
 | 
    34             'it' : 'italian',
 | 
| 
igor@38
 | 
    35             'uk' : 'ukrainian',
 | 
| 
igor@38
 | 
    36         }
 | 
| 
igor@38
 | 
    37         self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
 | 
| 
igor@38
 | 
    38         self.linked_words = linked_words
 | 
| 
igor@38
 | 
    39 
 | 
| 
igor@38
 | 
    40     def normalize(self, word):
 | 
| 
igor@38
 | 
    41         word_chain = []
 | 
| 
igor@38
 | 
    42         while word in self.linked_words and not word in word_chain:
 | 
| 
igor@38
 | 
    43             word_chain.append(word)
 | 
| 
igor@38
 | 
    44             word = self.linked_words[word]
 | 
| 
igor@38
 | 
    45         return self.stemmer.stemWord(word.lower())
 | 
| 
igor@37
 | 
    46 
 | 
| 
igor@47
 | 
    47     def best_word_from_group(self, wordpairs_group):
 | 
| 
igor@47
 | 
    48         """Returns the word that is the most relevant to the wordpairs_group.
 | 
| 
igor@47
 | 
    49 
 | 
| 
igor@47
 | 
    50         At the moment: returns the word with minimal length"""
 | 
| 
igor@49
 | 
    51         
 | 
| 
igor@49
 | 
    52         def f(x, y):
 | 
| 
igor@49
 | 
    53             return difflib.SequenceMatcher(
 | 
| 
igor@49
 | 
    54                         None, 
 | 
| 
igor@49
 | 
    55                         #(x[-2:] == 'en' and x[:-2].lower() or x.lower()), 
 | 
| 
igor@49
 | 
    56                         x.lower(),
 | 
| 
igor@49
 | 
    57                         y.lower()).ratio()
 | 
| 
igor@47
 | 
    58 
 | 
| 
igor@47
 | 
    59         minimal_length = min(len(pair[1]) for pair in wordpairs_group)
 | 
| 
igor@49
 | 
    60         best_match = list(x[1] for x in sorted(
 | 
| 
igor@47
 | 
    61             (x for x in wordpairs_group if len(x[1]) == minimal_length),
 | 
| 
igor@47
 | 
    62             key=lambda x:x[0],
 | 
| 
igor@47
 | 
    63             reverse=True))[0]
 | 
| 
igor@47
 | 
    64 
 | 
| 
igor@51
 | 
    65         return best_match
 | 
| 
igor@51
 | 
    66 
 | 
| 
igor@49
 | 
    67         suggestions = self.dictionary_suggestions(best_match)
 | 
| 
igor@49
 | 
    68         if len(suggestions) == 1:
 | 
| 
igor@49
 | 
    69             return best_match
 | 
| 
igor@49
 | 
    70 
 | 
| 
igor@49
 | 
    71         verb = False
 | 
| 
igor@49
 | 
    72         corrected_best_match = best_match
 | 
| 
igor@49
 | 
    73         if best_match[-2:] == 'et':
 | 
| 
igor@49
 | 
    74             word = best_match[:-1]+"n"
 | 
| 
igor@49
 | 
    75             sugg = self.dictionary_suggestions(word)
 | 
| 
igor@49
 | 
    76             if len(sugg) == 1:
 | 
| 
igor@49
 | 
    77                 return word
 | 
| 
igor@49
 | 
    78             suggestions += sugg
 | 
| 
igor@49
 | 
    79             corrected_best_match = word
 | 
| 
igor@49
 | 
    80             corrected_best_match = best_match[:-2]
 | 
| 
igor@49
 | 
    81             verb = True
 | 
| 
igor@49
 | 
    82 
 | 
| 
igor@49
 | 
    83         if best_match[-1] == 't':
 | 
| 
igor@49
 | 
    84             word = best_match[:-1]+"en"
 | 
| 
igor@49
 | 
    85             sugg = self.dictionary_suggestions(word)
 | 
| 
igor@49
 | 
    86             if len(sugg) == 1:
 | 
| 
igor@49
 | 
    87                 return word
 | 
| 
igor@49
 | 
    88             suggestions += sugg
 | 
| 
igor@49
 | 
    89             corrected_best_match = best_match[:-1]
 | 
| 
igor@49
 | 
    90             verb = True
 | 
| 
igor@49
 | 
    91 
 | 
| 
igor@49
 | 
    92         if corrected_best_match[0].lower() == corrected_best_match[0]:
 | 
| 
igor@49
 | 
    93             suggestions = [ x for x in suggestions
 | 
| 
igor@49
 | 
    94                 if x[0].lower() == x[0] ]
 | 
| 
igor@49
 | 
    95 
 | 
| 
igor@49
 | 
    96         if suggestions == []:
 | 
| 
igor@49
 | 
    97             return best_match+"_"
 | 
| 
igor@49
 | 
    98         return best_match+" "+(" ".join(
 | 
| 
igor@49
 | 
    99                             sorted(
 | 
| 
igor@49
 | 
   100                                 suggestions,
 | 
| 
igor@49
 | 
   101                                 key = lambda x: f(x, corrected_best_match),
 | 
| 
igor@49
 | 
   102                                 reverse = True
 | 
| 
igor@49
 | 
   103                                 )
 | 
| 
igor@49
 | 
   104                             )
 | 
| 
igor@49
 | 
   105                         )
 | 
| 
igor@49
 | 
   106 
 | 
| 
igor@49
 | 
   107     def dictionary_suggestions(self, word):
 | 
| 
igor@49
 | 
   108         return [
 | 
| 
igor@49
 | 
   109             x.decode('utf-8').rstrip('\n')
 | 
| 
igor@49
 | 
   110             for x 
 | 
| 
igor@49
 | 
   111                 in subprocess.Popen(
 | 
| 
igor@49
 | 
   112                     ["de-variants", word],
 | 
| 
igor@49
 | 
   113                     stdout=subprocess.PIPE
 | 
| 
igor@49
 | 
   114                 ).stdout.readlines() ]
 | 
| 
igor@49
 | 
   115 
 | 
| 
igor@49
 | 
   116 
 | 
| 
igor@37
 | 
   117 parser = optparse.OptionParser()
 | 
| 
igor@37
 | 
   118 
 | 
| 
igor@37
 | 
   119 parser.add_option(
 | 
| 
igor@37
 | 
   120     "-a", "--no-marks",
 | 
| 
igor@37
 | 
   121     help="don't add marks (and don't save marks added by user)",
 | 
| 
igor@37
 | 
   122     action="store_true",
 | 
| 
igor@37
 | 
   123     dest="no_marks")
 | 
| 
igor@37
 | 
   124 
 | 
| 
igor@37
 | 
   125 parser.add_option(
 | 
| 
igor@37
 | 
   126     "-c", "--compressed",
 | 
| 
igor@37
 | 
   127     help="show compressed wordlist: one word per group",
 | 
| 
igor@37
 | 
   128     action="store_true",
 | 
| 
igor@37
 | 
   129     dest="compressed")
 | 
| 
igor@37
 | 
   130 
 | 
| 
igor@37
 | 
   131 parser.add_option(
 | 
| 
igor@37
 | 
   132     "-k", "--known-words",
 | 
| 
igor@37
 | 
   133     help="put higher words that are similar to the known words (only for English)",
 | 
| 
igor@37
 | 
   134     action="store_true",
 | 
| 
igor@37
 | 
   135     dest="compressed")
 | 
| 
igor@37
 | 
   136 
 | 
| 
igor@37
 | 
   137 parser.add_option(
 | 
| 
igor@37
 | 
   138     "-l", "--language",
 | 
| 
igor@37
 | 
   139     help="specify language of text",
 | 
| 
igor@37
 | 
   140     action="store",
 | 
| 
igor@37
 | 
   141     dest="language")
 | 
| 
igor@37
 | 
   142 
 | 
| 
igor@37
 | 
   143 parser.add_option(
 | 
| 
igor@38
 | 
   144     "-f", "--function",
 | 
| 
igor@38
 | 
   145     help="filter through subsystem [INTERNAL]",
 | 
| 
igor@38
 | 
   146     action="store",
 | 
| 
igor@38
 | 
   147     dest="function")
 | 
| 
igor@38
 | 
   148 
 | 
| 
igor@38
 | 
   149 parser.add_option(
 | 
| 
igor@37
 | 
   150     "-m", "--merge-tag",
 | 
| 
igor@37
 | 
   151     help="merge words tagged with specified tag into the main vocabulary",
 | 
| 
igor@37
 | 
   152     action="store",
 | 
| 
igor@37
 | 
   153     dest="merge_tag")
 | 
| 
igor@37
 | 
   154 
 | 
| 
igor@37
 | 
   155 parser.add_option(
 | 
| 
igor@37
 | 
   156     "-M", "--merge-tagged",
 | 
| 
igor@37
 | 
   157     help="merge words tagged with ANY tag into the main vocabulary",
 | 
| 
igor@37
 | 
   158     action="store_true",
 | 
| 
igor@37
 | 
   159     dest="merge_tagged")
 | 
| 
igor@37
 | 
   160 
 | 
| 
igor@37
 | 
   161 parser.add_option(
 | 
| 
igor@37
 | 
   162     "-n", "--non-interactive",
 | 
| 
igor@37
 | 
   163     help="non-interactive mode (don't run vi)",
 | 
| 
igor@37
 | 
   164     action="store_true",
 | 
| 
igor@37
 | 
   165     dest="non_interactive")
 | 
| 
igor@37
 | 
   166 
 | 
| 
igor@37
 | 
   167 parser.add_option(
 | 
| 
igor@37
 | 
   168     "-N", "--no-filter",
 | 
| 
igor@37
 | 
   169     help="switch off known words filtering",
 | 
| 
igor@37
 | 
   170     action="store_true",
 | 
| 
igor@37
 | 
   171     dest="no_filter")
 | 
| 
igor@37
 | 
   172 
 | 
| 
igor@37
 | 
   173 parser.add_option(
 | 
| 
igor@37
 | 
   174     "-p", "--pages",
 | 
| 
igor@37
 | 
   175     help="work with specified pages only (pages = start-stop/total )",
 | 
| 
igor@37
 | 
   176     action="store",
 | 
| 
igor@37
 | 
   177     dest="pages")
 | 
| 
igor@37
 | 
   178 
 | 
| 
igor@37
 | 
   179 parser.add_option(
 | 
| 
igor@48
 | 
   180     "-d", "--delete-tag",
 | 
| 
igor@48
 | 
   181     help="delete subvocabulary of specified tag",
 | 
| 
igor@37
 | 
   182     action="store",
 | 
| 
igor@48
 | 
   183     dest="delete_tag")
 | 
| 
igor@37
 | 
   184 
 | 
| 
igor@37
 | 
   185 parser.add_option(
 | 
| 
igor@37
 | 
   186     "-s", "--text-stats",
 | 
| 
igor@37
 | 
   187     help="show the text statistics (percentage of known words and so on) and exit",
 | 
| 
igor@37
 | 
   188     action="store_true",
 | 
| 
igor@37
 | 
   189     dest="text_stats")
 | 
| 
igor@37
 | 
   190 
 | 
| 
igor@37
 | 
   191 parser.add_option(
 | 
| 
igor@37
 | 
   192     "-S", "--voc-stats",
 | 
| 
igor@37
 | 
   193     help="show your vocabulary statistics (number of words and word groups)",
 | 
| 
igor@37
 | 
   194     action="store_true",
 | 
| 
igor@37
 | 
   195     dest="voc_stats")
 | 
| 
igor@37
 | 
   196 
 | 
| 
igor@37
 | 
   197 parser.add_option(
 | 
| 
igor@37
 | 
   198     "-t", "--tag",
 | 
| 
igor@37
 | 
   199     help="tag known words with tag",
 | 
| 
igor@37
 | 
   200     action="store",
 | 
| 
igor@37
 | 
   201     dest="tag")
 | 
| 
igor@37
 | 
   202 
 | 
| 
igor@37
 | 
   203 parser.add_option(
 | 
| 
igor@37
 | 
   204     "-T", "--show-tags",
 | 
| 
igor@37
 | 
   205     help="tag known words with tag",
 | 
| 
igor@37
 | 
   206     action="store_true",
 | 
| 
igor@37
 | 
   207     dest="show_tags")
 | 
| 
igor@37
 | 
   208 
 | 
| 
igor@37
 | 
   209 parser.add_option(
 | 
| 
igor@37
 | 
   210     "-2", "--two-words",
 | 
| 
igor@37
 | 
   211     help="find 2 words' sequences",
 | 
| 
igor@37
 | 
   212     action="store_true",
 | 
| 
igor@37
 | 
   213     dest="two_words")
 | 
| 
igor@37
 | 
   214 
 | 
| 
igor@37
 | 
   215 parser.add_option(
 | 
| 
igor@37
 | 
   216     "-3", "--three-words",
 | 
| 
igor@37
 | 
   217     help="find 3 words' sequences",
 | 
| 
igor@37
 | 
   218     action="store_true",
 | 
| 
igor@37
 | 
   219     dest="three_words")
 | 
| 
igor@37
 | 
   220 
 | 
| 
igor@38
 | 
   221 def readlines_from_file(filename):
 | 
| 
igor@38
 | 
   222     res = []
 | 
| 
igor@38
 | 
   223     with codecs.open(filename, "r", "utf-8") as f:
 | 
| 
igor@38
 | 
   224         for line in f.readlines():
 | 
| 
igor@38
 | 
   225             res += [line]
 | 
| 
igor@38
 | 
   226     return res
 | 
| 
igor@38
 | 
   227 
 | 
| 
igor@38
 | 
   228 def readlines_from_stdin():
 | 
| 
igor@38
 | 
   229     return codecs.getreader("utf-8")(sys.stdin).readlines()
 | 
| 
igor@38
 | 
   230 
 | 
| 
igor@38
 | 
   231 def words_from_line(line):
 | 
| 
igor@38
 | 
   232     line = line.rstrip('\n')
 | 
| 
igor@38
 | 
   233     #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
 | 
| 
igor@38
 | 
   234     #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
 | 
| 
igor@44
 | 
   235     return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
 | 
| 
igor@38
 | 
   236 
 | 
| 
igor@44
 | 
   237 def get_words(lines, group_by=[1]):
 | 
| 
igor@38
 | 
   238     """
 | 
| 
igor@38
 | 
   239     Returns hash of words in a file
 | 
| 
igor@38
 | 
   240     word => number
 | 
| 
igor@38
 | 
   241     """
 | 
| 
igor@38
 | 
   242     result = {}
 | 
| 
igor@44
 | 
   243     (a, b, c) = ("", "", "")
 | 
| 
igor@38
 | 
   244     for line in lines:
 | 
| 
igor@38
 | 
   245         words = words_from_line(line)
 | 
| 
igor@38
 | 
   246         for word in words:
 | 
| 
igor@41
 | 
   247             if re.match('[0-9]*$', word):
 | 
| 
igor@41
 | 
   248                 continue
 | 
| 
igor@38
 | 
   249             result.setdefault(word, 0)
 | 
| 
igor@38
 | 
   250             result[word] += 1
 | 
| 
igor@44
 | 
   251             if 2 in group_by and a != "" and b != "":
 | 
| 
igor@44
 | 
   252                 w = "%s_%s" % (a,b)
 | 
| 
igor@44
 | 
   253                 result.setdefault(w, 0)
 | 
| 
igor@44
 | 
   254                 result[w] += 1
 | 
| 
igor@44
 | 
   255             if 3 in group_by and not "" in [a,b,c]:
 | 
| 
igor@44
 | 
   256                 w = "%s_%s_%s" % (a,b,c)
 | 
| 
igor@44
 | 
   257                 result.setdefault(w, 0)
 | 
| 
igor@44
 | 
   258                 result[w] += 1
 | 
| 
igor@44
 | 
   259             (a,b,c) = (b, c, word)
 | 
| 
igor@44
 | 
   260 
 | 
| 
igor@44
 | 
   261     logging.debug(result)
 | 
| 
igor@38
 | 
   262     return result
 | 
| 
igor@38
 | 
   263 
 | 
| 
igor@38
 | 
   264 def load_vocabulary():
 | 
| 
igor@38
 | 
   265     return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
 | 
| 
igor@38
 | 
   266 
 | 
| 
igor@38
 | 
   267 def notes_filenames():
 | 
| 
igor@38
 | 
   268     return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
 | 
| 
igor@38
 | 
   269 
 | 
| 
igor@38
 | 
   270 def load_notes(files):
 | 
| 
igor@38
 | 
   271     notes = {}
 | 
| 
igor@38
 | 
   272     for filename in files:
 | 
| 
igor@39
 | 
   273         with codecs.open(filename, "r", "utf-8") as f:
 | 
| 
igor@38
 | 
   274             for line in f.readlines():
 | 
| 
igor@38
 | 
   275                 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
 | 
| 
igor@38
 | 
   276                 notes.setdefault(word, {})
 | 
| 
igor@38
 | 
   277                 notes[word][filename] = note
 | 
| 
igor@38
 | 
   278     return notes
 | 
| 
igor@38
 | 
   279 
 | 
| 
igor@39
 | 
   280 def add_notes(lines, notes):
 | 
| 
igor@39
 | 
   281     notes_filename = notes_filenames()[0]
 | 
| 
igor@39
 | 
   282     result = []
 | 
| 
igor@39
 | 
   283     for line in lines:
 | 
| 
igor@39
 | 
   284         if line.startswith('#'):
 | 
| 
igor@39
 | 
   285             result += [line]
 | 
| 
igor@39
 | 
   286         else:
 | 
| 
igor@39
 | 
   287             match_object = re.search('^\s*\S+\s*(\S+)', line)
 | 
| 
igor@39
 | 
   288             if match_object:
 | 
| 
igor@39
 | 
   289                 word = match_object.group(1)
 | 
| 
igor@39
 | 
   290                 if word in notes:
 | 
| 
igor@39
 | 
   291                     if notes_filename in notes[word]:
 | 
| 
igor@39
 | 
   292                         line = line.rstrip('\n')
 | 
| 
igor@39
 | 
   293                         line = "%-30s %s\n" % (line, notes[word][notes_filename])
 | 
| 
igor@39
 | 
   294                         result += [line]
 | 
| 
igor@39
 | 
   295                 else:
 | 
| 
igor@39
 | 
   296                     result += [line]
 | 
| 
igor@39
 | 
   297             else:
 | 
| 
igor@39
 | 
   298                 result += [line]
 | 
| 
igor@39
 | 
   299     return result
 | 
| 
igor@39
 | 
   300 
 | 
| 
igor@39
 | 
   301 def remove_notes(lines, notes_group):
 | 
| 
igor@39
 | 
   302     notes_filename = notes_filenames()[0]
 | 
| 
igor@39
 | 
   303     notes = {}
 | 
| 
igor@39
 | 
   304     for k in notes_group.keys():
 | 
| 
igor@39
 | 
   305         if notes_filename in notes_group[k]:
 | 
| 
igor@39
 | 
   306             notes[k] = notes_group[k][notes_filename]
 | 
| 
igor@39
 | 
   307 
 | 
| 
igor@39
 | 
   308     result = []
 | 
| 
igor@39
 | 
   309     for line in lines:
 | 
| 
igor@39
 | 
   310         line = line.rstrip('\n')
 | 
| 
igor@39
 | 
   311         match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
 | 
| 
igor@39
 | 
   312         if match_object:
 | 
| 
igor@39
 | 
   313             result.append("".join([
 | 
| 
igor@39
 | 
   314                 match_object.group(1),
 | 
| 
igor@39
 | 
   315                 match_object.group(2),
 | 
| 
igor@39
 | 
   316                 match_object.group(3),
 | 
| 
igor@39
 | 
   317                 match_object.group(4),
 | 
| 
igor@39
 | 
   318                 "\n"
 | 
| 
igor@39
 | 
   319                 ]))
 | 
| 
igor@39
 | 
   320             notes[match_object.group(4)] = match_object.group(6)
 | 
| 
igor@39
 | 
   321         else:
 | 
| 
igor@39
 | 
   322             result.append(line+"\n")
 | 
| 
igor@39
 | 
   323 
 | 
| 
igor@39
 | 
   324     save_notes(notes_filename, notes)
 | 
| 
igor@39
 | 
   325     return result
 | 
| 
igor@39
 | 
   326 
 | 
| 
igor@39
 | 
   327 def save_notes(filename, notes):
 | 
| 
igor@39
 | 
   328     lines = []
 | 
| 
igor@39
 | 
   329     saved_words = []
 | 
| 
igor@39
 | 
   330     with codecs.open(filename, "r", "utf-8") as f:
 | 
| 
igor@39
 | 
   331         for line in f.readlines():
 | 
| 
igor@39
 | 
   332             (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
 | 
| 
igor@39
 | 
   333             if word in notes:
 | 
| 
igor@39
 | 
   334                 line = "%-29s %s\n" % (word, notes[word])
 | 
| 
igor@39
 | 
   335                 saved_words.append(word)
 | 
| 
igor@39
 | 
   336             lines.append(line)
 | 
| 
igor@39
 | 
   337     for word in [x for x in notes.keys() if not x in saved_words]:
 | 
| 
igor@39
 | 
   338         line = "%-29s %s\n" % (word, notes[word])
 | 
| 
igor@39
 | 
   339         lines.append(line)
 | 
| 
igor@39
 | 
   340 
 | 
| 
igor@39
 | 
   341     with codecs.open(filename, "w", "utf-8") as f:
 | 
| 
igor@39
 | 
   342         for line in lines:
 | 
| 
igor@39
 | 
   343             f.write(line)
 | 
| 
igor@39
 | 
   344 
 | 
| 
igor@39
 | 
   345 
 | 
| 
igor@38
 | 
   346 def substract_dictionary(dict1, dict2):
 | 
| 
igor@38
 | 
   347     """
 | 
| 
igor@38
 | 
   348     returns dict1 - dict2
 | 
| 
igor@38
 | 
   349     """
 | 
| 
igor@38
 | 
   350     result = {}
 | 
| 
igor@38
 | 
   351     for (k,v) in dict1.items():
 | 
| 
igor@38
 | 
   352         if not k in dict2:
 | 
| 
igor@38
 | 
   353             result[k] = v
 | 
| 
igor@38
 | 
   354     return result
 | 
| 
igor@38
 | 
   355 
 | 
| 
igor@38
 | 
   356 def dump_words(words, filename):
 | 
| 
igor@38
 | 
   357     with codecs.open(filename, "w+", "utf-8") as f:
 | 
| 
igor@38
 | 
   358         for word in words.keys():
 | 
| 
igor@38
 | 
   359             f.write(("%s\n"%word)*words[word])
 | 
| 
igor@38
 | 
   360 
 | 
| 
igor@38
 | 
   361 def error_message(text):
 | 
| 
igor@38
 | 
   362     print text
 | 
| 
igor@38
 | 
   363 
 | 
| 
igor@40
 | 
   364 def find_wordgroups_weights(word_pairs, normalizator):
 | 
| 
igor@38
 | 
   365     weight = {}
 | 
| 
igor@40
 | 
   366     for (num, word) in word_pairs:
 | 
| 
igor@38
 | 
   367         normalized = normalizator.normalize(word)
 | 
| 
igor@38
 | 
   368         weight.setdefault(normalized, 0)
 | 
| 
igor@40
 | 
   369         weight[normalized] += num
 | 
| 
igor@38
 | 
   370     return weight
 | 
| 
igor@38
 | 
   371 
 | 
| 
igor@38
 | 
   372 def find_linked_words(notes):
 | 
| 
igor@38
 | 
   373     linked_words = {}
 | 
| 
igor@38
 | 
   374     for word in notes.keys():
 | 
| 
igor@38
 | 
   375         for note in notes[word].values():
 | 
| 
igor@38
 | 
   376             if "@" in note:
 | 
| 
igor@38
 | 
   377                 result = re.search(r'\@(\S*)', note)
 | 
| 
igor@38
 | 
   378                 if result:
 | 
| 
igor@38
 | 
   379                     main_word = result.group(1)
 | 
| 
igor@38
 | 
   380                     if main_word:
 | 
| 
igor@38
 | 
   381                         linked_words[word] = main_word
 | 
| 
igor@38
 | 
   382     return linked_words
 | 
| 
igor@38
 | 
   383 
 | 
| 
igor@40
 | 
   384 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
 | 
| 
igor@40
 | 
   385     (num1, word1) = pair1
 | 
| 
igor@40
 | 
   386     (num2, word2) = pair2
 | 
| 
igor@38
 | 
   387 
 | 
| 
igor@38
 | 
   388     normalized_word1 = normalizator.normalize(word1)
 | 
| 
igor@38
 | 
   389     normalized_word2 = normalizator.normalize(word2)
 | 
| 
igor@38
 | 
   390 
 | 
| 
igor@38
 | 
   391     cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
 | 
| 
igor@38
 | 
   392     if cmp_res != 0:
 | 
| 
igor@38
 | 
   393         return cmp_res
 | 
| 
igor@38
 | 
   394     else:
 | 
| 
igor@38
 | 
   395         cmp_res = cmp(normalized_word1, normalized_word2)
 | 
| 
igor@38
 | 
   396         if cmp_res != 0:
 | 
| 
igor@38
 | 
   397             return cmp_res
 | 
| 
igor@38
 | 
   398         else:
 | 
| 
igor@38
 | 
   399             return cmp(int(num1), int(num2))
 | 
| 
igor@38
 | 
   400 
 | 
| 
igor@47
 | 
   401 
 | 
| 
igor@48
 | 
   402 def print_words_sorted(
 | 
| 
igor@48
 | 
   403         word_pairs,
 | 
| 
igor@48
 | 
   404         stats,
 | 
| 
igor@48
 | 
   405         normalizator,
 | 
| 
igor@48
 | 
   406         print_stats=True,
 | 
| 
igor@48
 | 
   407         stats_only=False,
 | 
| 
igor@48
 | 
   408         compressed_wordlist=False,
 | 
| 
igor@48
 | 
   409         show_range=0,
 | 
| 
igor@48
 | 
   410         show_range_percentage=0,
 | 
| 
igor@48
 | 
   411         ):
 | 
| 
igor@40
 | 
   412     if stats_only:
 | 
| 
igor@43
 | 
   413         codecs.getwriter("utf-8")(sys.stdout).write(
 | 
| 
igor@43
 | 
   414             " ".join([
 | 
| 
igor@43
 | 
   415                 "%-10s" % x for x in [
 | 
| 
igor@43
 | 
   416                 "LANG",
 | 
| 
igor@43
 | 
   417                 "KNOWN%",
 | 
| 
igor@43
 | 
   418                 "UNKNOWN%",
 | 
| 
igor@43
 | 
   419                 "KNOWN",
 | 
| 
igor@43
 | 
   420                 "TOTAL",
 | 
| 
igor@43
 | 
   421                 "WPS",
 | 
| 
igor@43
 | 
   422                 "UWPS*10"
 | 
| 
igor@43
 | 
   423                 ]]) + "\n")
 | 
| 
igor@43
 | 
   424         codecs.getwriter("utf-8")(sys.stdout).write(
 | 
| 
igor@43
 | 
   425             " ".join([
 | 
| 
igor@43
 | 
   426                 "%(language)-10s",
 | 
| 
igor@43
 | 
   427                 "%(percentage)-10.2f",
 | 
| 
igor@43
 | 
   428                 "%(percentage_unknown)-10.2f",
 | 
| 
igor@43
 | 
   429                 "%(total_known)-11d"
 | 
| 
igor@43
 | 
   430                 "%(total)-11d"
 | 
| 
igor@43
 | 
   431                 "%(wps)-11d"
 | 
| 
igor@43
 | 
   432                 "%(uwps)-11d"
 | 
| 
igor@43
 | 
   433                 ]) % stats + "\n")
 | 
| 
igor@40
 | 
   434         return
 | 
| 
igor@38
 | 
   435 
 | 
| 
igor@40
 | 
   436     if print_stats:
 | 
| 
igor@40
 | 
   437         codecs.getwriter("utf-8")(sys.stdout).write(
 | 
| 
igor@43
 | 
   438             "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
 | 
| 
igor@38
 | 
   439 
 | 
| 
igor@40
 | 
   440     level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
 | 
| 
igor@40
 | 
   441     known = int(stats['total_known'])
 | 
| 
igor@40
 | 
   442     total = int(stats['total'])
 | 
| 
igor@40
 | 
   443     current_level = 0
 | 
| 
igor@47
 | 
   444     old_normalized_word = None
 | 
| 
igor@47
 | 
   445     words_of_this_group = []
 | 
| 
igor@48
 | 
   446     printed_words = 0
 | 
| 
igor@40
 | 
   447     for word_pair in word_pairs:
 | 
| 
igor@47
 | 
   448 
 | 
| 
igor@47
 | 
   449         normalized_word = normalizator.normalize(word_pair[1])
 | 
| 
igor@47
 | 
   450         if old_normalized_word and old_normalized_word != normalized_word:
 | 
| 
igor@47
 | 
   451             #codecs.getwriter("utf-8")(sys.stdout).write(
 | 
| 
igor@47
 | 
   452             #    "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
 | 
| 
igor@47
 | 
   453             if compressed_wordlist:
 | 
| 
igor@49
 | 
   454                 compressed_word_pair = (
 | 
| 
igor@49
 | 
   455                     sum(x[0] for x in words_of_this_group),
 | 
| 
igor@49
 | 
   456                     normalizator.best_word_from_group(words_of_this_group)
 | 
| 
igor@49
 | 
   457                     )
 | 
| 
igor@47
 | 
   458                 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
 | 
| 
igor@48
 | 
   459                 printed_words += 1
 | 
| 
igor@47
 | 
   460             words_of_this_group = []
 | 
| 
igor@47
 | 
   461 
 | 
| 
igor@47
 | 
   462         old_normalized_word = normalized_word
 | 
| 
igor@47
 | 
   463         words_of_this_group.append(word_pair)
 | 
| 
igor@47
 | 
   464 
 | 
| 
igor@47
 | 
   465         if not compressed_wordlist:
 | 
| 
igor@47
 | 
   466             codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
 | 
| 
igor@48
 | 
   467             printed_words += 1
 | 
| 
igor@47
 | 
   468 
 | 
| 
igor@47
 | 
   469 
 | 
| 
igor@40
 | 
   470         known += word_pair[0]
 | 
| 
igor@40
 | 
   471         if 100.0*known/total >= level_lines[0]:
 | 
| 
igor@40
 | 
   472             current_level = level_lines[0]
 | 
| 
igor@40
 | 
   473             while 100.0*known/total > level_lines[0]:
 | 
| 
igor@40
 | 
   474                 current_level = level_lines[0]
 | 
| 
igor@40
 | 
   475                 level_lines = level_lines[1:]
 | 
| 
igor@40
 | 
   476             codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
 | 
| 
igor@38
 | 
   477 
 | 
| 
igor@48
 | 
   478         if show_range >0 and printed_words >= show_range:
 | 
| 
igor@48
 | 
   479             break
 | 
| 
igor@48
 | 
   480         if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
 | 
| 
igor@48
 | 
   481             break
 | 
| 
igor@48
 | 
   482 
 | 
| 
igor@39
 | 
   483 def filter_add_notes(args):
 | 
| 
igor@39
 | 
   484     lines = readlines_from_file(args[0])
 | 
| 
igor@39
 | 
   485     notes = load_notes(notes_filenames())
 | 
| 
igor@39
 | 
   486     lines = add_notes(lines, notes)
 | 
| 
igor@39
 | 
   487     with codecs.open(args[0], "w", "utf-8") as f:
 | 
| 
igor@39
 | 
   488         for line in lines:
 | 
| 
igor@39
 | 
   489             f.write(line)
 | 
| 
igor@39
 | 
   490 
 | 
| 
igor@39
 | 
   491 def filter_remove_notes(args):
 | 
| 
igor@39
 | 
   492     lines = readlines_from_file(args[0])
 | 
| 
igor@39
 | 
   493     notes = load_notes(notes_filenames())
 | 
| 
igor@39
 | 
   494     lines = remove_notes(lines, notes)
 | 
| 
igor@39
 | 
   495     with codecs.open(args[0], "w", "utf-8") as f:
 | 
| 
igor@39
 | 
   496         for line in lines:
 | 
| 
igor@39
 | 
   497             f.write(line)
 | 
| 
igor@39
 | 
   498 
 | 
| 
igor@40
 | 
   499 def filter_get_words_group_words_add_stat(args):
 | 
| 
igor@40
 | 
   500     vocabulary = load_vocabulary()
 | 
| 
igor@40
 | 
   501     notes = load_notes(notes_filenames())
 | 
| 
igor@40
 | 
   502     lines = readlines_from_stdin()
 | 
| 
igor@44
 | 
   503     group_by = [1]
 | 
| 
igor@48
 | 
   504 
 | 
| 
igor@44
 | 
   505     if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
 | 
| 
igor@44
 | 
   506         group_by.append(2)
 | 
| 
igor@44
 | 
   507     if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
 | 
| 
igor@44
 | 
   508         group_by.append(3)
 | 
| 
igor@44
 | 
   509     words = get_words(lines, group_by)
 | 
| 
igor@43
 | 
   510     stats_only = False
 | 
| 
igor@43
 | 
   511     if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
 | 
| 
igor@43
 | 
   512         stats_only = True
 | 
| 
igor@40
 | 
   513 
 | 
| 
igor@47
 | 
   514     compressed_wordlist = False
 | 
| 
igor@47
 | 
   515     if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
 | 
| 
igor@47
 | 
   516         compressed_wordlist = True
 | 
| 
igor@47
 | 
   517 
 | 
| 
igor@48
 | 
   518     show_range = os.environ.get('SHOW_RANGE', '')
 | 
| 
igor@48
 | 
   519     if show_range != '':
 | 
| 
igor@48
 | 
   520         show_range = int(show_range)
 | 
| 
igor@48
 | 
   521     else:
 | 
| 
igor@48
 | 
   522         show_range = 0
 | 
| 
igor@48
 | 
   523     show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '')
 | 
| 
igor@48
 | 
   524     if show_range_percentage != '':
 | 
| 
igor@48
 | 
   525         show_range_percentage = int(show_range_percentage)
 | 
| 
igor@48
 | 
   526     else:
 | 
| 
igor@48
 | 
   527         show_range_percentage = 0
 | 
| 
igor@48
 | 
   528 
 | 
| 
igor@44
 | 
   529 
 | 
| 
igor@40
 | 
   530     stats = {}
 | 
| 
igor@40
 | 
   531     stats['total'] = sum(words[x] for x in words.keys())
 | 
| 
igor@45
 | 
   532     if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':
 | 
| 
igor@45
 | 
   533         words = substract_dictionary(words, vocabulary)
 | 
| 
igor@40
 | 
   534 
 | 
| 
igor@40
 | 
   535     stats['total_unknown'] = sum(words[x] for x in words.keys())
 | 
| 
igor@40
 | 
   536     stats['total_known'] = stats['total'] - stats['total_unknown']
 | 
| 
igor@43
 | 
   537     stats['percentage'] = 100.0*stats['total_known']/stats['total']
 | 
| 
igor@43
 | 
   538     stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
 | 
| 
igor@40
 | 
   539     stats['groups'] = 0
 | 
| 
igor@40
 | 
   540     stats['words'] = len(words)
 | 
| 
igor@43
 | 
   541     stats['sentences'] = 0  #FIXME
 | 
| 
igor@43
 | 
   542     stats['wps'] = 0        #FIXME
 | 
| 
igor@43
 | 
   543     stats['uwps'] = 0       #FIXME
 | 
| 
igor@40
 | 
   544     stats['language'] = config['language']
 | 
| 
igor@40
 | 
   545 
 | 
| 
igor@40
 | 
   546     linked_words = find_linked_words(notes)
 | 
| 
igor@40
 | 
   547     normalizator = Normalizator(config['language'], linked_words)
 | 
| 
igor@40
 | 
   548 
 | 
| 
igor@50
 | 
   549     # filter words by allowed_words_filter
 | 
| 
igor@50
 | 
   550     if os.environ.get('ALLOWED_WORDS_FILENAME', ''):
 | 
| 
igor@50
 | 
   551         allowed_words_filename = os.environ.get('ALLOWED_WORDS_FILENAME', '')
 | 
| 
igor@50
 | 
   552         normalized_allowed_words = [
 | 
| 
igor@50
 | 
   553             normalizator.normalize(w.rstrip('\n')) 
 | 
| 
igor@50
 | 
   554             for w in readlines_from_file(allowed_words_filename)
 | 
| 
igor@50
 | 
   555         ]
 | 
| 
igor@50
 | 
   556 
 | 
| 
igor@50
 | 
   557         result = {}
 | 
| 
igor@50
 | 
   558         for w, wn in words.iteritems():
 | 
| 
igor@50
 | 
   559             if normalizator.normalize(w) in normalized_allowed_words:
 | 
| 
igor@50
 | 
   560                 result[w] = wn
 | 
| 
igor@50
 | 
   561         words = result
 | 
| 
igor@50
 | 
   562 
 | 
| 
igor@44
 | 
   563     words_with_freq = []
 | 
| 
igor@40
 | 
   564     for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
 | 
| 
igor@44
 | 
   565         words_with_freq.append((words[k], k))
 | 
| 
igor@40
 | 
   566 
 | 
| 
igor@44
 | 
   567     wgw = find_wordgroups_weights(words_with_freq, normalizator)
 | 
| 
igor@45
 | 
   568     if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':
 | 
| 
igor@45
 | 
   569         words_with_freq = sorted(
 | 
| 
igor@44
 | 
   570                 words_with_freq,
 | 
| 
igor@40
 | 
   571                 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
 | 
| 
igor@40
 | 
   572                 reverse=True)
 | 
| 
igor@40
 | 
   573 
 | 
| 
igor@47
 | 
   574     print_words_sorted(
 | 
| 
igor@47
 | 
   575         words_with_freq,
 | 
| 
igor@47
 | 
   576         stats,
 | 
| 
igor@47
 | 
   577         normalizator,
 | 
| 
igor@47
 | 
   578         stats_only=stats_only,
 | 
| 
igor@48
 | 
   579         compressed_wordlist=compressed_wordlist,
 | 
| 
igor@48
 | 
   580         show_range=show_range,
 | 
| 
igor@48
 | 
   581         show_range_percentage=show_range_percentage,
 | 
| 
igor@47
 | 
   582         )
 | 
| 
igor@40
 | 
   583 
 | 
| 
igor@37
 | 
   584 (options, args) = parser.parse_args()
 | 
| 
igor@38
 | 
   585 if options.language:
 | 
| 
igor@38
 | 
   586     config['language'] = options.language
 | 
| 
igor@37
 | 
   587 
 | 
| 
igor@38
 | 
   588 if options.function:
 | 
| 
igor@38
 | 
   589     function_names = {
 | 
| 
igor@39
 | 
   590         'add_notes' :   filter_add_notes,
 | 
| 
igor@39
 | 
   591         'remove_notes': filter_remove_notes,
 | 
| 
igor@40
 | 
   592         'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
 | 
| 
igor@38
 | 
   593     }
 | 
| 
igor@38
 | 
   594     if options.function in function_names:
 | 
| 
igor@38
 | 
   595         function_names[options.function](args)
 | 
| 
igor@38
 | 
   596     else:
 | 
| 
igor@38
 | 
   597         error_message("Unkown function %s.\nAvailable functions:\n%s" % (
 | 
| 
igor@38
 | 
   598             options.function, "".join(["   "+x for x in sorted(function_names.keys())])))
 | 
| 
igor@38
 | 
   599         sys.exit(1)
 | 
| 
igor@37
 | 
   600 
 | 
| 
igor@37
 | 
   601 
 | 
| 
igor@37
 | 
   602 
 | 
| 
igor@37
 | 
   603 
 | 
| 
igor@38
 | 
   604 #os.system("vim")
 | 
| 
igor@37
 | 
   605 
 |