| rev | 
   line source | 
| 
igor@37
 | 
     1 #!/usr/bin/env python
 | 
| 
igor@38
 | 
     2 # -*- coding: utf-8 -*-
 | 
| 
igor@37
 | 
     3 
 | 
| 
igor@40
 | 
     4 from __future__ import with_statement
 | 
| 
igor@38
 | 
     5 import codecs
 | 
| 
igor@38
 | 
     6 import logging
 | 
| 
igor@38
 | 
     7 import os
 | 
| 
igor@37
 | 
     8 import optparse
 | 
| 
igor@38
 | 
     9 import re
 | 
| 
igor@38
 | 
    10 import subprocess
 | 
| 
igor@38
 | 
    11 import sys
 | 
| 
igor@38
 | 
    12 import Stemmer
 | 
| 
igor@42
 | 
    13 try:
 | 
| 
igor@42
 | 
    14     import psyco
 | 
| 
igor@42
 | 
    15     psyco.full()
 | 
| 
igor@42
 | 
    16 except:
 | 
| 
igor@42
 | 
    17     pass
 | 
| 
igor@38
 | 
    18 
 | 
| 
igor@38
 | 
    19 config = {
 | 
| 
igor@38
 | 
    20     'config_directory': os.environ['HOME'] + '/.new-words',
 | 
| 
igor@38
 | 
    21     'language': 'en',
 | 
| 
igor@38
 | 
    22 }
 | 
| 
igor@38
 | 
    23 
 | 
| 
igor@38
 | 
    24 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
 | 
| 
igor@38
 | 
    25 
 | 
| 
igor@38
 | 
    26 class Normalizator:
 | 
| 
igor@38
 | 
    27     def __init__(self, language, linked_words={}):
 | 
| 
igor@38
 | 
    28         stemmer_algorithm = {
 | 
| 
igor@38
 | 
    29             'de' : 'german',
 | 
| 
igor@38
 | 
    30             'en' : 'english',
 | 
| 
igor@38
 | 
    31             'ru' : 'russian',
 | 
| 
igor@38
 | 
    32             'uk' : 'ukrainian',
 | 
| 
igor@38
 | 
    33         }
 | 
| 
igor@38
 | 
    34         self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
 | 
| 
igor@38
 | 
    35         self.linked_words = linked_words
 | 
| 
igor@38
 | 
    36 
 | 
| 
igor@38
 | 
    37     def normalize(self, word):
 | 
| 
igor@38
 | 
    38         word_chain = []
 | 
| 
igor@38
 | 
    39         while word in self.linked_words and not word in word_chain:
 | 
| 
igor@38
 | 
    40             word_chain.append(word)
 | 
| 
igor@38
 | 
    41             word = self.linked_words[word]
 | 
| 
igor@38
 | 
    42         return self.stemmer.stemWord(word.lower())
 | 
| 
igor@37
 | 
    43 
 | 
| 
igor@47
 | 
    44     def best_word_from_group(self, wordpairs_group):
 | 
| 
igor@47
 | 
    45         """Returns the word that is the most relevant to the wordpairs_group.
 | 
| 
igor@47
 | 
    46 
 | 
| 
igor@47
 | 
    47         At the moment: returns the word with minimal length"""
 | 
| 
igor@47
 | 
    48 
 | 
| 
igor@47
 | 
    49         minimal_length = min(len(pair[1]) for pair in wordpairs_group)
 | 
| 
igor@47
 | 
    50         return list(x[1] for x in sorted(
 | 
| 
igor@47
 | 
    51             (x for x in wordpairs_group if len(x[1]) == minimal_length),
 | 
| 
igor@47
 | 
    52             key=lambda x:x[0],
 | 
| 
igor@47
 | 
    53             reverse=True))[0]
 | 
| 
igor@47
 | 
    54 
 | 
| 
igor@37
 | 
    55 parser = optparse.OptionParser()
 | 
| 
igor@37
 | 
    56 
 | 
| 
igor@37
 | 
    57 parser.add_option(
 | 
| 
igor@37
 | 
    58     "-a", "--no-marks",
 | 
| 
igor@37
 | 
    59     help="don't add marks (and don't save marks added by user)",
 | 
| 
igor@37
 | 
    60     action="store_true",
 | 
| 
igor@37
 | 
    61     dest="no_marks")
 | 
| 
igor@37
 | 
    62 
 | 
| 
igor@37
 | 
    63 parser.add_option(
 | 
| 
igor@37
 | 
    64     "-c", "--compressed",
 | 
| 
igor@37
 | 
    65     help="show compressed wordlist: one word per group",
 | 
| 
igor@37
 | 
    66     action="store_true",
 | 
| 
igor@37
 | 
    67     dest="compressed")
 | 
| 
igor@37
 | 
    68 
 | 
| 
igor@37
 | 
    69 parser.add_option(
 | 
| 
igor@37
 | 
    70     "-k", "--known-words",
 | 
| 
igor@37
 | 
    71     help="put higher words that are similar to the known words (only for English)",
 | 
| 
igor@37
 | 
    72     action="store_true",
 | 
| 
igor@37
 | 
    73     dest="compressed")
 | 
| 
igor@37
 | 
    74 
 | 
| 
igor@37
 | 
    75 parser.add_option(
 | 
| 
igor@37
 | 
    76     "-l", "--language",
 | 
| 
igor@37
 | 
    77     help="specify language of text",
 | 
| 
igor@37
 | 
    78     action="store",
 | 
| 
igor@37
 | 
    79     dest="language")
 | 
| 
igor@37
 | 
    80 
 | 
| 
igor@37
 | 
    81 parser.add_option(
 | 
| 
igor@38
 | 
    82     "-f", "--function",
 | 
| 
igor@38
 | 
    83     help="filter through subsystem [INTERNAL]",
 | 
| 
igor@38
 | 
    84     action="store",
 | 
| 
igor@38
 | 
    85     dest="function")
 | 
| 
igor@38
 | 
    86 
 | 
| 
igor@38
 | 
    87 parser.add_option(
 | 
| 
igor@37
 | 
    88     "-m", "--merge-tag",
 | 
| 
igor@37
 | 
    89     help="merge words tagged with specified tag into the main vocabulary",
 | 
| 
igor@37
 | 
    90     action="store",
 | 
| 
igor@37
 | 
    91     dest="merge_tag")
 | 
| 
igor@37
 | 
    92 
 | 
| 
igor@37
 | 
    93 parser.add_option(
 | 
| 
igor@37
 | 
    94     "-M", "--merge-tagged",
 | 
| 
igor@37
 | 
    95     help="merge words tagged with ANY tag into the main vocabulary",
 | 
| 
igor@37
 | 
    96     action="store_true",
 | 
| 
igor@37
 | 
    97     dest="merge_tagged")
 | 
| 
igor@37
 | 
    98 
 | 
| 
igor@37
 | 
    99 parser.add_option(
 | 
| 
igor@37
 | 
   100     "-n", "--non-interactive",
 | 
| 
igor@37
 | 
   101     help="non-interactive mode (don't run vi)",
 | 
| 
igor@37
 | 
   102     action="store_true",
 | 
| 
igor@37
 | 
   103     dest="non_interactive")
 | 
| 
igor@37
 | 
   104 
 | 
| 
igor@37
 | 
   105 parser.add_option(
 | 
| 
igor@37
 | 
   106     "-N", "--no-filter",
 | 
| 
igor@37
 | 
   107     help="switch off known words filtering",
 | 
| 
igor@37
 | 
   108     action="store_true",
 | 
| 
igor@37
 | 
   109     dest="no_filter")
 | 
| 
igor@37
 | 
   110 
 | 
| 
igor@37
 | 
   111 parser.add_option(
 | 
| 
igor@37
 | 
   112     "-p", "--pages",
 | 
| 
igor@37
 | 
   113     help="work with specified pages only (pages = start-stop/total )",
 | 
| 
igor@37
 | 
   114     action="store",
 | 
| 
igor@37
 | 
   115     dest="pages")
 | 
| 
igor@37
 | 
   116 
 | 
| 
igor@37
 | 
   117 parser.add_option(
 | 
| 
igor@37
 | 
   118     "-r", "--remove-tag",
 | 
| 
igor@37
 | 
   119     help="remove subvocabulary of specified tag",
 | 
| 
igor@37
 | 
   120     action="store",
 | 
| 
igor@37
 | 
   121     dest="remove_tag")
 | 
| 
igor@37
 | 
   122 
 | 
| 
igor@37
 | 
   123 parser.add_option(
 | 
| 
igor@37
 | 
   124     "-s", "--text-stats",
 | 
| 
igor@37
 | 
   125     help="show the text statistics (percentage of known words and so on) and exit",
 | 
| 
igor@37
 | 
   126     action="store_true",
 | 
| 
igor@37
 | 
   127     dest="text_stats")
 | 
| 
igor@37
 | 
   128 
 | 
| 
igor@37
 | 
   129 parser.add_option(
 | 
| 
igor@37
 | 
   130     "-S", "--voc-stats",
 | 
| 
igor@37
 | 
   131     help="show your vocabulary statistics (number of words and word groups)",
 | 
| 
igor@37
 | 
   132     action="store_true",
 | 
| 
igor@37
 | 
   133     dest="voc_stats")
 | 
| 
igor@37
 | 
   134 
 | 
| 
igor@37
 | 
   135 parser.add_option(
 | 
| 
igor@37
 | 
   136     "-t", "--tag",
 | 
| 
igor@37
 | 
   137     help="tag known words with tag",
 | 
| 
igor@37
 | 
   138     action="store",
 | 
| 
igor@37
 | 
   139     dest="tag")
 | 
| 
igor@37
 | 
   140 
 | 
| 
igor@37
 | 
   141 parser.add_option(
 | 
| 
igor@37
 | 
   142     "-T", "--show-tags",
 | 
| 
igor@37
 | 
   143     help="tag known words with tag",
 | 
| 
igor@37
 | 
   144     action="store_true",
 | 
| 
igor@37
 | 
   145     dest="show_tags")
 | 
| 
igor@37
 | 
   146 
 | 
| 
igor@37
 | 
   147 parser.add_option(
 | 
| 
igor@37
 | 
   148     "-2", "--two-words",
 | 
| 
igor@37
 | 
   149     help="find 2 words' sequences",
 | 
| 
igor@37
 | 
   150     action="store_true",
 | 
| 
igor@37
 | 
   151     dest="two_words")
 | 
| 
igor@37
 | 
   152 
 | 
| 
igor@37
 | 
   153 parser.add_option(
 | 
| 
igor@37
 | 
   154     "-3", "--three-words",
 | 
| 
igor@37
 | 
   155     help="find 3 words' sequences",
 | 
| 
igor@37
 | 
   156     action="store_true",
 | 
| 
igor@37
 | 
   157     dest="three_words")
 | 
| 
igor@37
 | 
   158 
 | 
| 
igor@38
 | 
   159 def readlines_from_file(filename):
 | 
| 
igor@38
 | 
   160     res = []
 | 
| 
igor@38
 | 
   161     with codecs.open(filename, "r", "utf-8") as f:
 | 
| 
igor@38
 | 
   162         for line in f.readlines():
 | 
| 
igor@38
 | 
   163             res += [line]
 | 
| 
igor@38
 | 
   164     return res
 | 
| 
igor@38
 | 
   165 
 | 
| 
igor@38
 | 
   166 def readlines_from_stdin():
 | 
| 
igor@38
 | 
   167     return codecs.getreader("utf-8")(sys.stdin).readlines()
 | 
| 
igor@38
 | 
   168 
 | 
| 
igor@38
 | 
   169 def words_from_line(line):
 | 
| 
igor@38
 | 
   170     line = line.rstrip('\n')
 | 
| 
igor@38
 | 
   171     #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
 | 
| 
igor@38
 | 
   172     #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
 | 
| 
igor@44
 | 
   173     return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
 | 
| 
igor@38
 | 
   174 
 | 
| 
igor@44
 | 
   175 def get_words(lines, group_by=[1]):
 | 
| 
igor@38
 | 
   176     """
 | 
| 
igor@38
 | 
   177     Returns hash of words in a file
 | 
| 
igor@38
 | 
   178     word => number
 | 
| 
igor@38
 | 
   179     """
 | 
| 
igor@38
 | 
   180     result = {}
 | 
| 
igor@44
 | 
   181     (a, b, c) = ("", "", "")
 | 
| 
igor@38
 | 
   182     for line in lines:
 | 
| 
igor@38
 | 
   183         words = words_from_line(line)
 | 
| 
igor@38
 | 
   184         for word in words:
 | 
| 
igor@41
 | 
   185             if re.match('[0-9]*$', word):
 | 
| 
igor@41
 | 
   186                 continue
 | 
| 
igor@38
 | 
   187             result.setdefault(word, 0)
 | 
| 
igor@38
 | 
   188             result[word] += 1
 | 
| 
igor@44
 | 
   189             if 2 in group_by and a != "" and b != "":
 | 
| 
igor@44
 | 
   190                 w = "%s_%s" % (a,b)
 | 
| 
igor@44
 | 
   191                 result.setdefault(w, 0)
 | 
| 
igor@44
 | 
   192                 result[w] += 1
 | 
| 
igor@44
 | 
   193             if 3 in group_by and not "" in [a,b,c]:
 | 
| 
igor@44
 | 
   194                 w = "%s_%s_%s" % (a,b,c)
 | 
| 
igor@44
 | 
   195                 result.setdefault(w, 0)
 | 
| 
igor@44
 | 
   196                 result[w] += 1
 | 
| 
igor@44
 | 
   197             (a,b,c) = (b, c, word)
 | 
| 
igor@44
 | 
   198 
 | 
| 
igor@44
 | 
   199     logging.debug(result)
 | 
| 
igor@38
 | 
   200     return result
 | 
| 
igor@38
 | 
   201 
 | 
| 
igor@38
 | 
   202 def load_vocabulary():
 | 
| 
igor@38
 | 
   203     return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
 | 
| 
igor@38
 | 
   204 
 | 
| 
igor@38
 | 
   205 def notes_filenames():
 | 
| 
igor@38
 | 
   206     return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
 | 
| 
igor@38
 | 
   207 
 | 
| 
igor@38
 | 
   208 def load_notes(files):
 | 
| 
igor@38
 | 
   209     notes = {}
 | 
| 
igor@38
 | 
   210     for filename in files:
 | 
| 
igor@39
 | 
   211         with codecs.open(filename, "r", "utf-8") as f:
 | 
| 
igor@38
 | 
   212             for line in f.readlines():
 | 
| 
igor@38
 | 
   213                 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
 | 
| 
igor@38
 | 
   214                 notes.setdefault(word, {})
 | 
| 
igor@38
 | 
   215                 notes[word][filename] = note
 | 
| 
igor@38
 | 
   216     return notes
 | 
| 
igor@38
 | 
   217 
 | 
| 
igor@39
 | 
   218 def add_notes(lines, notes):
 | 
| 
igor@39
 | 
   219     notes_filename = notes_filenames()[0]
 | 
| 
igor@39
 | 
   220     result = []
 | 
| 
igor@39
 | 
   221     for line in lines:
 | 
| 
igor@39
 | 
   222         if line.startswith('#'):
 | 
| 
igor@39
 | 
   223             result += [line]
 | 
| 
igor@39
 | 
   224         else:
 | 
| 
igor@39
 | 
   225             match_object = re.search('^\s*\S+\s*(\S+)', line)
 | 
| 
igor@39
 | 
   226             if match_object:
 | 
| 
igor@39
 | 
   227                 word = match_object.group(1)
 | 
| 
igor@39
 | 
   228                 if word in notes:
 | 
| 
igor@39
 | 
   229                     if notes_filename in notes[word]:
 | 
| 
igor@39
 | 
   230                         line = line.rstrip('\n')
 | 
| 
igor@39
 | 
   231                         line = "%-30s %s\n" % (line, notes[word][notes_filename])
 | 
| 
igor@39
 | 
   232                         result += [line]
 | 
| 
igor@39
 | 
   233                 else:
 | 
| 
igor@39
 | 
   234                     result += [line]
 | 
| 
igor@39
 | 
   235             else:
 | 
| 
igor@39
 | 
   236                 result += [line]
 | 
| 
igor@39
 | 
   237     return result
 | 
| 
igor@39
 | 
   238 
 | 
| 
igor@39
 | 
   239 def remove_notes(lines, notes_group):
 | 
| 
igor@39
 | 
   240     notes_filename = notes_filenames()[0]
 | 
| 
igor@39
 | 
   241     notes = {}
 | 
| 
igor@39
 | 
   242     for k in notes_group.keys():
 | 
| 
igor@39
 | 
   243         if notes_filename in notes_group[k]:
 | 
| 
igor@39
 | 
   244             notes[k] = notes_group[k][notes_filename]
 | 
| 
igor@39
 | 
   245 
 | 
| 
igor@39
 | 
   246     result = []
 | 
| 
igor@39
 | 
   247     for line in lines:
 | 
| 
igor@39
 | 
   248         line = line.rstrip('\n')
 | 
| 
igor@39
 | 
   249         match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
 | 
| 
igor@39
 | 
   250         if match_object:
 | 
| 
igor@39
 | 
   251             result.append("".join([
 | 
| 
igor@39
 | 
   252                 match_object.group(1),
 | 
| 
igor@39
 | 
   253                 match_object.group(2),
 | 
| 
igor@39
 | 
   254                 match_object.group(3),
 | 
| 
igor@39
 | 
   255                 match_object.group(4),
 | 
| 
igor@39
 | 
   256                 "\n"
 | 
| 
igor@39
 | 
   257                 ]))
 | 
| 
igor@39
 | 
   258             notes[match_object.group(4)] = match_object.group(6)
 | 
| 
igor@39
 | 
   259         else:
 | 
| 
igor@39
 | 
   260             result.append(line+"\n")
 | 
| 
igor@39
 | 
   261 
 | 
| 
igor@39
 | 
   262     save_notes(notes_filename, notes)
 | 
| 
igor@39
 | 
   263     return result
 | 
| 
igor@39
 | 
   264 
 | 
| 
igor@39
 | 
   265 def save_notes(filename, notes):
 | 
| 
igor@39
 | 
   266     lines = []
 | 
| 
igor@39
 | 
   267     saved_words = []
 | 
| 
igor@39
 | 
   268     with codecs.open(filename, "r", "utf-8") as f:
 | 
| 
igor@39
 | 
   269         for line in f.readlines():
 | 
| 
igor@39
 | 
   270             (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
 | 
| 
igor@39
 | 
   271             if word in notes:
 | 
| 
igor@39
 | 
   272                 line = "%-29s %s\n" % (word, notes[word])
 | 
| 
igor@39
 | 
   273                 saved_words.append(word)
 | 
| 
igor@39
 | 
   274             lines.append(line)
 | 
| 
igor@39
 | 
   275     for word in [x for x in notes.keys() if not x in saved_words]:
 | 
| 
igor@39
 | 
   276         line = "%-29s %s\n" % (word, notes[word])
 | 
| 
igor@39
 | 
   277         lines.append(line)
 | 
| 
igor@39
 | 
   278 
 | 
| 
igor@39
 | 
   279     with codecs.open(filename, "w", "utf-8") as f:
 | 
| 
igor@39
 | 
   280         for line in lines:
 | 
| 
igor@39
 | 
   281             f.write(line)
 | 
| 
igor@39
 | 
   282 
 | 
| 
igor@39
 | 
   283 
 | 
| 
igor@38
 | 
   284 def substract_dictionary(dict1, dict2):
 | 
| 
igor@38
 | 
   285     """
 | 
| 
igor@38
 | 
   286     returns dict1 - dict2
 | 
| 
igor@38
 | 
   287     """
 | 
| 
igor@38
 | 
   288     result = {}
 | 
| 
igor@38
 | 
   289     for (k,v) in dict1.items():
 | 
| 
igor@38
 | 
   290         if not k in dict2:
 | 
| 
igor@38
 | 
   291             result[k] = v
 | 
| 
igor@38
 | 
   292     return result
 | 
| 
igor@38
 | 
   293 
 | 
| 
igor@38
 | 
   294 def dump_words(words, filename):
 | 
| 
igor@38
 | 
   295     with codecs.open(filename, "w+", "utf-8") as f:
 | 
| 
igor@38
 | 
   296         for word in words.keys():
 | 
| 
igor@38
 | 
   297             f.write(("%s\n"%word)*words[word])
 | 
| 
igor@38
 | 
   298 
 | 
| 
igor@38
 | 
   299 def error_message(text):
 | 
| 
igor@38
 | 
   300     print text
 | 
| 
igor@38
 | 
   301 
 | 
| 
igor@40
 | 
   302 def find_wordgroups_weights(word_pairs, normalizator):
 | 
| 
igor@38
 | 
   303     weight = {}
 | 
| 
igor@40
 | 
   304     for (num, word) in word_pairs:
 | 
| 
igor@38
 | 
   305         normalized = normalizator.normalize(word)
 | 
| 
igor@38
 | 
   306         weight.setdefault(normalized, 0)
 | 
| 
igor@40
 | 
   307         weight[normalized] += num
 | 
| 
igor@38
 | 
   308     return weight
 | 
| 
igor@38
 | 
   309 
 | 
| 
igor@38
 | 
   310 def find_linked_words(notes):
 | 
| 
igor@38
 | 
   311     linked_words = {}
 | 
| 
igor@38
 | 
   312     for word in notes.keys():
 | 
| 
igor@38
 | 
   313         for note in notes[word].values():
 | 
| 
igor@38
 | 
   314             if "@" in note:
 | 
| 
igor@38
 | 
   315                 result = re.search(r'\@(\S*)', note)
 | 
| 
igor@38
 | 
   316                 if result:
 | 
| 
igor@38
 | 
   317                     main_word = result.group(1)
 | 
| 
igor@38
 | 
   318                     if main_word:
 | 
| 
igor@38
 | 
   319                         linked_words[word] = main_word
 | 
| 
igor@38
 | 
   320     return linked_words
 | 
| 
igor@38
 | 
   321 
 | 
| 
igor@40
 | 
   322 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
 | 
| 
igor@40
 | 
   323     (num1, word1) = pair1
 | 
| 
igor@40
 | 
   324     (num2, word2) = pair2
 | 
| 
igor@38
 | 
   325 
 | 
| 
igor@38
 | 
   326     normalized_word1 = normalizator.normalize(word1)
 | 
| 
igor@38
 | 
   327     normalized_word2 = normalizator.normalize(word2)
 | 
| 
igor@38
 | 
   328 
 | 
| 
igor@38
 | 
   329     cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
 | 
| 
igor@38
 | 
   330     if cmp_res != 0:
 | 
| 
igor@38
 | 
   331         return cmp_res
 | 
| 
igor@38
 | 
   332     else:
 | 
| 
igor@38
 | 
   333         cmp_res = cmp(normalized_word1, normalized_word2)
 | 
| 
igor@38
 | 
   334         if cmp_res != 0:
 | 
| 
igor@38
 | 
   335             return cmp_res
 | 
| 
igor@38
 | 
   336         else:
 | 
| 
igor@38
 | 
   337             return cmp(int(num1), int(num2))
 | 
| 
igor@38
 | 
   338 
 | 
| 
igor@47
 | 
   339 
 | 
| 
igor@47
 | 
   340 def print_words_sorted(word_pairs, stats, normalizator, print_stats=True, stats_only=False, compressed_wordlist=False):
 | 
| 
igor@40
 | 
   341     if stats_only:
 | 
| 
igor@43
 | 
   342         codecs.getwriter("utf-8")(sys.stdout).write(
 | 
| 
igor@43
 | 
   343             " ".join([
 | 
| 
igor@43
 | 
   344                 "%-10s" % x for x in [
 | 
| 
igor@43
 | 
   345                 "LANG",
 | 
| 
igor@43
 | 
   346                 "KNOWN%",
 | 
| 
igor@43
 | 
   347                 "UNKNOWN%",
 | 
| 
igor@43
 | 
   348                 "KNOWN",
 | 
| 
igor@43
 | 
   349                 "TOTAL",
 | 
| 
igor@43
 | 
   350                 "WPS",
 | 
| 
igor@43
 | 
   351                 "UWPS*10"
 | 
| 
igor@43
 | 
   352                 ]]) + "\n")
 | 
| 
igor@43
 | 
   353         codecs.getwriter("utf-8")(sys.stdout).write(
 | 
| 
igor@43
 | 
   354             " ".join([
 | 
| 
igor@43
 | 
   355                 "%(language)-10s",
 | 
| 
igor@43
 | 
   356                 "%(percentage)-10.2f",
 | 
| 
igor@43
 | 
   357                 "%(percentage_unknown)-10.2f",
 | 
| 
igor@43
 | 
   358                 "%(total_known)-11d"
 | 
| 
igor@43
 | 
   359                 "%(total)-11d"
 | 
| 
igor@43
 | 
   360                 "%(wps)-11d"
 | 
| 
igor@43
 | 
   361                 "%(uwps)-11d"
 | 
| 
igor@43
 | 
   362                 ]) % stats + "\n")
 | 
| 
igor@40
 | 
   363         return
 | 
| 
igor@38
 | 
   364 
 | 
| 
igor@40
 | 
   365     if print_stats:
 | 
| 
igor@40
 | 
   366         codecs.getwriter("utf-8")(sys.stdout).write(
 | 
| 
igor@43
 | 
   367             "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
 | 
| 
igor@38
 | 
   368 
 | 
| 
igor@40
 | 
   369     level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
 | 
| 
igor@40
 | 
   370     known = int(stats['total_known'])
 | 
| 
igor@40
 | 
   371     total = int(stats['total'])
 | 
| 
igor@40
 | 
   372     current_level = 0
 | 
| 
igor@47
 | 
   373     old_normalized_word = None
 | 
| 
igor@47
 | 
   374     words_of_this_group = []
 | 
| 
igor@40
 | 
   375     for word_pair in word_pairs:
 | 
| 
igor@47
 | 
   376 
 | 
| 
igor@47
 | 
   377         normalized_word = normalizator.normalize(word_pair[1])
 | 
| 
igor@47
 | 
   378         if old_normalized_word and old_normalized_word != normalized_word:
 | 
| 
igor@47
 | 
   379             #codecs.getwriter("utf-8")(sys.stdout).write(
 | 
| 
igor@47
 | 
   380             #    "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
 | 
| 
igor@47
 | 
   381             compressed_word_pair = (
 | 
| 
igor@47
 | 
   382                 sum(x[0] for x in words_of_this_group),
 | 
| 
igor@47
 | 
   383                 normalizator.best_word_from_group(words_of_this_group)
 | 
| 
igor@47
 | 
   384                 )
 | 
| 
igor@47
 | 
   385             if compressed_wordlist:
 | 
| 
igor@47
 | 
   386                 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
 | 
| 
igor@47
 | 
   387             words_of_this_group = []
 | 
| 
igor@47
 | 
   388 
 | 
| 
igor@47
 | 
   389         old_normalized_word = normalized_word
 | 
| 
igor@47
 | 
   390         words_of_this_group.append(word_pair)
 | 
| 
igor@47
 | 
   391 
 | 
| 
igor@47
 | 
   392         if not compressed_wordlist:
 | 
| 
igor@47
 | 
   393             codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
 | 
| 
igor@47
 | 
   394 
 | 
| 
igor@47
 | 
   395 
 | 
| 
igor@40
 | 
   396         known += word_pair[0]
 | 
| 
igor@40
 | 
   397         if 100.0*known/total >= level_lines[0]:
 | 
| 
igor@40
 | 
   398             current_level = level_lines[0]
 | 
| 
igor@40
 | 
   399             while 100.0*known/total > level_lines[0]:
 | 
| 
igor@40
 | 
   400                 current_level = level_lines[0]
 | 
| 
igor@40
 | 
   401                 level_lines = level_lines[1:]
 | 
| 
igor@40
 | 
   402             codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
 | 
| 
igor@38
 | 
   403 
 | 
| 
igor@39
 | 
   404 def filter_add_notes(args):
 | 
| 
igor@39
 | 
   405     lines = readlines_from_file(args[0])
 | 
| 
igor@39
 | 
   406     notes = load_notes(notes_filenames())
 | 
| 
igor@39
 | 
   407     lines = add_notes(lines, notes)
 | 
| 
igor@39
 | 
   408     with codecs.open(args[0], "w", "utf-8") as f:
 | 
| 
igor@39
 | 
   409         for line in lines:
 | 
| 
igor@39
 | 
   410             f.write(line)
 | 
| 
igor@39
 | 
   411 
 | 
| 
igor@39
 | 
   412 def filter_remove_notes(args):
 | 
| 
igor@39
 | 
   413     lines = readlines_from_file(args[0])
 | 
| 
igor@39
 | 
   414     notes = load_notes(notes_filenames())
 | 
| 
igor@39
 | 
   415     lines = remove_notes(lines, notes)
 | 
| 
igor@39
 | 
   416     with codecs.open(args[0], "w", "utf-8") as f:
 | 
| 
igor@39
 | 
   417         for line in lines:
 | 
| 
igor@39
 | 
   418             f.write(line)
 | 
| 
igor@39
 | 
   419 
 | 
| 
igor@40
 | 
   420 def filter_get_words_group_words_add_stat(args):
 | 
| 
igor@40
 | 
   421     vocabulary = load_vocabulary()
 | 
| 
igor@40
 | 
   422     notes = load_notes(notes_filenames())
 | 
| 
igor@40
 | 
   423     lines = readlines_from_stdin()
 | 
| 
igor@44
 | 
   424     group_by = [1]
 | 
| 
igor@44
 | 
   425     if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
 | 
| 
igor@44
 | 
   426         group_by.append(2)
 | 
| 
igor@44
 | 
   427     if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
 | 
| 
igor@44
 | 
   428         group_by.append(3)
 | 
| 
igor@44
 | 
   429     words = get_words(lines, group_by)
 | 
| 
igor@43
 | 
   430     stats_only = False
 | 
| 
igor@43
 | 
   431     if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
 | 
| 
igor@43
 | 
   432         stats_only = True
 | 
| 
igor@40
 | 
   433 
 | 
| 
igor@47
 | 
   434     compressed_wordlist = False
 | 
| 
igor@47
 | 
   435     if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
 | 
| 
igor@47
 | 
   436         compressed_wordlist = True
 | 
| 
igor@47
 | 
   437 
 | 
| 
igor@44
 | 
   438 
 | 
| 
igor@40
 | 
   439     stats = {}
 | 
| 
igor@40
 | 
   440     stats['total'] = sum(words[x] for x in words.keys())
 | 
| 
igor@45
 | 
   441     if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':
 | 
| 
igor@45
 | 
   442         words = substract_dictionary(words, vocabulary)
 | 
| 
igor@40
 | 
   443 
 | 
| 
igor@40
 | 
   444     stats['total_unknown'] = sum(words[x] for x in words.keys())
 | 
| 
igor@40
 | 
   445     stats['total_known'] = stats['total'] - stats['total_unknown']
 | 
| 
igor@43
 | 
   446     stats['percentage'] = 100.0*stats['total_known']/stats['total']
 | 
| 
igor@43
 | 
   447     stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
 | 
| 
igor@40
 | 
   448     stats['groups'] = 0
 | 
| 
igor@40
 | 
   449     stats['words'] = len(words)
 | 
| 
igor@43
 | 
   450     stats['sentences'] = 0  #FIXME
 | 
| 
igor@43
 | 
   451     stats['wps'] = 0        #FIXME
 | 
| 
igor@43
 | 
   452     stats['uwps'] = 0       #FIXME
 | 
| 
igor@40
 | 
   453     stats['language'] = config['language']
 | 
| 
igor@40
 | 
   454 
 | 
| 
igor@40
 | 
   455     linked_words = find_linked_words(notes)
 | 
| 
igor@40
 | 
   456     normalizator = Normalizator(config['language'], linked_words)
 | 
| 
igor@40
 | 
   457 
 | 
| 
igor@44
 | 
   458     words_with_freq = []
 | 
| 
igor@40
 | 
   459     for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
 | 
| 
igor@44
 | 
   460         words_with_freq.append((words[k], k))
 | 
| 
igor@40
 | 
   461 
 | 
| 
igor@44
 | 
   462     wgw = find_wordgroups_weights(words_with_freq, normalizator)
 | 
| 
igor@45
 | 
   463     if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':
 | 
| 
igor@45
 | 
   464         words_with_freq = sorted(
 | 
| 
igor@44
 | 
   465                 words_with_freq,
 | 
| 
igor@40
 | 
   466                 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
 | 
| 
igor@40
 | 
   467                 reverse=True)
 | 
| 
igor@40
 | 
   468 
 | 
| 
igor@47
 | 
   469     print_words_sorted(
 | 
| 
igor@47
 | 
   470         words_with_freq,
 | 
| 
igor@47
 | 
   471         stats,
 | 
| 
igor@47
 | 
   472         normalizator,
 | 
| 
igor@47
 | 
   473         stats_only=stats_only,
 | 
| 
igor@47
 | 
   474         compressed_wordlist=compressed_wordlist
 | 
| 
igor@47
 | 
   475         )
 | 
| 
igor@40
 | 
   476 
 | 
| 
igor@37
 | 
   477 (options, args) = parser.parse_args()
 | 
| 
igor@38
 | 
   478 if options.language:
 | 
| 
igor@38
 | 
   479     config['language'] = options.language
 | 
| 
igor@37
 | 
   480 
 | 
| 
igor@38
 | 
   481 if options.function:
 | 
| 
igor@38
 | 
   482     function_names = {
 | 
| 
igor@39
 | 
   483         'add_notes' :   filter_add_notes,
 | 
| 
igor@39
 | 
   484         'remove_notes': filter_remove_notes,
 | 
| 
igor@40
 | 
   485         'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
 | 
| 
igor@38
 | 
   486     }
 | 
| 
igor@38
 | 
   487     if options.function in function_names:
 | 
| 
igor@38
 | 
   488         function_names[options.function](args)
 | 
| 
igor@38
 | 
   489     else:
 | 
| 
igor@38
 | 
   490         error_message("Unkown function %s.\nAvailable functions:\n%s" % (
 | 
| 
igor@38
 | 
   491             options.function, "".join(["   "+x for x in sorted(function_names.keys())])))
 | 
| 
igor@38
 | 
   492         sys.exit(1)
 | 
| 
igor@37
 | 
   493 
 | 
| 
igor@37
 | 
   494 
 | 
| 
igor@37
 | 
   495 
 | 
| 
igor@37
 | 
   496 
 | 
| 
igor@38
 | 
   497 #os.system("vim")
 | 
| 
igor@37
 | 
   498 
 |