| rev | line source | 
| igor@37 | 1 #!/usr/bin/env python | 
| igor@38 | 2 # -*- coding: utf-8 -*- | 
| igor@37 | 3 | 
| igor@40 | 4 from __future__ import with_statement | 
| igor@38 | 5 import codecs | 
| igor@49 | 6 import difflib | 
| igor@38 | 7 import logging | 
| igor@38 | 8 import os | 
| igor@37 | 9 import optparse | 
| igor@38 | 10 import re | 
| igor@38 | 11 import subprocess | 
| igor@38 | 12 import sys | 
| igor@38 | 13 import Stemmer | 
| igor@54 | 14 import tempfile | 
| igor@42 | 15 try: | 
| igor@42 | 16     import psyco | 
| igor@42 | 17     psyco.full() | 
| igor@42 | 18 except: | 
| igor@42 | 19     pass | 
| igor@38 | 20 | 
| igor@38 | 21 config = { | 
| igor@38 | 22     'config_directory': os.environ['HOME'] + '/.new-words', | 
| igor@38 | 23     'language': 'en', | 
| igor@38 | 24 } | 
| igor@38 | 25 | 
| igor@38 | 26 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG) | 
| igor@38 | 27 | 
| igor@38 | 28 class Normalizator: | 
| igor@38 | 29     def __init__(self, language, linked_words={}): | 
| igor@38 | 30         stemmer_algorithm = { | 
| igor@38 | 31             'de' : 'german', | 
| igor@63 | 32             'fr' : 'french', | 
| igor@38 | 33             'en' : 'english', | 
| igor@51 | 34             'es' : 'spanish', | 
| igor@38 | 35             'ru' : 'russian', | 
| igor@51 | 36             'it' : 'italian', | 
| igor@38 | 37             'uk' : 'ukrainian', | 
| igor@38 | 38         } | 
| igor@65 | 39         try: | 
| igor@65 | 40             self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language]) | 
| igor@65 | 41         except: | 
| igor@65 | 42             self.stemmer = None | 
| igor@38 | 43         self.linked_words = linked_words | 
| igor@38 | 44 | 
| igor@38 | 45     def normalize(self, word): | 
| igor@38 | 46         word_chain = [] | 
| igor@38 | 47         while word in self.linked_words and not word in word_chain: | 
| igor@38 | 48             word_chain.append(word) | 
| igor@38 | 49             word = self.linked_words[word] | 
| igor@65 | 50         if self.stemmer: | 
| igor@65 | 51             return self.stemmer.stemWord(word.lower()) | 
| igor@65 | 52         else: | 
| igor@65 | 53             return word.lower() | 
| igor@37 | 54 | 
| igor@47 | 55     def best_word_from_group(self, wordpairs_group): | 
| igor@47 | 56         """Returns the word that is the most relevant to the wordpairs_group. | 
| igor@47 | 57 | 
| igor@47 | 58         At the moment: returns the word with minimal length""" | 
| igor@49 | 59 | 
| igor@49 | 60         def f(x, y): | 
| igor@49 | 61             return difflib.SequenceMatcher( | 
| igor@49 | 62                         None, | 
| igor@49 | 63                         #(x[-2:] == 'en' and x[:-2].lower() or x.lower()), | 
| igor@49 | 64                         x.lower(), | 
| igor@49 | 65                         y.lower()).ratio() | 
| igor@47 | 66 | 
| igor@47 | 67         minimal_length = min(len(pair[1]) for pair in wordpairs_group) | 
| igor@49 | 68         best_match = list(x[1] for x in sorted( | 
| igor@47 | 69             (x for x in wordpairs_group if len(x[1]) == minimal_length), | 
| igor@47 | 70             key=lambda x:x[0], | 
| igor@47 | 71             reverse=True))[0] | 
| igor@47 | 72 | 
| igor@51 | 73         return best_match | 
| igor@51 | 74 | 
| igor@49 | 75         suggestions = self.dictionary_suggestions(best_match) | 
| igor@49 | 76         if len(suggestions) == 1: | 
| igor@49 | 77             return best_match | 
| igor@49 | 78 | 
| igor@49 | 79         verb = False | 
| igor@49 | 80         corrected_best_match = best_match | 
| igor@49 | 81         if best_match[-2:] == 'et': | 
| igor@49 | 82             word = best_match[:-1]+"n" | 
| igor@49 | 83             sugg = self.dictionary_suggestions(word) | 
| igor@49 | 84             if len(sugg) == 1: | 
| igor@49 | 85                 return word | 
| igor@49 | 86             suggestions += sugg | 
| igor@49 | 87             corrected_best_match = word | 
| igor@49 | 88             corrected_best_match = best_match[:-2] | 
| igor@49 | 89             verb = True | 
| igor@49 | 90 | 
| igor@49 | 91         if best_match[-1] == 't': | 
| igor@49 | 92             word = best_match[:-1]+"en" | 
| igor@49 | 93             sugg = self.dictionary_suggestions(word) | 
| igor@49 | 94             if len(sugg) == 1: | 
| igor@49 | 95                 return word | 
| igor@49 | 96             suggestions += sugg | 
| igor@49 | 97             corrected_best_match = best_match[:-1] | 
| igor@49 | 98             verb = True | 
| igor@49 | 99 | 
| igor@49 | 100         if corrected_best_match[0].lower() == corrected_best_match[0]: | 
| igor@49 | 101             suggestions = [ x for x in suggestions | 
| igor@49 | 102                 if x[0].lower() == x[0] ] | 
| igor@49 | 103 | 
| igor@49 | 104         if suggestions == []: | 
| igor@49 | 105             return best_match+"_" | 
| igor@49 | 106         return best_match+" "+(" ".join( | 
| igor@49 | 107                             sorted( | 
| igor@49 | 108                                 suggestions, | 
| igor@49 | 109                                 key = lambda x: f(x, corrected_best_match), | 
| igor@49 | 110                                 reverse = True | 
| igor@49 | 111                                 ) | 
| igor@49 | 112                             ) | 
| igor@49 | 113                         ) | 
| igor@49 | 114 | 
| igor@49 | 115     def dictionary_suggestions(self, word): | 
| igor@49 | 116         return [ | 
| igor@49 | 117             x.decode('utf-8').rstrip('\n') | 
| igor@49 | 118             for x | 
| igor@49 | 119                 in subprocess.Popen( | 
| igor@49 | 120                     ["de-variants", word], | 
| igor@49 | 121                     stdout=subprocess.PIPE | 
| igor@49 | 122                 ).stdout.readlines() ] | 
| igor@49 | 123 | 
| igor@49 | 124 | 
| igor@37 | 125 parser = optparse.OptionParser() | 
| igor@37 | 126 | 
| igor@37 | 127 parser.add_option( | 
| igor@37 | 128     "-a", "--no-marks", | 
| igor@55 | 129     help="don't add marks (and don't save marks added by user) [NOT IMPLEMENTED YET]", | 
| igor@37 | 130     action="store_true", | 
| igor@37 | 131     dest="no_marks") | 
| igor@37 | 132 | 
| igor@37 | 133 parser.add_option( | 
| igor@37 | 134     "-c", "--compressed", | 
| igor@37 | 135     help="show compressed wordlist: one word per group", | 
| igor@37 | 136     action="store_true", | 
| igor@37 | 137     dest="compressed") | 
| igor@37 | 138 | 
| igor@37 | 139 parser.add_option( | 
| igor@68 | 140     "-C", "--compressed-to-line", | 
| igor@68 | 141     help="show compressed wordlist: all words of the group in a line", | 
| igor@68 | 142     action="store_true", | 
| igor@68 | 143     dest="compressed_to_line") | 
| igor@68 | 144 | 
| igor@68 | 145 parser.add_option( | 
| igor@37 | 146     "-k", "--known-words", | 
| igor@37 | 147     help="put higher words that are similar to the known words (only for English)", | 
| igor@37 | 148     action="store_true", | 
| igor@37 | 149     dest="compressed") | 
| igor@37 | 150 | 
| igor@37 | 151 parser.add_option( | 
| igor@37 | 152     "-l", "--language", | 
| igor@37 | 153     help="specify language of text", | 
| igor@37 | 154     action="store", | 
| igor@37 | 155     dest="language") | 
| igor@37 | 156 | 
| igor@37 | 157 parser.add_option( | 
| igor@54 | 158     "-f", "--allowed-words", | 
| igor@54 | 159     help="file with list of allowed words (words that will be shown in the output)", | 
| igor@54 | 160     action="store", | 
| igor@54 | 161     dest="allowed_words") | 
| igor@54 | 162 | 
| igor@54 | 163 parser.add_option( | 
| igor@55 | 164     "-G", "--words-grouping", | 
| igor@55 | 165     help="turn off word grouping", | 
| igor@55 | 166     action="store_true", | 
| igor@55 | 167     dest="no_words_grouping") | 
| igor@55 | 168 | 
| igor@55 | 169 parser.add_option( | 
| igor@54 | 170     "-X", "--function", | 
| igor@38 | 171     help="filter through subsystem [INTERNAL]", | 
| igor@38 | 172     action="store", | 
| igor@38 | 173     dest="function") | 
| igor@38 | 174 | 
| igor@38 | 175 parser.add_option( | 
| igor@37 | 176     "-m", "--merge-tag", | 
| igor@55 | 177     help="merge words tagged with specified tag into the main vocabulary [NOT IMPLEMENTED YET]", | 
| igor@37 | 178     action="store", | 
| igor@37 | 179     dest="merge_tag") | 
| igor@37 | 180 | 
| igor@37 | 181 parser.add_option( | 
| igor@37 | 182     "-M", "--merge-tagged", | 
| igor@55 | 183     help="merge words tagged with ANY tag into the main vocabulary [NOT IMPLEMENTED YET]", | 
| igor@37 | 184     action="store_true", | 
| igor@37 | 185     dest="merge_tagged") | 
| igor@37 | 186 | 
| igor@37 | 187 parser.add_option( | 
| igor@37 | 188     "-n", "--non-interactive", | 
| igor@37 | 189     help="non-interactive mode (don't run vi)", | 
| igor@37 | 190     action="store_true", | 
| igor@37 | 191     dest="non_interactive") | 
| igor@37 | 192 | 
| igor@37 | 193 parser.add_option( | 
| igor@37 | 194     "-N", "--no-filter", | 
| igor@37 | 195     help="switch off known words filtering", | 
| igor@37 | 196     action="store_true", | 
| igor@37 | 197     dest="no_filter") | 
| igor@37 | 198 | 
| igor@37 | 199 parser.add_option( | 
| igor@37 | 200     "-p", "--pages", | 
| igor@37 | 201     help="work with specified pages only (pages = start-stop/total )", | 
| igor@37 | 202     action="store", | 
| igor@37 | 203     dest="pages") | 
| igor@37 | 204 | 
| igor@37 | 205 parser.add_option( | 
| igor@48 | 206     "-d", "--delete-tag", | 
| igor@48 | 207     help="delete subvocabulary of specified tag", | 
| igor@37 | 208     action="store", | 
| igor@48 | 209     dest="delete_tag") | 
| igor@37 | 210 | 
| igor@37 | 211 parser.add_option( | 
| igor@55 | 212     "-r", "--show-range", | 
| igor@55 | 213     help="show only words specified number of words", | 
| igor@55 | 214     action="store", | 
| igor@55 | 215     dest="show_range") | 
| igor@55 | 216 | 
| igor@55 | 217 parser.add_option( | 
| igor@54 | 218     "-R", "--show-range-percentage", | 
| igor@54 | 219     help="show only words that cover specified percentage of the text, skip the rest", | 
| igor@54 | 220     action="store", | 
| igor@54 | 221     dest="show_range_percentage") | 
| igor@54 | 222 | 
| igor@54 | 223 parser.add_option( | 
| igor@37 | 224     "-s", "--text-stats", | 
| igor@37 | 225     help="show the text statistics (percentage of known words and so on) and exit", | 
| igor@37 | 226     action="store_true", | 
| igor@37 | 227     dest="text_stats") | 
| igor@37 | 228 | 
| igor@37 | 229 parser.add_option( | 
| igor@37 | 230     "-S", "--voc-stats", | 
| igor@55 | 231     help="show your vocabulary statistics (number of words and word groups) [NOT IMPLEMENTED YET]", | 
| igor@37 | 232     action="store_true", | 
| igor@37 | 233     dest="voc_stats") | 
| igor@37 | 234 | 
| igor@37 | 235 parser.add_option( | 
| igor@37 | 236     "-t", "--tag", | 
| igor@37 | 237     help="tag known words with tag", | 
| igor@37 | 238     action="store", | 
| igor@37 | 239     dest="tag") | 
| igor@37 | 240 | 
| igor@37 | 241 parser.add_option( | 
| igor@37 | 242     "-T", "--show-tags", | 
| igor@37 | 243     help="tag known words with tag", | 
| igor@37 | 244     action="store_true", | 
| igor@37 | 245     dest="show_tags") | 
| igor@37 | 246 | 
| igor@37 | 247 parser.add_option( | 
| igor@63 | 248     "-v", "--vocabulary-filename", | 
| igor@63 | 249     help="use specified file as a vocabulary", | 
| igor@63 | 250     action="store", | 
| igor@63 | 251     dest="vocabulary_filename") | 
| igor@63 | 252 | 
| igor@63 | 253 parser.add_option( | 
| igor@65 | 254     "-w", "--web", | 
| igor@65 | 255     help="Web browser version", | 
| igor@65 | 256     action="store_true", | 
| igor@65 | 257     dest="web") | 
| igor@65 | 258 | 
| igor@65 | 259 parser.add_option( | 
| igor@37 | 260     "-2", "--two-words", | 
| igor@37 | 261     help="find 2 words' sequences", | 
| igor@37 | 262     action="store_true", | 
| igor@37 | 263     dest="two_words") | 
| igor@37 | 264 | 
| igor@37 | 265 parser.add_option( | 
| igor@37 | 266     "-3", "--three-words", | 
| igor@37 | 267     help="find 3 words' sequences", | 
| igor@37 | 268     action="store_true", | 
| igor@37 | 269     dest="three_words") | 
| igor@37 | 270 | 
| igor@38 | 271 def readlines_from_file(filename): | 
| igor@38 | 272     res = [] | 
| igor@38 | 273     with codecs.open(filename, "r", "utf-8") as f: | 
| igor@38 | 274         for line in f.readlines(): | 
| igor@38 | 275             res += [line] | 
| igor@38 | 276     return res | 
| igor@38 | 277 | 
| igor@54 | 278 def readlines_from_url(url): | 
| igor@54 | 279     return [x.decode('utf-8') for x in | 
| igor@54 | 280         subprocess.Popen( | 
| igor@54 | 281             "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url), | 
| igor@54 | 282             shell = True, | 
| igor@54 | 283             stdout = subprocess.PIPE, | 
| igor@54 | 284             stderr = subprocess.STDOUT | 
| igor@54 | 285             ).communicate()[0].split('\n') | 
| igor@54 | 286     ] | 
| igor@54 | 287 | 
| igor@38 | 288 def readlines_from_stdin(): | 
| igor@38 | 289     return codecs.getreader("utf-8")(sys.stdin).readlines() | 
| igor@38 | 290 | 
| igor@38 | 291 def words_from_line(line): | 
| igor@38 | 292     line = line.rstrip('\n') | 
| igor@38 | 293     #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line) | 
| igor@38 | 294     #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line) | 
| igor@44 | 295     return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line) | 
| igor@38 | 296 | 
| igor@44 | 297 def get_words(lines, group_by=[1]): | 
| igor@38 | 298     """ | 
| igor@38 | 299     Returns hash of words in a file | 
| igor@38 | 300     word => number | 
| igor@38 | 301     """ | 
| igor@38 | 302     result = {} | 
| igor@44 | 303     (a, b, c) = ("", "", "") | 
| igor@38 | 304     for line in lines: | 
| igor@38 | 305         words = words_from_line(line) | 
| igor@38 | 306         for word in words: | 
| igor@41 | 307             if re.match('[0-9]*$', word): | 
| igor@41 | 308                 continue | 
| igor@38 | 309             result.setdefault(word, 0) | 
| igor@38 | 310             result[word] += 1 | 
| igor@44 | 311             if 2 in group_by and a != "" and b != "": | 
| igor@44 | 312                 w = "%s_%s" % (a,b) | 
| igor@44 | 313                 result.setdefault(w, 0) | 
| igor@44 | 314                 result[w] += 1 | 
| igor@44 | 315             if 3 in group_by and not "" in [a,b,c]: | 
| igor@44 | 316                 w = "%s_%s_%s" % (a,b,c) | 
| igor@44 | 317                 result.setdefault(w, 0) | 
| igor@44 | 318                 result[w] += 1 | 
| igor@44 | 319             (a,b,c) = (b, c, word) | 
| igor@44 | 320 | 
| igor@44 | 321     logging.debug(result) | 
| igor@38 | 322     return result | 
| igor@38 | 323 | 
| igor@54 | 324 def voc_filename(): | 
| igor@63 | 325     if 'vocabulary_filename' in config: | 
| igor@63 | 326         return config['vocabulary_filename'] | 
| igor@54 | 327     return "%s/%s.txt"%(config['config_directory'], config['language']) | 
| igor@54 | 328 | 
| igor@38 | 329 def load_vocabulary(): | 
| igor@54 | 330     return get_words(readlines_from_file(voc_filename())) | 
| igor@38 | 331 | 
| igor@38 | 332 def notes_filenames(): | 
| igor@38 | 333     return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])] | 
| igor@38 | 334 | 
| igor@38 | 335 def load_notes(files): | 
| igor@38 | 336     notes = {} | 
| igor@38 | 337     for filename in files: | 
| igor@39 | 338         with codecs.open(filename, "r", "utf-8") as f: | 
| igor@38 | 339             for line in f.readlines(): | 
| igor@38 | 340                 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) | 
| igor@38 | 341                 notes.setdefault(word, {}) | 
| igor@38 | 342                 notes[word][filename] = note | 
| igor@38 | 343     return notes | 
| igor@38 | 344 | 
| igor@39 | 345 def add_notes(lines, notes): | 
| igor@39 | 346     notes_filename = notes_filenames()[0] | 
| igor@39 | 347     result = [] | 
| igor@39 | 348     for line in lines: | 
| igor@39 | 349         if line.startswith('#'): | 
| igor@39 | 350             result += [line] | 
| igor@39 | 351         else: | 
| igor@39 | 352             match_object = re.search('^\s*\S+\s*(\S+)', line) | 
| igor@39 | 353             if match_object: | 
| igor@39 | 354                 word = match_object.group(1) | 
| igor@39 | 355                 if word in notes: | 
| igor@39 | 356                     if notes_filename in notes[word]: | 
| igor@39 | 357                         line = line.rstrip('\n') | 
| igor@39 | 358                         line = "%-30s %s\n" % (line, notes[word][notes_filename]) | 
| igor@39 | 359                         result += [line] | 
| igor@39 | 360                 else: | 
| igor@39 | 361                     result += [line] | 
| igor@39 | 362             else: | 
| igor@39 | 363                 result += [line] | 
| igor@39 | 364     return result | 
| igor@39 | 365 | 
| igor@39 | 366 def remove_notes(lines, notes_group): | 
| igor@39 | 367     notes_filename = notes_filenames()[0] | 
| igor@39 | 368     notes = {} | 
| igor@39 | 369     for k in notes_group.keys(): | 
| igor@39 | 370         if notes_filename in notes_group[k]: | 
| igor@39 | 371             notes[k] = notes_group[k][notes_filename] | 
| igor@39 | 372 | 
| igor@39 | 373     result = [] | 
| igor@39 | 374     for line in lines: | 
| igor@39 | 375         line = line.rstrip('\n') | 
| igor@39 | 376         match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line) | 
| igor@39 | 377         if match_object: | 
| igor@39 | 378             result.append("".join([ | 
| igor@39 | 379                 match_object.group(1), | 
| igor@39 | 380                 match_object.group(2), | 
| igor@39 | 381                 match_object.group(3), | 
| igor@39 | 382                 match_object.group(4), | 
| igor@39 | 383                 "\n" | 
| igor@39 | 384                 ])) | 
| igor@39 | 385             notes[match_object.group(4)] = match_object.group(6) | 
| igor@39 | 386         else: | 
| igor@39 | 387             result.append(line+"\n") | 
| igor@39 | 388 | 
| igor@39 | 389     save_notes(notes_filename, notes) | 
| igor@39 | 390     return result | 
| igor@39 | 391 | 
| igor@39 | 392 def save_notes(filename, notes): | 
| igor@39 | 393     lines = [] | 
| igor@39 | 394     saved_words = [] | 
| igor@39 | 395     with codecs.open(filename, "r", "utf-8") as f: | 
| igor@39 | 396         for line in f.readlines(): | 
| igor@39 | 397             (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) | 
| igor@39 | 398             if word in notes: | 
| igor@39 | 399                 line = "%-29s %s\n" % (word, notes[word]) | 
| igor@39 | 400                 saved_words.append(word) | 
| igor@39 | 401             lines.append(line) | 
| igor@39 | 402     for word in [x for x in notes.keys() if not x in saved_words]: | 
| igor@39 | 403         line = "%-29s %s\n" % (word, notes[word]) | 
| igor@39 | 404         lines.append(line) | 
| igor@39 | 405 | 
| igor@39 | 406     with codecs.open(filename, "w", "utf-8") as f: | 
| igor@39 | 407         for line in lines: | 
| igor@39 | 408             f.write(line) | 
| igor@39 | 409 | 
| igor@39 | 410 | 
| igor@38 | 411 def substract_dictionary(dict1, dict2): | 
| igor@38 | 412     """ | 
| igor@38 | 413     returns dict1 - dict2 | 
| igor@38 | 414     """ | 
| igor@38 | 415     result = {} | 
| igor@38 | 416     for (k,v) in dict1.items(): | 
| igor@38 | 417         if not k in dict2: | 
| igor@38 | 418             result[k] = v | 
| igor@38 | 419     return result | 
| igor@38 | 420 | 
| igor@38 | 421 def dump_words(words, filename): | 
| igor@38 | 422     with codecs.open(filename, "w+", "utf-8") as f: | 
| igor@38 | 423         for word in words.keys(): | 
| igor@38 | 424             f.write(("%s\n"%word)*words[word]) | 
| igor@38 | 425 | 
| igor@38 | 426 def error_message(text): | 
| igor@38 | 427     print text | 
| igor@38 | 428 | 
| igor@40 | 429 def find_wordgroups_weights(word_pairs, normalizator): | 
| igor@38 | 430     weight = {} | 
| igor@40 | 431     for (num, word) in word_pairs: | 
| igor@38 | 432         normalized = normalizator.normalize(word) | 
| igor@38 | 433         weight.setdefault(normalized, 0) | 
| igor@40 | 434         weight[normalized] += num | 
| igor@38 | 435     return weight | 
| igor@38 | 436 | 
| igor@38 | 437 def find_linked_words(notes): | 
| igor@38 | 438     linked_words = {} | 
| igor@38 | 439     for word in notes.keys(): | 
| igor@38 | 440         for note in notes[word].values(): | 
| igor@38 | 441             if "@" in note: | 
| igor@38 | 442                 result = re.search(r'\@(\S*)', note) | 
| igor@38 | 443                 if result: | 
| igor@38 | 444                     main_word = result.group(1) | 
| igor@38 | 445                     if main_word: | 
| igor@38 | 446                         linked_words[word] = main_word | 
| igor@38 | 447     return linked_words | 
| igor@38 | 448 | 
| igor@40 | 449 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words): | 
| igor@40 | 450     (num1, word1) = pair1 | 
| igor@40 | 451     (num2, word2) = pair2 | 
| igor@38 | 452 | 
| igor@38 | 453     normalized_word1 = normalizator.normalize(word1) | 
| igor@38 | 454     normalized_word2 = normalizator.normalize(word2) | 
| igor@38 | 455 | 
| igor@38 | 456     cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2]) | 
| igor@38 | 457     if cmp_res != 0: | 
| igor@38 | 458         return cmp_res | 
| igor@38 | 459     else: | 
| igor@38 | 460         cmp_res = cmp(normalized_word1, normalized_word2) | 
| igor@38 | 461         if cmp_res != 0: | 
| igor@38 | 462             return cmp_res | 
| igor@38 | 463         else: | 
| igor@38 | 464             return cmp(int(num1), int(num2)) | 
| igor@38 | 465 | 
| igor@47 | 466 | 
| igor@48 | 467 def print_words_sorted( | 
| igor@48 | 468         word_pairs, | 
| igor@48 | 469         stats, | 
| igor@48 | 470         normalizator, | 
| igor@48 | 471         print_stats=True, | 
| igor@48 | 472         stats_only=False, | 
| igor@48 | 473         compressed_wordlist=False, | 
| igor@68 | 474         compressed_to_line = False, | 
| igor@48 | 475         show_range=0, | 
| igor@48 | 476         show_range_percentage=0, | 
| igor@48 | 477         ): | 
| igor@54 | 478     result = [] | 
| igor@40 | 479     if stats_only: | 
| igor@54 | 480         #codecs.getwriter("utf-8")(sys.stdout).write( | 
| igor@54 | 481         result.append( | 
| igor@43 | 482             " ".join([ | 
| igor@43 | 483                 "%-10s" % x for x in [ | 
| igor@43 | 484                 "LANG", | 
| igor@43 | 485                 "KNOWN%", | 
| igor@43 | 486                 "UNKNOWN%", | 
| igor@43 | 487                 "KNOWN", | 
| igor@43 | 488                 "TOTAL", | 
| igor@43 | 489                 "WPS", | 
| igor@43 | 490                 "UWPS*10" | 
| igor@43 | 491                 ]]) + "\n") | 
| igor@54 | 492         result.append( | 
| igor@43 | 493             " ".join([ | 
| igor@43 | 494                 "%(language)-10s", | 
| igor@43 | 495                 "%(percentage)-10.2f", | 
| igor@43 | 496                 "%(percentage_unknown)-10.2f", | 
| igor@43 | 497                 "%(total_known)-11d" | 
| igor@43 | 498                 "%(total)-11d" | 
| igor@43 | 499                 "%(wps)-11d" | 
| igor@43 | 500                 "%(uwps)-11d" | 
| igor@43 | 501                 ]) % stats + "\n") | 
| igor@54 | 502         return "".join(result) | 
| igor@38 | 503 | 
| igor@40 | 504     if print_stats: | 
| igor@54 | 505         result.append( | 
| igor@43 | 506             "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats) | 
| igor@38 | 507 | 
| igor@40 | 508     known = int(stats['total_known']) | 
| igor@40 | 509     total = int(stats['total']) | 
| igor@68 | 510     level_lines = range(int(float(stats['percentage']))/5*5+5,90,5)+range(90,102) | 
| igor@68 | 511     if 100.0*known/total >= level_lines[0]: | 
| igor@68 | 512         current_level = level_lines[0] | 
| igor@68 | 513         while 100.0*known/total > level_lines[0]: | 
| igor@68 | 514             current_level = level_lines[0] | 
| igor@68 | 515             level_lines = level_lines[1:] | 
| igor@47 | 516     old_normalized_word = None | 
| igor@47 | 517     words_of_this_group = [] | 
| igor@48 | 518     printed_words = 0 | 
| igor@40 | 519     for word_pair in word_pairs: | 
| igor@47 | 520 | 
| igor@47 | 521         normalized_word = normalizator.normalize(word_pair[1]) | 
| igor@47 | 522         if old_normalized_word and old_normalized_word != normalized_word: | 
| igor@47 | 523             if compressed_wordlist: | 
| igor@49 | 524                 compressed_word_pair = ( | 
| igor@49 | 525                     sum(x[0] for x in words_of_this_group), | 
| igor@49 | 526                     normalizator.best_word_from_group(words_of_this_group) | 
| igor@49 | 527                     ) | 
| igor@68 | 528                 if compressed_to_line: | 
| igor@68 | 529                     result.append("%10s %s %s\n" % (compressed_word_pair + (" ".join(y for x,y in words_of_this_group if y not in compressed_word_pair),))) | 
| igor@68 | 530                 else: | 
| igor@68 | 531                     result.append("%10s %s\n" % compressed_word_pair) | 
| igor@48 | 532                 printed_words += 1 | 
| igor@47 | 533             words_of_this_group = [] | 
| igor@47 | 534 | 
| igor@47 | 535         old_normalized_word = normalized_word | 
| igor@47 | 536         words_of_this_group.append(word_pair) | 
| igor@47 | 537 | 
| igor@47 | 538         if not compressed_wordlist: | 
| igor@54 | 539             result.append("%10s %s\n" % word_pair) | 
| igor@48 | 540             printed_words += 1 | 
| igor@47 | 541 | 
| igor@47 | 542 | 
| igor@40 | 543         known += word_pair[0] | 
| igor@40 | 544         if 100.0*known/total >= level_lines[0]: | 
| igor@40 | 545             current_level = level_lines[0] | 
| igor@40 | 546             while 100.0*known/total > level_lines[0]: | 
| igor@40 | 547                 current_level = level_lines[0] | 
| igor@40 | 548                 level_lines = level_lines[1:] | 
| igor@54 | 549             result.append("# %s\n" % current_level) | 
| igor@38 | 550 | 
| igor@48 | 551         if show_range >0 and printed_words >= show_range: | 
| igor@48 | 552             break | 
| igor@48 | 553         if show_range_percentage >0 and 100.0*known/total >= show_range_percentage: | 
| igor@48 | 554             break | 
| igor@48 | 555 | 
| igor@54 | 556     return result | 
| igor@39 | 557 | 
| igor@53 | 558 def parse_parts_description(parts_description): | 
| igor@53 | 559     """ | 
| igor@53 | 560     Returns triad (start, stop, step) | 
| igor@53 | 561     basing on parts_description string. | 
| igor@53 | 562      from-to/step | 
| igor@53 | 563      from+delta/step | 
| igor@53 | 564     """ | 
| igor@53 | 565 | 
| igor@53 | 566     try: | 
| igor@53 | 567         (a, step) = parts_description.split("/", 1) | 
| igor@53 | 568         step = int(step) | 
| igor@53 | 569         start = 0 | 
| igor@53 | 570         stop = 0 | 
| igor@53 | 571         if '-' in a: | 
| igor@53 | 572             (start, stop) = a.split("-", 1) | 
| igor@53 | 573             start = int(start) | 
| igor@53 | 574             stop = int(stop) | 
| igor@53 | 575         elif '+' in a: | 
| igor@53 | 576             (start, stop) = a.split("+", 1) | 
| igor@53 | 577             start = int(start) | 
| igor@53 | 578             stop = int(stop) | 
| igor@53 | 579         else: | 
| igor@53 | 580             start = int(a) | 
| igor@53 | 581             stop = start + 1 | 
| igor@53 | 582         return (start, stop, step) | 
| igor@53 | 583 | 
| igor@53 | 584     except: | 
| igor@54 | 585         raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % parts_description) | 
| igor@53 | 586 | 
| igor@53 | 587 | 
| igor@53 | 588 def take_part(lines, part_description = None): | 
| igor@55 | 589     if part_description == None or part_description == '': | 
| igor@53 | 590         return lines | 
| igor@53 | 591     (start, stop, step) = parse_parts_description(part_description) | 
| igor@53 | 592     n = len(lines) | 
| igor@53 | 593     part_size = (1.0*n) / step | 
| igor@53 | 594     result = [] | 
| igor@53 | 595     for i in range(n): | 
| igor@54 | 596         if i >= start * part_size and i <= stop * part_size: | 
| igor@54 | 597             result += [lines[i]] | 
| igor@53 | 598     return result | 
| igor@53 | 599 | 
| igor@65 | 600 def web_editor(output): | 
| igor@65 | 601     from twisted.internet import reactor | 
| igor@65 | 602     from twisted.web.server import Site | 
| igor@65 | 603     from twisted.web.static import File | 
| igor@65 | 604     from twisted.web.resource import Resource | 
| igor@65 | 605     import json | 
| igor@65 | 606 | 
| igor@65 | 607     word_list = [] | 
| igor@65 | 608 | 
| igor@65 | 609     for o in output: | 
| igor@65 | 610         a = re.split('\s+', o.strip(), 2) | 
| igor@65 | 611         a = a + ['']*(3-len(a)) | 
| igor@65 | 612         word_list.append({'number':a[0], 'word':a[1], 'comment':a[2]}) | 
| igor@65 | 613 | 
| igor@65 | 614     print "Loaded ", len(word_list) | 
| igor@65 | 615 | 
| igor@65 | 616     new_words_html = "/home/igor/hg/new-words/web" | 
| igor@65 | 617 | 
| igor@65 | 618     class JSONPage(Resource): | 
| igor@65 | 619         isLeaf = True | 
| igor@65 | 620         def render_GET(self, request): | 
| igor@65 | 621             return json.dumps({"word_list": word_list}) | 
| igor@65 | 622 | 
| igor@65 | 623     class SaveJSON(Resource): | 
| igor@65 | 624         isLeaf = True | 
| igor@65 | 625         def render_POST(self, request): | 
| igor@65 | 626             print json.loads(request.args["selected_words"][0]) | 
| igor@65 | 627             return json.dumps({"status": "ok"}) | 
| igor@65 | 628 | 
| igor@65 | 629     json_page = JSONPage() | 
| igor@65 | 630     save_json = SaveJSON() | 
| igor@65 | 631 | 
| igor@65 | 632     resource = File(new_words_html) | 
| igor@65 | 633     resource.putChild("json", json_page) | 
| igor@65 | 634     resource.putChild("save", save_json) | 
| igor@65 | 635 | 
| igor@65 | 636     factory = Site(resource) | 
| igor@65 | 637     reactor.listenTCP(8880, factory) | 
| igor@65 | 638     reactor.run() | 
| igor@65 | 639 | 
| igor@65 | 640 | 
| igor@40 | 641 def filter_get_words_group_words_add_stat(args): | 
| igor@40 | 642     vocabulary = load_vocabulary() | 
| igor@40 | 643     notes = load_notes(notes_filenames()) | 
| igor@54 | 644 | 
| igor@65 | 645     input_lines = [] | 
| igor@54 | 646     if len(args) > 0: | 
| igor@65 | 647         for arg in args: | 
| igor@65 | 648             if 'http://' in arg: | 
| igor@65 | 649                 input_lines += readlines_from_url(arg) | 
| igor@65 | 650             else: | 
| igor@65 | 651                 input_lines += readlines_from_file(arg) | 
| igor@54 | 652     else: | 
| igor@65 | 653         input_lines += readlines_from_stdin() | 
| igor@54 | 654 | 
| igor@54 | 655     if len(input_lines) == 0: | 
| igor@54 | 656         print >> sys.stderr, "Nothing to do, standard input is empty, exiting." | 
| igor@54 | 657         sys.exit(1) | 
| igor@54 | 658 | 
| igor@54 | 659     lines = take_part(input_lines, config.get('pages', '')) | 
| igor@54 | 660 | 
| igor@54 | 661     (_, original_text_tempfile) = tempfile.mkstemp(prefix='new-word') | 
| igor@54 | 662     with codecs.open(original_text_tempfile, "w", "utf-8") as f: | 
| igor@54 | 663         f.write("".join(lines)) | 
| igor@54 | 664 | 
| igor@44 | 665     group_by = [1] | 
| igor@48 | 666 | 
| igor@54 | 667     if 'two_words' in config: | 
| igor@44 | 668         group_by.append(2) | 
| igor@54 | 669     if 'three_words' in config: | 
| igor@44 | 670         group_by.append(3) | 
| igor@44 | 671     words = get_words(lines, group_by) | 
| igor@43 | 672     stats_only = False | 
| igor@54 | 673     if 'text_stats' in config: | 
| igor@43 | 674         stats_only = True | 
| igor@40 | 675 | 
| igor@47 | 676     compressed_wordlist = False | 
| igor@68 | 677     if 'compressed' in config or 'compressed_to_line' in config: | 
| igor@47 | 678         compressed_wordlist = True | 
| igor@47 | 679 | 
| igor@68 | 680     compressed_to_line = 'compressed_to_line' in config | 
| igor@68 | 681 | 
| igor@55 | 682     if 'show_range' in config: | 
| igor@55 | 683         show_range = int(config['show_range']) | 
| igor@48 | 684     else: | 
| igor@48 | 685         show_range = 0 | 
| igor@54 | 686 | 
| igor@54 | 687     if 'show_range_percentage' in config: | 
| igor@54 | 688         show_range_percentage = int(config['show_range_percentage']) | 
| igor@48 | 689     else: | 
| igor@48 | 690         show_range_percentage = 0 | 
| igor@48 | 691 | 
| igor@44 | 692 | 
| igor@40 | 693     stats = {} | 
| igor@40 | 694     stats['total'] = sum(words[x] for x in words.keys()) | 
| igor@54 | 695     if not 'no_filter' in config: | 
| igor@45 | 696         words = substract_dictionary(words, vocabulary) | 
| igor@40 | 697 | 
| igor@40 | 698     stats['total_unknown'] = sum(words[x] for x in words.keys()) | 
| igor@40 | 699     stats['total_known'] = stats['total'] - stats['total_unknown'] | 
| igor@43 | 700     stats['percentage'] = 100.0*stats['total_known']/stats['total'] | 
| igor@43 | 701     stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total'] | 
| igor@40 | 702     stats['groups'] = 0 | 
| igor@40 | 703     stats['words'] = len(words) | 
| igor@43 | 704     stats['sentences'] = 0  #FIXME | 
| igor@43 | 705     stats['wps'] = 0        #FIXME | 
| igor@43 | 706     stats['uwps'] = 0       #FIXME | 
| igor@40 | 707     stats['language'] = config['language'] | 
| igor@40 | 708 | 
| igor@40 | 709     linked_words = find_linked_words(notes) | 
| igor@40 | 710     normalizator = Normalizator(config['language'], linked_words) | 
| igor@40 | 711 | 
| igor@50 | 712     # filter words by allowed_words_filter | 
| igor@54 | 713     if 'allowed_words' in config: | 
| igor@54 | 714         allowed_words_filename = config['allowed_words'] | 
| igor@50 | 715         normalized_allowed_words = [ | 
| igor@50 | 716             normalizator.normalize(w.rstrip('\n')) | 
| igor@50 | 717             for w in readlines_from_file(allowed_words_filename) | 
| igor@50 | 718         ] | 
| igor@50 | 719 | 
| igor@50 | 720         result = {} | 
| igor@50 | 721         for w, wn in words.iteritems(): | 
| igor@50 | 722             if normalizator.normalize(w) in normalized_allowed_words: | 
| igor@50 | 723                 result[w] = wn | 
| igor@50 | 724         words = result | 
| igor@50 | 725 | 
| igor@44 | 726     words_with_freq = [] | 
| igor@40 | 727     for k in sorted(words.keys(), key=lambda k: words[k], reverse=True): | 
| igor@44 | 728         words_with_freq.append((words[k], k)) | 
| igor@40 | 729 | 
| igor@44 | 730     wgw = find_wordgroups_weights(words_with_freq, normalizator) | 
| igor@55 | 731     if not 'no_words_grouping' in config or not config['no_words_grouping']: | 
| igor@45 | 732         words_with_freq = sorted( | 
| igor@44 | 733                 words_with_freq, | 
| igor@40 | 734                 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words), | 
| igor@40 | 735                 reverse=True) | 
| igor@40 | 736 | 
| igor@54 | 737     output = print_words_sorted( | 
| igor@47 | 738         words_with_freq, | 
| igor@47 | 739         stats, | 
| igor@47 | 740         normalizator, | 
| igor@47 | 741         stats_only=stats_only, | 
| igor@48 | 742         compressed_wordlist=compressed_wordlist, | 
| igor@68 | 743         compressed_to_line=compressed_to_line, | 
| igor@48 | 744         show_range=show_range, | 
| igor@48 | 745         show_range_percentage=show_range_percentage, | 
| igor@47 | 746         ) | 
| igor@40 | 747 | 
| igor@54 | 748 | 
| igor@54 | 749     if ('non_interactive' in config or 'text_stats' in config): | 
| igor@54 | 750         codecs.getwriter("utf-8")(sys.stdout).write("".join(output)) | 
| igor@65 | 751     elif config.get('web', False): | 
| igor@65 | 752         web_editor(output) | 
| igor@54 | 753     else: | 
| igor@54 | 754         (_, temp1) = tempfile.mkstemp(prefix='new-word') | 
| igor@54 | 755         (_, temp2) = tempfile.mkstemp(prefix='new-word') | 
| igor@54 | 756 | 
| igor@54 | 757         with codecs.open(temp1, "w", "utf-8") as f: | 
| igor@54 | 758             f.write("".join(output)) | 
| igor@54 | 759         with codecs.open(temp2, "w", "utf-8") as f: | 
| igor@54 | 760             f.write("".join(add_notes(output, notes))) | 
| igor@54 | 761 | 
| igor@54 | 762         os.putenv('ORIGINAL_TEXT', original_text_tempfile) | 
| igor@54 | 763         os.system(( | 
| igor@54 | 764             "vim" | 
| igor@54 | 765             " -c 'setlocal spell spelllang={language}'" | 
| igor@54 | 766             " -c 'set keywordprg={language}'" | 
| igor@54 | 767             " -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255'" | 
| igor@54 | 768             " {filename}" | 
| igor@54 | 769             " < /dev/tty > /dev/tty" | 
| igor@54 | 770             ).format(language=config['language'], filename=temp2)) | 
| igor@54 | 771 | 
| igor@54 | 772         lines = remove_notes(readlines_from_file(temp2), notes) | 
| igor@54 | 773 | 
| igor@54 | 774         # compare lines_before and lines_after and return deleted words | 
| igor@54 | 775         lines_before = output | 
| igor@54 | 776         lines_after = lines | 
| igor@54 | 777         deleted_words = [] | 
| igor@54 | 778 | 
| igor@60 | 779         lines_after_set = set(lines_after) | 
| igor@54 | 780         for line in lines_before: | 
| igor@60 | 781             if line not in lines_after_set: | 
| igor@54 | 782                 line = line.strip() | 
| igor@54 | 783                 if ' ' in line: | 
| igor@54 | 784                     word = re.split('\s+', line, 1)[1] | 
| igor@54 | 785                     if ' ' in word: | 
| igor@54 | 786                         word = re.split('\s+', word, 1)[0] | 
| igor@54 | 787                 deleted_words.append(word) | 
| igor@54 | 788 | 
| igor@54 | 789         with codecs.open(voc_filename(), "a", "utf-8") as f: | 
| igor@54 | 790             f.write("\n".join(deleted_words + [''])) | 
| igor@54 | 791 | 
| igor@54 | 792         os.unlink(temp1) | 
| igor@54 | 793         os.unlink(temp2) | 
| igor@54 | 794 | 
| igor@54 | 795     os.unlink(original_text_tempfile) | 
| igor@54 | 796 | 
| igor@37 | 797 (options, args) = parser.parse_args() | 
| igor@38 | 798 if options.language: | 
| igor@38 | 799     config['language'] = options.language | 
| igor@37 | 800 | 
| igor@54 | 801 if options.pages: | 
| igor@54 | 802     config['pages'] = options.pages | 
| igor@54 | 803 else: | 
| igor@54 | 804     config['pages'] = "" | 
| igor@54 | 805 | 
| igor@54 | 806 if options.allowed_words: | 
| igor@54 | 807     config['allowed_words'] = options.allowed_words | 
| igor@54 | 808 | 
| igor@55 | 809 if options.show_range: | 
| igor@55 | 810     config['show_range'] = options.show_range | 
| igor@55 | 811 | 
| igor@54 | 812 if options.show_range_percentage: | 
| igor@54 | 813     config['show_range_percentage'] = options.show_range_percentage | 
| igor@54 | 814 | 
| igor@54 | 815 if options.non_interactive: | 
| igor@54 | 816     config['non_interactive'] = True | 
| igor@54 | 817 | 
| igor@54 | 818 if options.text_stats: | 
| igor@54 | 819     config['text_stats'] = True | 
| igor@54 | 820 | 
| igor@54 | 821 if options.compressed: | 
| igor@54 | 822     config['compressed'] = True | 
| igor@54 | 823 | 
| igor@68 | 824 if options.compressed_to_line: | 
| igor@68 | 825     config['compressed_to_line'] = True | 
| igor@68 | 826 | 
| igor@54 | 827 if options.no_filter: | 
| igor@54 | 828     config['no_filter'] = True | 
| igor@54 | 829 | 
| igor@54 | 830 if options.two_words: | 
| igor@54 | 831     config['two_words'] = True | 
| igor@54 | 832 | 
| igor@54 | 833 if options.three_words: | 
| igor@54 | 834     config['three_words'] = True | 
| igor@54 | 835 | 
| igor@55 | 836 if options.no_words_grouping: | 
| igor@55 | 837     config['no_words_grouping'] = True | 
| igor@37 | 838 | 
| igor@65 | 839 if options.web: | 
| igor@65 | 840     config['web'] = True | 
| igor@65 | 841 | 
| igor@55 | 842 filter_get_words_group_words_add_stat(args) | 
| igor@55 | 843 | 
| igor@55 | 844 #if options.function: | 
| igor@55 | 845 #    function_names = { | 
| igor@55 | 846 #        'get_words_group_words_add_stat': , | 
| igor@55 | 847 #    } | 
| igor@55 | 848 #    if options.function in function_names: | 
| igor@55 | 849 #        function_names[options.function](args) | 
| igor@55 | 850 #    else: | 
| igor@55 | 851 #        error_message("Unkown function %s.\nAvailable functions:\n%s" % ( | 
| igor@55 | 852 #            options.function, "".join(["   "+x for x in sorted(function_names.keys())]))) | 
| igor@55 | 853 #        sys.exit(1) | 
| igor@55 | 854 # | 
| igor@37 | 855 | 
| igor@37 | 856 | 
| igor@37 | 857 | 
| igor@38 | 858 #os.system("vim") | 
| igor@37 | 859 |