new-words

annotate new-words.py @ 67:87bb1c5e6616

added de script to misc/
author Igor Chubin <igor@chub.in>
date Wed Mar 28 15:54:30 2012 +0200 (2012-03-28)
parents 1b8b30ad7c95
children 846240941452
rev   line source
igor@37 1 #!/usr/bin/env python
igor@38 2 # -*- coding: utf-8 -*-
igor@37 3
igor@40 4 from __future__ import with_statement
igor@38 5 import codecs
igor@49 6 import difflib
igor@38 7 import logging
igor@38 8 import os
igor@37 9 import optparse
igor@38 10 import re
igor@38 11 import subprocess
igor@38 12 import sys
igor@38 13 import Stemmer
igor@54 14 import tempfile
igor@42 15 try:
igor@42 16 import psyco
igor@42 17 psyco.full()
igor@42 18 except:
igor@42 19 pass
igor@38 20
igor@38 21 config = {
igor@38 22 'config_directory': os.environ['HOME'] + '/.new-words',
igor@38 23 'language': 'en',
igor@38 24 }
igor@38 25
igor@38 26 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
igor@38 27
igor@38 28 class Normalizator:
igor@38 29 def __init__(self, language, linked_words={}):
igor@38 30 stemmer_algorithm = {
igor@38 31 'de' : 'german',
igor@63 32 'fr' : 'french',
igor@38 33 'en' : 'english',
igor@51 34 'es' : 'spanish',
igor@38 35 'ru' : 'russian',
igor@51 36 'it' : 'italian',
igor@38 37 'uk' : 'ukrainian',
igor@38 38 }
igor@65 39 try:
igor@65 40 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
igor@65 41 except:
igor@65 42 self.stemmer = None
igor@38 43 self.linked_words = linked_words
igor@38 44
igor@38 45 def normalize(self, word):
igor@38 46 word_chain = []
igor@38 47 while word in self.linked_words and not word in word_chain:
igor@38 48 word_chain.append(word)
igor@38 49 word = self.linked_words[word]
igor@65 50 if self.stemmer:
igor@65 51 return self.stemmer.stemWord(word.lower())
igor@65 52 else:
igor@65 53 return word.lower()
igor@37 54
igor@47 55 def best_word_from_group(self, wordpairs_group):
igor@47 56 """Returns the word that is the most relevant to the wordpairs_group.
igor@47 57
igor@47 58 At the moment: returns the word with minimal length"""
igor@49 59
igor@49 60 def f(x, y):
igor@49 61 return difflib.SequenceMatcher(
igor@49 62 None,
igor@49 63 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),
igor@49 64 x.lower(),
igor@49 65 y.lower()).ratio()
igor@47 66
igor@47 67 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
igor@49 68 best_match = list(x[1] for x in sorted(
igor@47 69 (x for x in wordpairs_group if len(x[1]) == minimal_length),
igor@47 70 key=lambda x:x[0],
igor@47 71 reverse=True))[0]
igor@47 72
igor@51 73 return best_match
igor@51 74
igor@49 75 suggestions = self.dictionary_suggestions(best_match)
igor@49 76 if len(suggestions) == 1:
igor@49 77 return best_match
igor@49 78
igor@49 79 verb = False
igor@49 80 corrected_best_match = best_match
igor@49 81 if best_match[-2:] == 'et':
igor@49 82 word = best_match[:-1]+"n"
igor@49 83 sugg = self.dictionary_suggestions(word)
igor@49 84 if len(sugg) == 1:
igor@49 85 return word
igor@49 86 suggestions += sugg
igor@49 87 corrected_best_match = word
igor@49 88 corrected_best_match = best_match[:-2]
igor@49 89 verb = True
igor@49 90
igor@49 91 if best_match[-1] == 't':
igor@49 92 word = best_match[:-1]+"en"
igor@49 93 sugg = self.dictionary_suggestions(word)
igor@49 94 if len(sugg) == 1:
igor@49 95 return word
igor@49 96 suggestions += sugg
igor@49 97 corrected_best_match = best_match[:-1]
igor@49 98 verb = True
igor@49 99
igor@49 100 if corrected_best_match[0].lower() == corrected_best_match[0]:
igor@49 101 suggestions = [ x for x in suggestions
igor@49 102 if x[0].lower() == x[0] ]
igor@49 103
igor@49 104 if suggestions == []:
igor@49 105 return best_match+"_"
igor@49 106 return best_match+" "+(" ".join(
igor@49 107 sorted(
igor@49 108 suggestions,
igor@49 109 key = lambda x: f(x, corrected_best_match),
igor@49 110 reverse = True
igor@49 111 )
igor@49 112 )
igor@49 113 )
igor@49 114
igor@49 115 def dictionary_suggestions(self, word):
igor@49 116 return [
igor@49 117 x.decode('utf-8').rstrip('\n')
igor@49 118 for x
igor@49 119 in subprocess.Popen(
igor@49 120 ["de-variants", word],
igor@49 121 stdout=subprocess.PIPE
igor@49 122 ).stdout.readlines() ]
igor@49 123
igor@49 124
igor@37 125 parser = optparse.OptionParser()
igor@37 126
igor@37 127 parser.add_option(
igor@37 128 "-a", "--no-marks",
igor@55 129 help="don't add marks (and don't save marks added by user) [NOT IMPLEMENTED YET]",
igor@37 130 action="store_true",
igor@37 131 dest="no_marks")
igor@37 132
igor@37 133 parser.add_option(
igor@37 134 "-c", "--compressed",
igor@37 135 help="show compressed wordlist: one word per group",
igor@37 136 action="store_true",
igor@37 137 dest="compressed")
igor@37 138
igor@37 139 parser.add_option(
igor@37 140 "-k", "--known-words",
igor@37 141 help="put higher words that are similar to the known words (only for English)",
igor@37 142 action="store_true",
igor@37 143 dest="compressed")
igor@37 144
igor@37 145 parser.add_option(
igor@37 146 "-l", "--language",
igor@37 147 help="specify language of text",
igor@37 148 action="store",
igor@37 149 dest="language")
igor@37 150
igor@37 151 parser.add_option(
igor@54 152 "-f", "--allowed-words",
igor@54 153 help="file with list of allowed words (words that will be shown in the output)",
igor@54 154 action="store",
igor@54 155 dest="allowed_words")
igor@54 156
igor@54 157 parser.add_option(
igor@55 158 "-G", "--words-grouping",
igor@55 159 help="turn off word grouping",
igor@55 160 action="store_true",
igor@55 161 dest="no_words_grouping")
igor@55 162
igor@55 163 parser.add_option(
igor@54 164 "-X", "--function",
igor@38 165 help="filter through subsystem [INTERNAL]",
igor@38 166 action="store",
igor@38 167 dest="function")
igor@38 168
igor@38 169 parser.add_option(
igor@37 170 "-m", "--merge-tag",
igor@55 171 help="merge words tagged with specified tag into the main vocabulary [NOT IMPLEMENTED YET]",
igor@37 172 action="store",
igor@37 173 dest="merge_tag")
igor@37 174
igor@37 175 parser.add_option(
igor@37 176 "-M", "--merge-tagged",
igor@55 177 help="merge words tagged with ANY tag into the main vocabulary [NOT IMPLEMENTED YET]",
igor@37 178 action="store_true",
igor@37 179 dest="merge_tagged")
igor@37 180
igor@37 181 parser.add_option(
igor@37 182 "-n", "--non-interactive",
igor@37 183 help="non-interactive mode (don't run vi)",
igor@37 184 action="store_true",
igor@37 185 dest="non_interactive")
igor@37 186
igor@37 187 parser.add_option(
igor@37 188 "-N", "--no-filter",
igor@37 189 help="switch off known words filtering",
igor@37 190 action="store_true",
igor@37 191 dest="no_filter")
igor@37 192
igor@37 193 parser.add_option(
igor@37 194 "-p", "--pages",
igor@37 195 help="work with specified pages only (pages = start-stop/total )",
igor@37 196 action="store",
igor@37 197 dest="pages")
igor@37 198
igor@37 199 parser.add_option(
igor@48 200 "-d", "--delete-tag",
igor@48 201 help="delete subvocabulary of specified tag",
igor@37 202 action="store",
igor@48 203 dest="delete_tag")
igor@37 204
igor@37 205 parser.add_option(
igor@55 206 "-r", "--show-range",
igor@55 207 help="show only words specified number of words",
igor@55 208 action="store",
igor@55 209 dest="show_range")
igor@55 210
igor@55 211 parser.add_option(
igor@54 212 "-R", "--show-range-percentage",
igor@54 213 help="show only words that cover specified percentage of the text, skip the rest",
igor@54 214 action="store",
igor@54 215 dest="show_range_percentage")
igor@54 216
igor@54 217 parser.add_option(
igor@37 218 "-s", "--text-stats",
igor@37 219 help="show the text statistics (percentage of known words and so on) and exit",
igor@37 220 action="store_true",
igor@37 221 dest="text_stats")
igor@37 222
igor@37 223 parser.add_option(
igor@37 224 "-S", "--voc-stats",
igor@55 225 help="show your vocabulary statistics (number of words and word groups) [NOT IMPLEMENTED YET]",
igor@37 226 action="store_true",
igor@37 227 dest="voc_stats")
igor@37 228
igor@37 229 parser.add_option(
igor@37 230 "-t", "--tag",
igor@37 231 help="tag known words with tag",
igor@37 232 action="store",
igor@37 233 dest="tag")
igor@37 234
igor@37 235 parser.add_option(
igor@37 236 "-T", "--show-tags",
igor@37 237 help="tag known words with tag",
igor@37 238 action="store_true",
igor@37 239 dest="show_tags")
igor@37 240
igor@37 241 parser.add_option(
igor@63 242 "-v", "--vocabulary-filename",
igor@63 243 help="use specified file as a vocabulary",
igor@63 244 action="store",
igor@63 245 dest="vocabulary_filename")
igor@63 246
igor@63 247 parser.add_option(
igor@65 248 "-w", "--web",
igor@65 249 help="Web browser version",
igor@65 250 action="store_true",
igor@65 251 dest="web")
igor@65 252
igor@65 253 parser.add_option(
igor@37 254 "-2", "--two-words",
igor@37 255 help="find 2 words' sequences",
igor@37 256 action="store_true",
igor@37 257 dest="two_words")
igor@37 258
igor@37 259 parser.add_option(
igor@37 260 "-3", "--three-words",
igor@37 261 help="find 3 words' sequences",
igor@37 262 action="store_true",
igor@37 263 dest="three_words")
igor@37 264
igor@38 265 def readlines_from_file(filename):
igor@38 266 res = []
igor@38 267 with codecs.open(filename, "r", "utf-8") as f:
igor@38 268 for line in f.readlines():
igor@38 269 res += [line]
igor@38 270 return res
igor@38 271
igor@54 272 def readlines_from_url(url):
igor@54 273 return [x.decode('utf-8') for x in
igor@54 274 subprocess.Popen(
igor@54 275 "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url),
igor@54 276 shell = True,
igor@54 277 stdout = subprocess.PIPE,
igor@54 278 stderr = subprocess.STDOUT
igor@54 279 ).communicate()[0].split('\n')
igor@54 280 ]
igor@54 281
igor@38 282 def readlines_from_stdin():
igor@38 283 return codecs.getreader("utf-8")(sys.stdin).readlines()
igor@38 284
igor@38 285 def words_from_line(line):
igor@38 286 line = line.rstrip('\n')
igor@38 287 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
igor@38 288 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
igor@44 289 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
igor@38 290
igor@44 291 def get_words(lines, group_by=[1]):
igor@38 292 """
igor@38 293 Returns hash of words in a file
igor@38 294 word => number
igor@38 295 """
igor@38 296 result = {}
igor@44 297 (a, b, c) = ("", "", "")
igor@38 298 for line in lines:
igor@38 299 words = words_from_line(line)
igor@38 300 for word in words:
igor@41 301 if re.match('[0-9]*$', word):
igor@41 302 continue
igor@38 303 result.setdefault(word, 0)
igor@38 304 result[word] += 1
igor@44 305 if 2 in group_by and a != "" and b != "":
igor@44 306 w = "%s_%s" % (a,b)
igor@44 307 result.setdefault(w, 0)
igor@44 308 result[w] += 1
igor@44 309 if 3 in group_by and not "" in [a,b,c]:
igor@44 310 w = "%s_%s_%s" % (a,b,c)
igor@44 311 result.setdefault(w, 0)
igor@44 312 result[w] += 1
igor@44 313 (a,b,c) = (b, c, word)
igor@44 314
igor@44 315 logging.debug(result)
igor@38 316 return result
igor@38 317
igor@54 318 def voc_filename():
igor@63 319 if 'vocabulary_filename' in config:
igor@63 320 return config['vocabulary_filename']
igor@54 321 return "%s/%s.txt"%(config['config_directory'], config['language'])
igor@54 322
igor@38 323 def load_vocabulary():
igor@54 324 return get_words(readlines_from_file(voc_filename()))
igor@38 325
igor@38 326 def notes_filenames():
igor@38 327 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
igor@38 328
igor@38 329 def load_notes(files):
igor@38 330 notes = {}
igor@38 331 for filename in files:
igor@39 332 with codecs.open(filename, "r", "utf-8") as f:
igor@38 333 for line in f.readlines():
igor@38 334 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@38 335 notes.setdefault(word, {})
igor@38 336 notes[word][filename] = note
igor@38 337 return notes
igor@38 338
igor@39 339 def add_notes(lines, notes):
igor@39 340 notes_filename = notes_filenames()[0]
igor@39 341 result = []
igor@39 342 for line in lines:
igor@39 343 if line.startswith('#'):
igor@39 344 result += [line]
igor@39 345 else:
igor@39 346 match_object = re.search('^\s*\S+\s*(\S+)', line)
igor@39 347 if match_object:
igor@39 348 word = match_object.group(1)
igor@39 349 if word in notes:
igor@39 350 if notes_filename in notes[word]:
igor@39 351 line = line.rstrip('\n')
igor@39 352 line = "%-30s %s\n" % (line, notes[word][notes_filename])
igor@39 353 result += [line]
igor@39 354 else:
igor@39 355 result += [line]
igor@39 356 else:
igor@39 357 result += [line]
igor@39 358 return result
igor@39 359
igor@39 360 def remove_notes(lines, notes_group):
igor@39 361 notes_filename = notes_filenames()[0]
igor@39 362 notes = {}
igor@39 363 for k in notes_group.keys():
igor@39 364 if notes_filename in notes_group[k]:
igor@39 365 notes[k] = notes_group[k][notes_filename]
igor@39 366
igor@39 367 result = []
igor@39 368 for line in lines:
igor@39 369 line = line.rstrip('\n')
igor@39 370 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
igor@39 371 if match_object:
igor@39 372 result.append("".join([
igor@39 373 match_object.group(1),
igor@39 374 match_object.group(2),
igor@39 375 match_object.group(3),
igor@39 376 match_object.group(4),
igor@39 377 "\n"
igor@39 378 ]))
igor@39 379 notes[match_object.group(4)] = match_object.group(6)
igor@39 380 else:
igor@39 381 result.append(line+"\n")
igor@39 382
igor@39 383 save_notes(notes_filename, notes)
igor@39 384 return result
igor@39 385
igor@39 386 def save_notes(filename, notes):
igor@39 387 lines = []
igor@39 388 saved_words = []
igor@39 389 with codecs.open(filename, "r", "utf-8") as f:
igor@39 390 for line in f.readlines():
igor@39 391 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@39 392 if word in notes:
igor@39 393 line = "%-29s %s\n" % (word, notes[word])
igor@39 394 saved_words.append(word)
igor@39 395 lines.append(line)
igor@39 396 for word in [x for x in notes.keys() if not x in saved_words]:
igor@39 397 line = "%-29s %s\n" % (word, notes[word])
igor@39 398 lines.append(line)
igor@39 399
igor@39 400 with codecs.open(filename, "w", "utf-8") as f:
igor@39 401 for line in lines:
igor@39 402 f.write(line)
igor@39 403
igor@39 404
igor@38 405 def substract_dictionary(dict1, dict2):
igor@38 406 """
igor@38 407 returns dict1 - dict2
igor@38 408 """
igor@38 409 result = {}
igor@38 410 for (k,v) in dict1.items():
igor@38 411 if not k in dict2:
igor@38 412 result[k] = v
igor@38 413 return result
igor@38 414
igor@38 415 def dump_words(words, filename):
igor@38 416 with codecs.open(filename, "w+", "utf-8") as f:
igor@38 417 for word in words.keys():
igor@38 418 f.write(("%s\n"%word)*words[word])
igor@38 419
igor@38 420 def error_message(text):
igor@38 421 print text
igor@38 422
igor@40 423 def find_wordgroups_weights(word_pairs, normalizator):
igor@38 424 weight = {}
igor@40 425 for (num, word) in word_pairs:
igor@38 426 normalized = normalizator.normalize(word)
igor@38 427 weight.setdefault(normalized, 0)
igor@40 428 weight[normalized] += num
igor@38 429 return weight
igor@38 430
igor@38 431 def find_linked_words(notes):
igor@38 432 linked_words = {}
igor@38 433 for word in notes.keys():
igor@38 434 for note in notes[word].values():
igor@38 435 if "@" in note:
igor@38 436 result = re.search(r'\@(\S*)', note)
igor@38 437 if result:
igor@38 438 main_word = result.group(1)
igor@38 439 if main_word:
igor@38 440 linked_words[word] = main_word
igor@38 441 return linked_words
igor@38 442
igor@40 443 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
igor@40 444 (num1, word1) = pair1
igor@40 445 (num2, word2) = pair2
igor@38 446
igor@38 447 normalized_word1 = normalizator.normalize(word1)
igor@38 448 normalized_word2 = normalizator.normalize(word2)
igor@38 449
igor@38 450 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
igor@38 451 if cmp_res != 0:
igor@38 452 return cmp_res
igor@38 453 else:
igor@38 454 cmp_res = cmp(normalized_word1, normalized_word2)
igor@38 455 if cmp_res != 0:
igor@38 456 return cmp_res
igor@38 457 else:
igor@38 458 return cmp(int(num1), int(num2))
igor@38 459
igor@47 460
igor@48 461 def print_words_sorted(
igor@48 462 word_pairs,
igor@48 463 stats,
igor@48 464 normalizator,
igor@48 465 print_stats=True,
igor@48 466 stats_only=False,
igor@48 467 compressed_wordlist=False,
igor@48 468 show_range=0,
igor@48 469 show_range_percentage=0,
igor@48 470 ):
igor@54 471 result = []
igor@40 472 if stats_only:
igor@54 473 #codecs.getwriter("utf-8")(sys.stdout).write(
igor@54 474 result.append(
igor@43 475 " ".join([
igor@43 476 "%-10s" % x for x in [
igor@43 477 "LANG",
igor@43 478 "KNOWN%",
igor@43 479 "UNKNOWN%",
igor@43 480 "KNOWN",
igor@43 481 "TOTAL",
igor@43 482 "WPS",
igor@43 483 "UWPS*10"
igor@43 484 ]]) + "\n")
igor@54 485 result.append(
igor@43 486 " ".join([
igor@43 487 "%(language)-10s",
igor@43 488 "%(percentage)-10.2f",
igor@43 489 "%(percentage_unknown)-10.2f",
igor@43 490 "%(total_known)-11d"
igor@43 491 "%(total)-11d"
igor@43 492 "%(wps)-11d"
igor@43 493 "%(uwps)-11d"
igor@43 494 ]) % stats + "\n")
igor@54 495 return "".join(result)
igor@38 496
igor@40 497 if print_stats:
igor@54 498 result.append(
igor@43 499 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
igor@38 500
igor@40 501 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
igor@40 502 known = int(stats['total_known'])
igor@40 503 total = int(stats['total'])
igor@40 504 current_level = 0
igor@47 505 old_normalized_word = None
igor@47 506 words_of_this_group = []
igor@48 507 printed_words = 0
igor@40 508 for word_pair in word_pairs:
igor@47 509
igor@47 510 normalized_word = normalizator.normalize(word_pair[1])
igor@47 511 if old_normalized_word and old_normalized_word != normalized_word:
igor@47 512 if compressed_wordlist:
igor@49 513 compressed_word_pair = (
igor@49 514 sum(x[0] for x in words_of_this_group),
igor@49 515 normalizator.best_word_from_group(words_of_this_group)
igor@49 516 )
igor@54 517 result.append("%10s %s\n" % compressed_word_pair)
igor@48 518 printed_words += 1
igor@47 519 words_of_this_group = []
igor@47 520
igor@47 521 old_normalized_word = normalized_word
igor@47 522 words_of_this_group.append(word_pair)
igor@47 523
igor@47 524 if not compressed_wordlist:
igor@54 525 result.append("%10s %s\n" % word_pair)
igor@48 526 printed_words += 1
igor@47 527
igor@47 528
igor@40 529 known += word_pair[0]
igor@40 530 if 100.0*known/total >= level_lines[0]:
igor@40 531 current_level = level_lines[0]
igor@40 532 while 100.0*known/total > level_lines[0]:
igor@40 533 current_level = level_lines[0]
igor@40 534 level_lines = level_lines[1:]
igor@54 535 result.append("# %s\n" % current_level)
igor@38 536
igor@48 537 if show_range >0 and printed_words >= show_range:
igor@48 538 break
igor@48 539 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
igor@48 540 break
igor@48 541
igor@54 542 return result
igor@39 543
igor@53 544 def parse_parts_description(parts_description):
igor@53 545 """
igor@53 546 Returns triad (start, stop, step)
igor@53 547 basing on parts_description string.
igor@53 548 from-to/step
igor@53 549 from+delta/step
igor@53 550 """
igor@53 551
igor@53 552 try:
igor@53 553 (a, step) = parts_description.split("/", 1)
igor@53 554 step = int(step)
igor@53 555 start = 0
igor@53 556 stop = 0
igor@53 557 if '-' in a:
igor@53 558 (start, stop) = a.split("-", 1)
igor@53 559 start = int(start)
igor@53 560 stop = int(stop)
igor@53 561 elif '+' in a:
igor@53 562 (start, stop) = a.split("+", 1)
igor@53 563 start = int(start)
igor@53 564 stop = int(stop)
igor@53 565 else:
igor@53 566 start = int(a)
igor@53 567 stop = start + 1
igor@53 568 return (start, stop, step)
igor@53 569
igor@53 570 except:
igor@54 571 raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % parts_description)
igor@53 572
igor@53 573
igor@53 574 def take_part(lines, part_description = None):
igor@55 575 if part_description == None or part_description == '':
igor@53 576 return lines
igor@53 577 (start, stop, step) = parse_parts_description(part_description)
igor@53 578 n = len(lines)
igor@53 579 part_size = (1.0*n) / step
igor@53 580 result = []
igor@53 581 for i in range(n):
igor@54 582 if i >= start * part_size and i <= stop * part_size:
igor@54 583 result += [lines[i]]
igor@53 584 return result
igor@53 585
igor@65 586 def web_editor(output):
igor@65 587 from twisted.internet import reactor
igor@65 588 from twisted.web.server import Site
igor@65 589 from twisted.web.static import File
igor@65 590 from twisted.web.resource import Resource
igor@65 591 import json
igor@65 592
igor@65 593 word_list = []
igor@65 594
igor@65 595 for o in output:
igor@65 596 a = re.split('\s+', o.strip(), 2)
igor@65 597 a = a + ['']*(3-len(a))
igor@65 598 word_list.append({'number':a[0], 'word':a[1], 'comment':a[2]})
igor@65 599
igor@65 600 print "Loaded ", len(word_list)
igor@65 601
igor@65 602 new_words_html = "/home/igor/hg/new-words/web"
igor@65 603
igor@65 604 class JSONPage(Resource):
igor@65 605 isLeaf = True
igor@65 606 def render_GET(self, request):
igor@65 607 return json.dumps({"word_list": word_list})
igor@65 608
igor@65 609 class SaveJSON(Resource):
igor@65 610 isLeaf = True
igor@65 611 def render_POST(self, request):
igor@65 612 print json.loads(request.args["selected_words"][0])
igor@65 613 return json.dumps({"status": "ok"})
igor@65 614
igor@65 615 json_page = JSONPage()
igor@65 616 save_json = SaveJSON()
igor@65 617
igor@65 618 resource = File(new_words_html)
igor@65 619 resource.putChild("json", json_page)
igor@65 620 resource.putChild("save", save_json)
igor@65 621
igor@65 622 factory = Site(resource)
igor@65 623 reactor.listenTCP(8880, factory)
igor@65 624 reactor.run()
igor@65 625
igor@65 626
igor@40 627 def filter_get_words_group_words_add_stat(args):
igor@40 628 vocabulary = load_vocabulary()
igor@40 629 notes = load_notes(notes_filenames())
igor@54 630
igor@65 631 input_lines = []
igor@54 632 if len(args) > 0:
igor@65 633 for arg in args:
igor@65 634 if 'http://' in arg:
igor@65 635 input_lines += readlines_from_url(arg)
igor@65 636 else:
igor@65 637 input_lines += readlines_from_file(arg)
igor@54 638 else:
igor@65 639 input_lines += readlines_from_stdin()
igor@54 640
igor@54 641 if len(input_lines) == 0:
igor@54 642 print >> sys.stderr, "Nothing to do, standard input is empty, exiting."
igor@54 643 sys.exit(1)
igor@54 644
igor@54 645 lines = take_part(input_lines, config.get('pages', ''))
igor@54 646
igor@54 647 (_, original_text_tempfile) = tempfile.mkstemp(prefix='new-word')
igor@54 648 with codecs.open(original_text_tempfile, "w", "utf-8") as f:
igor@54 649 f.write("".join(lines))
igor@54 650
igor@44 651 group_by = [1]
igor@48 652
igor@54 653 if 'two_words' in config:
igor@44 654 group_by.append(2)
igor@54 655 if 'three_words' in config:
igor@44 656 group_by.append(3)
igor@44 657 words = get_words(lines, group_by)
igor@43 658 stats_only = False
igor@54 659 if 'text_stats' in config:
igor@43 660 stats_only = True
igor@40 661
igor@47 662 compressed_wordlist = False
igor@54 663 if 'compressed' in config:
igor@47 664 compressed_wordlist = True
igor@47 665
igor@55 666 if 'show_range' in config:
igor@55 667 show_range = int(config['show_range'])
igor@48 668 else:
igor@48 669 show_range = 0
igor@54 670
igor@54 671 if 'show_range_percentage' in config:
igor@54 672 show_range_percentage = int(config['show_range_percentage'])
igor@48 673 else:
igor@48 674 show_range_percentage = 0
igor@48 675
igor@44 676
igor@40 677 stats = {}
igor@40 678 stats['total'] = sum(words[x] for x in words.keys())
igor@54 679 if not 'no_filter' in config:
igor@45 680 words = substract_dictionary(words, vocabulary)
igor@40 681
igor@40 682 stats['total_unknown'] = sum(words[x] for x in words.keys())
igor@40 683 stats['total_known'] = stats['total'] - stats['total_unknown']
igor@43 684 stats['percentage'] = 100.0*stats['total_known']/stats['total']
igor@43 685 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
igor@40 686 stats['groups'] = 0
igor@40 687 stats['words'] = len(words)
igor@43 688 stats['sentences'] = 0 #FIXME
igor@43 689 stats['wps'] = 0 #FIXME
igor@43 690 stats['uwps'] = 0 #FIXME
igor@40 691 stats['language'] = config['language']
igor@40 692
igor@40 693 linked_words = find_linked_words(notes)
igor@40 694 normalizator = Normalizator(config['language'], linked_words)
igor@40 695
igor@50 696 # filter words by allowed_words_filter
igor@54 697 if 'allowed_words' in config:
igor@54 698 allowed_words_filename = config['allowed_words']
igor@50 699 normalized_allowed_words = [
igor@50 700 normalizator.normalize(w.rstrip('\n'))
igor@50 701 for w in readlines_from_file(allowed_words_filename)
igor@50 702 ]
igor@50 703
igor@50 704 result = {}
igor@50 705 for w, wn in words.iteritems():
igor@50 706 if normalizator.normalize(w) in normalized_allowed_words:
igor@50 707 result[w] = wn
igor@50 708 words = result
igor@50 709
igor@44 710 words_with_freq = []
igor@40 711 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
igor@44 712 words_with_freq.append((words[k], k))
igor@40 713
igor@44 714 wgw = find_wordgroups_weights(words_with_freq, normalizator)
igor@55 715 if not 'no_words_grouping' in config or not config['no_words_grouping']:
igor@45 716 words_with_freq = sorted(
igor@44 717 words_with_freq,
igor@40 718 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
igor@40 719 reverse=True)
igor@40 720
igor@54 721 output = print_words_sorted(
igor@47 722 words_with_freq,
igor@47 723 stats,
igor@47 724 normalizator,
igor@47 725 stats_only=stats_only,
igor@48 726 compressed_wordlist=compressed_wordlist,
igor@48 727 show_range=show_range,
igor@48 728 show_range_percentage=show_range_percentage,
igor@47 729 )
igor@40 730
igor@54 731
igor@54 732 if ('non_interactive' in config or 'text_stats' in config):
igor@54 733 codecs.getwriter("utf-8")(sys.stdout).write("".join(output))
igor@65 734 elif config.get('web', False):
igor@65 735 web_editor(output)
igor@54 736 else:
igor@54 737 (_, temp1) = tempfile.mkstemp(prefix='new-word')
igor@54 738 (_, temp2) = tempfile.mkstemp(prefix='new-word')
igor@54 739
igor@54 740 with codecs.open(temp1, "w", "utf-8") as f:
igor@54 741 f.write("".join(output))
igor@54 742 with codecs.open(temp2, "w", "utf-8") as f:
igor@54 743 f.write("".join(add_notes(output, notes)))
igor@54 744
igor@54 745 os.putenv('ORIGINAL_TEXT', original_text_tempfile)
igor@54 746 os.system((
igor@54 747 "vim"
igor@54 748 " -c 'setlocal spell spelllang={language}'"
igor@54 749 " -c 'set keywordprg={language}'"
igor@54 750 " -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255'"
igor@54 751 " {filename}"
igor@54 752 " < /dev/tty > /dev/tty"
igor@54 753 ).format(language=config['language'], filename=temp2))
igor@54 754
igor@54 755 lines = remove_notes(readlines_from_file(temp2), notes)
igor@54 756
igor@54 757 # compare lines_before and lines_after and return deleted words
igor@54 758 lines_before = output
igor@54 759 lines_after = lines
igor@54 760 deleted_words = []
igor@54 761
igor@60 762 lines_after_set = set(lines_after)
igor@54 763 for line in lines_before:
igor@60 764 if line not in lines_after_set:
igor@54 765 line = line.strip()
igor@54 766 if ' ' in line:
igor@54 767 word = re.split('\s+', line, 1)[1]
igor@54 768 if ' ' in word:
igor@54 769 word = re.split('\s+', word, 1)[0]
igor@54 770 deleted_words.append(word)
igor@54 771
igor@54 772 with codecs.open(voc_filename(), "a", "utf-8") as f:
igor@54 773 f.write("\n".join(deleted_words + ['']))
igor@54 774
igor@54 775 os.unlink(temp1)
igor@54 776 os.unlink(temp2)
igor@54 777
igor@54 778 os.unlink(original_text_tempfile)
igor@54 779
igor@37 780 (options, args) = parser.parse_args()
igor@38 781 if options.language:
igor@38 782 config['language'] = options.language
igor@37 783
igor@54 784 if options.pages:
igor@54 785 config['pages'] = options.pages
igor@54 786 else:
igor@54 787 config['pages'] = ""
igor@54 788
igor@54 789 if options.allowed_words:
igor@54 790 config['allowed_words'] = options.allowed_words
igor@54 791
igor@55 792 if options.show_range:
igor@55 793 config['show_range'] = options.show_range
igor@55 794
igor@54 795 if options.show_range_percentage:
igor@54 796 config['show_range_percentage'] = options.show_range_percentage
igor@54 797
igor@54 798 if options.non_interactive:
igor@54 799 config['non_interactive'] = True
igor@54 800
igor@54 801 if options.text_stats:
igor@54 802 config['text_stats'] = True
igor@54 803
igor@54 804 if options.compressed:
igor@54 805 config['compressed'] = True
igor@54 806
igor@54 807 if options.no_filter:
igor@54 808 config['no_filter'] = True
igor@54 809
igor@54 810 if options.two_words:
igor@54 811 config['two_words'] = True
igor@54 812
igor@54 813 if options.three_words:
igor@54 814 config['three_words'] = True
igor@54 815
igor@55 816 if options.no_words_grouping:
igor@55 817 config['no_words_grouping'] = True
igor@37 818
igor@65 819 if options.web:
igor@65 820 config['web'] = True
igor@65 821
igor@55 822 filter_get_words_group_words_add_stat(args)
igor@55 823
igor@55 824 #if options.function:
igor@55 825 # function_names = {
igor@55 826 # 'get_words_group_words_add_stat': ,
igor@55 827 # }
igor@55 828 # if options.function in function_names:
igor@55 829 # function_names[options.function](args)
igor@55 830 # else:
igor@55 831 # error_message("Unkown function %s.\nAvailable functions:\n%s" % (
igor@55 832 # options.function, "".join([" "+x for x in sorted(function_names.keys())])))
igor@55 833 # sys.exit(1)
igor@55 834 #
igor@37 835
igor@37 836
igor@37 837
igor@38 838 #os.system("vim")
igor@37 839