new-words

annotate new-words.py @ 54:e25de9ea9184

new-words.py is almost ready
author Igor Chubin <igor@chub.in>
date Tue Nov 01 20:19:18 2011 +0100 (2011-11-01)
parents f583256b7ab1
children 2a1a25e61872
rev   line source
igor@37 1 #!/usr/bin/env python
igor@38 2 # -*- coding: utf-8 -*-
igor@37 3
igor@40 4 from __future__ import with_statement
igor@38 5 import codecs
igor@49 6 import difflib
igor@38 7 import logging
igor@38 8 import os
igor@37 9 import optparse
igor@38 10 import re
igor@38 11 import subprocess
igor@38 12 import sys
igor@38 13 import Stemmer
igor@54 14 import tempfile
igor@42 15 try:
igor@42 16 import psyco
igor@42 17 psyco.full()
igor@42 18 except:
igor@42 19 pass
igor@38 20
igor@38 21 config = {
igor@38 22 'config_directory': os.environ['HOME'] + '/.new-words',
igor@38 23 'language': 'en',
igor@38 24 }
igor@38 25
igor@38 26 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
igor@38 27
igor@38 28 class Normalizator:
igor@38 29 def __init__(self, language, linked_words={}):
igor@38 30 stemmer_algorithm = {
igor@38 31 'de' : 'german',
igor@38 32 'en' : 'english',
igor@51 33 'es' : 'spanish',
igor@38 34 'ru' : 'russian',
igor@51 35 'it' : 'italian',
igor@38 36 'uk' : 'ukrainian',
igor@38 37 }
igor@38 38 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
igor@38 39 self.linked_words = linked_words
igor@38 40
igor@38 41 def normalize(self, word):
igor@38 42 word_chain = []
igor@38 43 while word in self.linked_words and not word in word_chain:
igor@38 44 word_chain.append(word)
igor@38 45 word = self.linked_words[word]
igor@38 46 return self.stemmer.stemWord(word.lower())
igor@37 47
igor@47 48 def best_word_from_group(self, wordpairs_group):
igor@47 49 """Returns the word that is the most relevant to the wordpairs_group.
igor@47 50
igor@47 51 At the moment: returns the word with minimal length"""
igor@49 52
igor@49 53 def f(x, y):
igor@49 54 return difflib.SequenceMatcher(
igor@49 55 None,
igor@49 56 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),
igor@49 57 x.lower(),
igor@49 58 y.lower()).ratio()
igor@47 59
igor@47 60 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
igor@49 61 best_match = list(x[1] for x in sorted(
igor@47 62 (x for x in wordpairs_group if len(x[1]) == minimal_length),
igor@47 63 key=lambda x:x[0],
igor@47 64 reverse=True))[0]
igor@47 65
igor@51 66 return best_match
igor@51 67
igor@49 68 suggestions = self.dictionary_suggestions(best_match)
igor@49 69 if len(suggestions) == 1:
igor@49 70 return best_match
igor@49 71
igor@49 72 verb = False
igor@49 73 corrected_best_match = best_match
igor@49 74 if best_match[-2:] == 'et':
igor@49 75 word = best_match[:-1]+"n"
igor@49 76 sugg = self.dictionary_suggestions(word)
igor@49 77 if len(sugg) == 1:
igor@49 78 return word
igor@49 79 suggestions += sugg
igor@49 80 corrected_best_match = word
igor@49 81 corrected_best_match = best_match[:-2]
igor@49 82 verb = True
igor@49 83
igor@49 84 if best_match[-1] == 't':
igor@49 85 word = best_match[:-1]+"en"
igor@49 86 sugg = self.dictionary_suggestions(word)
igor@49 87 if len(sugg) == 1:
igor@49 88 return word
igor@49 89 suggestions += sugg
igor@49 90 corrected_best_match = best_match[:-1]
igor@49 91 verb = True
igor@49 92
igor@49 93 if corrected_best_match[0].lower() == corrected_best_match[0]:
igor@49 94 suggestions = [ x for x in suggestions
igor@49 95 if x[0].lower() == x[0] ]
igor@49 96
igor@49 97 if suggestions == []:
igor@49 98 return best_match+"_"
igor@49 99 return best_match+" "+(" ".join(
igor@49 100 sorted(
igor@49 101 suggestions,
igor@49 102 key = lambda x: f(x, corrected_best_match),
igor@49 103 reverse = True
igor@49 104 )
igor@49 105 )
igor@49 106 )
igor@49 107
igor@49 108 def dictionary_suggestions(self, word):
igor@49 109 return [
igor@49 110 x.decode('utf-8').rstrip('\n')
igor@49 111 for x
igor@49 112 in subprocess.Popen(
igor@49 113 ["de-variants", word],
igor@49 114 stdout=subprocess.PIPE
igor@49 115 ).stdout.readlines() ]
igor@49 116
igor@49 117
igor@37 118 parser = optparse.OptionParser()
igor@37 119
igor@37 120 parser.add_option(
igor@37 121 "-a", "--no-marks",
igor@37 122 help="don't add marks (and don't save marks added by user)",
igor@37 123 action="store_true",
igor@37 124 dest="no_marks")
igor@37 125
igor@37 126 parser.add_option(
igor@37 127 "-c", "--compressed",
igor@37 128 help="show compressed wordlist: one word per group",
igor@37 129 action="store_true",
igor@37 130 dest="compressed")
igor@37 131
igor@37 132 parser.add_option(
igor@37 133 "-k", "--known-words",
igor@37 134 help="put higher words that are similar to the known words (only for English)",
igor@37 135 action="store_true",
igor@37 136 dest="compressed")
igor@37 137
igor@37 138 parser.add_option(
igor@37 139 "-l", "--language",
igor@37 140 help="specify language of text",
igor@37 141 action="store",
igor@37 142 dest="language")
igor@37 143
igor@37 144 parser.add_option(
igor@54 145 "-f", "--allowed-words",
igor@54 146 help="file with list of allowed words (words that will be shown in the output)",
igor@54 147 action="store",
igor@54 148 dest="allowed_words")
igor@54 149
igor@54 150 parser.add_option(
igor@54 151 "-X", "--function",
igor@38 152 help="filter through subsystem [INTERNAL]",
igor@38 153 action="store",
igor@38 154 dest="function")
igor@38 155
igor@38 156 parser.add_option(
igor@37 157 "-m", "--merge-tag",
igor@37 158 help="merge words tagged with specified tag into the main vocabulary",
igor@37 159 action="store",
igor@37 160 dest="merge_tag")
igor@37 161
igor@37 162 parser.add_option(
igor@37 163 "-M", "--merge-tagged",
igor@37 164 help="merge words tagged with ANY tag into the main vocabulary",
igor@37 165 action="store_true",
igor@37 166 dest="merge_tagged")
igor@37 167
igor@37 168 parser.add_option(
igor@37 169 "-n", "--non-interactive",
igor@37 170 help="non-interactive mode (don't run vi)",
igor@37 171 action="store_true",
igor@37 172 dest="non_interactive")
igor@37 173
igor@37 174 parser.add_option(
igor@37 175 "-N", "--no-filter",
igor@37 176 help="switch off known words filtering",
igor@37 177 action="store_true",
igor@37 178 dest="no_filter")
igor@37 179
igor@37 180 parser.add_option(
igor@37 181 "-p", "--pages",
igor@37 182 help="work with specified pages only (pages = start-stop/total )",
igor@37 183 action="store",
igor@37 184 dest="pages")
igor@37 185
igor@37 186 parser.add_option(
igor@48 187 "-d", "--delete-tag",
igor@48 188 help="delete subvocabulary of specified tag",
igor@37 189 action="store",
igor@48 190 dest="delete_tag")
igor@37 191
igor@37 192 parser.add_option(
igor@54 193 "-R", "--show-range-percentage",
igor@54 194 help="show only words that cover specified percentage of the text, skip the rest",
igor@54 195 action="store",
igor@54 196 dest="show_range_percentage")
igor@54 197
igor@54 198 parser.add_option(
igor@37 199 "-s", "--text-stats",
igor@37 200 help="show the text statistics (percentage of known words and so on) and exit",
igor@37 201 action="store_true",
igor@37 202 dest="text_stats")
igor@37 203
igor@37 204 parser.add_option(
igor@37 205 "-S", "--voc-stats",
igor@37 206 help="show your vocabulary statistics (number of words and word groups)",
igor@37 207 action="store_true",
igor@37 208 dest="voc_stats")
igor@37 209
igor@37 210 parser.add_option(
igor@37 211 "-t", "--tag",
igor@37 212 help="tag known words with tag",
igor@37 213 action="store",
igor@37 214 dest="tag")
igor@37 215
igor@37 216 parser.add_option(
igor@37 217 "-T", "--show-tags",
igor@37 218 help="tag known words with tag",
igor@37 219 action="store_true",
igor@37 220 dest="show_tags")
igor@37 221
igor@37 222 parser.add_option(
igor@37 223 "-2", "--two-words",
igor@37 224 help="find 2 words' sequences",
igor@37 225 action="store_true",
igor@37 226 dest="two_words")
igor@37 227
igor@37 228 parser.add_option(
igor@37 229 "-3", "--three-words",
igor@37 230 help="find 3 words' sequences",
igor@37 231 action="store_true",
igor@37 232 dest="three_words")
igor@37 233
igor@38 234 def readlines_from_file(filename):
igor@38 235 res = []
igor@38 236 with codecs.open(filename, "r", "utf-8") as f:
igor@38 237 for line in f.readlines():
igor@38 238 res += [line]
igor@38 239 return res
igor@38 240
igor@54 241 def readlines_from_url(url):
igor@54 242 return [x.decode('utf-8') for x in
igor@54 243 subprocess.Popen(
igor@54 244 "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url),
igor@54 245 shell = True,
igor@54 246 stdout = subprocess.PIPE,
igor@54 247 stderr = subprocess.STDOUT
igor@54 248 ).communicate()[0].split('\n')
igor@54 249 ]
igor@54 250
igor@38 251 def readlines_from_stdin():
igor@38 252 return codecs.getreader("utf-8")(sys.stdin).readlines()
igor@38 253
igor@38 254 def words_from_line(line):
igor@38 255 line = line.rstrip('\n')
igor@38 256 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
igor@38 257 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
igor@44 258 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
igor@38 259
igor@44 260 def get_words(lines, group_by=[1]):
igor@38 261 """
igor@38 262 Returns hash of words in a file
igor@38 263 word => number
igor@38 264 """
igor@38 265 result = {}
igor@44 266 (a, b, c) = ("", "", "")
igor@38 267 for line in lines:
igor@38 268 words = words_from_line(line)
igor@38 269 for word in words:
igor@41 270 if re.match('[0-9]*$', word):
igor@41 271 continue
igor@38 272 result.setdefault(word, 0)
igor@38 273 result[word] += 1
igor@44 274 if 2 in group_by and a != "" and b != "":
igor@44 275 w = "%s_%s" % (a,b)
igor@44 276 result.setdefault(w, 0)
igor@44 277 result[w] += 1
igor@44 278 if 3 in group_by and not "" in [a,b,c]:
igor@44 279 w = "%s_%s_%s" % (a,b,c)
igor@44 280 result.setdefault(w, 0)
igor@44 281 result[w] += 1
igor@44 282 (a,b,c) = (b, c, word)
igor@44 283
igor@44 284 logging.debug(result)
igor@38 285 return result
igor@38 286
igor@54 287 def voc_filename():
igor@54 288 return "%s/%s.txt"%(config['config_directory'], config['language'])
igor@54 289
igor@38 290 def load_vocabulary():
igor@54 291 return get_words(readlines_from_file(voc_filename()))
igor@38 292
igor@38 293 def notes_filenames():
igor@38 294 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
igor@38 295
igor@38 296 def load_notes(files):
igor@38 297 notes = {}
igor@38 298 for filename in files:
igor@39 299 with codecs.open(filename, "r", "utf-8") as f:
igor@38 300 for line in f.readlines():
igor@38 301 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@38 302 notes.setdefault(word, {})
igor@38 303 notes[word][filename] = note
igor@38 304 return notes
igor@38 305
igor@39 306 def add_notes(lines, notes):
igor@39 307 notes_filename = notes_filenames()[0]
igor@39 308 result = []
igor@39 309 for line in lines:
igor@39 310 if line.startswith('#'):
igor@39 311 result += [line]
igor@39 312 else:
igor@39 313 match_object = re.search('^\s*\S+\s*(\S+)', line)
igor@39 314 if match_object:
igor@39 315 word = match_object.group(1)
igor@39 316 if word in notes:
igor@39 317 if notes_filename in notes[word]:
igor@39 318 line = line.rstrip('\n')
igor@39 319 line = "%-30s %s\n" % (line, notes[word][notes_filename])
igor@39 320 result += [line]
igor@39 321 else:
igor@39 322 result += [line]
igor@39 323 else:
igor@39 324 result += [line]
igor@39 325 return result
igor@39 326
igor@39 327 def remove_notes(lines, notes_group):
igor@39 328 notes_filename = notes_filenames()[0]
igor@39 329 notes = {}
igor@39 330 for k in notes_group.keys():
igor@39 331 if notes_filename in notes_group[k]:
igor@39 332 notes[k] = notes_group[k][notes_filename]
igor@39 333
igor@39 334 result = []
igor@39 335 for line in lines:
igor@39 336 line = line.rstrip('\n')
igor@39 337 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
igor@39 338 if match_object:
igor@39 339 result.append("".join([
igor@39 340 match_object.group(1),
igor@39 341 match_object.group(2),
igor@39 342 match_object.group(3),
igor@39 343 match_object.group(4),
igor@39 344 "\n"
igor@39 345 ]))
igor@39 346 notes[match_object.group(4)] = match_object.group(6)
igor@39 347 else:
igor@39 348 result.append(line+"\n")
igor@39 349
igor@39 350 save_notes(notes_filename, notes)
igor@39 351 return result
igor@39 352
igor@39 353 def save_notes(filename, notes):
igor@39 354 lines = []
igor@39 355 saved_words = []
igor@39 356 with codecs.open(filename, "r", "utf-8") as f:
igor@39 357 for line in f.readlines():
igor@39 358 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@39 359 if word in notes:
igor@39 360 line = "%-29s %s\n" % (word, notes[word])
igor@39 361 saved_words.append(word)
igor@39 <