new-words

annotate new-words.py @ 68:846240941452

added -C key: compress to lines; fixed bug with #90-line
author Igor Chubin <igor@chub.in>
date Sun Sep 23 16:07:29 2012 +0300 (2012-09-23)
parents 5a003076eb11
children
rev   line source
igor@37 1 #!/usr/bin/env python
igor@38 2 # -*- coding: utf-8 -*-
igor@37 3
igor@40 4 from __future__ import with_statement
igor@38 5 import codecs
igor@49 6 import difflib
igor@38 7 import logging
igor@38 8 import os
igor@37 9 import optparse
igor@38 10 import re
igor@38 11 import subprocess
igor@38 12 import sys
igor@38 13 import Stemmer
igor@54 14 import tempfile
igor@42 15 try:
igor@42 16 import psyco
igor@42 17 psyco.full()
igor@42 18 except:
igor@42 19 pass
igor@38 20
igor@38 21 config = {
igor@38 22 'config_directory': os.environ['HOME'] + '/.new-words',
igor@38 23 'language': 'en',
igor@38 24 }
igor@38 25
igor@38 26 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
igor@38 27
igor@38 28 class Normalizator:
igor@38 29 def __init__(self, language, linked_words={}):
igor@38 30 stemmer_algorithm = {
igor@38 31 'de' : 'german',
igor@63 32 'fr' : 'french',
igor@38 33 'en' : 'english',
igor@51 34 'es' : 'spanish',
igor@38 35 'ru' : 'russian',
igor@51 36 'it' : 'italian',
igor@38 37 'uk' : 'ukrainian',
igor@38 38 }
igor@65 39 try:
igor@65 40 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
igor@65 41 except:
igor@65 42 self.stemmer = None
igor@38 43 self.linked_words = linked_words
igor@38 44
igor@38 45 def normalize(self, word):
igor@38 46 word_chain = []
igor@38 47 while word in self.linked_words and not word in word_chain:
igor@38 48 word_chain.append(word)
igor@38 49 word = self.linked_words[word]
igor@65 50 if self.stemmer:
igor@65 51 return self.stemmer.stemWord(word.lower())
igor@65 52 else:
igor@65 53 return word.lower()
igor@37 54
igor@47 55 def best_word_from_group(self, wordpairs_group):
igor@47 56 """Returns the word that is the most relevant to the wordpairs_group.
igor@47 57
igor@47 58 At the moment: returns the word with minimal length"""
igor@49 59
igor@49 60 def f(x, y):
igor@49 61 return difflib.SequenceMatcher(
igor@49 62 None,
igor@49 63 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),
igor@49 64 x.lower(),
igor@49 65 y.lower()).ratio()
igor@47 66
igor@47 67 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
igor@49 68 best_match = list(x[1] for x in sorted(
igor@47 69 (x for x in wordpairs_group if len(x[1]) == minimal_length),
igor@47 70 key=lambda x:x[0],
igor@47 71 reverse=True))[0]
igor@47 72
igor@51 73 return best_match
igor@51 74
igor@49 75 suggestions = self.dictionary_suggestions(best_match)
igor@49 76 if len(suggestions) == 1:
igor@49 77 return best_match
igor@49 78
igor@49 79 verb = False
igor@49 80 corrected_best_match = best_match
igor@49 81 if best_match[-2:] == 'et':
igor@49 82 word = best_match[:-1]+"n"
igor@49 83 sugg = self.dictionary_suggestions(word)
igor@49 84 if len(sugg) == 1:
igor@49 85 return word
igor@49 86 suggestions += sugg
igor@49 87 corrected_best_match = word
igor@49 88 corrected_best_match = best_match[:-2]
igor@49 89 verb = True
igor@49 90
igor@49 91 if best_match[-1] == 't':
igor@49 92 word = best_match[:-1]+"en"
igor@49 93 sugg = self.dictionary_suggestions(word)
igor@49 94 if len(sugg) == 1:
igor@49 95 return word
igor@49 96 suggestions += sugg
igor@49 97 corrected_best_match = best_match[:-1]
igor@49 98 verb = True
igor@49 99
igor@49 100 if corrected_best_match[0].lower() == corrected_best_match[0]:
igor@49 101 suggestions = [ x for x in suggestions
igor@49 102 if x[0].lower() == x[0] ]
igor@49 103
igor@49 104 if suggestions == []:
igor@49 105 return best_match+"_"
igor@49 106 return best_match+" "+(" ".join(
igor@49 107 sorted(
igor@49 108 suggestions,
igor@49 109 key = lambda x: f(x, corrected_best_match),
igor@49 110 reverse = True
igor@49 111 )
igor@49 112 )
igor@49 113 )
igor@49 114
igor@49 115 def dictionary_suggestions(self, word):
igor@49 116 return [
igor@49 117 x.decode('utf-8').rstrip('\n')
igor@49 118 for x
igor@49 119 in subprocess.Popen(
igor@49 120 ["de-variants", word],
igor@49 121 stdout=subprocess.PIPE
igor@49 122 ).stdout.readlines() ]
igor@49 123
igor@49 124
igor@37 125 parser = optparse.OptionParser()
igor@37 126
igor@37 127 parser.add_option(
igor@37 128 "-a", "--no-marks",
igor@55 129 help="don't add marks (and don't save marks added by user) [NOT IMPLEMENTED YET]",
igor@37 130 action="store_true",
igor@37 131 dest="no_marks")
igor@37 132
igor@37 133 parser.add_option(
igor@37 134 "-c", "--compressed",
igor@37 135 help="show compressed wordlist: one word per group",
igor@37 136 action="store_true",
igor@37 137 dest="compressed")
igor@37 138
igor@37 139 parser.add_option(
igor@68 140 "-C", "--compressed-to-line",
igor@68 141 help="show compressed wordlist: all words of the group in a line",
igor@68 142 action="store_true",
igor@68 143 dest="compressed_to_line")
igor@68 144
igor@68 145 parser.add_option(
igor@37 146 "-k", "--known-words",
igor@37 147 help="put higher words that are similar to the known words (only for English)",
igor@37 148 action="store_true",
igor@37 149 dest="compressed")
igor@37 150
igor@37 151 parser.add_option(
igor@37 152 "-l", "--language",
igor@37 153 help="specify language of text",
igor@37 154 action="store",
igor@37 155 dest="language")
igor@37 156
igor@37 157 parser.add_option(
igor@54 158 "-f", "--allowed-words",
igor@54 159 help="file with list of allowed words (words that will be shown in the output)",
igor@54 160 action="store",
igor@54 161 dest="allowed_words")
igor@54 162
igor@54 163 parser.add_option(
igor@55 164 "-G", "--words-grouping",
igor@55 165 help="turn off word grouping",
igor@55 166 action="store_true",
igor@55 167 dest="no_words_grouping")
igor@55 168
igor@55 169 parser.add_option(
igor@54 170 "-X", "--function",
igor@38 171 help="filter through subsystem [INTERNAL]",
igor@38 172 action="store",
igor@38 173 dest="function")
igor@38 174
igor@38 175 parser.add_option(
igor@37 176 "-m", "--merge-tag",
igor@55 177 help="merge words tagged with specified tag into the main vocabulary [NOT IMPLEMENTED YET]",
igor@37 178 action="store",
igor@37 179 dest="merge_tag")
igor@37 180
igor@37 181 parser.add_option(
igor@37 182 "-M", "--merge-tagged",
igor@55 183 help="merge words tagged with ANY tag into the main vocabulary [NOT IMPLEMENTED YET]",
igor@37 184 action="store_true",
igor@37 185 dest="merge_tagged")
igor@37 186
igor@37 187 parser.add_option(
igor@37 188 "-n", "--non-interactive",
igor@37 189 help="non-interactive mode (don't run vi)",
igor@37 190 action="store_true",
igor@37 191 dest="non_interactive")
igor@37 192
igor@37 193 parser.add_option(
igor@37 194 "-N", "--no-filter",
igor@37 195 help="switch off known words filtering",
igor@37 196 action="store_true",
igor@37 197 dest="no_filter")
igor@37 198
igor@37 199 parser.add_option(
igor@37 200 "-p", "--pages",
igor@37 201 help="work with specified pages only (pages = start-stop/total )",
igor@37 202 action="store",
igor@37 203 dest="pages")
igor@37 204
igor@37 205 parser.add_option(
igor@48 206 "-d", "--delete-tag",
igor@48 207 help="delete subvocabulary of specified tag",
igor@37 208 action="store",
igor@48 209 dest="delete_tag")
igor@37 210
igor@37 211 parser.add_option(
igor@55 212 "-r", "--show-range",
igor@55 213 help="show only words specified number of words",
igor@55 214 action="store",
igor@55 215 dest="show_range")
igor@55 216
igor@55 217 parser.add_option(
igor@54 218 "-R", "--show-range-percentage",
igor@54 219 help="show only words that cover specified percentage of the text, skip the rest",
igor@54 220 action="store",
igor@54 221 dest="show_range_percentage")
igor@54 222
igor@54 223 parser.add_option(
igor@37 224 "-s", "--text-stats",
igor@37 225 help="show the text statistics (percentage of known words and so on) and exit",
igor@37 226 action="store_true",
igor@37 227 dest="text_stats")
igor@37 228
igor@37 229 parser.add_option(
igor@37 230 "-S", "--voc-stats",
igor@55 231 help="show your vocabulary statistics (number of words and word groups) [NOT IMPLEMENTED YET]",
igor@37 232 action="store_true",
igor@37 233 dest="voc_stats")
igor@37 234
igor@37 235 parser.add_option(
igor@37 236 "-t", "--tag",
igor@37 237 help="tag known words with tag",
igor@37 238 action="store",
igor@37 239 dest="tag")
igor@37 240
igor@37 241 parser.add_option(
igor@37 242 "-T", "--show-tags",
igor@37 243 help="tag known words with tag",
igor@37 244 action="store_true",
igor@37 245 dest="show_tags")
igor@37 246
igor@37 247 parser.add_option(
igor@63 248 "-v", "--vocabulary-filename",
igor@63 249 help="use specified file as a vocabulary",
igor@63 250 action="store",
igor@63 251 dest="vocabulary_filename")
igor@63 252
igor@63 253 parser.add_option(
igor@65 254 "-w", "--web",
igor@65 255 help="Web browser version",
igor@65 256 action="store_true",
igor@65 257 dest="web")
igor@65 258
igor@65 259 parser.add_option(
igor@37 260 "-2", "--two-words",
igor@37 261 help="find 2 words' sequences",
igor@37 262 action="store_true",
igor@37 263 dest="two_words")
igor@37 264
igor@37 265 parser.add_option(
igor@37 266 "-3", "--three-words",
igor@37 267 help="find 3 words' sequences",
igor@37 268 action="store_true",
igor@37 269 dest="three_words")
igor@37 270
igor@38 271 def readlines_from_file(filename):
igor@38 272 res = []
igor@38 273 with codecs.open(filename, "r", "utf-8") as f:
igor@38 274 for line in f.readlines():
igor@38 275 res += [line]
igor@38 276 return res
igor@38 277
igor@54 278 def readlines_from_url(url):
igor@54 279 return [x.decode('utf-8') for x in
igor@54 280 subprocess.Popen(
igor@54 281 "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url),
igor@54 282 shell = True,
igor@54 283 stdout = subprocess.PIPE,
igor@54 284 stderr = subprocess.STDOUT
igor@54 285 ).communicate()[0].split('\n')
igor@54 286 ]
igor@54 287
igor@38 288 def readlines_from_stdin():
igor@38 289 return codecs.getreader("utf-8")(sys.stdin).readlines()
igor@38 290
igor@38 291 def words_from_line(line):
igor@38 292 line = line.rstrip('\n')
igor@38 293 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
igor@38 294 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
igor@44 295 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
igor@38 296
igor@44 297 def get_words(lines, group_by=[1]):
igor@38 298 """
igor@38 299 Returns hash of words in a file
igor@38 300 word => number
igor@38 301 """
igor@38 302 result = {}
igor@44 303 (a, b, c) = ("", "", "")
igor@38 304 for line in lines:
igor@38 305 words = words_from_line(line)
igor@38 306 for word in words:
igor@41 307 if re.match('[0-9]*$', word):
igor@41 308 continue
igor@38 309 result.setdefault(word, 0)
igor@38 310 result[word] += 1
igor@44 311 if 2 in group_by and a != "" and b != "":
igor@44 312 w = "%s_%s" % (a,b)
igor@44 313 result.setdefault(w, 0)
igor@44 314 result[w] += 1
igor@44 315 if 3 in group_by and not "" in [a,b,c]:
igor@44 316 w = "%s_%s_%s" % (a,b,c)
igor@44 317 result.setdefault(w, 0)
igor@44 318 result[w] += 1
igor@44 319 (a,b,c) = (b, c, word)
igor@44 320
igor@44 321 logging.debug(result)
igor@38 322 return result
igor@38 323
igor@54 324 def voc_filename():
igor@63 325 if 'vocabulary_filename' in config:
igor@63 326 return config['vocabulary_filename']
igor@54 327 return "%s/%s.txt"%(config['config_directory'], config['language'])
igor@54 328
igor@38 329 def load_vocabulary():
igor@54 330 return get_words(readlines_from_file(voc_filename()))
igor@38 331
igor@38 332 def notes_filenames():
igor@38 333 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
igor@38 334
igor@38 335 def load_notes(files):
igor@38 336 notes = {}
igor@38 337 for filename in files:
igor@39 338 with codecs.open(filename, "r", "utf-8") as f:
igor@38 339 for line in f.readlines():
igor@38 340 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@38 341 notes.setdefault(word, {})
igor@38 342 notes[word][filename] = note
igor@38 343 return notes
igor@38 344
igor@39 345 def add_notes(lines, notes):
igor@39 346 notes_filename = notes_filenames()[0]
igor@39 347 result = []
igor@39 348 for line in lines:
igor@39 349 if line.startswith('#'):
igor@39 350 result += [line]
igor@39 351 else:
igor@39 352 match_object = re.search('^\s*\S+\s*(\S+)', line)
igor@39 353 if match_object:
igor@39 354 word = match_object.group(1)
igor@39 355 if word in notes:
igor@39 356 if notes_filename in notes[word]:
igor@39 357 line = line.rstrip('\n')
igor@39 358 line = "%-30s %s\n" % (line, notes[word][notes_filename])
igor@39 359 result += [line]
igor@39 360 else:
igor@39 361 result += [line]
igor@39 362 else:
igor@39 363 result += [line]
igor@39 364 return result