new-words: new-words.py annotate

new-words

annotate new-words.py @ 60:3682038403ad

minifix

author	Igor Chubin <igor@chub.in>
date	Fri Nov 04 11:27:21 2011 +0100 (2011-11-04)
parents	2a1a25e61872
children	1b8b30ad7c95

rev	line source
igor@37	1 #!/usr/bin/env python
igor@38	2 # -- coding: utf-8 --
igor@37	3
igor@40	4 from __future__ import with_statement
igor@38	5 import codecs
igor@49	6 import difflib
igor@38	7 import logging
igor@38	8 import os
igor@37	9 import optparse
igor@38	10 import re
igor@38	11 import subprocess
igor@38	12 import sys
igor@38	13 import Stemmer
igor@54	14 import tempfile
igor@42	15 try:
igor@42	16 import psyco
igor@42	17 psyco.full()
igor@42	18 except:
igor@42	19 pass
igor@38	20
igor@38	21 config = {
igor@38	22 'config_directory': os.environ['HOME'] + '/.new-words',
igor@38	23 'language': 'en',
igor@38	24 }
igor@38	25
igor@38	26 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
igor@38	27
igor@38	28 class Normalizator:
igor@38	29 def __init__(self, language, linked_words={}):
igor@38	30 stemmer_algorithm = {
igor@38	31 'de' : 'german',
igor@38	32 'en' : 'english',
igor@51	33 'es' : 'spanish',
igor@38	34 'ru' : 'russian',
igor@51	35 'it' : 'italian',
igor@38	36 'uk' : 'ukrainian',
igor@38	37 }
igor@38	38 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
igor@38	39 self.linked_words = linked_words
igor@38	40
igor@38	41 def normalize(self, word):
igor@38	42 word_chain = []
igor@38	43 while word in self.linked_words and not word in word_chain:
igor@38	44 word_chain.append(word)
igor@38	45 word = self.linked_words[word]
igor@38	46 return self.stemmer.stemWord(word.lower())
igor@37	47
igor@47	48 def best_word_from_group(self, wordpairs_group):
igor@47	49 """Returns the word that is the most relevant to the wordpairs_group.
igor@47	50
igor@47	51 At the moment: returns the word with minimal length"""
igor@49	52
igor@49	53 def f(x, y):
igor@49	54 return difflib.SequenceMatcher(
igor@49	55 None,
igor@49	56 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),
igor@49	57 x.lower(),
igor@49	58 y.lower()).ratio()
igor@47	59
igor@47	60 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
igor@49	61 best_match = list(x[1] for x in sorted(
igor@47	62 (x for x in wordpairs_group if len(x[1]) == minimal_length),
igor@47	63 key=lambda x:x[0],
igor@47	64 reverse=True))[0]
igor@47	65
igor@51	66 return best_match
igor@51	67
igor@49	68 suggestions = self.dictionary_suggestions(best_match)
igor@49	69 if len(suggestions) == 1:
igor@49	70 return best_match
igor@49	71
igor@49	72 verb = False
igor@49	73 corrected_best_match = best_match
igor@49	74 if best_match[-2:] == 'et':
igor@49	75 word = best_match[:-1]+"n"
igor@49	76 sugg = self.dictionary_suggestions(word)
igor@49	77 if len(sugg) == 1:
igor@49	78 return word
igor@49	79 suggestions += sugg
igor@49	80 corrected_best_match = word
igor@49	81 corrected_best_match = best_match[:-2]
igor@49	82 verb = True
igor@49	83
igor@49	84 if best_match[-1] == 't':
igor@49	85 word = best_match[:-1]+"en"
igor@49	86 sugg = self.dictionary_suggestions(word)
igor@49	87 if len(sugg) == 1:
igor@49	88 return word
igor@49	89 suggestions += sugg
igor@49	90 corrected_best_match = best_match[:-1]
igor@49	91 verb = True
igor@49	92
igor@49	93 if corrected_best_match[0].lower() == corrected_best_match[0]:
igor@49	94 suggestions = [ x for x in suggestions
igor@49	95 if x[0].lower() == x[0] ]
igor@49	96
igor@49	97 if suggestions == []:
igor@49	98 return best_match+"_"
igor@49	99 return best_match+" "+(" ".join(
igor@49	100 sorted(
igor@49	101 suggestions,
igor@49	102 key = lambda x: f(x, corrected_best_match),
igor@49	103 reverse = True
igor@49	104 )
igor@49	105 )
igor@49	106 )
igor@49	107
igor@49	108 def dictionary_suggestions(self, word):
igor@49	109 return [
igor@49	110 x.decode('utf-8').rstrip('\n')
igor@49	111 for x
igor@49	112 in subprocess.Popen(
igor@49	113 ["de-variants", word],
igor@49	114 stdout=subprocess.PIPE
igor@49	115 ).stdout.readlines() ]
igor@49	116
igor@49	117
igor@37	118 parser = optparse.OptionParser()
igor@37	119
igor@37	120 parser.add_option(
igor@37	121 "-a", "--no-marks",
igor@55	122 help="don't add marks (and don't save marks added by user) [NOT IMPLEMENTED YET]",
igor@37	123 action="store_true",
igor@37	124 dest="no_marks")
igor@37	125
igor@37	126 parser.add_option(
igor@37	127 "-c", "--compressed",
igor@37	128 help="show compressed wordlist: one word per group",
igor@37	129 action="store_true",
igor@37	130 dest="compressed")
igor@37	131
igor@37	132 parser.add_option(
igor@37	133 "-k", "--known-words",
igor@37	134 help="put higher words that are similar to the known words (only for English)",
igor@37	135 action="store_true",
igor@37	136 dest="compressed")
igor@37	137
igor@37	138 parser.add_option(
igor@37	139 "-l", "--language",
igor@37	140 help="specify language of text",
igor@37	141 action="store",
igor@37	142 dest="language")
igor@37	143
igor@37	144 parser.add_option(
igor@54	145 "-f", "--allowed-words",
igor@54	146 help="file with list of allowed words (words that will be shown in the output)",
igor@54	147 action="store",
igor@54	148 dest="allowed_words")
igor@54	149
igor@54	150 parser.add_option(
igor@55	151 "-G", "--words-grouping",
igor@55	152 help="turn off word grouping",
igor@55	153 action="store_true",
igor@55	154 dest="no_words_grouping")
igor@55	155
igor@55	156 parser.add_option(
igor@54	157 "-X", "--function",
igor@38	158 help="filter through subsystem [INTERNAL]",
igor@38	159 action="store",
igor@38	160 dest="function")
igor@38	161
igor@38	162 parser.add_option(
igor@37	163 "-m", "--merge-tag",
igor@55	164 help="merge words tagged with specified tag into the main vocabulary [NOT IMPLEMENTED YET]",
igor@37	165 action="store",
igor@37	166 dest="merge_tag")
igor@37	167
igor@37	168 parser.add_option(
igor@37	169 "-M", "--merge-tagged",
igor@55	170 help="merge words tagged with ANY tag into the main vocabulary [NOT IMPLEMENTED YET]",
igor@37	171 action="store_true",
igor@37	172 dest="merge_tagged")
igor@37	173
igor@37	174 parser.add_option(
igor@37	175 "-n", "--non-interactive",
igor@37	176 help="non-interactive mode (don't run vi)",
igor@37	177 action="store_true",
igor@37	178 dest="non_interactive")
igor@37	179
igor@37	180 parser.add_option(
igor@37	181 "-N", "--no-filter",
igor@37	182 help="switch off known words filtering",
igor@37	183 action="store_true",
igor@37	184 dest="no_filter")
igor@37	185
igor@37	186 parser.add_option(
igor@37	187 "-p", "--pages",
igor@37	188 help="work with specified pages only (pages = start-stop/total )",
igor@37	189 action="store",
igor@37	190 dest="pages")
igor@37	191
igor@37	192 parser.add_option(
igor@48	193 "-d", "--delete-tag",
igor@48	194 help="delete subvocabulary of specified tag",
igor@37	195 action="store",
igor@48	196 dest="delete_tag")
igor@37	197
igor@37	198 parser.add_option(
igor@55	199 "-r", "--show-range",
igor@55	200 help="show only words specified number of words",
igor@55	201 action="store",
igor@55	202 dest="show_range")
igor@55	203
igor@55	204 parser.add_option(
igor@54	205 "-R", "--show-range-percentage",
igor@54	206 help="show only words that cover specified percentage of the text, skip the rest",
igor@54	207 action="store",
igor@54	208 dest="show_range_percentage")
igor@54	209
igor@54	210 parser.add_option(
igor@37	211 "-s", "--text-stats",
igor@37	212 help="show the text statistics (percentage of known words and so on) and exit",
igor@37	213 action="store_true",
igor@37	214 dest="text_stats")
igor@37	215
igor@37	216 parser.add_option(
igor@37	217 "-S", "--voc-stats",
igor@55	218 help="show your vocabulary statistics (number of words and word groups) [NOT IMPLEMENTED YET]",
igor@37	219 action="store_true",
igor@37	220 dest="voc_stats")
igor@37	221
igor@37	222 parser.add_option(
igor@37	223 "-t", "--tag",
igor@37	224 help="tag known words with tag",
igor@37	225 action="store",
igor@37	226 dest="tag")
igor@37	227
igor@37	228 parser.add_option(
igor@37	229 "-T", "--show-tags",
igor@37	230 help="tag known words with tag",
igor@37	231 action="store_true",
igor@37	232 dest="show_tags")
igor@37	233
igor@37	234 parser.add_option(
igor@37	235 "-2", "--two-words",
igor@37	236 help="find 2 words' sequences",
igor@37	237 action="store_true",
igor@37	238 dest="two_words")
igor@37	239
igor@37	240 parser.add_option(
igor@37	241 "-3", "--three-words",
igor@37	242 help="find 3 words' sequences",
igor@37	243 action="store_true",
igor@37	244 dest="three_words")
igor@37	245
igor@38	246 def readlines_from_file(filename):
igor@38	247 res = []
igor@38	248 with codecs.open(filename, "r", "utf-8") as f:
igor@38	249 for line in f.readlines():
igor@38	250 res += [line]
igor@38	251 return res
igor@38	252
igor@54	253 def readlines_from_url(url):
igor@54	254 return [x.decode('utf-8') for x in
igor@54	255 subprocess.Popen(
igor@54	256 "lynx -dump '{url}' \| perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url),
igor@54	257 shell = True,
igor@54	258 stdout = subprocess.PIPE,
igor@54	259 stderr = subprocess.STDOUT
igor@54	260 ).communicate()[0].split('\n')
igor@54	261 ]
igor@54	262
igor@38	263 def readlines_from_stdin():
igor@38	264 return codecs.getreader("utf-8")(sys.stdin).readlines()
igor@38	265
igor@38	266 def words_from_line(line):
igor@38	267 line = line.rstrip('\n')
igor@38	268 #return re.split('(?:\s\|[*\r,.:#@()+=<>$;"?!\|\[\]^%&~{}«»–])+', line)
igor@38	269 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
igor@44	270 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
igor@38	271
igor@44	272 def get_words(lines, group_by=[1]):
igor@38	273 """
igor@38	274 Returns hash of words in a file
igor@38	275 word => number
igor@38	276 """
igor@38	277 result = {}
igor@44	278 (a, b, c) = ("", "", "")
igor@38	279 for line in lines:
igor@38	280 words = words_from_line(line)
igor@38	281 for word in words:
igor@41	282 if re.match('[0-9]*$', word):
igor@41	283 continue
igor@38	284 result.setdefault(word, 0)
igor@38	285 result[word] += 1
igor@44	286 if 2 in group_by and a != "" and b != "":
igor@44	287 w = "%s_%s" % (a,b)
igor@44	288 result.setdefault(w, 0)
igor@44	289 result[w] += 1
igor@44	290 if 3 in group_by and not "" in [a,b,c]:
igor@44	291 w = "%s_%s_%s" % (a,b,c)
igor@44	292 result.setdefault(w, 0)
igor@44	293 result[w] += 1
igor@44	294 (a,b,c) = (b, c, word)
igor@44	295
igor@44	296 logging.debug(result)
igor@38	297 return result
igor@38	298
igor@54	299 def voc_filename():
igor@54	300 return "%s/%s.txt"%(config['config_directory'], config['language'])
igor@54	301
igor@38	302 def load_vocabulary():
igor@54	303 return get_words(readlines_from_file(voc_filename()))
igor@38	304
igor@38	305 def notes_filenames():
igor@38	306 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
igor@38	307
igor@38	308 def load_notes(files):
igor@38	309 notes = {}
igor@38	310 for filename in files:
igor@39	311 with codecs.open(filename, "r", "utf-8") as f:
igor@38	312 for line in f.readlines():
igor@38	313 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@38	314 notes.setdefault(word, {})
igor@38	315 notes[word][filename] = note
igor@38	316 return notes
igor@38	317
igor@39	318 def add_notes(lines, notes):
igor@39	319 notes_filename = notes_filenames()[0]
igor@39	320 result = []
igor@39	321 for line in lines:
igor@39	322 if line.startswith('#'):
igor@39	323 result += [line]
igor@39	324 else:
igor@39	325 match_object = re.search('^\s\S+\s(\S+)', line)
igor@39	326 if match_object:
igor@39	327 word = match_object.group(1)
igor@39	328 if word in notes:
igor@39	329 if notes_filename in notes[word]:
igor@39	330 line = line.rstrip('\n')
igor@39	331 line = "%-30s %s\n" % (line, notes[word][notes_filename])
igor@39	332 result += [line]
igor@39	333 else:
igor@39	334 result += [line]
igor@39	335 else:
igor@39	336 result += [line]
igor@39	337 return result
igor@39	338
igor@39	339 def remove_notes(lines, notes_group):
igor@39	340 notes_filename = notes_filenames()[0]
igor@39	341 notes = {}
igor@39	342 for k in notes_group.keys():
igor@39	343 if notes_filename in notes_group[k]:
igor@39	344 notes[k] = notes_group[k][notes_filename]
igor@39	345
igor@39	346 result = []
igor@39	347 for line in lines:
igor@39	348 line = line.rstrip('\n')
igor@39	349 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
igor@39	350 if match_object:
igor@39	351 result.append("".join([
igor@39	352 match_object.group(1),
igor@39	353 match_object.group(2),
igor@39	354 match_object.group(3),
igor@39	355 match_object.group(4),
igor@39	356 "\n"
igor@39	357 ]))
igor@39	358 notes[match_object.group(4)] = match_object.group(6)
igor@39	359 else:
igor@39	360 result.append(line+"\n")