rev |
line source |
igor@37
|
1 #!/usr/bin/env python
|
igor@38
|
2 # -*- coding: utf-8 -*-
|
igor@37
|
3
|
igor@40
|
4 from __future__ import with_statement
|
igor@38
|
5 import codecs
|
igor@49
|
6 import difflib
|
igor@38
|
7 import logging
|
igor@38
|
8 import os
|
igor@37
|
9 import optparse
|
igor@38
|
10 import re
|
igor@38
|
11 import subprocess
|
igor@38
|
12 import sys
|
igor@38
|
13 import Stemmer
|
igor@54
|
14 import tempfile
|
igor@42
|
15 try:
|
igor@42
|
16 import psyco
|
igor@42
|
17 psyco.full()
|
igor@42
|
18 except:
|
igor@42
|
19 pass
|
igor@38
|
20
|
igor@38
|
21 config = {
|
igor@38
|
22 'config_directory': os.environ['HOME'] + '/.new-words',
|
igor@38
|
23 'language': 'en',
|
igor@38
|
24 }
|
igor@38
|
25
|
igor@38
|
26 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
|
igor@38
|
27
|
igor@38
|
28 class Normalizator:
|
igor@38
|
29 def __init__(self, language, linked_words={}):
|
igor@38
|
30 stemmer_algorithm = {
|
igor@38
|
31 'de' : 'german',
|
igor@63
|
32 'fr' : 'french',
|
igor@38
|
33 'en' : 'english',
|
igor@51
|
34 'es' : 'spanish',
|
igor@38
|
35 'ru' : 'russian',
|
igor@51
|
36 'it' : 'italian',
|
igor@38
|
37 'uk' : 'ukrainian',
|
igor@38
|
38 }
|
igor@65
|
39 try:
|
igor@65
|
40 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
|
igor@65
|
41 except:
|
igor@65
|
42 self.stemmer = None
|
igor@38
|
43 self.linked_words = linked_words
|
igor@38
|
44
|
igor@38
|
45 def normalize(self, word):
|
igor@38
|
46 word_chain = []
|
igor@38
|
47 while word in self.linked_words and not word in word_chain:
|
igor@38
|
48 word_chain.append(word)
|
igor@38
|
49 word = self.linked_words[word]
|
igor@65
|
50 if self.stemmer:
|
igor@65
|
51 return self.stemmer.stemWord(word.lower())
|
igor@65
|
52 else:
|
igor@65
|
53 return word.lower()
|
igor@37
|
54
|
igor@47
|
55 def best_word_from_group(self, wordpairs_group):
|
igor@47
|
56 """Returns the word that is the most relevant to the wordpairs_group.
|
igor@47
|
57
|
igor@47
|
58 At the moment: returns the word with minimal length"""
|
igor@49
|
59
|
igor@49
|
60 def f(x, y):
|
igor@49
|
61 return difflib.SequenceMatcher(
|
igor@49
|
62 None,
|
igor@49
|
63 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),
|
igor@49
|
64 x.lower(),
|
igor@49
|
65 y.lower()).ratio()
|
igor@47
|
66
|
igor@47
|
67 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
|
igor@49
|
68 best_match = list(x[1] for x in sorted(
|
igor@47
|
69 (x for x in wordpairs_group if len(x[1]) == minimal_length),
|
igor@47
|
70 key=lambda x:x[0],
|
igor@47
|
71 reverse=True))[0]
|
igor@47
|
72
|
igor@51
|
73 return best_match
|
igor@51
|
74
|
igor@49
|
75 suggestions = self.dictionary_suggestions(best_match)
|
igor@49
|
76 if len(suggestions) == 1:
|
igor@49
|
77 return best_match
|
igor@49
|
78
|
igor@49
|
79 verb = False
|
igor@49
|
80 corrected_best_match = best_match
|
igor@49
|
81 if best_match[-2:] == 'et':
|
igor@49
|
82 word = best_match[:-1]+"n"
|
igor@49
|
83 sugg = self.dictionary_suggestions(word)
|
igor@49
|
84 if len(sugg) == 1:
|
igor@49
|
85 return word
|
igor@49
|
86 suggestions += sugg
|
igor@49
|
87 corrected_best_match = word
|
igor@49
|
88 corrected_best_match = best_match[:-2]
|
igor@49
|
89 verb = True
|
igor@49
|
90
|
igor@49
|
91 if best_match[-1] == 't':
|
igor@49
|
92 word = best_match[:-1]+"en"
|
igor@49
|
93 sugg = self.dictionary_suggestions(word)
|
igor@49
|
94 if len(sugg) == 1:
|
igor@49
|
95 return word
|
igor@49
|
96 suggestions += sugg
|
igor@49
|
97 corrected_best_match = best_match[:-1]
|
igor@49
|
98 verb = True
|
igor@49
|
99
|
igor@49
|
100 if corrected_best_match[0].lower() == corrected_best_match[0]:
|
igor@49
|
101 suggestions = [ x for x in suggestions
|
igor@49
|
102 if x[0].lower() == x[0] ]
|
igor@49
|
103
|
igor@49
|
104 if suggestions == []:
|
igor@49
|
105 return best_match+"_"
|
igor@49
|
106 return best_match+" "+(" ".join(
|
igor@49
|
107 sorted(
|
igor@49
|
108 suggestions,
|
igor@49
|
109 key = lambda x: f(x, corrected_best_match),
|
igor@49
|
110 reverse = True
|
igor@49
|
111 )
|
igor@49
|
112 )
|
igor@49
|
113 )
|
igor@49
|
114
|
igor@49
|
115 def dictionary_suggestions(self, word):
|
igor@49
|
116 return [
|
igor@49
|
117 x.decode('utf-8').rstrip('\n')
|
igor@49
|
118 for x
|
igor@49
|
119 in subprocess.Popen(
|
igor@49
|
120 ["de-variants", word],
|
igor@49
|
121 stdout=subprocess.PIPE
|
igor@49
|
122 ).stdout.readlines() ]
|
igor@49
|
123
|
igor@49
|
124
|
igor@37
|
125 parser = optparse.OptionParser()
|
igor@37
|
126
|
igor@37
|
127 parser.add_option(
|
igor@37
|
128 "-a", "--no-marks",
|
igor@55
|
129 help="don't add marks (and don't save marks added by user) [NOT IMPLEMENTED YET]",
|
igor@37
|
130 action="store_true",
|
igor@37
|
131 dest="no_marks")
|
igor@37
|
132
|
igor@37
|
133 parser.add_option(
|
igor@37
|
134 "-c", "--compressed",
|
igor@37
|
135 help="show compressed wordlist: one word per group",
|
igor@37
|
136 action="store_true",
|
igor@37
|
137 dest="compressed")
|
igor@37
|
138
|
igor@37
|
139 parser.add_option(
|
igor@68
|
140 "-C", "--compressed-to-line",
|
igor@68
|
141 help="show compressed wordlist: all words of the group in a line",
|
igor@68
|
142 action="store_true",
|
igor@68
|
143 dest="compressed_to_line")
|
igor@68
|
144
|
igor@68
|
145 parser.add_option(
|
igor@37
|
146 "-k", "--known-words",
|
igor@37
|
147 help="put higher words that are similar to the known words (only for English)",
|
igor@37
|
148 action="store_true",
|
igor@37
|
149 dest="compressed")
|
igor@37
|
150
|
igor@37
|
151 parser.add_option(
|
igor@37
|
152 "-l", "--language",
|
igor@37
|
153 help="specify language of text",
|
igor@37
|
154 action="store",
|
igor@37
|
155 dest="language")
|
igor@37
|
156
|
igor@37
|
157 parser.add_option(
|
igor@54
|
158 "-f", "--allowed-words",
|
igor@54
|
159 help="file with list of allowed words (words that will be shown in the output)",
|
igor@54
|
160 action="store",
|
igor@54
|
161 dest="allowed_words")
|
igor@54
|
162
|
igor@54
|
163 parser.add_option(
|
igor@55
|
164 "-G", "--words-grouping",
|
igor@55
|
165 help="turn off word grouping",
|
igor@55
|
166 action="store_true",
|
igor@55
|
167 dest="no_words_grouping")
|
igor@55
|
168
|
igor@55
|
169 parser.add_option(
|
igor@54
|
170 "-X", "--function",
|
igor@38
|
171 help="filter through subsystem [INTERNAL]",
|
igor@38
|
172 action="store",
|
igor@38
|
173 dest="function")
|
igor@38
|
174
|
igor@38
|
175 parser.add_option(
|
igor@37
|
176 "-m", "--merge-tag",
|
igor@55
|
177 help="merge words tagged with specified tag into the main vocabulary [NOT IMPLEMENTED YET]",
|
igor@37
|
178 action="store",
|
igor@37
|
179 dest="merge_tag")
|
igor@37
|
180
|
igor@37
|
181 parser.add_option(
|
igor@37
|
182 "-M", "--merge-tagged",
|
igor@55
|
183 help="merge words tagged with ANY tag into the main vocabulary [NOT IMPLEMENTED YET]",
|
igor@37
|
184 action="store_true",
|
igor@37
|
185 dest="merge_tagged")
|
igor@37
|
186
|
igor@37
|
187 parser.add_option(
|
igor@37
|
188 "-n", "--non-interactive",
|
igor@37
|
189 help="non-interactive mode (don't run vi)",
|
igor@37
|
190 action="store_true",
|
igor@37
|
191 dest="non_interactive")
|
igor@37
|
192
|
igor@37
|
193 parser.add_option(
|
igor@37
|
194 "-N", "--no-filter",
|
igor@37
|
195 help="switch off known words filtering",
|
igor@37
|
196 action="store_true",
|
igor@37
|
197 dest="no_filter")
|
igor@37
|
198
|
igor@37
|
199 parser.add_option(
|
igor@37
|
200 "-p", "--pages",
|
igor@37
|
201 help="work with specified pages only (pages = start-stop/total )",
|
igor@37
|
202 action="store",
|
igor@37
|
203 dest="pages")
|
igor@37
|
204
|
igor@37
|
205 parser.add_option(
|
igor@48
|
206 "-d", "--delete-tag",
|
igor@48
|
207 help="delete subvocabulary of specified tag",
|
igor@37
|
208 action="store",
|
igor@48
|
209 dest="delete_tag")
|
igor@37
|
210
|
igor@37
|
211 parser.add_option(
|
igor@55
|
212 "-r", "--show-range",
|
igor@55
|
213 help="show only words specified number of words",
|
igor@55
|
214 action="store",
|
igor@55
|
215 dest="show_range")
|
igor@55
|
216
|
igor@55
|
217 parser.add_option(
|
igor@54
|
218 "-R", "--show-range-percentage",
|
igor@54
|
219 help="show only words that cover specified percentage of the text, skip the rest",
|
igor@54
|
220 action="store",
|
igor@54
|
221 dest="show_range_percentage")
|
igor@54
|
222
|
igor@54
|
223 parser.add_option(
|
igor@37
|
224 "-s", "--text-stats",
|
igor@37
|
225 help="show the text statistics (percentage of known words and so on) and exit",
|
igor@37
|
226 action="store_true",
|
igor@37
|
227 dest="text_stats")
|
igor@37
|
228
|
igor@37
|
229 parser.add_option(
|
igor@37
|
230 "-S", "--voc-stats",
|
igor@55
|
231 help="show your vocabulary statistics (number of words and word groups) [NOT IMPLEMENTED YET]",
|
igor@37
|
232 action="store_true",
|
igor@37
|
233 dest="voc_stats")
|
igor@37
|
234
|
igor@37
|
235 parser.add_option(
|
igor@37
|
236 "-t", "--tag",
|
igor@37
|
237 help="tag known words with tag",
|
igor@37
|
238 action="store",
|
igor@37
|
239 dest="tag")
|
igor@37
|
240
|
igor@37
|
241 parser.add_option(
|
igor@37
|
242 "-T", "--show-tags",
|
igor@37
|
243 help="tag known words with tag",
|
igor@37
|
244 action="store_true",
|
igor@37
|
245 dest="show_tags")
|
igor@37
|
246
|
igor@37
|
247 parser.add_option(
|
igor@63
|
248 "-v", "--vocabulary-filename",
|
igor@63
|
249 help="use specified file as a vocabulary",
|
igor@63
|
250 action="store",
|
igor@63
|
251 dest="vocabulary_filename")
|
igor@63
|
252
|
igor@63
|
253 parser.add_option(
|
igor@65
|
254 "-w", "--web",
|
igor@65
|
255 help="Web browser version",
|
igor@65
|
256 action="store_true",
|
igor@65
|
257 dest="web")
|
igor@65
|
258
|
igor@65
|
259 parser.add_option(
|
igor@37
|
260 "-2", "--two-words",
|
igor@37
|
261 help="find 2 words' sequences",
|
igor@37
|
262 action="store_true",
|
igor@37
|
263 dest="two_words")
|
igor@37
|
264
|
igor@37
|
265 parser.add_option(
|
igor@37
|
266 "-3", "--three-words",
|
igor@37
|
267 help="find 3 words' sequences",
|
igor@37
|
268 action="store_true",
|
igor@37
|
269 dest="three_words")
|
igor@37
|
270
|
igor@38
|
271 def readlines_from_file(filename):
|
igor@38
|
272 res = []
|
igor@38
|
273 with codecs.open(filename, "r", "utf-8") as f:
|
igor@38
|
274 for line in f.readlines():
|
igor@38
|
275 res += [line]
|
igor@38
|
276 return res
|
igor@38
|
277
|
igor@54
|
278 def readlines_from_url(url):
|
igor@54
|
279 return [x.decode('utf-8') for x in
|
igor@54
|
280 subprocess.Popen(
|
igor@54
|
281 "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url),
|
igor@54
|
282 shell = True,
|
igor@54
|
283 stdout = subprocess.PIPE,
|
igor@54
|
284 stderr = subprocess.STDOUT
|
igor@54
|
285 ).communicate()[0].split('\n')
|
igor@54
|
286 ]
|
igor@54
|
287
|
igor@38
|
288 def readlines_from_stdin():
|
igor@38
|
289 return codecs.getreader("utf-8")(sys.stdin).readlines()
|
igor@38
|
290
|
igor@38
|
291 def words_from_line(line):
|
igor@38
|
292 line = line.rstrip('\n')
|
igor@38
|
293 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
|
igor@38
|
294 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
|
igor@44
|
295 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
|
igor@38
|
296
|
igor@44
|
297 def get_words(lines, group_by=[1]):
|
igor@38
|
298 """
|
igor@38
|
299 Returns hash of words in a file
|
igor@38
|
300 word => number
|
igor@38
|
301 """
|
igor@38
|
302 result = {}
|
igor@44
|
303 (a, b, c) = ("", "", "")
|
igor@38
|
304 for line in lines:
|
igor@38
|
305 words = words_from_line(line)
|
igor@38
|
306 for word in words:
|
igor@41
|
307 if re.match('[0-9]*$', word):
|
igor@41
|
308 continue
|
igor@38
|
309 result.setdefault(word, 0)
|
igor@38
|
310 result[word] += 1
|
igor@44
|
311 if 2 in group_by and a != "" and b != "":
|
igor@44
|
312 w = "%s_%s" % (a,b)
|
igor@44
|
313 result.setdefault(w, 0)
|
igor@44
|
314 result[w] += 1
|
igor@44
|
315 if 3 in group_by and not "" in [a,b,c]:
|
igor@44
|
316 w = "%s_%s_%s" % (a,b,c)
|
igor@44
|
317 result.setdefault(w, 0)
|
igor@44
|
318 result[w] += 1
|
igor@44
|
319 (a,b,c) = (b, c, word)
|
igor@44
|
320
|
igor@44
|
321 logging.debug(result)
|
igor@38
|
322 return result
|
igor@38
|
323
|
igor@54
|
324 def voc_filename():
|
igor@63
|
325 if 'vocabulary_filename' in config:
|
igor@63
|
326 return config['vocabulary_filename']
|
igor@54
|
327 return "%s/%s.txt"%(config['config_directory'], config['language'])
|
igor@54
|
328
|
igor@38
|
329 def load_vocabulary():
|
igor@54
|
330 return get_words(readlines_from_file(voc_filename()))
|
igor@38
|
331
|
igor@38
|
332 def notes_filenames():
|
igor@38
|
333 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
|
igor@38
|
334
|
igor@38
|
335 def load_notes(files):
|
igor@38
|
336 notes = {}
|
igor@38
|
337 for filename in files:
|
igor@39
|
338 with codecs.open(filename, "r", "utf-8") as f:
|
igor@38
|
339 for line in f.readlines():
|
igor@38
|
340 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
|
igor@38
|
341 notes.setdefault(word, {})
|
igor@38
|
342 notes[word][filename] = note
|
igor@38
|
343 return notes
|
igor@38
|
344
|
igor@39
|
345 def add_notes(lines, notes):
|
igor@39
|
346 notes_filename = notes_filenames()[0]
|
igor@39
|
347 result = []
|
igor@39
|
348 for line in lines:
|
igor@39
|
349 if line.startswith('#'):
|
igor@39
|
350 result += [line]
|
igor@39
|
351 else:
|
igor@39
|
352 match_object = re.search('^\s*\S+\s*(\S+)', line)
|
igor@39
|
353 if match_object:
|
igor@39
|
354 word = match_object.group(1)
|
igor@39
|
355 if word in notes:
|
igor@39
|
356 if notes_filename in notes[word]:
|
igor@39
|
357 line = line.rstrip('\n')
|
igor@39
|
358 line = "%-30s %s\n" % (line, notes[word][notes_filename])
|
igor@39
|
359 result += [line]
|
igor@39
|
360 else:
|
igor@39
|
361 result += [line]
|
igor@39
|
362 else:
|
igor@39
|
363 result += [line]
|
igor@39
|
364 return result
|
|