rev |
line source |
igor@37
|
1 #!/usr/bin/env python
|
igor@38
|
2 # -*- coding: utf-8 -*-
|
igor@37
|
3
|
igor@40
|
4 from __future__ import with_statement
|
igor@38
|
5 import codecs
|
igor@49
|
6 import difflib
|
igor@38
|
7 import logging
|
igor@38
|
8 import os
|
igor@37
|
9 import optparse
|
igor@38
|
10 import re
|
igor@38
|
11 import subprocess
|
igor@38
|
12 import sys
|
igor@38
|
13 import Stemmer
|
igor@54
|
14 import tempfile
|
igor@42
|
15 try:
|
igor@42
|
16 import psyco
|
igor@42
|
17 psyco.full()
|
igor@42
|
18 except:
|
igor@42
|
19 pass
|
igor@38
|
20
|
igor@38
|
21 config = {
|
igor@38
|
22 'config_directory': os.environ['HOME'] + '/.new-words',
|
igor@38
|
23 'language': 'en',
|
igor@38
|
24 }
|
igor@38
|
25
|
igor@38
|
26 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
|
igor@38
|
27
|
igor@38
|
28 class Normalizator:
|
igor@38
|
29 def __init__(self, language, linked_words={}):
|
igor@38
|
30 stemmer_algorithm = {
|
igor@38
|
31 'de' : 'german',
|
igor@38
|
32 'en' : 'english',
|
igor@51
|
33 'es' : 'spanish',
|
igor@38
|
34 'ru' : 'russian',
|
igor@51
|
35 'it' : 'italian',
|
igor@38
|
36 'uk' : 'ukrainian',
|
igor@38
|
37 }
|
igor@38
|
38 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
|
igor@38
|
39 self.linked_words = linked_words
|
igor@38
|
40
|
igor@38
|
41 def normalize(self, word):
|
igor@38
|
42 word_chain = []
|
igor@38
|
43 while word in self.linked_words and not word in word_chain:
|
igor@38
|
44 word_chain.append(word)
|
igor@38
|
45 word = self.linked_words[word]
|
igor@38
|
46 return self.stemmer.stemWord(word.lower())
|
igor@37
|
47
|
igor@47
|
48 def best_word_from_group(self, wordpairs_group):
|
igor@47
|
49 """Returns the word that is the most relevant to the wordpairs_group.
|
igor@47
|
50
|
igor@47
|
51 At the moment: returns the word with minimal length"""
|
igor@49
|
52
|
igor@49
|
53 def f(x, y):
|
igor@49
|
54 return difflib.SequenceMatcher(
|
igor@49
|
55 None,
|
igor@49
|
56 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),
|
igor@49
|
57 x.lower(),
|
igor@49
|
58 y.lower()).ratio()
|
igor@47
|
59
|
igor@47
|
60 minimal_length = min(len(pair[1]) for pair in wordpairs_group)
|
igor@49
|
61 best_match = list(x[1] for x in sorted(
|
igor@47
|
62 (x for x in wordpairs_group if len(x[1]) == minimal_length),
|
igor@47
|
63 key=lambda x:x[0],
|
igor@47
|
64 reverse=True))[0]
|
igor@47
|
65
|
igor@51
|
66 return best_match
|
igor@51
|
67
|
igor@49
|
68 suggestions = self.dictionary_suggestions(best_match)
|
igor@49
|
69 if len(suggestions) == 1:
|
igor@49
|
70 return best_match
|
igor@49
|
71
|
igor@49
|
72 verb = False
|
igor@49
|
73 corrected_best_match = best_match
|
igor@49
|
74 if best_match[-2:] == 'et':
|
igor@49
|
75 word = best_match[:-1]+"n"
|
igor@49
|
76 sugg = self.dictionary_suggestions(word)
|
igor@49
|
77 if len(sugg) == 1:
|
igor@49
|
78 return word
|
igor@49
|
79 suggestions += sugg
|
igor@49
|
80 corrected_best_match = word
|
igor@49
|
81 corrected_best_match = best_match[:-2]
|
igor@49
|
82 verb = True
|
igor@49
|
83
|
igor@49
|
84 if best_match[-1] == 't':
|
igor@49
|
85 word = best_match[:-1]+"en"
|
igor@49
|
86 sugg = self.dictionary_suggestions(word)
|
igor@49
|
87 if len(sugg) == 1:
|
igor@49
|
88 return word
|
igor@49
|
89 suggestions += sugg
|
igor@49
|
90 corrected_best_match = best_match[:-1]
|
igor@49
|
91 verb = True
|
igor@49
|
92
|
igor@49
|
93 if corrected_best_match[0].lower() == corrected_best_match[0]:
|
igor@49
|
94 suggestions = [ x for x in suggestions
|
igor@49
|
95 if x[0].lower() == x[0] ]
|
igor@49
|
96
|
igor@49
|
97 if suggestions == []:
|
igor@49
|
98 return best_match+"_"
|
igor@49
|
99 return best_match+" "+(" ".join(
|
igor@49
|
100 sorted(
|
igor@49
|
101 suggestions,
|
igor@49
|
102 key = lambda x: f(x, corrected_best_match),
|
igor@49
|
103 reverse = True
|
igor@49
|
104 )
|
igor@49
|
105 )
|
igor@49
|
106 )
|
igor@49
|
107
|
igor@49
|
108 def dictionary_suggestions(self, word):
|
igor@49
|
109 return [
|
igor@49
|
110 x.decode('utf-8').rstrip('\n')
|
igor@49
|
111 for x
|
igor@49
|
112 in subprocess.Popen(
|
igor@49
|
113 ["de-variants", word],
|
igor@49
|
114 stdout=subprocess.PIPE
|
igor@49
|
115 ).stdout.readlines() ]
|
igor@49
|
116
|
igor@49
|
117
|
igor@37
|
118 parser = optparse.OptionParser()
|
igor@37
|
119
|
igor@37
|
120 parser.add_option(
|
igor@37
|
121 "-a", "--no-marks",
|
igor@55
|
122 help="don't add marks (and don't save marks added by user) [NOT IMPLEMENTED YET]",
|
igor@37
|
123 action="store_true",
|
igor@37
|
124 dest="no_marks")
|
igor@37
|
125
|
igor@37
|
126 parser.add_option(
|
igor@37
|
127 "-c", "--compressed",
|
igor@37
|
128 help="show compressed wordlist: one word per group",
|
igor@37
|
129 action="store_true",
|
igor@37
|
130 dest="compressed")
|
igor@37
|
131
|
igor@37
|
132 parser.add_option(
|
igor@37
|
133 "-k", "--known-words",
|
igor@37
|
134 help="put higher words that are similar to the known words (only for English)",
|
igor@37
|
135 action="store_true",
|
igor@37
|
136 dest="compressed")
|
igor@37
|
137
|
igor@37
|
138 parser.add_option(
|
igor@37
|
139 "-l", "--language",
|
igor@37
|
140 help="specify language of text",
|
igor@37
|
141 action="store",
|
igor@37
|
142 dest="language")
|
igor@37
|
143
|
igor@37
|
144 parser.add_option(
|
igor@54
|
145 "-f", "--allowed-words",
|
igor@54
|
146 help="file with list of allowed words (words that will be shown in the output)",
|
igor@54
|
147 action="store",
|
igor@54
|
148 dest="allowed_words")
|
igor@54
|
149
|
igor@54
|
150 parser.add_option(
|
igor@55
|
151 "-G", "--words-grouping",
|
igor@55
|
152 help="turn off word grouping",
|
igor@55
|
153 action="store_true",
|
igor@55
|
154 dest="no_words_grouping")
|
igor@55
|
155
|
igor@55
|
156 parser.add_option(
|
igor@54
|
157 "-X", "--function",
|
igor@38
|
158 help="filter through subsystem [INTERNAL]",
|
igor@38
|
159 action="store",
|
igor@38
|
160 dest="function")
|
igor@38
|
161
|
igor@38
|
162 parser.add_option(
|
igor@37
|
163 "-m", "--merge-tag",
|
igor@55
|
164 help="merge words tagged with specified tag into the main vocabulary [NOT IMPLEMENTED YET]",
|
igor@37
|
165 action="store",
|
igor@37
|
166 dest="merge_tag")
|
igor@37
|
167
|
igor@37
|
168 parser.add_option(
|
igor@37
|
169 "-M", "--merge-tagged",
|
igor@55
|
170 help="merge words tagged with ANY tag into the main vocabulary [NOT IMPLEMENTED YET]",
|
igor@37
|
171 action="store_true",
|
igor@37
|
172 dest="merge_tagged")
|
igor@37
|
173
|
igor@37
|
174 parser.add_option(
|
igor@37
|
175 "-n", "--non-interactive",
|
igor@37
|
176 help="non-interactive mode (don't run vi)",
|
igor@37
|
177 action="store_true",
|
igor@37
|
178 dest="non_interactive")
|
igor@37
|
179
|
igor@37
|
180 parser.add_option(
|
igor@37
|
181 "-N", "--no-filter",
|
igor@37
|
182 help="switch off known words filtering",
|
igor@37
|
183 action="store_true",
|
igor@37
|
184 dest="no_filter")
|
igor@37
|
185
|
igor@37
|
186 parser.add_option(
|
igor@37
|
187 "-p", "--pages",
|
igor@37
|
188 help="work with specified pages only (pages = start-stop/total )",
|
igor@37
|
189 action="store",
|
igor@37
|
190 dest="pages")
|
igor@37
|
191
|
igor@37
|
192 parser.add_option(
|
igor@48
|
193 "-d", "--delete-tag",
|
igor@48
|
194 help="delete subvocabulary of specified tag",
|
igor@37
|
195 action="store",
|
igor@48
|
196 dest="delete_tag")
|
igor@37
|
197
|
igor@37
|
198 parser.add_option(
|
igor@55
|
199 "-r", "--show-range",
|
igor@55
|
200 help="show only words specified number of words",
|
igor@55
|
201 action="store",
|
igor@55
|
202 dest="show_range")
|
igor@55
|
203
|
igor@55
|
204 parser.add_option(
|
igor@54
|
205 "-R", "--show-range-percentage",
|
igor@54
|
206 help="show only words that cover specified percentage of the text, skip the rest",
|
igor@54
|
207 action="store",
|
igor@54
|
208 dest="show_range_percentage")
|
igor@54
|
209
|
igor@54
|
210 parser.add_option(
|
igor@37
|
211 "-s", "--text-stats",
|
igor@37
|
212 help="show the text statistics (percentage of known words and so on) and exit",
|
igor@37
|
213 action="store_true",
|
igor@37
|
214 dest="text_stats")
|
igor@37
|
215
|
igor@37
|
216 parser.add_option(
|
igor@37
|
217 "-S", "--voc-stats",
|
igor@55
|
218 help="show your vocabulary statistics (number of words and word groups) [NOT IMPLEMENTED YET]",
|
igor@37
|
219 action="store_true",
|
igor@37
|
220 dest="voc_stats")
|
igor@37
|
221
|
igor@37
|
222 parser.add_option(
|
igor@37
|
223 "-t", "--tag",
|
igor@37
|
224 help="tag known words with tag",
|
igor@37
|
225 action="store",
|
igor@37
|
226 dest="tag")
|
igor@37
|
227
|
igor@37
|
228 parser.add_option(
|
igor@37
|
229 "-T", "--show-tags",
|
igor@37
|
230 help="tag known words with tag",
|
igor@37
|
231 action="store_true",
|
igor@37
|
232 dest="show_tags")
|
igor@37
|
233
|
igor@37
|
234 parser.add_option(
|
igor@37
|
235 "-2", "--two-words",
|
igor@37
|
236 help="find 2 words' sequences",
|
igor@37
|
237 action="store_true",
|
igor@37
|
238 dest="two_words")
|
igor@37
|
239
|
igor@37
|
240 parser.add_option(
|
igor@37
|
241 "-3", "--three-words",
|
igor@37
|
242 help="find 3 words' sequences",
|
igor@37
|
243 action="store_true",
|
igor@37
|
244 dest="three_words")
|
igor@37
|
245
|
igor@38
|
246 def readlines_from_file(filename):
|
igor@38
|
247 res = []
|
igor@38
|
248 with codecs.open(filename, "r", "utf-8") as f:
|
igor@38
|
249 for line in f.readlines():
|
igor@38
|
250 res += [line]
|
igor@38
|
251 return res
|
igor@38
|
252
|
igor@54
|
253 def readlines_from_url(url):
|
igor@54
|
254 return [x.decode('utf-8') for x in
|
igor@54
|
255 subprocess.Popen(
|
igor@54
|
256 "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url),
|
igor@54
|
257 shell = True,
|
igor@54
|
258 stdout = subprocess.PIPE,
|
igor@54
|
259 stderr = subprocess.STDOUT
|
igor@54
|
260 ).communicate()[0].split('\n')
|
igor@54
|
261 ]
|
igor@54
|
262
|
igor@38
|
263 def readlines_from_stdin():
|
igor@38
|
264 return codecs.getreader("utf-8")(sys.stdin).readlines()
|
igor@38
|
265
|
igor@38
|
266 def words_from_line(line):
|
igor@38
|
267 line = line.rstrip('\n')
|
igor@38
|
268 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
|
igor@38
|
269 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
|
igor@44
|
270 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
|
igor@38
|
271
|
igor@44
|
272 def get_words(lines, group_by=[1]):
|
igor@38
|
273 """
|
igor@38
|
274 Returns hash of words in a file
|
igor@38
|
275 word => number
|
igor@38
|
276 """
|
igor@38
|
277 result = {}
|
igor@44
|
278 (a, b, c) = ("", "", "")
|
igor@38
|
279 for line in lines:
|
igor@38
|
280 words = words_from_line(line)
|
igor@38
|
281 for word in words:
|
igor@41
|
282 if re.match('[0-9]*$', word):
|
igor@41
|
283 continue
|
igor@38
|
284 result.setdefault(word, 0)
|
igor@38
|
285 result[word] += 1
|
igor@44
|
286 if 2 in group_by and a != "" and b != "":
|
igor@44
|
287 w = "%s_%s" % (a,b)
|
igor@44
|
288 result.setdefault(w, 0)
|
igor@44
|
289 result[w] += 1
|
igor@44
|
290 if 3 in group_by and not "" in [a,b,c]:
|
igor@44
|
291 w = "%s_%s_%s" % (a,b,c)
|
igor@44
|
292 result.setdefault(w, 0)
|
igor@44
|
293 result[w] += 1
|
igor@44
|
294 (a,b,c) = (b, c, word)
|
igor@44
|
295
|
igor@44
|
296 logging.debug(result)
|
igor@38
|
297 return result
|
igor@38
|
298
|
igor@54
|
299 def voc_filename():
|
igor@54
|
300 return "%s/%s.txt"%(config['config_directory'], config['language'])
|
igor@54
|
301
|
igor@38
|
302 def load_vocabulary():
|
igor@54
|
303 return get_words(readlines_from_file(voc_filename()))
|
igor@38
|
304
|
igor@38
|
305 def notes_filenames():
|
igor@38
|
306 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
|
igor@38
|
307
|
igor@38
|
308 def load_notes(files):
|
igor@38
|
309 notes = {}
|
igor@38
|
310 for filename in files:
|
igor@39
|
311 with codecs.open(filename, "r", "utf-8") as f:
|
igor@38
|
312 for line in f.readlines():
|
igor@38
|
313 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
|
igor@38
|
314 notes.setdefault(word, {})
|
igor@38
|
315 notes[word][filename] = note
|
igor@38
|
316 return notes
|
igor@38
|
317
|
igor@39
|
318 def add_notes(lines, notes):
|
igor@39
|
319 notes_filename = notes_filenames()[0]
|
igor@39
|
320 result = []
|
igor@39
|
321 for line in lines:
|
igor@39
|
322 if line.startswith('#'):
|
igor@39
|
323 result += [line]
|
igor@39
|
324 else:
|
igor@39
|
325 match_object = re.search('^\s*\S+\s*(\S+)', line)
|
igor@39
|
326 if match_object:
|
igor@39
|
327 word = match_object.group(1)
|
igor@39
|
328 if word in notes:
|
igor@39
|
329 if notes_filename in notes[word]:
|
igor@39
|
330 line = line.rstrip('\n')
|
igor@39
|
331 line = "%-30s %s\n" % (line, notes[word][notes_filename])
|
igor@39
|
332 result += [line]
|
igor@39
|
333 else:
|
igor@39
|
334 result += [line]
|
igor@39
|
335 else:
|
igor@39
|
336 result += [line]
|
igor@39
|
337 return result
|
igor@39
|
338
|
igor@39
|
339 def remove_notes(lines, notes_group):
|
igor@39
|
340 notes_filename = notes_filenames()[0]
|
igor@39
|
341 notes = {}
|
igor@39
|
342 for k in notes_group.keys():
|
igor@39
|
343 if notes_filename in notes_group[k]:
|
igor@39
|
344 notes[k] = notes_group[k][notes_filename]
|
igor@39
|
345
|
igor@39
|
346 result = []
|
igor@39
|
347 for line in lines:
|
igor@39
|
348 line = line.rstrip('\n')
|
igor@39
|
349 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
|
igor@39
|
350 if match_object:
|
igor@39
|
351 result.append("".join([
|
igor@39
|
352 match_object.group(1),
|
igor@39
|
353 match_object.group(2),
|
igor@39
|
354 match_object.group(3),
|
igor@39
|
355 match_object.group(4),
|
igor@39
|
356 "\n"
|
igor@39
|
357 ]))
|
igor@39
|
358 notes[match_object.group(4)] = match_object.group(6)
|
igor@39
|
359 else:
|
igor@39
|
360 result.append(line+"\n")
|
|