new-words

view new-words.py @ 38:adbc809d3924

Transition to Python started

new-words-py.sh is a wrapper around
new-words.py version which is not finished yet.
author Igor Chubin <igor@chub.in>
date Sat Jan 22 23:42:31 2011 +0100 (2011-01-22)
parents be6336e98b3c
children a598e0d25784
line source
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 import codecs
5 import logging
6 import os
7 import optparse
8 import re
9 import subprocess
10 import sys
11 import Stemmer
13 config = {
14 'config_directory': os.environ['HOME'] + '/.new-words',
15 'language': 'en',
16 }
18 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
20 class Normalizator:
21 def __init__(self, language, linked_words={}):
22 stemmer_algorithm = {
23 'de' : 'german',
24 'en' : 'english',
25 'ru' : 'russian',
26 'uk' : 'ukrainian',
27 }
28 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
29 self.linked_words = linked_words
31 def normalize(self, word):
32 word_chain = []
33 while word in self.linked_words and not word in word_chain:
34 word_chain.append(word)
35 word = self.linked_words[word]
36 return self.stemmer.stemWord(word.lower())
38 parser = optparse.OptionParser()
40 parser.add_option(
41 "-a", "--no-marks",
42 help="don't add marks (and don't save marks added by user)",
43 action="store_true",
44 dest="no_marks")
46 parser.add_option(
47 "-c", "--compressed",
48 help="show compressed wordlist: one word per group",
49 action="store_true",
50 dest="compressed")
52 parser.add_option(
53 "-k", "--known-words",
54 help="put higher words that are similar to the known words (only for English)",
55 action="store_true",
56 dest="compressed")
58 parser.add_option(
59 "-l", "--language",
60 help="specify language of text",
61 action="store",
62 dest="language")
64 parser.add_option(
65 "-f", "--function",
66 help="filter through subsystem [INTERNAL]",
67 action="store",
68 dest="function")
70 parser.add_option(
71 "-m", "--merge-tag",
72 help="merge words tagged with specified tag into the main vocabulary",
73 action="store",
74 dest="merge_tag")
76 parser.add_option(
77 "-M", "--merge-tagged",
78 help="merge words tagged with ANY tag into the main vocabulary",
79 action="store_true",
80 dest="merge_tagged")
82 parser.add_option(
83 "-n", "--non-interactive",
84 help="non-interactive mode (don't run vi)",
85 action="store_true",
86 dest="non_interactive")
88 parser.add_option(
89 "-N", "--no-filter",
90 help="switch off known words filtering",
91 action="store_true",
92 dest="no_filter")
94 parser.add_option(
95 "-p", "--pages",
96 help="work with specified pages only (pages = start-stop/total )",
97 action="store",
98 dest="pages")
100 parser.add_option(
101 "-r", "--remove-tag",
102 help="remove subvocabulary of specified tag",
103 action="store",
104 dest="remove_tag")
106 parser.add_option(
107 "-s", "--text-stats",
108 help="show the text statistics (percentage of known words and so on) and exit",
109 action="store_true",
110 dest="text_stats")
112 parser.add_option(
113 "-S", "--voc-stats",
114 help="show your vocabulary statistics (number of words and word groups)",
115 action="store_true",
116 dest="voc_stats")
118 parser.add_option(
119 "-t", "--tag",
120 help="tag known words with tag",
121 action="store",
122 dest="tag")
124 parser.add_option(
125 "-T", "--show-tags",
126 help="tag known words with tag",
127 action="store_true",
128 dest="show_tags")
130 parser.add_option(
131 "-2", "--two-words",
132 help="find 2 words' sequences",
133 action="store_true",
134 dest="two_words")
136 parser.add_option(
137 "-3", "--three-words",
138 help="find 3 words' sequences",
139 action="store_true",
140 dest="three_words")
142 def readlines_from_file(filename):
143 res = []
144 with codecs.open(filename, "r", "utf-8") as f:
145 for line in f.readlines():
146 res += [line]
147 return res
149 def readlines_from_stdin():
150 return codecs.getreader("utf-8")(sys.stdin).readlines()
152 def words_from_line(line):
153 line = line.rstrip('\n')
154 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
155 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
156 return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line)
158 def get_words(lines):
159 """
160 Returns hash of words in a file
161 word => number
162 """
163 result = {}
164 for line in lines:
165 words = words_from_line(line)
166 for word in words:
167 result.setdefault(word, 0)
168 result[word] += 1
169 return result
171 def load_vocabulary():
172 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
174 def notes_filenames():
175 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
177 def load_notes(files):
178 notes = {}
179 for filename in files:
180 with open(filename) as f:
181 for line in f.readlines():
182 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
183 notes.setdefault(word, {})
184 notes[word][filename] = note
185 return notes
187 def print_words_sorted(words_freq):
188 for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True):
189 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k))
191 def substract_dictionary(dict1, dict2):
192 """
193 returns dict1 - dict2
194 """
195 result = {}
196 for (k,v) in dict1.items():
197 if not k in dict2:
198 result[k] = v
199 return result
201 def dump_words(words, filename):
202 with codecs.open(filename, "w+", "utf-8") as f:
203 for word in words.keys():
204 f.write(("%s\n"%word)*words[word])
206 def error_message(text):
207 print text
209 def find_wordgroups_weights(lines, normalizator):
210 weight = {}
211 for line in lines:
212 line = re.sub('^\s*', '', line.rstrip('\n'))
213 (num, word) = re.split('\s+', line, maxsplit=1)
214 normalized = normalizator.normalize(word)
215 weight.setdefault(normalized, 0)
216 weight[normalized] += int(num)
217 return weight
219 def find_linked_words(notes):
220 linked_words = {}
221 for word in notes.keys():
222 for note in notes[word].values():
223 if "@" in note:
224 logging.debug("%s %s" % (word, note))
225 result = re.search(r'\@(\S*)', note)
226 if result:
227 main_word = result.group(1)
228 logging.debug("%s %s" % (word, main_word))
229 if main_word:
230 linked_words[word] = main_word
231 return linked_words
234 def compare_word_lines(line1, line2, wgw, normalizator, linked_words):
235 line1 = re.sub('^\s*', '', line1.rstrip('\n'))
236 (num1, word1) = re.split('\s+', line1, 1)
237 line2 = re.sub('^\s*', '', line2.rstrip('\n'))
238 (num2, word2) = re.split('\s+', line2, 1)
240 normalized_word1 = normalizator.normalize(word1)
241 normalized_word2 = normalizator.normalize(word2)
243 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
244 if cmp_res != 0:
245 return cmp_res
246 else:
247 cmp_res = cmp(normalized_word1, normalized_word2)
248 if cmp_res != 0:
249 return cmp_res
250 else:
251 return cmp(int(num1), int(num2))
253 def filter_get_words(args):
254 vocabulary = load_vocabulary()
255 words = get_words(readlines_from_stdin())
256 dump_words(words, args[0])
257 words = substract_dictionary(words, vocabulary)
258 print_words_sorted(words)
260 def filter_group_words(args):
261 lines = readlines_from_stdin()
262 notes = load_notes(notes_filenames())
263 linked_words = find_linked_words(notes)
264 logging.debug(linked_words)
265 normalizator = Normalizator(config['language'], linked_words)
267 wgw = find_wordgroups_weights(lines, normalizator)
268 for line in sorted(
269 lines,
270 cmp=lambda x,y:compare_word_lines(x,y, wgw, normalizator, linked_words),
271 reverse=True):
272 codecs.getwriter("utf-8")(sys.stdout).write(line)
274 (options, args) = parser.parse_args()
275 if options.language:
276 config['language'] = options.language
278 if options.function:
279 function_names = {
280 'get_words' : filter_get_words,
281 'group_words' : filter_group_words,
282 }
283 if options.function in function_names:
284 function_names[options.function](args)
285 else:
286 error_message("Unkown function %s.\nAvailable functions:\n%s" % (
287 options.function, "".join([" "+x for x in sorted(function_names.keys())])))
288 sys.exit(1)
293 #os.system("vim")