new-words: a598e0d25784 new-words.py

new-words

view new-words.py @ 39:a598e0d25784

add_notes (add_marks) + remove_notes (remove_marks) implemented in python

author	Igor Chubin <igor@chub.in>
date	Sun Jan 23 14:25:52 2011 +0100 (2011-01-23)
parents	adbc809d3924
children	c3a50c0d2400

line source

1 #!/usr/bin/env python

2 # -*- coding: utf-8 -*-

4 import codecs

5 import logging

6 import os

7 import optparse

8 import re

9 import subprocess

10 import sys

11 import Stemmer

13 config = {

14 'config_directory': os.environ['HOME'] + '/.new-words',

15 'language': 'en',

16 }

18 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)

20 class Normalizator:

21 def __init__(self, language, linked_words={}):

22 stemmer_algorithm = {

23 'de' : 'german',

24 'en' : 'english',

25 'ru' : 'russian',

26 'uk' : 'ukrainian',

27 }

28 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])

29 self.linked_words = linked_words

31 def normalize(self, word):

32 word_chain = []

33 while word in self.linked_words and not word in word_chain:

34 word_chain.append(word)

35 word = self.linked_words[word]

36 return self.stemmer.stemWord(word.lower())

38 parser = optparse.OptionParser()

40 parser.add_option(

41 "-a", "--no-marks",

42 help="don't add marks (and don't save marks added by user)",

43 action="store_true",

44 dest="no_marks")

46 parser.add_option(

47 "-c", "--compressed",

48 help="show compressed wordlist: one word per group",

49 action="store_true",

50 dest="compressed")

52 parser.add_option(

53 "-k", "--known-words",

54 help="put higher words that are similar to the known words (only for English)",

55 action="store_true",

56 dest="compressed")

58 parser.add_option(

59 "-l", "--language",

60 help="specify language of text",

61 action="store",

62 dest="language")

64 parser.add_option(

65 "-f", "--function",

66 help="filter through subsystem [INTERNAL]",

67 action="store",

68 dest="function")

70 parser.add_option(

71 "-m", "--merge-tag",

72 help="merge words tagged with specified tag into the main vocabulary",

73 action="store",

74 dest="merge_tag")

76 parser.add_option(

77 "-M", "--merge-tagged",

78 help="merge words tagged with ANY tag into the main vocabulary",

79 action="store_true",

80 dest="merge_tagged")

82 parser.add_option(

83 "-n", "--non-interactive",

84 help="non-interactive mode (don't run vi)",

85 action="store_true",

86 dest="non_interactive")

88 parser.add_option(

89 "-N", "--no-filter",

90 help="switch off known words filtering",

91 action="store_true",

92 dest="no_filter")

94 parser.add_option(

95 "-p", "--pages",

96 help="work with specified pages only (pages = start-stop/total )",

97 action="store",

98 dest="pages")

100 parser.add_option(

101 "-r", "--remove-tag",

102 help="remove subvocabulary of specified tag",

103 action="store",

104 dest="remove_tag")

105

106 parser.add_option(

107 "-s", "--text-stats",

108 help="show the text statistics (percentage of known words and so on) and exit",

109 action="store_true",

110 dest="text_stats")

111

112 parser.add_option(

113 "-S", "--voc-stats",

114 help="show your vocabulary statistics (number of words and word groups)",

115 action="store_true",

116 dest="voc_stats")

117

118 parser.add_option(

119 "-t", "--tag",

120 help="tag known words with tag",

121 action="store",

122 dest="tag")

123

124 parser.add_option(

125 "-T", "--show-tags",

126 help="tag known words with tag",

127 action="store_true",

128 dest="show_tags")

129

130 parser.add_option(

131 "-2", "--two-words",

132 help="find 2 words' sequences",

133 action="store_true",

134 dest="two_words")

135

136 parser.add_option(

137 "-3", "--three-words",

138 help="find 3 words' sequences",

139 action="store_true",

140 dest="three_words")

141

142 def readlines_from_file(filename):

143 res = []

144 with codecs.open(filename, "r", "utf-8") as f:

145 for line in f.readlines():

146 res += [line]

147 return res

148

149 def readlines_from_stdin():

150 return codecs.getreader("utf-8")(sys.stdin).readlines()

151

152 def words_from_line(line):

153 line = line.rstrip('\n')

154 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)

155 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)

156 return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line)

157

158 def get_words(lines):

159 """

160 Returns hash of words in a file

161 word => number

162 """

163 result = {}

164 for line in lines:

165 words = words_from_line(line)

166 for word in words:

167 result.setdefault(word, 0)

168 result[word] += 1

169 return result

170

171 def load_vocabulary():

172 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))

173

174 def notes_filenames():

175 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]

176

177 def load_notes(files):

178 notes = {}

179 for filename in files:

180 with codecs.open(filename, "r", "utf-8") as f:

181 for line in f.readlines():

182 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

183 notes.setdefault(word, {})

184 notes[word][filename] = note

185 return notes

186

187 def add_notes(lines, notes):

188 notes_filename = notes_filenames()[0]

189 result = []

190 for line in lines:

191 if line.startswith('#'):

192 result += [line]

193 else:

194 match_object = re.search('^\s*\S+\s*(\S+)', line)

195 if match_object:

196 word = match_object.group(1)

197 if word in notes:

198 logging.debug(word)

199 logging.debug(line)

200 if notes_filename in notes[word]:

201 line = line.rstrip('\n')

202 line = "%-30s %s\n" % (line, notes[word][notes_filename])

203 logging.debug(line)

204 result += [line]

205 else:

206 result += [line]

207 else:

208 result += [line]

209 return result

210

211 def remove_notes(lines, notes_group):

212 notes_filename = notes_filenames()[0]

213 notes = {}

214 for k in notes_group.keys():

215 if notes_filename in notes_group[k]:

216 notes[k] = notes_group[k][notes_filename]

217

218 result = []

219 for line in lines:

220 line = line.rstrip('\n')

221 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)

222 if match_object:

223 result.append("".join([

224 match_object.group(1),

225 match_object.group(2),

226 match_object.group(3),

227 match_object.group(4),

228 "\n"

229 ]))

230 notes[match_object.group(4)] = match_object.group(6)

231 else:

232 result.append(line+"\n")

233

234 save_notes(notes_filename, notes)

235 return result

236

237 def save_notes(filename, notes):

238 lines = []

239 saved_words = []

240 with codecs.open(filename, "r", "utf-8") as f:

241 for line in f.readlines():

242 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

243 if word in notes:

244 line = "%-29s %s\n" % (word, notes[word])

245 saved_words.append(word)

246 lines.append(line)

247 for word in [x for x in notes.keys() if not x in saved_words]:

248 line = "%-29s %s\n" % (word, notes[word])

249 lines.append(line)

250

251 with codecs.open(filename, "w", "utf-8") as f:

252 for line in lines:

253 f.write(line)

254

255

256 def print_words_sorted(words_freq):

257 for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True):

258 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k))

259

260 def substract_dictionary(dict1, dict2):

261 """

262 returns dict1 - dict2

263 """

264 result = {}

265 for (k,v) in dict1.items():

266 if not k in dict2:

267 result[k] = v

268 return result

269

270 def dump_words(words, filename):

271 with codecs.open(filename, "w+", "utf-8") as f:

272 for word in words.keys():

273 f.write(("%s\n"%word)*words[word])

274

275 def error_message(text):

276 print text

277

278 def find_wordgroups_weights(lines, normalizator):

279 weight = {}

280 for line in lines:

281 line = re.sub('^\s*', '', line.rstrip('\n'))

282 (num, word) = re.split('\s+', line, maxsplit=1)

283 normalized = normalizator.normalize(word)

284 weight.setdefault(normalized, 0)

285 weight[normalized] += int(num)

286 return weight

287

288 def find_linked_words(notes):

289 linked_words = {}

290 for word in notes.keys():

291 for note in notes[word].values():

292 if "@" in note:

293 result = re.search(r'\@(\S*)', note)

294 if result:

295 main_word = result.group(1)

296 if main_word:

297 linked_words[word] = main_word

298 return linked_words

299

300

301 def compare_word_lines(line1, line2, wgw, normalizator, linked_words):

302 line1 = re.sub('^\s*', '', line1.rstrip('\n'))

303 (num1, word1) = re.split('\s+', line1, 1)

304 line2 = re.sub('^\s*', '', line2.rstrip('\n'))

305 (num2, word2) = re.split('\s+', line2, 1)

306

307 normalized_word1 = normalizator.normalize(word1)

308 normalized_word2 = normalizator.normalize(word2)

309

310 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])

311 if cmp_res != 0:

312 return cmp_res

313 else:

314 cmp_res = cmp(normalized_word1, normalized_word2)

315 if cmp_res != 0:

316 return cmp_res

317 else:

318 return cmp(int(num1), int(num2))

319

320 def filter_get_words(args):

321 vocabulary = load_vocabulary()

322 words = get_words(readlines_from_stdin())

323 dump_words(words, args[0])

324 words = substract_dictionary(words, vocabulary)

325 print_words_sorted(words)

326

327 def filter_group_words(args):

328 lines = readlines_from_stdin()

329 notes = load_notes(notes_filenames())

330 linked_words = find_linked_words(notes)

331 normalizator = Normalizator(config['language'], linked_words)

332

333 wgw = find_wordgroups_weights(lines, normalizator)

334 for line in sorted(

335 lines,

336 cmp=lambda x,y:compare_word_lines(x,y, wgw, normalizator, linked_words),

337 reverse=True):

338 codecs.getwriter("utf-8")(sys.stdout).write(line)

339

340 def filter_add_notes(args):

341 lines = readlines_from_file(args[0])

342 notes = load_notes(notes_filenames())

343 lines = add_notes(lines, notes)

344 with codecs.open(args[0], "w", "utf-8") as f:

345 for line in lines:

346 f.write(line)

347

348 def filter_remove_notes(args):

349 lines = readlines_from_file(args[0])

350 notes = load_notes(notes_filenames())

351 lines = remove_notes(lines, notes)

352 with codecs.open(args[0], "w", "utf-8") as f:

353 for line in lines:

354 f.write(line)

355

356 (options, args) = parser.parse_args()

357 if options.language:

358 config['language'] = options.language

359

360 if options.function:

361 function_names = {

362 'get_words' : filter_get_words,

363 'group_words' : filter_group_words,

364 'add_notes' : filter_add_notes,

365 'remove_notes': filter_remove_notes,

366 }

367 if options.function in function_names:

368 function_names[options.function](args)

369 else:

370 error_message("Unkown function %s.\nAvailable functions:\n%s" % (

371 options.function, "".join([" "+x for x in sorted(function_names.keys())])))

372 sys.exit(1)

373

374

375

376

377 #os.system("vim")

378