new-words: c3a50c0d2400 new-words.py

new-words

view new-words.py @ 40:c3a50c0d2400

Functions for adding/removing notes + statistics now implemented in Python.

Option -O (old-style) is not supported anymore. If you need old-style new-words use new-words.sh

author	Igor Chubin <igor@chub.in>
date	Sun Jan 23 17:09:44 2011 +0100 (2011-01-23)
parents	a598e0d25784
children	4629e08b0d87

line source

1 #!/usr/bin/env python

2 # -*- coding: utf-8 -*-

4 from __future__ import with_statement

5 import codecs

6 import logging

7 import os

8 import optparse

9 import re

10 import subprocess

11 import sys

12 import Stemmer

14 config = {

15 'config_directory': os.environ['HOME'] + '/.new-words',

16 'language': 'en',

17 }

19 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)

21 class Normalizator:

22 def __init__(self, language, linked_words={}):

23 stemmer_algorithm = {

24 'de' : 'german',

25 'en' : 'english',

26 'ru' : 'russian',

27 'uk' : 'ukrainian',

28 }

29 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])

30 self.linked_words = linked_words

32 def normalize(self, word):

33 word_chain = []

34 while word in self.linked_words and not word in word_chain:

35 word_chain.append(word)

36 word = self.linked_words[word]

37 return self.stemmer.stemWord(word.lower())

39 parser = optparse.OptionParser()

41 parser.add_option(

42 "-a", "--no-marks",

43 help="don't add marks (and don't save marks added by user)",

44 action="store_true",

45 dest="no_marks")

47 parser.add_option(

48 "-c", "--compressed",

49 help="show compressed wordlist: one word per group",

50 action="store_true",

51 dest="compressed")

53 parser.add_option(

54 "-k", "--known-words",

55 help="put higher words that are similar to the known words (only for English)",

56 action="store_true",

57 dest="compressed")

59 parser.add_option(

60 "-l", "--language",

61 help="specify language of text",

62 action="store",

63 dest="language")

65 parser.add_option(

66 "-f", "--function",

67 help="filter through subsystem [INTERNAL]",

68 action="store",

69 dest="function")

71 parser.add_option(

72 "-m", "--merge-tag",

73 help="merge words tagged with specified tag into the main vocabulary",

74 action="store",

75 dest="merge_tag")

77 parser.add_option(

78 "-M", "--merge-tagged",

79 help="merge words tagged with ANY tag into the main vocabulary",

80 action="store_true",

81 dest="merge_tagged")

83 parser.add_option(

84 "-n", "--non-interactive",

85 help="non-interactive mode (don't run vi)",

86 action="store_true",

87 dest="non_interactive")

89 parser.add_option(

90 "-N", "--no-filter",

91 help="switch off known words filtering",

92 action="store_true",

93 dest="no_filter")

95 parser.add_option(

96 "-p", "--pages",

97 help="work with specified pages only (pages = start-stop/total )",

98 action="store",

99 dest="pages")

100

101 parser.add_option(

102 "-r", "--remove-tag",

103 help="remove subvocabulary of specified tag",

104 action="store",

105 dest="remove_tag")

106

107 parser.add_option(

108 "-s", "--text-stats",

109 help="show the text statistics (percentage of known words and so on) and exit",

110 action="store_true",

111 dest="text_stats")

112

113 parser.add_option(

114 "-S", "--voc-stats",

115 help="show your vocabulary statistics (number of words and word groups)",

116 action="store_true",

117 dest="voc_stats")

118

119 parser.add_option(

120 "-t", "--tag",

121 help="tag known words with tag",

122 action="store",

123 dest="tag")

124

125 parser.add_option(

126 "-T", "--show-tags",

127 help="tag known words with tag",

128 action="store_true",

129 dest="show_tags")

130

131 parser.add_option(

132 "-2", "--two-words",

133 help="find 2 words' sequences",

134 action="store_true",

135 dest="two_words")

136

137 parser.add_option(

138 "-3", "--three-words",

139 help="find 3 words' sequences",

140 action="store_true",

141 dest="three_words")

142

143 def readlines_from_file(filename):

144 res = []

145 with codecs.open(filename, "r", "utf-8") as f:

146 for line in f.readlines():

147 res += [line]

148 return res

149

150 def readlines_from_stdin():

151 return codecs.getreader("utf-8")(sys.stdin).readlines()

152

153 def words_from_line(line):

154 line = line.rstrip('\n')

155 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)

156 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)

157 return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line)

158

159 def get_words(lines):

160 """

161 Returns hash of words in a file

162 word => number

163 """

164 result = {}

165 for line in lines:

166 words = words_from_line(line)

167 for word in words:

168 result.setdefault(word, 0)

169 result[word] += 1

170 return result

171

172 def load_vocabulary():

173 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))

174

175 def notes_filenames():

176 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]

177

178 def load_notes(files):

179 notes = {}

180 for filename in files:

181 with codecs.open(filename, "r", "utf-8") as f:

182 for line in f.readlines():

183 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

184 notes.setdefault(word, {})

185 notes[word][filename] = note

186 return notes

187

188 def add_notes(lines, notes):

189 notes_filename = notes_filenames()[0]

190 result = []

191 for line in lines:

192 if line.startswith('#'):

193 result += [line]

194 else:

195 match_object = re.search('^\s*\S+\s*(\S+)', line)

196 if match_object:

197 word = match_object.group(1)

198 if word in notes:

199 logging.debug(word)

200 logging.debug(line)

201 if notes_filename in notes[word]:

202 line = line.rstrip('\n')

203 line = "%-30s %s\n" % (line, notes[word][notes_filename])

204 logging.debug(line)

205 result += [line]

206 else:

207 result += [line]

208 else:

209 result += [line]

210 return result

211

212 def remove_notes(lines, notes_group):

213 notes_filename = notes_filenames()[0]

214 notes = {}

215 for k in notes_group.keys():

216 if notes_filename in notes_group[k]:

217 notes[k] = notes_group[k][notes_filename]

218

219 result = []

220 for line in lines:

221 line = line.rstrip('\n')

222 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)

223 if match_object:

224 result.append("".join([

225 match_object.group(1),

226 match_object.group(2),

227 match_object.group(3),

228 match_object.group(4),

229 "\n"

230 ]))

231 notes[match_object.group(4)] = match_object.group(6)

232 else:

233 result.append(line+"\n")

234

235 save_notes(notes_filename, notes)

236 return result

237

238 def save_notes(filename, notes):

239 lines = []

240 saved_words = []

241 with codecs.open(filename, "r", "utf-8") as f:

242 for line in f.readlines():

243 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

244 if word in notes:

245 line = "%-29s %s\n" % (word, notes[word])

246 saved_words.append(word)

247 lines.append(line)

248 for word in [x for x in notes.keys() if not x in saved_words]:

249 line = "%-29s %s\n" % (word, notes[word])

250 lines.append(line)

251

252 with codecs.open(filename, "w", "utf-8") as f:

253 for line in lines:

254 f.write(line)

255

256

257 def substract_dictionary(dict1, dict2):

258 """

259 returns dict1 - dict2

260 """

261 result = {}

262 for (k,v) in dict1.items():

263 if not k in dict2:

264 result[k] = v

265 return result

266

267 def dump_words(words, filename):

268 with codecs.open(filename, "w+", "utf-8") as f:

269 for word in words.keys():

270 f.write(("%s\n"%word)*words[word])

271

272 def error_message(text):

273 print text

274

275 def find_wordgroups_weights(word_pairs, normalizator):

276 weight = {}

277 for (num, word) in word_pairs:

278 normalized = normalizator.normalize(word)

279 weight.setdefault(normalized, 0)

280 weight[normalized] += num

281 return weight

282

283 def find_linked_words(notes):

284 linked_words = {}

285 for word in notes.keys():

286 for note in notes[word].values():

287 if "@" in note:

288 result = re.search(r'\@(\S*)', note)

289 if result:

290 main_word = result.group(1)

291 if main_word:

292 linked_words[word] = main_word

293 return linked_words

294

295 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):

296 (num1, word1) = pair1

297 (num2, word2) = pair2

298

299 normalized_word1 = normalizator.normalize(word1)

300 normalized_word2 = normalizator.normalize(word2)

301

302 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])

303 if cmp_res != 0:

304 return cmp_res

305 else:

306 cmp_res = cmp(normalized_word1, normalized_word2)

307 if cmp_res != 0:

308 return cmp_res

309 else:

310 return cmp(int(num1), int(num2))

311

312 def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False):

313 if stats_only:

314 codecs.getwriter("utf-8")(sys.stdout).write("stat_only")

315 return

316

317 if print_stats:

318 codecs.getwriter("utf-8")(sys.stdout).write(

319 "# %(language)s, %(percentage)s, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)

320

321 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)

322 known = int(stats['total_known'])

323 total = int(stats['total'])

324 current_level = 0

325 for word_pair in word_pairs:

326 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)

327 known += word_pair[0]

328 if 100.0*known/total >= level_lines[0]:

329 current_level = level_lines[0]

330 while 100.0*known/total > level_lines[0]:

331 current_level = level_lines[0]

332 level_lines = level_lines[1:]

333 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)

334

335 def filter_add_notes(args):

336 lines = readlines_from_file(args[0])

337 notes = load_notes(notes_filenames())

338 lines = add_notes(lines, notes)

339 with codecs.open(args[0], "w", "utf-8") as f:

340 for line in lines:

341 f.write(line)

342

343 def filter_remove_notes(args):

344 lines = readlines_from_file(args[0])

345 notes = load_notes(notes_filenames())

346 lines = remove_notes(lines, notes)

347 with codecs.open(args[0], "w", "utf-8") as f:

348 for line in lines:

349 f.write(line)

350

351 def filter_get_words_group_words_add_stat(args):

352 vocabulary = load_vocabulary()

353 notes = load_notes(notes_filenames())

354 lines = readlines_from_stdin()

355 words = get_words(lines)

356

357 stats = {}

358 stats['total'] = sum(words[x] for x in words.keys())

359 words = substract_dictionary(words, vocabulary)

360

361 stats['total_unknown'] = sum(words[x] for x in words.keys())

362 stats['total_known'] = stats['total'] - stats['total_unknown']

363 stats['percentage'] = "%7.2f"%(100.0*stats['total_known']/stats['total'])

364 stats['groups'] = 0

365 stats['words'] = len(words)

366 stats['sentences'] = 0 #FIXME

367 stats['language'] = config['language']

368

369 linked_words = find_linked_words(notes)

370 normalizator = Normalizator(config['language'], linked_words)

371

372 word_pairs = []

373 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):

374 word_pairs.append((words[k], k))

375

376 wgw = find_wordgroups_weights(word_pairs, normalizator)

377 word_pairs = sorted(

378 word_pairs,

379 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),

380 reverse=True)

381

382 print_words_sorted(word_pairs, stats)

383

384 (options, args) = parser.parse_args()

385 if options.language:

386 config['language'] = options.language

387

388 if options.function:

389 function_names = {

390 'add_notes' : filter_add_notes,

391 'remove_notes': filter_remove_notes,

392 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,

393 }

394 if options.function in function_names:

395 function_names[options.function](args)

396 else:

397 error_message("Unkown function %s.\nAvailable functions:\n%s" % (

398 options.function, "".join([" "+x for x in sorted(function_names.keys())])))

399 sys.exit(1)

400

401

402

403

404 #os.system("vim")

405