new-words: d708e2c1bad8 new-words.py

new-words

view new-words.py @ 47:d708e2c1bad8

compressed wordlist support

author	Igor Chubin <igor@chub.in>
date	Mon Feb 07 21:21:17 2011 +0200 (2011-02-07)
parents	5f90e44eecfc
children	7194bdb56475

line source

1 #!/usr/bin/env python

2 # -*- coding: utf-8 -*-

4 from __future__ import with_statement

5 import codecs

6 import logging

7 import os

8 import optparse

9 import re

10 import subprocess

11 import sys

12 import Stemmer

13 try:

14 import psyco

15 psyco.full()

16 except:

17 pass

19 config = {

20 'config_directory': os.environ['HOME'] + '/.new-words',

21 'language': 'en',

22 }

24 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)

26 class Normalizator:

27 def __init__(self, language, linked_words={}):

28 stemmer_algorithm = {

29 'de' : 'german',

30 'en' : 'english',

31 'ru' : 'russian',

32 'uk' : 'ukrainian',

33 }

34 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])

35 self.linked_words = linked_words

37 def normalize(self, word):

38 word_chain = []

39 while word in self.linked_words and not word in word_chain:

40 word_chain.append(word)

41 word = self.linked_words[word]

42 return self.stemmer.stemWord(word.lower())

44 def best_word_from_group(self, wordpairs_group):

45 """Returns the word that is the most relevant to the wordpairs_group.

47 At the moment: returns the word with minimal length"""

49 minimal_length = min(len(pair[1]) for pair in wordpairs_group)

50 return list(x[1] for x in sorted(

51 (x for x in wordpairs_group if len(x[1]) == minimal_length),

52 key=lambda x:x[0],

53 reverse=True))[0]

55 parser = optparse.OptionParser()

57 parser.add_option(

58 "-a", "--no-marks",

59 help="don't add marks (and don't save marks added by user)",

60 action="store_true",

61 dest="no_marks")

63 parser.add_option(

64 "-c", "--compressed",

65 help="show compressed wordlist: one word per group",

66 action="store_true",

67 dest="compressed")

69 parser.add_option(

70 "-k", "--known-words",

71 help="put higher words that are similar to the known words (only for English)",

72 action="store_true",

73 dest="compressed")

75 parser.add_option(

76 "-l", "--language",

77 help="specify language of text",

78 action="store",

79 dest="language")

81 parser.add_option(

82 "-f", "--function",

83 help="filter through subsystem [INTERNAL]",

84 action="store",

85 dest="function")

87 parser.add_option(

88 "-m", "--merge-tag",

89 help="merge words tagged with specified tag into the main vocabulary",

90 action="store",

91 dest="merge_tag")

93 parser.add_option(

94 "-M", "--merge-tagged",

95 help="merge words tagged with ANY tag into the main vocabulary",

96 action="store_true",

97 dest="merge_tagged")

99 parser.add_option(

100 "-n", "--non-interactive",

101 help="non-interactive mode (don't run vi)",

102 action="store_true",

103 dest="non_interactive")

104

105 parser.add_option(

106 "-N", "--no-filter",

107 help="switch off known words filtering",

108 action="store_true",

109 dest="no_filter")

110

111 parser.add_option(

112 "-p", "--pages",

113 help="work with specified pages only (pages = start-stop/total )",

114 action="store",

115 dest="pages")

116

117 parser.add_option(

118 "-r", "--remove-tag",

119 help="remove subvocabulary of specified tag",

120 action="store",

121 dest="remove_tag")

122

123 parser.add_option(

124 "-s", "--text-stats",

125 help="show the text statistics (percentage of known words and so on) and exit",

126 action="store_true",

127 dest="text_stats")

128

129 parser.add_option(

130 "-S", "--voc-stats",

131 help="show your vocabulary statistics (number of words and word groups)",

132 action="store_true",

133 dest="voc_stats")

134

135 parser.add_option(

136 "-t", "--tag",

137 help="tag known words with tag",

138 action="store",

139 dest="tag")

140

141 parser.add_option(

142 "-T", "--show-tags",

143 help="tag known words with tag",

144 action="store_true",

145 dest="show_tags")

146

147 parser.add_option(

148 "-2", "--two-words",

149 help="find 2 words' sequences",

150 action="store_true",

151 dest="two_words")

152

153 parser.add_option(

154 "-3", "--three-words",

155 help="find 3 words' sequences",

156 action="store_true",

157 dest="three_words")

158

159 def readlines_from_file(filename):

160 res = []

161 with codecs.open(filename, "r", "utf-8") as f:

162 for line in f.readlines():

163 res += [line]

164 return res

165

166 def readlines_from_stdin():

167 return codecs.getreader("utf-8")(sys.stdin).readlines()

168

169 def words_from_line(line):

170 line = line.rstrip('\n')

171 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)

172 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)

173 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)

174

175 def get_words(lines, group_by=[1]):

176 """

177 Returns hash of words in a file

178 word => number

179 """

180 result = {}

181 (a, b, c) = ("", "", "")

182 for line in lines:

183 words = words_from_line(line)

184 for word in words:

185 if re.match('[0-9]*$', word):

186 continue

187 result.setdefault(word, 0)

188 result[word] += 1

189 if 2 in group_by and a != "" and b != "":

190 w = "%s_%s" % (a,b)

191 result.setdefault(w, 0)

192 result[w] += 1

193 if 3 in group_by and not "" in [a,b,c]:

194 w = "%s_%s_%s" % (a,b,c)

195 result.setdefault(w, 0)

196 result[w] += 1

197 (a,b,c) = (b, c, word)

198

199 logging.debug(result)

200 return result

201

202 def load_vocabulary():

203 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))

204

205 def notes_filenames():

206 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]

207

208 def load_notes(files):

209 notes = {}

210 for filename in files:

211 with codecs.open(filename, "r", "utf-8") as f:

212 for line in f.readlines():

213 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

214 notes.setdefault(word, {})

215 notes[word][filename] = note

216 return notes

217

218 def add_notes(lines, notes):

219 notes_filename = notes_filenames()[0]

220 result = []

221 for line in lines:

222 if line.startswith('#'):

223 result += [line]

224 else:

225 match_object = re.search('^\s*\S+\s*(\S+)', line)

226 if match_object:

227 word = match_object.group(1)

228 if word in notes:

229 if notes_filename in notes[word]:

230 line = line.rstrip('\n')

231 line = "%-30s %s\n" % (line, notes[word][notes_filename])

232 result += [line]

233 else:

234 result += [line]

235 else:

236 result += [line]

237 return result

238

239 def remove_notes(lines, notes_group):

240 notes_filename = notes_filenames()[0]

241 notes = {}

242 for k in notes_group.keys():

243 if notes_filename in notes_group[k]:

244 notes[k] = notes_group[k][notes_filename]

245

246 result = []

247 for line in lines:

248 line = line.rstrip('\n')

249 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)

250 if match_object:

251 result.append("".join([

252 match_object.group(1),

253 match_object.group(2),

254 match_object.group(3),

255 match_object.group(4),

256 "\n"

257 ]))

258 notes[match_object.group(4)] = match_object.group(6)

259 else:

260 result.append(line+"\n")

261

262 save_notes(notes_filename, notes)

263 return result

264

265 def save_notes(filename, notes):

266 lines = []

267 saved_words = []

268 with codecs.open(filename, "r", "utf-8") as f:

269 for line in f.readlines():

270 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

271 if word in notes:

272 line = "%-29s %s\n" % (word, notes[word])

273 saved_words.append(word)

274 lines.append(line)

275 for word in [x for x in notes.keys() if not x in saved_words]:

276 line = "%-29s %s\n" % (word, notes[word])

277 lines.append(line)

278

279 with codecs.open(filename, "w", "utf-8") as f:

280 for line in lines:

281 f.write(line)

282

283

284 def substract_dictionary(dict1, dict2):

285 """

286 returns dict1 - dict2

287 """

288 result = {}

289 for (k,v) in dict1.items():

290 if not k in dict2:

291 result[k] = v

292 return result

293

294 def dump_words(words, filename):

295 with codecs.open(filename, "w+", "utf-8") as f:

296 for word in words.keys():

297 f.write(("%s\n"%word)*words[word])

298

299 def error_message(text):

300 print text

301

302 def find_wordgroups_weights(word_pairs, normalizator):

303 weight = {}

304 for (num, word) in word_pairs:

305 normalized = normalizator.normalize(word)

306 weight.setdefault(normalized, 0)

307 weight[normalized] += num

308 return weight

309

310 def find_linked_words(notes):

311 linked_words = {}

312 for word in notes.keys():

313 for note in notes[word].values():

314 if "@" in note:

315 result = re.search(r'\@(\S*)', note)

316 if result:

317 main_word = result.group(1)

318 if main_word:

319 linked_words[word] = main_word

320 return linked_words

321

322 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):

323 (num1, word1) = pair1

324 (num2, word2) = pair2

325

326 normalized_word1 = normalizator.normalize(word1)

327 normalized_word2 = normalizator.normalize(word2)

328

329 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])

330 if cmp_res != 0:

331 return cmp_res

332 else:

333 cmp_res = cmp(normalized_word1, normalized_word2)

334 if cmp_res != 0:

335 return cmp_res

336 else:

337 return cmp(int(num1), int(num2))

338

339

340 def print_words_sorted(word_pairs, stats, normalizator, print_stats=True, stats_only=False, compressed_wordlist=False):

341 if stats_only:

342 codecs.getwriter("utf-8")(sys.stdout).write(

343 " ".join([

344 "%-10s" % x for x in [

345 "LANG",

346 "KNOWN%",

347 "UNKNOWN%",

348 "KNOWN",

349 "TOTAL",

350 "WPS",

351 "UWPS*10"

352 ]]) + "\n")

353 codecs.getwriter("utf-8")(sys.stdout).write(

354 " ".join([

355 "%(language)-10s",

356 "%(percentage)-10.2f",

357 "%(percentage_unknown)-10.2f",

358 "%(total_known)-11d"

359 "%(total)-11d"

360 "%(wps)-11d"

361 "%(uwps)-11d"

362 ]) % stats + "\n")

363 return

364

365 if print_stats:

366 codecs.getwriter("utf-8")(sys.stdout).write(

367 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)

368

369 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)

370 known = int(stats['total_known'])

371 total = int(stats['total'])

372 current_level = 0

373 old_normalized_word = None

374 words_of_this_group = []

375 for word_pair in word_pairs:

376

377 normalized_word = normalizator.normalize(word_pair[1])

378 if old_normalized_word and old_normalized_word != normalized_word:

379 #codecs.getwriter("utf-8")(sys.stdout).write(

380 # "### %s\n" % normalizator.best_word_from_group(words_of_this_group))

381 compressed_word_pair = (

382 sum(x[0] for x in words_of_this_group),

383 normalizator.best_word_from_group(words_of_this_group)

384 )

385 if compressed_wordlist:

386 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)

387 words_of_this_group = []

388

389 old_normalized_word = normalized_word

390 words_of_this_group.append(word_pair)

391

392 if not compressed_wordlist:

393 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)

394

395

396 known += word_pair[0]

397 if 100.0*known/total >= level_lines[0]:

398 current_level = level_lines[0]

399 while 100.0*known/total > level_lines[0]:

400 current_level = level_lines[0]

401 level_lines = level_lines[1:]

402 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)

403

404 def filter_add_notes(args):

405 lines = readlines_from_file(args[0])

406 notes = load_notes(notes_filenames())

407 lines = add_notes(lines, notes)

408 with codecs.open(args[0], "w", "utf-8") as f:

409 for line in lines:

410 f.write(line)

411

412 def filter_remove_notes(args):

413 lines = readlines_from_file(args[0])

414 notes = load_notes(notes_filenames())

415 lines = remove_notes(lines, notes)

416 with codecs.open(args[0], "w", "utf-8") as f:

417 for line in lines:

418 f.write(line)

419

420 def filter_get_words_group_words_add_stat(args):

421 vocabulary = load_vocabulary()

422 notes = load_notes(notes_filenames())

423 lines = readlines_from_stdin()

424 group_by = [1]

425 if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':

426 group_by.append(2)

427 if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':

428 group_by.append(3)

429 words = get_words(lines, group_by)

430 stats_only = False

431 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':

432 stats_only = True

433

434 compressed_wordlist = False

435 if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':

436 compressed_wordlist = True

437

438

439 stats = {}

440 stats['total'] = sum(words[x] for x in words.keys())

441 if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':

442 words = substract_dictionary(words, vocabulary)

443

444 stats['total_unknown'] = sum(words[x] for x in words.keys())

445 stats['total_known'] = stats['total'] - stats['total_unknown']

446 stats['percentage'] = 100.0*stats['total_known']/stats['total']

447 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']

448 stats['groups'] = 0

449 stats['words'] = len(words)

450 stats['sentences'] = 0 #FIXME

451 stats['wps'] = 0 #FIXME

452 stats['uwps'] = 0 #FIXME

453 stats['language'] = config['language']

454

455 linked_words = find_linked_words(notes)

456 normalizator = Normalizator(config['language'], linked_words)

457

458 words_with_freq = []

459 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):

460 words_with_freq.append((words[k], k))

461

462 wgw = find_wordgroups_weights(words_with_freq, normalizator)

463 if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':

464 words_with_freq = sorted(

465 words_with_freq,

466 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),

467 reverse=True)

468

469 print_words_sorted(

470 words_with_freq,

471 stats,

472 normalizator,

473 stats_only=stats_only,

474 compressed_wordlist=compressed_wordlist

475 )

476

477 (options, args) = parser.parse_args()

478 if options.language:

479 config['language'] = options.language

480

481 if options.function:

482 function_names = {

483 'add_notes' : filter_add_notes,

484 'remove_notes': filter_remove_notes,

485 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,

486 }

487 if options.function in function_names:

488 function_names[options.function](args)

489 else:

490 error_message("Unkown function %s.\nAvailable functions:\n%s" % (

491 options.function, "".join([" "+x for x in sorted(function_names.keys())])))

492 sys.exit(1)

493

494

495

496

497 #os.system("vim")

498