new-words: 7194bdb56475 new-words.py

new-words

view new-words.py @ 48:7194bdb56475

new feature: -r and -R can specify number of words (or percentage) to show

author	Igor Chubin <igor@chub.in>
date	Tue Feb 08 20:35:38 2011 +0200 (2011-02-08)
parents	d708e2c1bad8
children	00286f6bfa85

line source

1 #!/usr/bin/env python

2 # -*- coding: utf-8 -*-

4 from __future__ import with_statement

5 import codecs

6 import logging

7 import os

8 import optparse

9 import re

10 import subprocess

11 import sys

12 import Stemmer

13 try:

14 import psyco

15 psyco.full()

16 except:

17 pass

19 config = {

20 'config_directory': os.environ['HOME'] + '/.new-words',

21 'language': 'en',

22 }

24 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)

26 class Normalizator:

27 def __init__(self, language, linked_words={}):

28 stemmer_algorithm = {

29 'de' : 'german',

30 'en' : 'english',

31 'ru' : 'russian',

32 'uk' : 'ukrainian',

33 }

34 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])

35 self.linked_words = linked_words

37 def normalize(self, word):

38 word_chain = []

39 while word in self.linked_words and not word in word_chain:

40 word_chain.append(word)

41 word = self.linked_words[word]

42 return self.stemmer.stemWord(word.lower())

44 def best_word_from_group(self, wordpairs_group):

45 """Returns the word that is the most relevant to the wordpairs_group.

47 At the moment: returns the word with minimal length"""

49 minimal_length = min(len(pair[1]) for pair in wordpairs_group)

50 return list(x[1] for x in sorted(

51 (x for x in wordpairs_group if len(x[1]) == minimal_length),

52 key=lambda x:x[0],

53 reverse=True))[0]

55 parser = optparse.OptionParser()

57 parser.add_option(

58 "-a", "--no-marks",

59 help="don't add marks (and don't save marks added by user)",

60 action="store_true",

61 dest="no_marks")

63 parser.add_option(

64 "-c", "--compressed",

65 help="show compressed wordlist: one word per group",

66 action="store_true",

67 dest="compressed")

69 parser.add_option(

70 "-k", "--known-words",

71 help="put higher words that are similar to the known words (only for English)",

72 action="store_true",

73 dest="compressed")

75 parser.add_option(

76 "-l", "--language",

77 help="specify language of text",

78 action="store",

79 dest="language")

81 parser.add_option(

82 "-f", "--function",

83 help="filter through subsystem [INTERNAL]",

84 action="store",

85 dest="function")

87 parser.add_option(

88 "-m", "--merge-tag",

89 help="merge words tagged with specified tag into the main vocabulary",

90 action="store",

91 dest="merge_tag")

93 parser.add_option(

94 "-M", "--merge-tagged",

95 help="merge words tagged with ANY tag into the main vocabulary",

96 action="store_true",

97 dest="merge_tagged")

99 parser.add_option(

100 "-n", "--non-interactive",

101 help="non-interactive mode (don't run vi)",

102 action="store_true",

103 dest="non_interactive")

104

105 parser.add_option(

106 "-N", "--no-filter",

107 help="switch off known words filtering",

108 action="store_true",

109 dest="no_filter")

110

111 parser.add_option(

112 "-p", "--pages",

113 help="work with specified pages only (pages = start-stop/total )",

114 action="store",

115 dest="pages")

116

117 parser.add_option(

118 "-d", "--delete-tag",

119 help="delete subvocabulary of specified tag",

120 action="store",

121 dest="delete_tag")

122

123 parser.add_option(

124 "-s", "--text-stats",

125 help="show the text statistics (percentage of known words and so on) and exit",

126 action="store_true",

127 dest="text_stats")

128

129 parser.add_option(

130 "-S", "--voc-stats",

131 help="show your vocabulary statistics (number of words and word groups)",

132 action="store_true",

133 dest="voc_stats")

134

135 parser.add_option(

136 "-t", "--tag",

137 help="tag known words with tag",

138 action="store",

139 dest="tag")

140

141 parser.add_option(

142 "-T", "--show-tags",

143 help="tag known words with tag",

144 action="store_true",

145 dest="show_tags")

146

147 parser.add_option(

148 "-2", "--two-words",

149 help="find 2 words' sequences",

150 action="store_true",

151 dest="two_words")

152

153 parser.add_option(

154 "-3", "--three-words",

155 help="find 3 words' sequences",

156 action="store_true",

157 dest="three_words")

158

159 def readlines_from_file(filename):

160 res = []

161 with codecs.open(filename, "r", "utf-8") as f:

162 for line in f.readlines():

163 res += [line]

164 return res

165

166 def readlines_from_stdin():

167 return codecs.getreader("utf-8")(sys.stdin).readlines()

168

169 def words_from_line(line):

170 line = line.rstrip('\n')

171 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)

172 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)

173 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)

174

175 def get_words(lines, group_by=[1]):

176 """

177 Returns hash of words in a file

178 word => number

179 """

180 result = {}

181 (a, b, c) = ("", "", "")

182 for line in lines:

183 words = words_from_line(line)

184 for word in words:

185 if re.match('[0-9]*$', word):

186 continue

187 result.setdefault(word, 0)

188 result[word] += 1

189 if 2 in group_by and a != "" and b != "":

190 w = "%s_%s" % (a,b)

191 result.setdefault(w, 0)

192 result[w] += 1

193 if 3 in group_by and not "" in [a,b,c]:

194 w = "%s_%s_%s" % (a,b,c)

195 result.setdefault(w, 0)

196 result[w] += 1

197 (a,b,c) = (b, c, word)

198

199 logging.debug(result)

200 return result

201

202 def load_vocabulary():

203 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))

204

205 def notes_filenames():

206 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]

207

208 def load_notes(files):

209 notes = {}

210 for filename in files:

211 with codecs.open(filename, "r", "utf-8") as f:

212 for line in f.readlines():

213 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

214 notes.setdefault(word, {})

215 notes[word][filename] = note

216 return notes

217

218 def add_notes(lines, notes):

219 notes_filename = notes_filenames()[0]

220 result = []

221 for line in lines:

222 if line.startswith('#'):

223 result += [line]

224 else:

225 match_object = re.search('^\s*\S+\s*(\S+)', line)

226 if match_object:

227 word = match_object.group(1)

228 if word in notes:

229 if notes_filename in notes[word]:

230 line = line.rstrip('\n')

231 line = "%-30s %s\n" % (line, notes[word][notes_filename])

232 result += [line]

233 else:

234 result += [line]

235 else:

236 result += [line]

237 return result

238

239 def remove_notes(lines, notes_group):

240 notes_filename = notes_filenames()[0]

241 notes = {}

242 for k in notes_group.keys():

243 if notes_filename in notes_group[k]:

244 notes[k] = notes_group[k][notes_filename]

245

246 result = []

247 for line in lines:

248 line = line.rstrip('\n')

249 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)

250 if match_object:

251 result.append("".join([

252 match_object.group(1),

253 match_object.group(2),

254 match_object.group(3),

255 match_object.group(4),

256 "\n"

257 ]))

258 notes[match_object.group(4)] = match_object.group(6)

259 else:

260 result.append(line+"\n")

261

262 save_notes(notes_filename, notes)

263 return result

264

265 def save_notes(filename, notes):

266 lines = []

267 saved_words = []

268 with codecs.open(filename, "r", "utf-8") as f:

269 for line in f.readlines():

270 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

271 if word in notes:

272 line = "%-29s %s\n" % (word, notes[word])

273 saved_words.append(word)

274 lines.append(line)

275 for word in [x for x in notes.keys() if not x in saved_words]:

276 line = "%-29s %s\n" % (word, notes[word])

277 lines.append(line)

278

279 with codecs.open(filename, "w", "utf-8") as f:

280 for line in lines:

281 f.write(line)

282

283

284 def substract_dictionary(dict1, dict2):

285 """

286 returns dict1 - dict2

287 """

288 result = {}

289 for (k,v) in dict1.items():

290 if not k in dict2:

291 result[k] = v

292 return result

293

294 def dump_words(words, filename):

295 with codecs.open(filename, "w+", "utf-8") as f:

296 for word in words.keys():

297 f.write(("%s\n"%word)*words[word])

298

299 def error_message(text):

300 print text

301

302 def find_wordgroups_weights(word_pairs, normalizator):

303 weight = {}

304 for (num, word) in word_pairs:

305 normalized = normalizator.normalize(word)

306 weight.setdefault(normalized, 0)

307 weight[normalized] += num

308 return weight

309

310 def find_linked_words(notes):

311 linked_words = {}

312 for word in notes.keys():

313 for note in notes[word].values():

314 if "@" in note:

315 result = re.search(r'\@(\S*)', note)

316 if result:

317 main_word = result.group(1)

318 if main_word:

319 linked_words[word] = main_word

320 return linked_words

321

322 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):

323 (num1, word1) = pair1

324 (num2, word2) = pair2

325

326 normalized_word1 = normalizator.normalize(word1)

327 normalized_word2 = normalizator.normalize(word2)

328

329 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])

330 if cmp_res != 0:

331 return cmp_res

332 else:

333 cmp_res = cmp(normalized_word1, normalized_word2)

334 if cmp_res != 0:

335 return cmp_res

336 else:

337 return cmp(int(num1), int(num2))

338

339

340 def print_words_sorted(

341 word_pairs,

342 stats,

343 normalizator,

344 print_stats=True,

345 stats_only=False,

346 compressed_wordlist=False,

347 show_range=0,

348 show_range_percentage=0,

349 ):

350 if stats_only:

351 codecs.getwriter("utf-8")(sys.stdout).write(

352 " ".join([

353 "%-10s" % x for x in [

354 "LANG",

355 "KNOWN%",

356 "UNKNOWN%",

357 "KNOWN",

358 "TOTAL",

359 "WPS",

360 "UWPS*10"

361 ]]) + "\n")

362 codecs.getwriter("utf-8")(sys.stdout).write(

363 " ".join([

364 "%(language)-10s",

365 "%(percentage)-10.2f",

366 "%(percentage_unknown)-10.2f",

367 "%(total_known)-11d"

368 "%(total)-11d"

369 "%(wps)-11d"

370 "%(uwps)-11d"

371 ]) % stats + "\n")

372 return

373

374 if print_stats:

375 codecs.getwriter("utf-8")(sys.stdout).write(

376 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)

377

378 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)

379 known = int(stats['total_known'])

380 total = int(stats['total'])

381 current_level = 0

382 old_normalized_word = None

383 words_of_this_group = []

384 printed_words = 0

385 for word_pair in word_pairs:

386

387 normalized_word = normalizator.normalize(word_pair[1])

388 if old_normalized_word and old_normalized_word != normalized_word:

389 #codecs.getwriter("utf-8")(sys.stdout).write(

390 # "### %s\n" % normalizator.best_word_from_group(words_of_this_group))

391 compressed_word_pair = (

392 sum(x[0] for x in words_of_this_group),

393 normalizator.best_word_from_group(words_of_this_group)

394 )

395 if compressed_wordlist:

396 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)

397 printed_words += 1

398 words_of_this_group = []

399

400 old_normalized_word = normalized_word

401 words_of_this_group.append(word_pair)

402

403 if not compressed_wordlist:

404 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)

405 printed_words += 1

406

407

408 known += word_pair[0]

409 if 100.0*known/total >= level_lines[0]:

410 current_level = level_lines[0]

411 while 100.0*known/total > level_lines[0]:

412 current_level = level_lines[0]

413 level_lines = level_lines[1:]

414 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)

415

416 if show_range >0 and printed_words >= show_range:

417 break

418 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:

419 break

420

421 def filter_add_notes(args):

422 lines = readlines_from_file(args[0])

423 notes = load_notes(notes_filenames())

424 lines = add_notes(lines, notes)

425 with codecs.open(args[0], "w", "utf-8") as f:

426 for line in lines:

427 f.write(line)

428

429 def filter_remove_notes(args):

430 lines = readlines_from_file(args[0])

431 notes = load_notes(notes_filenames())

432 lines = remove_notes(lines, notes)

433 with codecs.open(args[0], "w", "utf-8") as f:

434 for line in lines:

435 f.write(line)

436

437 def filter_get_words_group_words_add_stat(args):

438 vocabulary = load_vocabulary()

439 notes = load_notes(notes_filenames())

440 lines = readlines_from_stdin()

441 group_by = [1]

442

443 if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':

444 group_by.append(2)

445 if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':

446 group_by.append(3)

447 words = get_words(lines, group_by)

448 stats_only = False

449 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':

450 stats_only = True

451

452 compressed_wordlist = False

453 if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':

454 compressed_wordlist = True

455

456 show_range = os.environ.get('SHOW_RANGE', '')

457 if show_range != '':

458 show_range = int(show_range)

459 else:

460 show_range = 0

461 show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '')

462 if show_range_percentage != '':

463 show_range_percentage = int(show_range_percentage)

464 else:

465 show_range_percentage = 0

466

467

468 stats = {}

469 stats['total'] = sum(words[x] for x in words.keys())

470 if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':

471 words = substract_dictionary(words, vocabulary)

472

473 stats['total_unknown'] = sum(words[x] for x in words.keys())

474 stats['total_known'] = stats['total'] - stats['total_unknown']

475 stats['percentage'] = 100.0*stats['total_known']/stats['total']

476 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']

477 stats['groups'] = 0

478 stats['words'] = len(words)

479 stats['sentences'] = 0 #FIXME

480 stats['wps'] = 0 #FIXME

481 stats['uwps'] = 0 #FIXME

482 stats['language'] = config['language']

483

484 linked_words = find_linked_words(notes)

485 normalizator = Normalizator(config['language'], linked_words)

486

487 words_with_freq = []

488 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):

489 words_with_freq.append((words[k], k))

490

491 wgw = find_wordgroups_weights(words_with_freq, normalizator)

492 if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':

493 words_with_freq = sorted(

494 words_with_freq,

495 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),

496 reverse=True)

497

498 print_words_sorted(

499 words_with_freq,

500 stats,

501 normalizator,

502 stats_only=stats_only,

503 compressed_wordlist=compressed_wordlist,

504 show_range=show_range,

505 show_range_percentage=show_range_percentage,

506 )

507

508 (options, args) = parser.parse_args()

509 if options.language:

510 config['language'] = options.language

511

512 if options.function:

513 function_names = {

514 'add_notes' : filter_add_notes,

515 'remove_notes': filter_remove_notes,

516 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,

517 }

518 if options.function in function_names:

519 function_names[options.function](args)

520 else:

521 error_message("Unkown function %s.\nAvailable functions:\n%s" % (

522 options.function, "".join([" "+x for x in sorted(function_names.keys())])))

523 sys.exit(1)

524

525

526

527

528 #os.system("vim")

529