new-words: 00286f6bfa85 new-words.py

new-words

view new-words.py @ 49:00286f6bfa85

experimental: when -c specified, use dictionary for compression

author	Igor Chubin <igor@chub.in>
date	Wed Feb 09 21:08:23 2011 +0200 (2011-02-09)
parents	7194bdb56475
children	4e931db74618

line source

1 #!/usr/bin/env python

2 # -*- coding: utf-8 -*-

4 from __future__ import with_statement

5 import codecs

6 import difflib

7 import logging

8 import os

9 import optparse

10 import re

11 import subprocess

12 import sys

13 import Stemmer

14 try:

15 import psyco

16 psyco.full()

17 except:

18 pass

20 config = {

21 'config_directory': os.environ['HOME'] + '/.new-words',

22 'language': 'en',

23 }

25 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)

27 class Normalizator:

28 def __init__(self, language, linked_words={}):

29 stemmer_algorithm = {

30 'de' : 'german',

31 'en' : 'english',

32 'ru' : 'russian',

33 'uk' : 'ukrainian',

34 }

35 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])

36 self.linked_words = linked_words

38 def normalize(self, word):

39 word_chain = []

40 while word in self.linked_words and not word in word_chain:

41 word_chain.append(word)

42 word = self.linked_words[word]

43 return self.stemmer.stemWord(word.lower())

45 def best_word_from_group(self, wordpairs_group):

46 """Returns the word that is the most relevant to the wordpairs_group.

48 At the moment: returns the word with minimal length"""

50 def f(x, y):

51 return difflib.SequenceMatcher(

52 None,

53 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),

54 x.lower(),

55 y.lower()).ratio()

57 minimal_length = min(len(pair[1]) for pair in wordpairs_group)

58 best_match = list(x[1] for x in sorted(

59 (x for x in wordpairs_group if len(x[1]) == minimal_length),

60 key=lambda x:x[0],

61 reverse=True))[0]

63 suggestions = self.dictionary_suggestions(best_match)

64 if len(suggestions) == 1:

65 return best_match

67 #return best_match

69 verb = False

70 corrected_best_match = best_match

71 if best_match[-2:] == 'et':

72 word = best_match[:-1]+"n"

73 sugg = self.dictionary_suggestions(word)

74 if len(sugg) == 1:

75 return word

76 suggestions += sugg

77 corrected_best_match = word

78 corrected_best_match = best_match[:-2]

79 verb = True

81 if best_match[-1] == 't':

82 word = best_match[:-1]+"en"

83 sugg = self.dictionary_suggestions(word)

84 if len(sugg) == 1:

85 return word

86 suggestions += sugg

87 corrected_best_match = best_match[:-1]

88 verb = True

90 if corrected_best_match[0].lower() == corrected_best_match[0]:

91 suggestions = [ x for x in suggestions

92 if x[0].lower() == x[0] ]

94 if suggestions == []:

95 return best_match+"_"

96 return best_match+" "+(" ".join(

97 sorted(

98 suggestions,

99 key = lambda x: f(x, corrected_best_match),

100 reverse = True

101 )

102 )

103 )

104

105 def dictionary_suggestions(self, word):

106 return [

107 x.decode('utf-8').rstrip('\n')

108 for x

109 in subprocess.Popen(

110 ["de-variants", word],

111 stdout=subprocess.PIPE

112 ).stdout.readlines() ]

113

114

115 parser = optparse.OptionParser()

116

117 parser.add_option(

118 "-a", "--no-marks",

119 help="don't add marks (and don't save marks added by user)",

120 action="store_true",

121 dest="no_marks")

122

123 parser.add_option(

124 "-c", "--compressed",

125 help="show compressed wordlist: one word per group",

126 action="store_true",

127 dest="compressed")

128

129 parser.add_option(

130 "-k", "--known-words",

131 help="put higher words that are similar to the known words (only for English)",

132 action="store_true",

133 dest="compressed")

134

135 parser.add_option(

136 "-l", "--language",

137 help="specify language of text",

138 action="store",

139 dest="language")

140

141 parser.add_option(

142 "-f", "--function",

143 help="filter through subsystem [INTERNAL]",

144 action="store",

145 dest="function")

146

147 parser.add_option(

148 "-m", "--merge-tag",

149 help="merge words tagged with specified tag into the main vocabulary",

150 action="store",

151 dest="merge_tag")

152

153 parser.add_option(

154 "-M", "--merge-tagged",

155 help="merge words tagged with ANY tag into the main vocabulary",

156 action="store_true",

157 dest="merge_tagged")

158

159 parser.add_option(

160 "-n", "--non-interactive",

161 help="non-interactive mode (don't run vi)",

162 action="store_true",

163 dest="non_interactive")

164

165 parser.add_option(

166 "-N", "--no-filter",

167 help="switch off known words filtering",

168 action="store_true",

169 dest="no_filter")

170

171 parser.add_option(

172 "-p", "--pages",

173 help="work with specified pages only (pages = start-stop/total )",

174 action="store",

175 dest="pages")

176

177 parser.add_option(

178 "-d", "--delete-tag",

179 help="delete subvocabulary of specified tag",

180 action="store",

181 dest="delete_tag")

182

183 parser.add_option(

184 "-s", "--text-stats",

185 help="show the text statistics (percentage of known words and so on) and exit",

186 action="store_true",

187 dest="text_stats")

188

189 parser.add_option(

190 "-S", "--voc-stats",

191 help="show your vocabulary statistics (number of words and word groups)",

192 action="store_true",

193 dest="voc_stats")

194

195 parser.add_option(

196 "-t", "--tag",

197 help="tag known words with tag",

198 action="store",

199 dest="tag")

200

201 parser.add_option(

202 "-T", "--show-tags",

203 help="tag known words with tag",

204 action="store_true",

205 dest="show_tags")

206

207 parser.add_option(

208 "-2", "--two-words",

209 help="find 2 words' sequences",

210 action="store_true",

211 dest="two_words")

212

213 parser.add_option(

214 "-3", "--three-words",

215 help="find 3 words' sequences",

216 action="store_true",

217 dest="three_words")

218

219 def readlines_from_file(filename):

220 res = []

221 with codecs.open(filename, "r", "utf-8") as f:

222 for line in f.readlines():

223 res += [line]

224 return res

225

226 def readlines_from_stdin():

227 return codecs.getreader("utf-8")(sys.stdin).readlines()

228

229 def words_from_line(line):

230 line = line.rstrip('\n')

231 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)

232 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)

233 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)

234

235 def get_words(lines, group_by=[1]):

236 """

237 Returns hash of words in a file

238 word => number

239 """

240 result = {}

241 (a, b, c) = ("", "", "")

242 for line in lines:

243 words = words_from_line(line)

244 for word in words:

245 if re.match('[0-9]*$', word):

246 continue

247 result.setdefault(word, 0)

248 result[word] += 1

249 if 2 in group_by and a != "" and b != "":

250 w = "%s_%s" % (a,b)

251 result.setdefault(w, 0)

252 result[w] += 1

253 if 3 in group_by and not "" in [a,b,c]:

254 w = "%s_%s_%s" % (a,b,c)

255 result.setdefault(w, 0)

256 result[w] += 1

257 (a,b,c) = (b, c, word)

258

259 logging.debug(result)

260 return result

261

262 def load_vocabulary():

263 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))

264

265 def notes_filenames():

266 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]

267

268 def load_notes(files):

269 notes = {}

270 for filename in files:

271 with codecs.open(filename, "r", "utf-8") as f:

272 for line in f.readlines():

273 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

274 notes.setdefault(word, {})

275 notes[word][filename] = note

276 return notes

277

278 def add_notes(lines, notes):

279 notes_filename = notes_filenames()[0]

280 result = []

281 for line in lines:

282 if line.startswith('#'):

283 result += [line]

284 else:

285 match_object = re.search('^\s*\S+\s*(\S+)', line)

286 if match_object:

287 word = match_object.group(1)

288 if word in notes:

289 if notes_filename in notes[word]:

290 line = line.rstrip('\n')

291 line = "%-30s %s\n" % (line, notes[word][notes_filename])

292 result += [line]

293 else:

294 result += [line]

295 else:

296 result += [line]

297 return result

298

299 def remove_notes(lines, notes_group):

300 notes_filename = notes_filenames()[0]

301 notes = {}

302 for k in notes_group.keys():

303 if notes_filename in notes_group[k]:

304 notes[k] = notes_group[k][notes_filename]

305

306 result = []

307 for line in lines:

308 line = line.rstrip('\n')

309 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)

310 if match_object:

311 result.append("".join([

312 match_object.group(1),

313 match_object.group(2),

314 match_object.group(3),

315 match_object.group(4),

316 "\n"

317 ]))

318 notes[match_object.group(4)] = match_object.group(6)

319 else:

320 result.append(line+"\n")

321

322 save_notes(notes_filename, notes)

323 return result

324

325 def save_notes(filename, notes):

326 lines = []

327 saved_words = []

328 with codecs.open(filename, "r", "utf-8") as f:

329 for line in f.readlines():

330 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

331 if word in notes:

332 line = "%-29s %s\n" % (word, notes[word])

333 saved_words.append(word)

334 lines.append(line)

335 for word in [x for x in notes.keys() if not x in saved_words]:

336 line = "%-29s %s\n" % (word, notes[word])

337 lines.append(line)

338

339 with codecs.open(filename, "w", "utf-8") as f:

340 for line in lines:

341 f.write(line)

342

343

344 def substract_dictionary(dict1, dict2):

345 """

346 returns dict1 - dict2

347 """

348 result = {}

349 for (k,v) in dict1.items():

350 if not k in dict2:

351 result[k] = v

352 return result

353

354 def dump_words(words, filename):

355 with codecs.open(filename, "w+", "utf-8") as f:

356 for word in words.keys():

357 f.write(("%s\n"%word)*words[word])

358

359 def error_message(text):

360 print text

361

362 def find_wordgroups_weights(word_pairs, normalizator):

363 weight = {}

364 for (num, word) in word_pairs:

365 normalized = normalizator.normalize(word)

366 weight.setdefault(normalized, 0)

367 weight[normalized] += num

368 return weight

369

370 def find_linked_words(notes):

371 linked_words = {}

372 for word in notes.keys():

373 for note in notes[word].values():

374 if "@" in note:

375 result = re.search(r'\@(\S*)', note)

376 if result:

377 main_word = result.group(1)

378 if main_word:

379 linked_words[word] = main_word

380 return linked_words

381

382 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):

383 (num1, word1) = pair1

384 (num2, word2) = pair2

385

386 normalized_word1 = normalizator.normalize(word1)

387 normalized_word2 = normalizator.normalize(word2)

388

389 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])

390 if cmp_res != 0:

391 return cmp_res

392 else:

393 cmp_res = cmp(normalized_word1, normalized_word2)

394 if cmp_res != 0:

395 return cmp_res

396 else:

397 return cmp(int(num1), int(num2))

398

399

400 def print_words_sorted(

401 word_pairs,

402 stats,

403 normalizator,

404 print_stats=True,

405 stats_only=False,

406 compressed_wordlist=False,

407 show_range=0,

408 show_range_percentage=0,

409 ):

410 if stats_only:

411 codecs.getwriter("utf-8")(sys.stdout).write(

412 " ".join([

413 "%-10s" % x for x in [

414 "LANG",

415 "KNOWN%",

416 "UNKNOWN%",

417 "KNOWN",

418 "TOTAL",

419 "WPS",

420 "UWPS*10"

421 ]]) + "\n")

422 codecs.getwriter("utf-8")(sys.stdout).write(

423 " ".join([

424 "%(language)-10s",

425 "%(percentage)-10.2f",

426 "%(percentage_unknown)-10.2f",

427 "%(total_known)-11d"

428 "%(total)-11d"

429 "%(wps)-11d"

430 "%(uwps)-11d"

431 ]) % stats + "\n")

432 return

433

434 if print_stats:

435 codecs.getwriter("utf-8")(sys.stdout).write(

436 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)

437

438 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)

439 known = int(stats['total_known'])

440 total = int(stats['total'])

441 current_level = 0

442 old_normalized_word = None

443 words_of_this_group = []

444 printed_words = 0

445 for word_pair in word_pairs:

446

447 normalized_word = normalizator.normalize(word_pair[1])

448 if old_normalized_word and old_normalized_word != normalized_word:

449 #codecs.getwriter("utf-8")(sys.stdout).write(

450 # "### %s\n" % normalizator.best_word_from_group(words_of_this_group))

451 if compressed_wordlist:

452 compressed_word_pair = (

453 sum(x[0] for x in words_of_this_group),

454 normalizator.best_word_from_group(words_of_this_group)

455 )

456 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)

457 printed_words += 1

458 words_of_this_group = []

459

460 old_normalized_word = normalized_word

461 words_of_this_group.append(word_pair)

462

463 if not compressed_wordlist:

464 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)

465 printed_words += 1

466

467

468 known += word_pair[0]

469 if 100.0*known/total >= level_lines[0]:

470 current_level = level_lines[0]

471 while 100.0*known/total > level_lines[0]:

472 current_level = level_lines[0]

473 level_lines = level_lines[1:]

474 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)

475

476 if show_range >0 and printed_words >= show_range:

477 break

478 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:

479 break

480

481 def filter_add_notes(args):

482 lines = readlines_from_file(args[0])

483 notes = load_notes(notes_filenames())

484 lines = add_notes(lines, notes)

485 with codecs.open(args[0], "w", "utf-8") as f:

486 for line in lines:

487 f.write(line)

488

489 def filter_remove_notes(args):

490 lines = readlines_from_file(args[0])

491 notes = load_notes(notes_filenames())

492 lines = remove_notes(lines, notes)

493 with codecs.open(args[0], "w", "utf-8") as f:

494 for line in lines:

495 f.write(line)

496

497 def filter_get_words_group_words_add_stat(args):

498 vocabulary = load_vocabulary()

499 notes = load_notes(notes_filenames())

500 lines = readlines_from_stdin()

501 group_by = [1]

502

503 if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':

504 group_by.append(2)

505 if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':

506 group_by.append(3)

507 words = get_words(lines, group_by)

508 stats_only = False

509 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':

510 stats_only = True

511

512 compressed_wordlist = False

513 if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':

514 compressed_wordlist = True

515

516 show_range = os.environ.get('SHOW_RANGE', '')

517 if show_range != '':

518 show_range = int(show_range)

519 else:

520 show_range = 0

521 show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '')

522 if show_range_percentage != '':

523 show_range_percentage = int(show_range_percentage)

524 else:

525 show_range_percentage = 0

526

527

528 stats = {}

529 stats['total'] = sum(words[x] for x in words.keys())

530 if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':

531 words = substract_dictionary(words, vocabulary)

532

533 stats['total_unknown'] = sum(words[x] for x in words.keys())

534 stats['total_known'] = stats['total'] - stats['total_unknown']

535 stats['percentage'] = 100.0*stats['total_known']/stats['total']

536 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']

537 stats['groups'] = 0

538 stats['words'] = len(words)

539 stats['sentences'] = 0 #FIXME

540 stats['wps'] = 0 #FIXME

541 stats['uwps'] = 0 #FIXME

542 stats['language'] = config['language']

543

544 linked_words = find_linked_words(notes)

545 normalizator = Normalizator(config['language'], linked_words)

546

547 words_with_freq = []

548 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):

549 words_with_freq.append((words[k], k))

550

551 wgw = find_wordgroups_weights(words_with_freq, normalizator)

552 if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':

553 words_with_freq = sorted(

554 words_with_freq,

555 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),

556 reverse=True)

557

558 print_words_sorted(

559 words_with_freq,

560 stats,

561 normalizator,

562 stats_only=stats_only,

563 compressed_wordlist=compressed_wordlist,

564 show_range=show_range,

565 show_range_percentage=show_range_percentage,

566 )

567

568 (options, args) = parser.parse_args()

569 if options.language:

570 config['language'] = options.language

571

572 if options.function:

573 function_names = {

574 'add_notes' : filter_add_notes,

575 'remove_notes': filter_remove_notes,

576 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,

577 }

578 if options.function in function_names:

579 function_names[options.function](args)

580 else:

581 error_message("Unkown function %s.\nAvailable functions:\n%s" % (

582 options.function, "".join([" "+x for x in sorted(function_names.keys())])))

583 sys.exit(1)

584

585

586

587

588 #os.system("vim")

589