new-words: f583256b7ab1 new-words.py

new-words

view new-words.py @ 53:f583256b7ab1

-p key support in new-words.py

author	Igor Chubin <igor@chub.in>
date	Mon Oct 31 20:21:20 2011 +0200 (2011-10-31)
parents	74e05d4436ee
children	e25de9ea9184

line source

1 #!/usr/bin/env python

2 # -*- coding: utf-8 -*-

4 from __future__ import with_statement

5 import codecs

6 import difflib

7 import logging

8 import os

9 import optparse

10 import re

11 import subprocess

12 import sys

13 import Stemmer

14 try:

15 import psyco

16 psyco.full()

17 except:

18 pass

20 config = {

21 'config_directory': os.environ['HOME'] + '/.new-words',

22 'language': 'en',

23 }

25 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)

27 class Normalizator:

28 def __init__(self, language, linked_words={}):

29 stemmer_algorithm = {

30 'de' : 'german',

31 'en' : 'english',

32 'es' : 'spanish',

33 'ru' : 'russian',

34 'it' : 'italian',

35 'uk' : 'ukrainian',

36 }

37 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])

38 self.linked_words = linked_words

40 def normalize(self, word):

41 word_chain = []

42 while word in self.linked_words and not word in word_chain:

43 word_chain.append(word)

44 word = self.linked_words[word]

45 return self.stemmer.stemWord(word.lower())

47 def best_word_from_group(self, wordpairs_group):

48 """Returns the word that is the most relevant to the wordpairs_group.

50 At the moment: returns the word with minimal length"""

52 def f(x, y):

53 return difflib.SequenceMatcher(

54 None,

55 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),

56 x.lower(),

57 y.lower()).ratio()

59 minimal_length = min(len(pair[1]) for pair in wordpairs_group)

60 best_match = list(x[1] for x in sorted(

61 (x for x in wordpairs_group if len(x[1]) == minimal_length),

62 key=lambda x:x[0],

63 reverse=True))[0]

65 return best_match

67 suggestions = self.dictionary_suggestions(best_match)

68 if len(suggestions) == 1:

69 return best_match

71 verb = False

72 corrected_best_match = best_match

73 if best_match[-2:] == 'et':

74 word = best_match[:-1]+"n"

75 sugg = self.dictionary_suggestions(word)

76 if len(sugg) == 1:

77 return word

78 suggestions += sugg

79 corrected_best_match = word

80 corrected_best_match = best_match[:-2]

81 verb = True

83 if best_match[-1] == 't':

84 word = best_match[:-1]+"en"

85 sugg = self.dictionary_suggestions(word)

86 if len(sugg) == 1:

87 return word

88 suggestions += sugg

89 corrected_best_match = best_match[:-1]

90 verb = True

92 if corrected_best_match[0].lower() == corrected_best_match[0]:

93 suggestions = [ x for x in suggestions

94 if x[0].lower() == x[0] ]

96 if suggestions == []:

97 return best_match+"_"

98 return best_match+" "+(" ".join(

99 sorted(

100 suggestions,

101 key = lambda x: f(x, corrected_best_match),

102 reverse = True

103 )

104 )

105 )

106

107 def dictionary_suggestions(self, word):

108 return [

109 x.decode('utf-8').rstrip('\n')

110 for x

111 in subprocess.Popen(

112 ["de-variants", word],

113 stdout=subprocess.PIPE

114 ).stdout.readlines() ]

115

116

117 parser = optparse.OptionParser()

118

119 parser.add_option(

120 "-a", "--no-marks",

121 help="don't add marks (and don't save marks added by user)",

122 action="store_true",

123 dest="no_marks")

124

125 parser.add_option(

126 "-c", "--compressed",

127 help="show compressed wordlist: one word per group",

128 action="store_true",

129 dest="compressed")

130

131 parser.add_option(

132 "-k", "--known-words",

133 help="put higher words that are similar to the known words (only for English)",

134 action="store_true",

135 dest="compressed")

136

137 parser.add_option(

138 "-l", "--language",

139 help="specify language of text",

140 action="store",

141 dest="language")

142

143 parser.add_option(

144 "-f", "--function",

145 help="filter through subsystem [INTERNAL]",

146 action="store",

147 dest="function")

148

149 parser.add_option(

150 "-m", "--merge-tag",

151 help="merge words tagged with specified tag into the main vocabulary",

152 action="store",

153 dest="merge_tag")

154

155 parser.add_option(

156 "-M", "--merge-tagged",

157 help="merge words tagged with ANY tag into the main vocabulary",

158 action="store_true",

159 dest="merge_tagged")

160

161 parser.add_option(

162 "-n", "--non-interactive",

163 help="non-interactive mode (don't run vi)",

164 action="store_true",

165 dest="non_interactive")

166

167 parser.add_option(

168 "-N", "--no-filter",

169 help="switch off known words filtering",

170 action="store_true",

171 dest="no_filter")

172

173 parser.add_option(

174 "-p", "--pages",

175 help="work with specified pages only (pages = start-stop/total )",

176 action="store",

177 dest="pages")

178

179 parser.add_option(

180 "-d", "--delete-tag",

181 help="delete subvocabulary of specified tag",

182 action="store",

183 dest="delete_tag")

184

185 parser.add_option(

186 "-s", "--text-stats",

187 help="show the text statistics (percentage of known words and so on) and exit",

188 action="store_true",

189 dest="text_stats")

190

191 parser.add_option(

192 "-S", "--voc-stats",

193 help="show your vocabulary statistics (number of words and word groups)",

194 action="store_true",

195 dest="voc_stats")

196

197 parser.add_option(

198 "-t", "--tag",

199 help="tag known words with tag",

200 action="store",

201 dest="tag")

202

203 parser.add_option(

204 "-T", "--show-tags",

205 help="tag known words with tag",

206 action="store_true",

207 dest="show_tags")

208

209 parser.add_option(

210 "-2", "--two-words",

211 help="find 2 words' sequences",

212 action="store_true",

213 dest="two_words")

214

215 parser.add_option(

216 "-3", "--three-words",

217 help="find 3 words' sequences",

218 action="store_true",

219 dest="three_words")

220

221 def readlines_from_file(filename):

222 res = []

223 with codecs.open(filename, "r", "utf-8") as f:

224 for line in f.readlines():

225 res += [line]

226 return res

227

228 def readlines_from_stdin():

229 return codecs.getreader("utf-8")(sys.stdin).readlines()

230

231 def words_from_line(line):

232 line = line.rstrip('\n')

233 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)

234 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)

235 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)

236

237 def get_words(lines, group_by=[1]):

238 """

239 Returns hash of words in a file

240 word => number

241 """

242 result = {}

243 (a, b, c) = ("", "", "")

244 for line in lines:

245 words = words_from_line(line)

246 for word in words:

247 if re.match('[0-9]*$', word):

248 continue

249 result.setdefault(word, 0)

250 result[word] += 1

251 if 2 in group_by and a != "" and b != "":

252 w = "%s_%s" % (a,b)

253 result.setdefault(w, 0)

254 result[w] += 1

255 if 3 in group_by and not "" in [a,b,c]:

256 w = "%s_%s_%s" % (a,b,c)

257 result.setdefault(w, 0)

258 result[w] += 1

259 (a,b,c) = (b, c, word)

260

261 logging.debug(result)

262 return result

263

264 def load_vocabulary():

265 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))

266

267 def notes_filenames():

268 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]

269

270 def load_notes(files):

271 notes = {}

272 for filename in files:

273 with codecs.open(filename, "r", "utf-8") as f:

274 for line in f.readlines():

275 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

276 notes.setdefault(word, {})

277 notes[word][filename] = note

278 return notes

279

280 def add_notes(lines, notes):

281 notes_filename = notes_filenames()[0]

282 result = []

283 for line in lines:

284 if line.startswith('#'):

285 result += [line]

286 else:

287 match_object = re.search('^\s*\S+\s*(\S+)', line)

288 if match_object:

289 word = match_object.group(1)

290 if word in notes:

291 if notes_filename in notes[word]:

292 line = line.rstrip('\n')

293 line = "%-30s %s\n" % (line, notes[word][notes_filename])

294 result += [line]

295 else:

296 result += [line]

297 else:

298 result += [line]

299 return result

300

301 def remove_notes(lines, notes_group):

302 notes_filename = notes_filenames()[0]

303 notes = {}

304 for k in notes_group.keys():

305 if notes_filename in notes_group[k]:

306 notes[k] = notes_group[k][notes_filename]

307

308 result = []

309 for line in lines:

310 line = line.rstrip('\n')

311 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)

312 if match_object:

313 result.append("".join([

314 match_object.group(1),

315 match_object.group(2),

316 match_object.group(3),

317 match_object.group(4),

318 "\n"

319 ]))

320 notes[match_object.group(4)] = match_object.group(6)

321 else:

322 result.append(line+"\n")

323

324 save_notes(notes_filename, notes)

325 return result

326

327 def save_notes(filename, notes):

328 lines = []

329 saved_words = []

330 with codecs.open(filename, "r", "utf-8") as f:

331 for line in f.readlines():

332 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

333 if word in notes:

334 line = "%-29s %s\n" % (word, notes[word])

335 saved_words.append(word)

336 lines.append(line)

337 for word in [x for x in notes.keys() if not x in saved_words]:

338 line = "%-29s %s\n" % (word, notes[word])

339 lines.append(line)

340

341 with codecs.open(filename, "w", "utf-8") as f:

342 for line in lines:

343 f.write(line)

344

345

346 def substract_dictionary(dict1, dict2):

347 """

348 returns dict1 - dict2

349 """

350 result = {}

351 for (k,v) in dict1.items():

352 if not k in dict2:

353 result[k] = v

354 return result

355

356 def dump_words(words, filename):

357 with codecs.open(filename, "w+", "utf-8") as f:

358 for word in words.keys():

359 f.write(("%s\n"%word)*words[word])

360

361 def error_message(text):

362 print text

363

364 def find_wordgroups_weights(word_pairs, normalizator):

365 weight = {}

366 for (num, word) in word_pairs:

367 normalized = normalizator.normalize(word)

368 weight.setdefault(normalized, 0)

369 weight[normalized] += num

370 return weight

371

372 def find_linked_words(notes):

373 linked_words = {}

374 for word in notes.keys():

375 for note in notes[word].values():

376 if "@" in note:

377 result = re.search(r'\@(\S*)', note)

378 if result:

379 main_word = result.group(1)

380 if main_word:

381 linked_words[word] = main_word

382 return linked_words

383

384 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):

385 (num1, word1) = pair1

386 (num2, word2) = pair2

387

388 normalized_word1 = normalizator.normalize(word1)

389 normalized_word2 = normalizator.normalize(word2)

390

391 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])

392 if cmp_res != 0:

393 return cmp_res

394 else:

395 cmp_res = cmp(normalized_word1, normalized_word2)

396 if cmp_res != 0:

397 return cmp_res

398 else:

399 return cmp(int(num1), int(num2))

400

401

402 def print_words_sorted(

403 word_pairs,

404 stats,

405 normalizator,

406 print_stats=True,

407 stats_only=False,

408 compressed_wordlist=False,

409 show_range=0,

410 show_range_percentage=0,

411 ):

412 if stats_only:

413 codecs.getwriter("utf-8")(sys.stdout).write(

414 " ".join([

415 "%-10s" % x for x in [

416 "LANG",

417 "KNOWN%",

418 "UNKNOWN%",

419 "KNOWN",

420 "TOTAL",

421 "WPS",

422 "UWPS*10"

423 ]]) + "\n")

424 codecs.getwriter("utf-8")(sys.stdout).write(

425 " ".join([

426 "%(language)-10s",

427 "%(percentage)-10.2f",

428 "%(percentage_unknown)-10.2f",

429 "%(total_known)-11d"

430 "%(total)-11d"

431 "%(wps)-11d"

432 "%(uwps)-11d"

433 ]) % stats + "\n")

434 return

435

436 if print_stats:

437 codecs.getwriter("utf-8")(sys.stdout).write(

438 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)

439

440 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)

441 known = int(stats['total_known'])

442 total = int(stats['total'])

443 current_level = 0

444 old_normalized_word = None

445 words_of_this_group = []

446 printed_words = 0

447 for word_pair in word_pairs:

448

449 normalized_word = normalizator.normalize(word_pair[1])

450 if old_normalized_word and old_normalized_word != normalized_word:

451 #codecs.getwriter("utf-8")(sys.stdout).write(

452 # "### %s\n" % normalizator.best_word_from_group(words_of_this_group))

453 if compressed_wordlist:

454 compressed_word_pair = (

455 sum(x[0] for x in words_of_this_group),

456 normalizator.best_word_from_group(words_of_this_group)

457 )

458 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)

459 printed_words += 1

460 words_of_this_group = []

461

462 old_normalized_word = normalized_word

463 words_of_this_group.append(word_pair)

464

465 if not compressed_wordlist:

466 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)

467 printed_words += 1

468

469

470 known += word_pair[0]

471 if 100.0*known/total >= level_lines[0]:

472 current_level = level_lines[0]

473 while 100.0*known/total > level_lines[0]:

474 current_level = level_lines[0]

475 level_lines = level_lines[1:]

476 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)

477

478 if show_range >0 and printed_words >= show_range:

479 break

480 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:

481 break

482

483 def filter_add_notes(args):

484 lines = readlines_from_file(args[0])

485 notes = load_notes(notes_filenames())

486 lines = add_notes(lines, notes)

487 with codecs.open(args[0], "w", "utf-8") as f:

488 for line in lines:

489 f.write(line)

490

491 def filter_remove_notes(args):

492 lines = readlines_from_file(args[0])

493 notes = load_notes(notes_filenames())

494 lines = remove_notes(lines, notes)

495 with codecs.open(args[0], "w", "utf-8") as f:

496 for line in lines:

497 f.write(line)

498

499 def parse_parts_description(parts_description):

500 """

501 Returns triad (start, stop, step)

502 basing on parts_description string.

503 from-to/step

504 from+delta/step

505 """

506 def incorrect_parts_description(pd):

507 raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % pd)

508

509 try:

510 (a, step) = parts_description.split("/", 1)

511 step = int(step)

512 start = 0

513 stop = 0

514 if '-' in a:

515 (start, stop) = a.split("-", 1)

516 start = int(start)

517 stop = int(stop)

518 elif '+' in a:

519 (start, stop) = a.split("+", 1)

520 start = int(start)

521 stop = int(stop)

522 else:

523 start = int(a)

524 stop = start + 1

525 return (start, stop, step)

526

527 except:

528 raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % pd)

529

530

531 def take_part(lines, part_description = None):

532 if part_description == None:

533 return lines

534 (start, stop, step) = parse_parts_description(part_description)

535 n = len(lines)

536 part_size = (1.0*n) / step

537 result = []

538 for i in range(n):

539 if part_size * i >= start and part_size * i <= stop:

540 result += lines[i]

541 return result

542

543 def filter_get_words_group_words_add_stat(args):

544 vocabulary = load_vocabulary()

545 notes = load_notes(notes_filenames())

546 lines = take_part(readlines_from_stdin(), config.get('pages', ''))

547 group_by = [1]

548

549 if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':

550 group_by.append(2)

551 if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':

552 group_by.append(3)

553 words = get_words(lines, group_by)

554 stats_only = False

555 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':

556 stats_only = True

557

558 compressed_wordlist = False

559 if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':

560 compressed_wordlist = True

561

562 show_range = os.environ.get('SHOW_RANGE', '')

563 if show_range != '':

564 show_range = int(show_range)

565 else:

566 show_range = 0

567 show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '')

568 if show_range_percentage != '':

569 show_range_percentage = int(show_range_percentage)

570 else:

571 show_range_percentage = 0

572

573

574 stats = {}

575 stats['total'] = sum(words[x] for x in words.keys())

576 if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':

577 words = substract_dictionary(words, vocabulary)

578

579 stats['total_unknown'] = sum(words[x] for x in words.keys())

580 stats['total_known'] = stats['total'] - stats['total_unknown']

581 stats['percentage'] = 100.0*stats['total_known']/stats['total']

582 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']

583 stats['groups'] = 0

584 stats['words'] = len(words)

585 stats['sentences'] = 0 #FIXME

586 stats['wps'] = 0 #FIXME

587 stats['uwps'] = 0 #FIXME

588 stats['language'] = config['language']

589

590 linked_words = find_linked_words(notes)

591 normalizator = Normalizator(config['language'], linked_words)

592

593 # filter words by allowed_words_filter

594 if os.environ.get('ALLOWED_WORDS_FILENAME', ''):

595 allowed_words_filename = os.environ.get('ALLOWED_WORDS_FILENAME', '')

596 normalized_allowed_words = [

597 normalizator.normalize(w.rstrip('\n'))

598 for w in readlines_from_file(allowed_words_filename)

599 ]

600

601 result = {}

602 for w, wn in words.iteritems():

603 if normalizator.normalize(w) in normalized_allowed_words:

604 result[w] = wn

605 words = result

606

607 words_with_freq = []

608 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):

609 words_with_freq.append((words[k], k))

610

611 wgw = find_wordgroups_weights(words_with_freq, normalizator)

612 if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':

613 words_with_freq = sorted(

614 words_with_freq,

615 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),

616 reverse=True)

617

618 print_words_sorted(

619 words_with_freq,

620 stats,

621 normalizator,

622 stats_only=stats_only,

623 compressed_wordlist=compressed_wordlist,

624 show_range=show_range,

625 show_range_percentage=show_range_percentage,

626 )

627

628 (options, args) = parser.parse_args()

629 if options.language:

630 config['language'] = options.language

631

632 if options.function:

633 function_names = {

634 'add_notes' : filter_add_notes,

635 'remove_notes': filter_remove_notes,

636 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,

637 }

638 if options.function in function_names:

639 function_names[options.function](args)

640 else:

641 error_message("Unkown function %s.\nAvailable functions:\n%s" % (

642 options.function, "".join([" "+x for x in sorted(function_names.keys())])))

643 sys.exit(1)

644

645

646

647

648 #os.system("vim")

649