new-words: c3adf6452eda new-words.py

new-words

view new-words.py @ 64:c3adf6452eda

lingvo-en-ru.pl script moved to misc/

author	Igor Chubin <igor@chub.in>
date	Sat Nov 12 14:03:54 2011 +0100 (2011-11-12)
parents	3682038403ad
children	5a003076eb11

line source

1 #!/usr/bin/env python

2 # -*- coding: utf-8 -*-

4 from __future__ import with_statement

5 import codecs

6 import difflib

7 import logging

8 import os

9 import optparse

10 import re

11 import subprocess

12 import sys

13 import Stemmer

14 import tempfile

15 try:

16 import psyco

17 psyco.full()

18 except:

19 pass

21 config = {

22 'config_directory': os.environ['HOME'] + '/.new-words',

23 'language': 'en',

24 }

26 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)

28 class Normalizator:

29 def __init__(self, language, linked_words={}):

30 stemmer_algorithm = {

31 'de' : 'german',

32 'fr' : 'french',

33 'en' : 'english',

34 'es' : 'spanish',

35 'ru' : 'russian',

36 'it' : 'italian',

37 'uk' : 'ukrainian',

38 }

39 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])

40 self.linked_words = linked_words

42 def normalize(self, word):

43 word_chain = []

44 while word in self.linked_words and not word in word_chain:

45 word_chain.append(word)

46 word = self.linked_words[word]

47 return self.stemmer.stemWord(word.lower())

49 def best_word_from_group(self, wordpairs_group):

50 """Returns the word that is the most relevant to the wordpairs_group.

52 At the moment: returns the word with minimal length"""

54 def f(x, y):

55 return difflib.SequenceMatcher(

56 None,

57 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),

58 x.lower(),

59 y.lower()).ratio()

61 minimal_length = min(len(pair[1]) for pair in wordpairs_group)

62 best_match = list(x[1] for x in sorted(

63 (x for x in wordpairs_group if len(x[1]) == minimal_length),

64 key=lambda x:x[0],

65 reverse=True))[0]

67 return best_match

69 suggestions = self.dictionary_suggestions(best_match)

70 if len(suggestions) == 1:

71 return best_match

73 verb = False

74 corrected_best_match = best_match

75 if best_match[-2:] == 'et':

76 word = best_match[:-1]+"n"

77 sugg = self.dictionary_suggestions(word)

78 if len(sugg) == 1:

79 return word

80 suggestions += sugg

81 corrected_best_match = word

82 corrected_best_match = best_match[:-2]

83 verb = True

85 if best_match[-1] == 't':

86 word = best_match[:-1]+"en"

87 sugg = self.dictionary_suggestions(word)

88 if len(sugg) == 1:

89 return word

90 suggestions += sugg

91 corrected_best_match = best_match[:-1]

92 verb = True

94 if corrected_best_match[0].lower() == corrected_best_match[0]:

95 suggestions = [ x for x in suggestions

96 if x[0].lower() == x[0] ]

98 if suggestions == []:

99 return best_match+"_"

100 return best_match+" "+(" ".join(

101 sorted(

102 suggestions,

103 key = lambda x: f(x, corrected_best_match),

104 reverse = True

105 )

106 )

107 )

108

109 def dictionary_suggestions(self, word):

110 return [

111 x.decode('utf-8').rstrip('\n')

112 for x

113 in subprocess.Popen(

114 ["de-variants", word],

115 stdout=subprocess.PIPE

116 ).stdout.readlines() ]

117

118

119 parser = optparse.OptionParser()

120

121 parser.add_option(

122 "-a", "--no-marks",

123 help="don't add marks (and don't save marks added by user) [NOT IMPLEMENTED YET]",

124 action="store_true",

125 dest="no_marks")

126

127 parser.add_option(

128 "-c", "--compressed",

129 help="show compressed wordlist: one word per group",

130 action="store_true",

131 dest="compressed")

132

133 parser.add_option(

134 "-k", "--known-words",

135 help="put higher words that are similar to the known words (only for English)",

136 action="store_true",

137 dest="compressed")

138

139 parser.add_option(

140 "-l", "--language",

141 help="specify language of text",

142 action="store",

143 dest="language")

144

145 parser.add_option(

146 "-f", "--allowed-words",

147 help="file with list of allowed words (words that will be shown in the output)",

148 action="store",

149 dest="allowed_words")

150

151 parser.add_option(

152 "-G", "--words-grouping",

153 help="turn off word grouping",

154 action="store_true",

155 dest="no_words_grouping")

156

157 parser.add_option(

158 "-X", "--function",

159 help="filter through subsystem [INTERNAL]",

160 action="store",

161 dest="function")

162

163 parser.add_option(

164 "-m", "--merge-tag",

165 help="merge words tagged with specified tag into the main vocabulary [NOT IMPLEMENTED YET]",

166 action="store",

167 dest="merge_tag")

168

169 parser.add_option(

170 "-M", "--merge-tagged",

171 help="merge words tagged with ANY tag into the main vocabulary [NOT IMPLEMENTED YET]",

172 action="store_true",

173 dest="merge_tagged")

174

175 parser.add_option(

176 "-n", "--non-interactive",

177 help="non-interactive mode (don't run vi)",

178 action="store_true",

179 dest="non_interactive")

180

181 parser.add_option(

182 "-N", "--no-filter",

183 help="switch off known words filtering",

184 action="store_true",

185 dest="no_filter")

186

187 parser.add_option(

188 "-p", "--pages",

189 help="work with specified pages only (pages = start-stop/total )",

190 action="store",

191 dest="pages")

192

193 parser.add_option(

194 "-d", "--delete-tag",

195 help="delete subvocabulary of specified tag",

196 action="store",

197 dest="delete_tag")

198

199 parser.add_option(

200 "-r", "--show-range",

201 help="show only words specified number of words",

202 action="store",

203 dest="show_range")

204

205 parser.add_option(

206 "-R", "--show-range-percentage",

207 help="show only words that cover specified percentage of the text, skip the rest",

208 action="store",

209 dest="show_range_percentage")

210

211 parser.add_option(

212 "-s", "--text-stats",

213 help="show the text statistics (percentage of known words and so on) and exit",

214 action="store_true",

215 dest="text_stats")

216

217 parser.add_option(

218 "-S", "--voc-stats",

219 help="show your vocabulary statistics (number of words and word groups) [NOT IMPLEMENTED YET]",

220 action="store_true",

221 dest="voc_stats")

222

223 parser.add_option(

224 "-t", "--tag",

225 help="tag known words with tag",

226 action="store",

227 dest="tag")

228

229 parser.add_option(

230 "-T", "--show-tags",

231 help="tag known words with tag",

232 action="store_true",

233 dest="show_tags")

234

235 parser.add_option(

236 "-v", "--vocabulary-filename",

237 help="use specified file as a vocabulary",

238 action="store",

239 dest="vocabulary_filename")

240

241 parser.add_option(

242 "-2", "--two-words",

243 help="find 2 words' sequences",

244 action="store_true",

245 dest="two_words")

246

247 parser.add_option(

248 "-3", "--three-words",

249 help="find 3 words' sequences",

250 action="store_true",

251 dest="three_words")

252

253 def readlines_from_file(filename):

254 res = []

255 with codecs.open(filename, "r", "utf-8") as f:

256 for line in f.readlines():

257 res += [line]

258 return res

259

260 def readlines_from_url(url):

261 return [x.decode('utf-8') for x in

262 subprocess.Popen(

263 "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url),

264 shell = True,

265 stdout = subprocess.PIPE,

266 stderr = subprocess.STDOUT

267 ).communicate()[0].split('\n')

268 ]

269

270 def readlines_from_stdin():

271 return codecs.getreader("utf-8")(sys.stdin).readlines()

272

273 def words_from_line(line):

274 line = line.rstrip('\n')

275 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)

276 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)

277 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)

278

279 def get_words(lines, group_by=[1]):

280 """

281 Returns hash of words in a file

282 word => number

283 """

284 result = {}

285 (a, b, c) = ("", "", "")

286 for line in lines:

287 words = words_from_line(line)

288 for word in words:

289 if re.match('[0-9]*$', word):

290 continue

291 result.setdefault(word, 0)

292 result[word] += 1

293 if 2 in group_by and a != "" and b != "":

294 w = "%s_%s" % (a,b)

295 result.setdefault(w, 0)

296 result[w] += 1

297 if 3 in group_by and not "" in [a,b,c]:

298 w = "%s_%s_%s" % (a,b,c)

299 result.setdefault(w, 0)

300 result[w] += 1

301 (a,b,c) = (b, c, word)

302

303 logging.debug(result)

304 return result

305

306 def voc_filename():

307 if 'vocabulary_filename' in config:

308 return config['vocabulary_filename']

309 return "%s/%s.txt"%(config['config_directory'], config['language'])

310

311 def load_vocabulary():

312 return get_words(readlines_from_file(voc_filename()))

313

314 def notes_filenames():

315 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]

316

317 def load_notes(files):

318 notes = {}

319 for filename in files:

320 with codecs.open(filename, "r", "utf-8") as f:

321 for line in f.readlines():

322 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

323 notes.setdefault(word, {})

324 notes[word][filename] = note

325 return notes

326

327 def add_notes(lines, notes):

328 notes_filename = notes_filenames()[0]

329 result = []

330 for line in lines:

331 if line.startswith('#'):

332 result += [line]

333 else:

334 match_object = re.search('^\s*\S+\s*(\S+)', line)

335 if match_object:

336 word = match_object.group(1)

337 if word in notes:

338 if notes_filename in notes[word]:

339 line = line.rstrip('\n')

340 line = "%-30s %s\n" % (line, notes[word][notes_filename])

341 result += [line]

342 else:

343 result += [line]

344 else:

345 result += [line]

346 return result

347

348 def remove_notes(lines, notes_group):

349 notes_filename = notes_filenames()[0]

350 notes = {}

351 for k in notes_group.keys():

352 if notes_filename in notes_group[k]:

353 notes[k] = notes_group[k][notes_filename]

354

355 result = []

356 for line in lines:

357 line = line.rstrip('\n')

358 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)

359 if match_object:

360 result.append("".join([

361 match_object.group(1),

362 match_object.group(2),

363 match_object.group(3),

364 match_object.group(4),

365 "\n"

366 ]))

367 notes[match_object.group(4)] = match_object.group(6)

368 else:

369 result.append(line+"\n")

370

371 save_notes(notes_filename, notes)

372 return result

373

374 def save_notes(filename, notes):

375 lines = []

376 saved_words = []

377 with codecs.open(filename, "r", "utf-8") as f:

378 for line in f.readlines():

379 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

380 if word in notes:

381 line = "%-29s %s\n" % (word, notes[word])

382 saved_words.append(word)

383 lines.append(line)

384 for word in [x for x in notes.keys() if not x in saved_words]:

385 line = "%-29s %s\n" % (word, notes[word])

386 lines.append(line)

387

388 with codecs.open(filename, "w", "utf-8") as f:

389 for line in lines:

390 f.write(line)

391

392

393 def substract_dictionary(dict1, dict2):

394 """

395 returns dict1 - dict2

396 """

397 result = {}

398 for (k,v) in dict1.items():

399 if not k in dict2:

400 result[k] = v

401 return result

402

403 def dump_words(words, filename):

404 with codecs.open(filename, "w+", "utf-8") as f:

405 for word in words.keys():

406 f.write(("%s\n"%word)*words[word])

407

408 def error_message(text):

409 print text

410

411 def find_wordgroups_weights(word_pairs, normalizator):

412 weight = {}

413 for (num, word) in word_pairs:

414 normalized = normalizator.normalize(word)

415 weight.setdefault(normalized, 0)

416 weight[normalized] += num

417 return weight

418

419 def find_linked_words(notes):

420 linked_words = {}

421 for word in notes.keys():

422 for note in notes[word].values():

423 if "@" in note:

424 result = re.search(r'\@(\S*)', note)

425 if result:

426 main_word = result.group(1)

427 if main_word:

428 linked_words[word] = main_word

429 return linked_words

430

431 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):

432 (num1, word1) = pair1

433 (num2, word2) = pair2

434

435 normalized_word1 = normalizator.normalize(word1)

436 normalized_word2 = normalizator.normalize(word2)

437

438 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])

439 if cmp_res != 0:

440 return cmp_res

441 else:

442 cmp_res = cmp(normalized_word1, normalized_word2)

443 if cmp_res != 0:

444 return cmp_res

445 else:

446 return cmp(int(num1), int(num2))

447

448

449 def print_words_sorted(

450 word_pairs,

451 stats,

452 normalizator,

453 print_stats=True,

454 stats_only=False,

455 compressed_wordlist=False,

456 show_range=0,

457 show_range_percentage=0,

458 ):

459 result = []

460 if stats_only:

461 #codecs.getwriter("utf-8")(sys.stdout).write(

462 result.append(

463 " ".join([

464 "%-10s" % x for x in [

465 "LANG",

466 "KNOWN%",

467 "UNKNOWN%",

468 "KNOWN",

469 "TOTAL",

470 "WPS",

471 "UWPS*10"

472 ]]) + "\n")

473 result.append(

474 " ".join([

475 "%(language)-10s",

476 "%(percentage)-10.2f",

477 "%(percentage_unknown)-10.2f",

478 "%(total_known)-11d"

479 "%(total)-11d"

480 "%(wps)-11d"

481 "%(uwps)-11d"

482 ]) % stats + "\n")

483 return "".join(result)

484

485 if print_stats:

486 result.append(

487 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)

488

489 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)

490 known = int(stats['total_known'])

491 total = int(stats['total'])

492 current_level = 0

493 old_normalized_word = None

494 words_of_this_group = []

495 printed_words = 0

496 for word_pair in word_pairs:

497

498 normalized_word = normalizator.normalize(word_pair[1])

499 if old_normalized_word and old_normalized_word != normalized_word:

500 if compressed_wordlist:

501 compressed_word_pair = (

502 sum(x[0] for x in words_of_this_group),

503 normalizator.best_word_from_group(words_of_this_group)

504 )

505 result.append("%10s %s\n" % compressed_word_pair)

506 printed_words += 1

507 words_of_this_group = []

508

509 old_normalized_word = normalized_word

510 words_of_this_group.append(word_pair)

511

512 if not compressed_wordlist:

513 result.append("%10s %s\n" % word_pair)

514 printed_words += 1

515

516

517 known += word_pair[0]

518 if 100.0*known/total >= level_lines[0]:

519 current_level = level_lines[0]

520 while 100.0*known/total > level_lines[0]:

521 current_level = level_lines[0]

522 level_lines = level_lines[1:]

523 result.append("# %s\n" % current_level)

524

525 if show_range >0 and printed_words >= show_range:

526 break

527 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:

528 break

529

530 return result

531

532 def parse_parts_description(parts_description):

533 """

534 Returns triad (start, stop, step)

535 basing on parts_description string.

536 from-to/step

537 from+delta/step

538 """

539

540 try:

541 (a, step) = parts_description.split("/", 1)

542 step = int(step)

543 start = 0

544 stop = 0

545 if '-' in a:

546 (start, stop) = a.split("-", 1)

547 start = int(start)

548 stop = int(stop)

549 elif '+' in a:

550 (start, stop) = a.split("+", 1)

551 start = int(start)

552 stop = int(stop)

553 else:

554 start = int(a)

555 stop = start + 1

556 return (start, stop, step)

557

558 except:

559 raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % parts_description)

560

561

562 def take_part(lines, part_description = None):

563 if part_description == None or part_description == '':

564 return lines

565 (start, stop, step) = parse_parts_description(part_description)

566 n = len(lines)

567 part_size = (1.0*n) / step

568 result = []

569 for i in range(n):

570 if i >= start * part_size and i <= stop * part_size:

571 result += [lines[i]]

572 return result

573

574 def filter_get_words_group_words_add_stat(args):

575 vocabulary = load_vocabulary()

576 notes = load_notes(notes_filenames())

577

578 if len(args) > 0:

579 if 'http://' in args[0]:

580 input_lines = readlines_from_url(args[0])

581 else:

582 input_lines = readlines_from_file(args[0])

583 else:

584 input_lines = readlines_from_stdin()

585

586 if len(input_lines) == 0:

587 print >> sys.stderr, "Nothing to do, standard input is empty, exiting."

588 sys.exit(1)

589

590 lines = take_part(input_lines, config.get('pages', ''))

591

592 (_, original_text_tempfile) = tempfile.mkstemp(prefix='new-word')

593 with codecs.open(original_text_tempfile, "w", "utf-8") as f:

594 f.write("".join(lines))

595

596 group_by = [1]

597

598 if 'two_words' in config:

599 group_by.append(2)

600 if 'three_words' in config:

601 group_by.append(3)

602 words = get_words(lines, group_by)

603 stats_only = False

604 if 'text_stats' in config:

605 stats_only = True

606

607 compressed_wordlist = False

608 if 'compressed' in config:

609 compressed_wordlist = True

610

611 if 'show_range' in config:

612 show_range = int(config['show_range'])

613 else:

614 show_range = 0

615

616 if 'show_range_percentage' in config:

617 show_range_percentage = int(config['show_range_percentage'])

618 else:

619 show_range_percentage = 0

620

621

622 stats = {}

623 stats['total'] = sum(words[x] for x in words.keys())

624 if not 'no_filter' in config:

625 words = substract_dictionary(words, vocabulary)

626

627 stats['total_unknown'] = sum(words[x] for x in words.keys())

628 stats['total_known'] = stats['total'] - stats['total_unknown']

629 stats['percentage'] = 100.0*stats['total_known']/stats['total']

630 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']

631 stats['groups'] = 0

632 stats['words'] = len(words)

633 stats['sentences'] = 0 #FIXME

634 stats['wps'] = 0 #FIXME

635 stats['uwps'] = 0 #FIXME

636 stats['language'] = config['language']

637

638 linked_words = find_linked_words(notes)

639 normalizator = Normalizator(config['language'], linked_words)

640

641 # filter words by allowed_words_filter

642 if 'allowed_words' in config:

643 allowed_words_filename = config['allowed_words']

644 normalized_allowed_words = [

645 normalizator.normalize(w.rstrip('\n'))

646 for w in readlines_from_file(allowed_words_filename)

647 ]

648

649 result = {}

650 for w, wn in words.iteritems():

651 if normalizator.normalize(w) in normalized_allowed_words:

652 result[w] = wn

653 words = result

654

655 words_with_freq = []

656 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):

657 words_with_freq.append((words[k], k))

658

659 wgw = find_wordgroups_weights(words_with_freq, normalizator)

660 if not 'no_words_grouping' in config or not config['no_words_grouping']:

661 words_with_freq = sorted(

662 words_with_freq,

663 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),

664 reverse=True)

665

666 output = print_words_sorted(

667 words_with_freq,

668 stats,

669 normalizator,

670 stats_only=stats_only,

671 compressed_wordlist=compressed_wordlist,

672 show_range=show_range,

673 show_range_percentage=show_range_percentage,

674 )

675

676

677 if ('non_interactive' in config or 'text_stats' in config):

678 codecs.getwriter("utf-8")(sys.stdout).write("".join(output))

679 else:

680 (_, temp1) = tempfile.mkstemp(prefix='new-word')

681 (_, temp2) = tempfile.mkstemp(prefix='new-word')

682

683 with codecs.open(temp1, "w", "utf-8") as f:

684 f.write("".join(output))

685 with codecs.open(temp2, "w", "utf-8") as f:

686 f.write("".join(add_notes(output, notes)))

687

688 os.putenv('ORIGINAL_TEXT', original_text_tempfile)

689 os.system((

690 "vim"

691 " -c 'setlocal spell spelllang={language}'"

692 " -c 'set keywordprg={language}'"

693 " -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255'"

694 " {filename}"

695 " < /dev/tty > /dev/tty"

696 ).format(language=config['language'], filename=temp2))

697

698 lines = remove_notes(readlines_from_file(temp2), notes)

699

700 # compare lines_before and lines_after and return deleted words

701 lines_before = output

702 lines_after = lines

703 deleted_words = []

704

705 lines_after_set = set(lines_after)

706 for line in lines_before:

707 if line not in lines_after_set:

708 line = line.strip()

709 if ' ' in line:

710 word = re.split('\s+', line, 1)[1]

711 if ' ' in word:

712 word = re.split('\s+', word, 1)[0]

713 deleted_words.append(word)

714

715 with codecs.open(voc_filename(), "a", "utf-8") as f:

716 f.write("\n".join(deleted_words + ['']))

717

718 os.unlink(temp1)

719 os.unlink(temp2)

720

721 os.unlink(original_text_tempfile)

722

723 (options, args) = parser.parse_args()

724 if options.language:

725 config['language'] = options.language

726

727 if options.pages:

728 config['pages'] = options.pages

729 else:

730 config['pages'] = ""

731

732 if options.allowed_words:

733 config['allowed_words'] = options.allowed_words

734

735 if options.show_range:

736 config['show_range'] = options.show_range

737

738 if options.show_range_percentage:

739 config['show_range_percentage'] = options.show_range_percentage

740

741 if options.non_interactive:

742 config['non_interactive'] = True

743

744 if options.text_stats:

745 config['text_stats'] = True

746

747 if options.compressed:

748 config['compressed'] = True

749

750 if options.no_filter:

751 config['no_filter'] = True

752

753 if options.two_words:

754 config['two_words'] = True

755

756 if options.three_words:

757 config['three_words'] = True

758

759 if options.no_words_grouping:

760 config['no_words_grouping'] = True

761

762 filter_get_words_group_words_add_stat(args)

763

764 #if options.function:

765 # function_names = {

766 # 'get_words_group_words_add_stat': ,

767 # }

768 # if options.function in function_names:

769 # function_names[options.function](args)

770 # else:

771 # error_message("Unkown function %s.\nAvailable functions:\n%s" % (

772 # options.function, "".join([" "+x for x in sorted(function_names.keys())])))

773 # sys.exit(1)

774 #

775

776

777

778 #os.system("vim")

779