new-words: 04b0280aa883 new-words.py

new-words

view new-words.py @ 57:04b0280aa883

new-words.py install by default; new-words.sh is now legacy script

author	Igor Chubin <igor@chub.in>
date	Thu Nov 03 16:03:49 2011 +0100 (2011-11-03)
parents	e25de9ea9184
children	3682038403ad

line source

1 #!/usr/bin/env python

2 # -*- coding: utf-8 -*-

4 from __future__ import with_statement

5 import codecs

6 import difflib

7 import logging

8 import os

9 import optparse

10 import re

11 import subprocess

12 import sys

13 import Stemmer

14 import tempfile

15 try:

16 import psyco

17 psyco.full()

18 except:

19 pass

21 config = {

22 'config_directory': os.environ['HOME'] + '/.new-words',

23 'language': 'en',

24 }

26 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)

28 class Normalizator:

29 def __init__(self, language, linked_words={}):

30 stemmer_algorithm = {

31 'de' : 'german',

32 'en' : 'english',

33 'es' : 'spanish',

34 'ru' : 'russian',

35 'it' : 'italian',

36 'uk' : 'ukrainian',

37 }

38 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])

39 self.linked_words = linked_words

41 def normalize(self, word):

42 word_chain = []

43 while word in self.linked_words and not word in word_chain:

44 word_chain.append(word)

45 word = self.linked_words[word]

46 return self.stemmer.stemWord(word.lower())

48 def best_word_from_group(self, wordpairs_group):

49 """Returns the word that is the most relevant to the wordpairs_group.

51 At the moment: returns the word with minimal length"""

53 def f(x, y):

54 return difflib.SequenceMatcher(

55 None,

56 #(x[-2:] == 'en' and x[:-2].lower() or x.lower()),

57 x.lower(),

58 y.lower()).ratio()

60 minimal_length = min(len(pair[1]) for pair in wordpairs_group)

61 best_match = list(x[1] for x in sorted(

62 (x for x in wordpairs_group if len(x[1]) == minimal_length),

63 key=lambda x:x[0],

64 reverse=True))[0]

66 return best_match

68 suggestions = self.dictionary_suggestions(best_match)

69 if len(suggestions) == 1:

70 return best_match

72 verb = False

73 corrected_best_match = best_match

74 if best_match[-2:] == 'et':

75 word = best_match[:-1]+"n"

76 sugg = self.dictionary_suggestions(word)

77 if len(sugg) == 1:

78 return word

79 suggestions += sugg

80 corrected_best_match = word

81 corrected_best_match = best_match[:-2]

82 verb = True

84 if best_match[-1] == 't':

85 word = best_match[:-1]+"en"

86 sugg = self.dictionary_suggestions(word)

87 if len(sugg) == 1:

88 return word

89 suggestions += sugg

90 corrected_best_match = best_match[:-1]

91 verb = True

93 if corrected_best_match[0].lower() == corrected_best_match[0]:

94 suggestions = [ x for x in suggestions

95 if x[0].lower() == x[0] ]

97 if suggestions == []:

98 return best_match+"_"

99 return best_match+" "+(" ".join(

100 sorted(

101 suggestions,

102 key = lambda x: f(x, corrected_best_match),

103 reverse = True

104 )

105 )

106 )

107

108 def dictionary_suggestions(self, word):

109 return [

110 x.decode('utf-8').rstrip('\n')

111 for x

112 in subprocess.Popen(

113 ["de-variants", word],

114 stdout=subprocess.PIPE

115 ).stdout.readlines() ]

116

117

118 parser = optparse.OptionParser()

119

120 parser.add_option(

121 "-a", "--no-marks",

122 help="don't add marks (and don't save marks added by user) [NOT IMPLEMENTED YET]",

123 action="store_true",

124 dest="no_marks")

125

126 parser.add_option(

127 "-c", "--compressed",

128 help="show compressed wordlist: one word per group",

129 action="store_true",

130 dest="compressed")

131

132 parser.add_option(

133 "-k", "--known-words",

134 help="put higher words that are similar to the known words (only for English)",

135 action="store_true",

136 dest="compressed")

137

138 parser.add_option(

139 "-l", "--language",

140 help="specify language of text",

141 action="store",

142 dest="language")

143

144 parser.add_option(

145 "-f", "--allowed-words",

146 help="file with list of allowed words (words that will be shown in the output)",

147 action="store",

148 dest="allowed_words")

149

150 parser.add_option(

151 "-G", "--words-grouping",

152 help="turn off word grouping",

153 action="store_true",

154 dest="no_words_grouping")

155

156 parser.add_option(

157 "-X", "--function",

158 help="filter through subsystem [INTERNAL]",

159 action="store",

160 dest="function")

161

162 parser.add_option(

163 "-m", "--merge-tag",

164 help="merge words tagged with specified tag into the main vocabulary [NOT IMPLEMENTED YET]",

165 action="store",

166 dest="merge_tag")

167

168 parser.add_option(

169 "-M", "--merge-tagged",

170 help="merge words tagged with ANY tag into the main vocabulary [NOT IMPLEMENTED YET]",

171 action="store_true",

172 dest="merge_tagged")

173

174 parser.add_option(

175 "-n", "--non-interactive",

176 help="non-interactive mode (don't run vi)",

177 action="store_true",

178 dest="non_interactive")

179

180 parser.add_option(

181 "-N", "--no-filter",

182 help="switch off known words filtering",

183 action="store_true",

184 dest="no_filter")

185

186 parser.add_option(

187 "-p", "--pages",

188 help="work with specified pages only (pages = start-stop/total )",

189 action="store",

190 dest="pages")

191

192 parser.add_option(

193 "-d", "--delete-tag",

194 help="delete subvocabulary of specified tag",

195 action="store",

196 dest="delete_tag")

197

198 parser.add_option(

199 "-r", "--show-range",

200 help="show only words specified number of words",

201 action="store",

202 dest="show_range")

203

204 parser.add_option(

205 "-R", "--show-range-percentage",

206 help="show only words that cover specified percentage of the text, skip the rest",

207 action="store",

208 dest="show_range_percentage")

209

210 parser.add_option(

211 "-s", "--text-stats",

212 help="show the text statistics (percentage of known words and so on) and exit",

213 action="store_true",

214 dest="text_stats")

215

216 parser.add_option(

217 "-S", "--voc-stats",

218 help="show your vocabulary statistics (number of words and word groups) [NOT IMPLEMENTED YET]",

219 action="store_true",

220 dest="voc_stats")

221

222 parser.add_option(

223 "-t", "--tag",

224 help="tag known words with tag",

225 action="store",

226 dest="tag")

227

228 parser.add_option(

229 "-T", "--show-tags",

230 help="tag known words with tag",

231 action="store_true",

232 dest="show_tags")

233

234 parser.add_option(

235 "-2", "--two-words",

236 help="find 2 words' sequences",

237 action="store_true",

238 dest="two_words")

239

240 parser.add_option(

241 "-3", "--three-words",

242 help="find 3 words' sequences",

243 action="store_true",

244 dest="three_words")

245

246 def readlines_from_file(filename):

247 res = []

248 with codecs.open(filename, "r", "utf-8") as f:

249 for line in f.readlines():

250 res += [line]

251 return res

252

253 def readlines_from_url(url):

254 return [x.decode('utf-8') for x in

255 subprocess.Popen(

256 "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url),

257 shell = True,

258 stdout = subprocess.PIPE,

259 stderr = subprocess.STDOUT

260 ).communicate()[0].split('\n')

261 ]

262

263 def readlines_from_stdin():

264 return codecs.getreader("utf-8")(sys.stdin).readlines()

265

266 def words_from_line(line):

267 line = line.rstrip('\n')

268 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)

269 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)

270 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)

271

272 def get_words(lines, group_by=[1]):

273 """

274 Returns hash of words in a file

275 word => number

276 """

277 result = {}

278 (a, b, c) = ("", "", "")

279 for line in lines:

280 words = words_from_line(line)

281 for word in words:

282 if re.match('[0-9]*$', word):

283 continue

284 result.setdefault(word, 0)

285 result[word] += 1

286 if 2 in group_by and a != "" and b != "":

287 w = "%s_%s" % (a,b)

288 result.setdefault(w, 0)

289 result[w] += 1

290 if 3 in group_by and not "" in [a,b,c]:

291 w = "%s_%s_%s" % (a,b,c)

292 result.setdefault(w, 0)

293 result[w] += 1

294 (a,b,c) = (b, c, word)

295

296 logging.debug(result)

297 return result

298

299 def voc_filename():

300 return "%s/%s.txt"%(config['config_directory'], config['language'])

301

302 def load_vocabulary():

303 return get_words(readlines_from_file(voc_filename()))

304

305 def notes_filenames():

306 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]

307

308 def load_notes(files):

309 notes = {}

310 for filename in files:

311 with codecs.open(filename, "r", "utf-8") as f:

312 for line in f.readlines():

313 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

314 notes.setdefault(word, {})

315 notes[word][filename] = note

316 return notes

317

318 def add_notes(lines, notes):

319 notes_filename = notes_filenames()[0]

320 result = []

321 for line in lines:

322 if line.startswith('#'):

323 result += [line]

324 else:

325 match_object = re.search('^\s*\S+\s*(\S+)', line)

326 if match_object:

327 word = match_object.group(1)

328 if word in notes:

329 if notes_filename in notes[word]:

330 line = line.rstrip('\n')

331 line = "%-30s %s\n" % (line, notes[word][notes_filename])

332 result += [line]

333 else:

334 result += [line]

335 else:

336 result += [line]

337 return result

338

339 def remove_notes(lines, notes_group):

340 notes_filename = notes_filenames()[0]

341 notes = {}

342 for k in notes_group.keys():

343 if notes_filename in notes_group[k]:

344 notes[k] = notes_group[k][notes_filename]

345

346 result = []

347 for line in lines:

348 line = line.rstrip('\n')

349 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)

350 if match_object:

351 result.append("".join([

352 match_object.group(1),

353 match_object.group(2),

354 match_object.group(3),

355 match_object.group(4),

356 "\n"

357 ]))

358 notes[match_object.group(4)] = match_object.group(6)

359 else:

360 result.append(line+"\n")

361

362 save_notes(notes_filename, notes)

363 return result

364

365 def save_notes(filename, notes):

366 lines = []

367 saved_words = []

368 with codecs.open(filename, "r", "utf-8") as f:

369 for line in f.readlines():

370 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

371 if word in notes:

372 line = "%-29s %s\n" % (word, notes[word])

373 saved_words.append(word)

374 lines.append(line)

375 for word in [x for x in notes.keys() if not x in saved_words]:

376 line = "%-29s %s\n" % (word, notes[word])

377 lines.append(line)

378

379 with codecs.open(filename, "w", "utf-8") as f:

380 for line in lines:

381 f.write(line)

382

383

384 def substract_dictionary(dict1, dict2):

385 """

386 returns dict1 - dict2

387 """

388 result = {}

389 for (k,v) in dict1.items():

390 if not k in dict2:

391 result[k] = v

392 return result

393

394 def dump_words(words, filename):

395 with codecs.open(filename, "w+", "utf-8") as f:

396 for word in words.keys():

397 f.write(("%s\n"%word)*words[word])

398

399 def error_message(text):

400 print text

401

402 def find_wordgroups_weights(word_pairs, normalizator):

403 weight = {}

404 for (num, word) in word_pairs:

405 normalized = normalizator.normalize(word)

406 weight.setdefault(normalized, 0)

407 weight[normalized] += num

408 return weight

409

410 def find_linked_words(notes):

411 linked_words = {}

412 for word in notes.keys():

413 for note in notes[word].values():

414 if "@" in note:

415 result = re.search(r'\@(\S*)', note)

416 if result:

417 main_word = result.group(1)

418 if main_word:

419 linked_words[word] = main_word

420 return linked_words

421

422 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):

423 (num1, word1) = pair1

424 (num2, word2) = pair2

425

426 normalized_word1 = normalizator.normalize(word1)

427 normalized_word2 = normalizator.normalize(word2)

428

429 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])

430 if cmp_res != 0:

431 return cmp_res

432 else:

433 cmp_res = cmp(normalized_word1, normalized_word2)

434 if cmp_res != 0:

435 return cmp_res

436 else:

437 return cmp(int(num1), int(num2))

438

439

440 def print_words_sorted(

441 word_pairs,

442 stats,

443 normalizator,

444 print_stats=True,

445 stats_only=False,

446 compressed_wordlist=False,

447 show_range=0,

448 show_range_percentage=0,

449 ):

450 result = []

451 if stats_only:

452 #codecs.getwriter("utf-8")(sys.stdout).write(

453 result.append(

454 " ".join([

455 "%-10s" % x for x in [

456 "LANG",

457 "KNOWN%",

458 "UNKNOWN%",

459 "KNOWN",

460 "TOTAL",

461 "WPS",

462 "UWPS*10"

463 ]]) + "\n")

464 result.append(

465 " ".join([

466 "%(language)-10s",

467 "%(percentage)-10.2f",

468 "%(percentage_unknown)-10.2f",

469 "%(total_known)-11d"

470 "%(total)-11d"

471 "%(wps)-11d"

472 "%(uwps)-11d"

473 ]) % stats + "\n")

474 return "".join(result)

475

476 if print_stats:

477 result.append(

478 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)

479

480 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)

481 known = int(stats['total_known'])

482 total = int(stats['total'])

483 current_level = 0

484 old_normalized_word = None

485 words_of_this_group = []

486 printed_words = 0

487 for word_pair in word_pairs:

488

489 normalized_word = normalizator.normalize(word_pair[1])

490 if old_normalized_word and old_normalized_word != normalized_word:

491 if compressed_wordlist:

492 compressed_word_pair = (

493 sum(x[0] for x in words_of_this_group),

494 normalizator.best_word_from_group(words_of_this_group)

495 )

496 result.append("%10s %s\n" % compressed_word_pair)

497 printed_words += 1

498 words_of_this_group = []

499

500 old_normalized_word = normalized_word

501 words_of_this_group.append(word_pair)

502

503 if not compressed_wordlist:

504 result.append("%10s %s\n" % word_pair)

505 printed_words += 1

506

507

508 known += word_pair[0]

509 if 100.0*known/total >= level_lines[0]:

510 current_level = level_lines[0]

511 while 100.0*known/total > level_lines[0]:

512 current_level = level_lines[0]

513 level_lines = level_lines[1:]

514 result.append("# %s\n" % current_level)

515

516 if show_range >0 and printed_words >= show_range:

517 break

518 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:

519 break

520

521 return result

522

523 def parse_parts_description(parts_description):

524 """

525 Returns triad (start, stop, step)

526 basing on parts_description string.

527 from-to/step

528 from+delta/step

529 """

530

531 try:

532 (a, step) = parts_description.split("/", 1)

533 step = int(step)

534 start = 0

535 stop = 0

536 if '-' in a:

537 (start, stop) = a.split("-", 1)

538 start = int(start)

539 stop = int(stop)

540 elif '+' in a:

541 (start, stop) = a.split("+", 1)

542 start = int(start)

543 stop = int(stop)

544 else:

545 start = int(a)

546 stop = start + 1

547 return (start, stop, step)

548

549 except:

550 raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % parts_description)

551

552

553 def take_part(lines, part_description = None):

554 if part_description == None or part_description == '':

555 return lines

556 (start, stop, step) = parse_parts_description(part_description)

557 n = len(lines)

558 part_size = (1.0*n) / step

559 result = []

560 for i in range(n):

561 if i >= start * part_size and i <= stop * part_size:

562 result += [lines[i]]

563 return result

564

565 def filter_get_words_group_words_add_stat(args):

566 vocabulary = load_vocabulary()

567 notes = load_notes(notes_filenames())

568

569 if len(args) > 0:

570 if 'http://' in args[0]:

571 input_lines = readlines_from_url(args[0])

572 else:

573 input_lines = readlines_from_file(args[0])

574 else:

575 input_lines = readlines_from_stdin()

576

577 if len(input_lines) == 0:

578 print >> sys.stderr, "Nothing to do, standard input is empty, exiting."

579 sys.exit(1)

580

581 lines = take_part(input_lines, config.get('pages', ''))

582

583 (_, original_text_tempfile) = tempfile.mkstemp(prefix='new-word')

584 with codecs.open(original_text_tempfile, "w", "utf-8") as f:

585 f.write("".join(lines))

586

587 group_by = [1]

588

589 if 'two_words' in config:

590 group_by.append(2)

591 if 'three_words' in config:

592 group_by.append(3)

593 words = get_words(lines, group_by)

594 stats_only = False

595 if 'text_stats' in config:

596 stats_only = True

597

598 compressed_wordlist = False

599 if 'compressed' in config:

600 compressed_wordlist = True

601

602 if 'show_range' in config:

603 show_range = int(config['show_range'])

604 else:

605 show_range = 0

606

607 if 'show_range_percentage' in config:

608 show_range_percentage = int(config['show_range_percentage'])

609 else:

610 show_range_percentage = 0

611

612

613 stats = {}

614 stats['total'] = sum(words[x] for x in words.keys())

615 if not 'no_filter' in config:

616 words = substract_dictionary(words, vocabulary)

617

618 stats['total_unknown'] = sum(words[x] for x in words.keys())

619 stats['total_known'] = stats['total'] - stats['total_unknown']

620 stats['percentage'] = 100.0*stats['total_known']/stats['total']

621 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']

622 stats['groups'] = 0

623 stats['words'] = len(words)

624 stats['sentences'] = 0 #FIXME

625 stats['wps'] = 0 #FIXME

626 stats['uwps'] = 0 #FIXME

627 stats['language'] = config['language']

628

629 linked_words = find_linked_words(notes)

630 normalizator = Normalizator(config['language'], linked_words)

631

632 # filter words by allowed_words_filter

633 if 'allowed_words' in config:

634 allowed_words_filename = config['allowed_words']

635 normalized_allowed_words = [

636 normalizator.normalize(w.rstrip('\n'))

637 for w in readlines_from_file(allowed_words_filename)

638 ]

639

640 result = {}

641 for w, wn in words.iteritems():

642 if normalizator.normalize(w) in normalized_allowed_words:

643 result[w] = wn

644 words = result

645

646 words_with_freq = []

647 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):

648 words_with_freq.append((words[k], k))

649

650 wgw = find_wordgroups_weights(words_with_freq, normalizator)

651 if not 'no_words_grouping' in config or not config['no_words_grouping']:

652 words_with_freq = sorted(

653 words_with_freq,

654 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),

655 reverse=True)

656

657 output = print_words_sorted(

658 words_with_freq,

659 stats,

660 normalizator,

661 stats_only=stats_only,

662 compressed_wordlist=compressed_wordlist,

663 show_range=show_range,

664 show_range_percentage=show_range_percentage,

665 )

666

667

668 if ('non_interactive' in config or 'text_stats' in config):

669 codecs.getwriter("utf-8")(sys.stdout).write("".join(output))

670 else:

671 (_, temp1) = tempfile.mkstemp(prefix='new-word')

672 (_, temp2) = tempfile.mkstemp(prefix='new-word')

673

674 with codecs.open(temp1, "w", "utf-8") as f:

675 f.write("".join(output))

676 with codecs.open(temp2, "w", "utf-8") as f:

677 f.write("".join(add_notes(output, notes)))

678

679 os.putenv('ORIGINAL_TEXT', original_text_tempfile)

680 os.system((

681 "vim"

682 " -c 'setlocal spell spelllang={language}'"

683 " -c 'set keywordprg={language}'"

684 " -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255'"

685 " {filename}"

686 " < /dev/tty > /dev/tty"

687 ).format(language=config['language'], filename=temp2))

688

689 lines = remove_notes(readlines_from_file(temp2), notes)

690

691 # compare lines_before and lines_after and return deleted words

692 lines_before = output

693 lines_after = lines

694 deleted_words = []

695

696 for line in lines_before:

697 if line not in lines_after:

698 line = line.strip()

699 if ' ' in line:

700 word = re.split('\s+', line, 1)[1]

701 if ' ' in word:

702 word = re.split('\s+', word, 1)[0]

703 deleted_words.append(word)

704

705 with codecs.open(voc_filename(), "a", "utf-8") as f:

706 f.write("\n".join(deleted_words + ['']))

707

708 os.unlink(temp1)

709 os.unlink(temp2)

710

711 os.unlink(original_text_tempfile)

712

713 (options, args) = parser.parse_args()

714 if options.language:

715 config['language'] = options.language

716

717 if options.pages:

718 config['pages'] = options.pages

719 else:

720 config['pages'] = ""

721

722 if options.allowed_words:

723 config['allowed_words'] = options.allowed_words

724

725 if options.show_range:

726 config['show_range'] = options.show_range

727

728 if options.show_range_percentage:

729 config['show_range_percentage'] = options.show_range_percentage

730

731 if options.non_interactive:

732 config['non_interactive'] = True

733

734 if options.text_stats:

735 config['text_stats'] = True

736

737 if options.compressed:

738 config['compressed'] = True

739

740 if options.no_filter:

741 config['no_filter'] = True

742

743 if options.two_words:

744 config['two_words'] = True

745

746 if options.three_words:

747 config['three_words'] = True

748

749 if options.no_words_grouping:

750 config['no_words_grouping'] = True

751

752 filter_get_words_group_words_add_stat(args)

753

754 #if options.function:

755 # function_names = {

756 # 'get_words_group_words_add_stat': ,

757 # }

758 # if options.function in function_names:

759 # function_names[options.function](args)

760 # else:

761 # error_message("Unkown function %s.\nAvailable functions:\n%s" % (

762 # options.function, "".join([" "+x for x in sorted(function_names.keys())])))

763 # sys.exit(1)

764 #

765

766

767

768 #os.system("vim")

769