new-words: d532e7b52ab2 new-words.py

new-words

view new-words.py @ 43:d532e7b52ab2

-s key support in new-words.py

Now new-words-py.sh -s works in the same way as new-words.sh.
(WPS and UWPS fields are not calculated correctly yet).

author	Igor Chubin <igor@chub.in>
date	Fri Jan 28 12:40:58 2011 +0200 (2011-01-28)
parents	3ec83a7cc544
children	7eb1a8c3eade

line source

1 #!/usr/bin/env python

2 # -*- coding: utf-8 -*-

4 from __future__ import with_statement

5 import codecs

6 import logging

7 import os

8 import optparse

9 import re

10 import subprocess

11 import sys

12 import Stemmer

13 try:

14 import psyco

15 psyco.full()

16 except:

17 pass

19 config = {

20 'config_directory': os.environ['HOME'] + '/.new-words',

21 'language': 'en',

22 }

24 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)

26 class Normalizator:

27 def __init__(self, language, linked_words={}):

28 stemmer_algorithm = {

29 'de' : 'german',

30 'en' : 'english',

31 'ru' : 'russian',

32 'uk' : 'ukrainian',

33 }

34 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])

35 self.linked_words = linked_words

37 def normalize(self, word):

38 word_chain = []

39 while word in self.linked_words and not word in word_chain:

40 word_chain.append(word)

41 word = self.linked_words[word]

42 return self.stemmer.stemWord(word.lower())

44 parser = optparse.OptionParser()

46 parser.add_option(

47 "-a", "--no-marks",

48 help="don't add marks (and don't save marks added by user)",

49 action="store_true",

50 dest="no_marks")

52 parser.add_option(

53 "-c", "--compressed",

54 help="show compressed wordlist: one word per group",

55 action="store_true",

56 dest="compressed")

58 parser.add_option(

59 "-k", "--known-words",

60 help="put higher words that are similar to the known words (only for English)",

61 action="store_true",

62 dest="compressed")

64 parser.add_option(

65 "-l", "--language",

66 help="specify language of text",

67 action="store",

68 dest="language")

70 parser.add_option(

71 "-f", "--function",

72 help="filter through subsystem [INTERNAL]",

73 action="store",

74 dest="function")

76 parser.add_option(

77 "-m", "--merge-tag",

78 help="merge words tagged with specified tag into the main vocabulary",

79 action="store",

80 dest="merge_tag")

82 parser.add_option(

83 "-M", "--merge-tagged",

84 help="merge words tagged with ANY tag into the main vocabulary",

85 action="store_true",

86 dest="merge_tagged")

88 parser.add_option(

89 "-n", "--non-interactive",

90 help="non-interactive mode (don't run vi)",

91 action="store_true",

92 dest="non_interactive")

94 parser.add_option(

95 "-N", "--no-filter",

96 help="switch off known words filtering",

97 action="store_true",

98 dest="no_filter")

100 parser.add_option(

101 "-p", "--pages",

102 help="work with specified pages only (pages = start-stop/total )",

103 action="store",

104 dest="pages")

105

106 parser.add_option(

107 "-r", "--remove-tag",

108 help="remove subvocabulary of specified tag",

109 action="store",

110 dest="remove_tag")

111

112 parser.add_option(

113 "-s", "--text-stats",

114 help="show the text statistics (percentage of known words and so on) and exit",

115 action="store_true",

116 dest="text_stats")

117

118 parser.add_option(

119 "-S", "--voc-stats",

120 help="show your vocabulary statistics (number of words and word groups)",

121 action="store_true",

122 dest="voc_stats")

123

124 parser.add_option(

125 "-t", "--tag",

126 help="tag known words with tag",

127 action="store",

128 dest="tag")

129

130 parser.add_option(

131 "-T", "--show-tags",

132 help="tag known words with tag",

133 action="store_true",

134 dest="show_tags")

135

136 parser.add_option(

137 "-2", "--two-words",

138 help="find 2 words' sequences",

139 action="store_true",

140 dest="two_words")

141

142 parser.add_option(

143 "-3", "--three-words",

144 help="find 3 words' sequences",

145 action="store_true",

146 dest="three_words")

147

148 def readlines_from_file(filename):

149 res = []

150 with codecs.open(filename, "r", "utf-8") as f:

151 for line in f.readlines():

152 res += [line]

153 return res

154

155 def readlines_from_stdin():

156 return codecs.getreader("utf-8")(sys.stdin).readlines()

157

158 def words_from_line(line):

159 line = line.rstrip('\n')

160 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)

161 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)

162 return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line)

163

164 def get_words(lines):

165 """

166 Returns hash of words in a file

167 word => number

168 """

169 result = {}

170 for line in lines:

171 words = words_from_line(line)

172 for word in words:

173 if re.match('[0-9]*$', word):

174 continue

175 result.setdefault(word, 0)

176 result[word] += 1

177 return result

178

179 def load_vocabulary():

180 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))

181

182 def notes_filenames():

183 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]

184

185 def load_notes(files):

186 notes = {}

187 for filename in files:

188 with codecs.open(filename, "r", "utf-8") as f:

189 for line in f.readlines():

190 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

191 notes.setdefault(word, {})

192 notes[word][filename] = note

193 return notes

194

195 def add_notes(lines, notes):

196 notes_filename = notes_filenames()[0]

197 result = []

198 for line in lines:

199 if line.startswith('#'):

200 result += [line]

201 else:

202 match_object = re.search('^\s*\S+\s*(\S+)', line)

203 if match_object:

204 word = match_object.group(1)

205 if word in notes:

206 logging.debug(word)

207 logging.debug(line)

208 if notes_filename in notes[word]:

209 line = line.rstrip('\n')

210 line = "%-30s %s\n" % (line, notes[word][notes_filename])

211 logging.debug(line)

212 result += [line]

213 else:

214 result += [line]

215 else:

216 result += [line]

217 return result

218

219 def remove_notes(lines, notes_group):

220 notes_filename = notes_filenames()[0]

221 notes = {}

222 for k in notes_group.keys():

223 if notes_filename in notes_group[k]:

224 notes[k] = notes_group[k][notes_filename]

225

226 result = []

227 for line in lines:

228 line = line.rstrip('\n')

229 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)

230 if match_object:

231 result.append("".join([

232 match_object.group(1),

233 match_object.group(2),

234 match_object.group(3),

235 match_object.group(4),

236 "\n"

237 ]))

238 notes[match_object.group(4)] = match_object.group(6)

239 else:

240 result.append(line+"\n")

241

242 save_notes(notes_filename, notes)

243 return result

244

245 def save_notes(filename, notes):

246 lines = []

247 saved_words = []

248 with codecs.open(filename, "r", "utf-8") as f:

249 for line in f.readlines():

250 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

251 if word in notes:

252 line = "%-29s %s\n" % (word, notes[word])

253 saved_words.append(word)

254 lines.append(line)

255 for word in [x for x in notes.keys() if not x in saved_words]:

256 line = "%-29s %s\n" % (word, notes[word])

257 lines.append(line)

258

259 with codecs.open(filename, "w", "utf-8") as f:

260 for line in lines:

261 f.write(line)

262

263

264 def substract_dictionary(dict1, dict2):

265 """

266 returns dict1 - dict2

267 """

268 result = {}

269 for (k,v) in dict1.items():

270 if not k in dict2:

271 result[k] = v

272 return result

273

274 def dump_words(words, filename):

275 with codecs.open(filename, "w+", "utf-8") as f:

276 for word in words.keys():

277 f.write(("%s\n"%word)*words[word])

278

279 def error_message(text):

280 print text

281

282 def find_wordgroups_weights(word_pairs, normalizator):

283 weight = {}

284 for (num, word) in word_pairs:

285 normalized = normalizator.normalize(word)

286 weight.setdefault(normalized, 0)

287 weight[normalized] += num

288 return weight

289

290 def find_linked_words(notes):

291 linked_words = {}

292 for word in notes.keys():

293 for note in notes[word].values():

294 if "@" in note:

295 result = re.search(r'\@(\S*)', note)

296 if result:

297 main_word = result.group(1)

298 if main_word:

299 linked_words[word] = main_word

300 return linked_words

301

302 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):

303 (num1, word1) = pair1

304 (num2, word2) = pair2

305

306 normalized_word1 = normalizator.normalize(word1)

307 normalized_word2 = normalizator.normalize(word2)

308

309 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])

310 if cmp_res != 0:

311 return cmp_res

312 else:

313 cmp_res = cmp(normalized_word1, normalized_word2)

314 if cmp_res != 0:

315 return cmp_res

316 else:

317 return cmp(int(num1), int(num2))

318

319 def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False):

320 if stats_only:

321 codecs.getwriter("utf-8")(sys.stdout).write(

322 " ".join([

323 "%-10s" % x for x in [

324 "LANG",

325 "KNOWN%",

326 "UNKNOWN%",

327 "KNOWN",

328 "TOTAL",

329 "WPS",

330 "UWPS*10"

331 ]]) + "\n")

332 codecs.getwriter("utf-8")(sys.stdout).write(

333 " ".join([

334 "%(language)-10s",

335 "%(percentage)-10.2f",

336 "%(percentage_unknown)-10.2f",

337 "%(total_known)-11d"

338 "%(total)-11d"

339 "%(wps)-11d"

340 "%(uwps)-11d"

341 ]) % stats + "\n")

342 return

343

344 if print_stats:

345 codecs.getwriter("utf-8")(sys.stdout).write(

346 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)

347

348 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)

349 known = int(stats['total_known'])

350 total = int(stats['total'])

351 current_level = 0

352 for word_pair in word_pairs:

353 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)

354 known += word_pair[0]

355 if 100.0*known/total >= level_lines[0]:

356 current_level = level_lines[0]

357 while 100.0*known/total > level_lines[0]:

358 current_level = level_lines[0]

359 level_lines = level_lines[1:]

360 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)

361

362 def filter_add_notes(args):

363 lines = readlines_from_file(args[0])

364 notes = load_notes(notes_filenames())

365 lines = add_notes(lines, notes)

366 with codecs.open(args[0], "w", "utf-8") as f:

367 for line in lines:

368 f.write(line)

369

370 def filter_remove_notes(args):

371 lines = readlines_from_file(args[0])

372 notes = load_notes(notes_filenames())

373 lines = remove_notes(lines, notes)

374 with codecs.open(args[0], "w", "utf-8") as f:

375 for line in lines:

376 f.write(line)

377

378 def filter_get_words_group_words_add_stat(args):

379 vocabulary = load_vocabulary()

380 notes = load_notes(notes_filenames())

381 lines = readlines_from_stdin()

382 words = get_words(lines)

383 stats_only = False

384 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':

385 stats_only = True

386

387 stats = {}

388 stats['total'] = sum(words[x] for x in words.keys())

389 words = substract_dictionary(words, vocabulary)

390

391 stats['total_unknown'] = sum(words[x] for x in words.keys())

392 stats['total_known'] = stats['total'] - stats['total_unknown']

393 stats['percentage'] = 100.0*stats['total_known']/stats['total']

394 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']

395 stats['groups'] = 0

396 stats['words'] = len(words)

397 stats['sentences'] = 0 #FIXME

398 stats['wps'] = 0 #FIXME

399 stats['uwps'] = 0 #FIXME

400 stats['language'] = config['language']

401

402 linked_words = find_linked_words(notes)

403 normalizator = Normalizator(config['language'], linked_words)

404

405 word_pairs = []

406 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):

407 word_pairs.append((words[k], k))

408

409 wgw = find_wordgroups_weights(word_pairs, normalizator)

410 word_pairs = sorted(

411 word_pairs,

412 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),

413 reverse=True)

414

415 print_words_sorted(word_pairs, stats, stats_only=stats_only)

416

417 (options, args) = parser.parse_args()

418 if options.language:

419 config['language'] = options.language

420

421 if options.function:

422 function_names = {

423 'add_notes' : filter_add_notes,

424 'remove_notes': filter_remove_notes,

425 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,

426 }

427 if options.function in function_names:

428 function_names[options.function](args)

429 else:

430 error_message("Unkown function %s.\nAvailable functions:\n%s" % (

431 options.function, "".join([" "+x for x in sorted(function_names.keys())])))

432 sys.exit(1)

433

434

435

436

437 #os.system("vim")

438