new-words: 5f90e44eecfc new-words.py

new-words

view new-words.py @ 45:5f90e44eecfc

new-words.py: turn words filtering and grouping on and off

author	Igor Chubin <igor@chub.in>
date	Fri Feb 04 06:18:50 2011 +0100 (2011-02-04)
parents	7eb1a8c3eade
children	d708e2c1bad8

line source

1 #!/usr/bin/env python

2 # -*- coding: utf-8 -*-

4 from __future__ import with_statement

5 import codecs

6 import logging

7 import os

8 import optparse

9 import re

10 import subprocess

11 import sys

12 import Stemmer

13 try:

14 import psyco

15 psyco.full()

16 except:

17 pass

19 config = {

20 'config_directory': os.environ['HOME'] + '/.new-words',

21 'language': 'en',

22 }

24 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)

26 class Normalizator:

27 def __init__(self, language, linked_words={}):

28 stemmer_algorithm = {

29 'de' : 'german',

30 'en' : 'english',

31 'ru' : 'russian',

32 'uk' : 'ukrainian',

33 }

34 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])

35 self.linked_words = linked_words

37 def normalize(self, word):

38 word_chain = []

39 while word in self.linked_words and not word in word_chain:

40 word_chain.append(word)

41 word = self.linked_words[word]

42 return self.stemmer.stemWord(word.lower())

44 parser = optparse.OptionParser()

46 parser.add_option(

47 "-a", "--no-marks",

48 help="don't add marks (and don't save marks added by user)",

49 action="store_true",

50 dest="no_marks")

52 parser.add_option(

53 "-c", "--compressed",

54 help="show compressed wordlist: one word per group",

55 action="store_true",

56 dest="compressed")

58 parser.add_option(

59 "-k", "--known-words",

60 help="put higher words that are similar to the known words (only for English)",

61 action="store_true",

62 dest="compressed")

64 parser.add_option(

65 "-l", "--language",

66 help="specify language of text",

67 action="store",

68 dest="language")

70 parser.add_option(

71 "-f", "--function",

72 help="filter through subsystem [INTERNAL]",

73 action="store",

74 dest="function")

76 parser.add_option(

77 "-m", "--merge-tag",

78 help="merge words tagged with specified tag into the main vocabulary",

79 action="store",

80 dest="merge_tag")

82 parser.add_option(

83 "-M", "--merge-tagged",

84 help="merge words tagged with ANY tag into the main vocabulary",

85 action="store_true",

86 dest="merge_tagged")

88 parser.add_option(

89 "-n", "--non-interactive",

90 help="non-interactive mode (don't run vi)",

91 action="store_true",

92 dest="non_interactive")

94 parser.add_option(

95 "-N", "--no-filter",

96 help="switch off known words filtering",

97 action="store_true",

98 dest="no_filter")

100 parser.add_option(

101 "-p", "--pages",

102 help="work with specified pages only (pages = start-stop/total )",

103 action="store",

104 dest="pages")

105

106 parser.add_option(

107 "-r", "--remove-tag",

108 help="remove subvocabulary of specified tag",

109 action="store",

110 dest="remove_tag")

111

112 parser.add_option(

113 "-s", "--text-stats",

114 help="show the text statistics (percentage of known words and so on) and exit",

115 action="store_true",

116 dest="text_stats")

117

118 parser.add_option(

119 "-S", "--voc-stats",

120 help="show your vocabulary statistics (number of words and word groups)",

121 action="store_true",

122 dest="voc_stats")

123

124 parser.add_option(

125 "-t", "--tag",

126 help="tag known words with tag",

127 action="store",

128 dest="tag")

129

130 parser.add_option(

131 "-T", "--show-tags",

132 help="tag known words with tag",

133 action="store_true",

134 dest="show_tags")

135

136 parser.add_option(

137 "-2", "--two-words",

138 help="find 2 words' sequences",

139 action="store_true",

140 dest="two_words")

141

142 parser.add_option(

143 "-3", "--three-words",

144 help="find 3 words' sequences",

145 action="store_true",

146 dest="three_words")

147

148 def readlines_from_file(filename):

149 res = []

150 with codecs.open(filename, "r", "utf-8") as f:

151 for line in f.readlines():

152 res += [line]

153 return res

154

155 def readlines_from_stdin():

156 return codecs.getreader("utf-8")(sys.stdin).readlines()

157

158 def words_from_line(line):

159 line = line.rstrip('\n')

160 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)

161 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)

162 return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)

163

164 def get_words(lines, group_by=[1]):

165 """

166 Returns hash of words in a file

167 word => number

168 """

169 result = {}

170 (a, b, c) = ("", "", "")

171 for line in lines:

172 words = words_from_line(line)

173 for word in words:

174 if re.match('[0-9]*$', word):

175 continue

176 result.setdefault(word, 0)

177 result[word] += 1

178 if 2 in group_by and a != "" and b != "":

179 w = "%s_%s" % (a,b)

180 result.setdefault(w, 0)

181 result[w] += 1

182 if 3 in group_by and not "" in [a,b,c]:

183 w = "%s_%s_%s" % (a,b,c)

184 result.setdefault(w, 0)

185 result[w] += 1

186 (a,b,c) = (b, c, word)

187

188 logging.debug(result)

189 return result

190

191 def load_vocabulary():

192 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))

193

194 def notes_filenames():

195 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]

196

197 def load_notes(files):

198 notes = {}

199 for filename in files:

200 with codecs.open(filename, "r", "utf-8") as f:

201 for line in f.readlines():

202 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

203 notes.setdefault(word, {})

204 notes[word][filename] = note

205 return notes

206

207 def add_notes(lines, notes):

208 notes_filename = notes_filenames()[0]

209 result = []

210 for line in lines:

211 if line.startswith('#'):

212 result += [line]

213 else:

214 match_object = re.search('^\s*\S+\s*(\S+)', line)

215 if match_object:

216 word = match_object.group(1)

217 if word in notes:

218 #logging.debug(word)

219 #logging.debug(line)

220 if notes_filename in notes[word]:

221 line = line.rstrip('\n')

222 line = "%-30s %s\n" % (line, notes[word][notes_filename])

223 #logging.debug(line)

224 result += [line]

225 else:

226 result += [line]

227 else:

228 result += [line]

229 return result

230

231 def remove_notes(lines, notes_group):

232 notes_filename = notes_filenames()[0]

233 notes = {}

234 for k in notes_group.keys():

235 if notes_filename in notes_group[k]:

236 notes[k] = notes_group[k][notes_filename]

237

238 result = []

239 for line in lines:

240 line = line.rstrip('\n')

241 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)

242 if match_object:

243 result.append("".join([

244 match_object.group(1),

245 match_object.group(2),

246 match_object.group(3),

247 match_object.group(4),

248 "\n"

249 ]))

250 notes[match_object.group(4)] = match_object.group(6)

251 else:

252 result.append(line+"\n")

253

254 save_notes(notes_filename, notes)

255 return result

256

257 def save_notes(filename, notes):

258 lines = []

259 saved_words = []

260 with codecs.open(filename, "r", "utf-8") as f:

261 for line in f.readlines():

262 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)

263 if word in notes:

264 line = "%-29s %s\n" % (word, notes[word])

265 saved_words.append(word)

266 lines.append(line)

267 for word in [x for x in notes.keys() if not x in saved_words]:

268 line = "%-29s %s\n" % (word, notes[word])

269 lines.append(line)

270

271 with codecs.open(filename, "w", "utf-8") as f:

272 for line in lines:

273 f.write(line)

274

275

276 def substract_dictionary(dict1, dict2):

277 """

278 returns dict1 - dict2

279 """

280 result = {}

281 for (k,v) in dict1.items():

282 if not k in dict2:

283 result[k] = v

284 return result

285

286 def dump_words(words, filename):

287 with codecs.open(filename, "w+", "utf-8") as f:

288 for word in words.keys():

289 f.write(("%s\n"%word)*words[word])

290

291 def error_message(text):

292 print text

293

294 def find_wordgroups_weights(word_pairs, normalizator):

295 weight = {}

296 for (num, word) in word_pairs:

297 normalized = normalizator.normalize(word)

298 weight.setdefault(normalized, 0)

299 weight[normalized] += num

300 return weight

301

302 def find_linked_words(notes):

303 linked_words = {}

304 for word in notes.keys():

305 for note in notes[word].values():

306 if "@" in note:

307 result = re.search(r'\@(\S*)', note)

308 if result:

309 main_word = result.group(1)

310 if main_word:

311 linked_words[word] = main_word

312 return linked_words

313

314 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):

315 (num1, word1) = pair1

316 (num2, word2) = pair2

317

318 normalized_word1 = normalizator.normalize(word1)

319 normalized_word2 = normalizator.normalize(word2)

320

321 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])

322 if cmp_res != 0:

323 return cmp_res

324 else:

325 cmp_res = cmp(normalized_word1, normalized_word2)

326 if cmp_res != 0:

327 return cmp_res

328 else:

329 return cmp(int(num1), int(num2))

330

331 def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False):

332 if stats_only:

333 codecs.getwriter("utf-8")(sys.stdout).write(

334 " ".join([

335 "%-10s" % x for x in [

336 "LANG",

337 "KNOWN%",

338 "UNKNOWN%",

339 "KNOWN",

340 "TOTAL",

341 "WPS",

342 "UWPS*10"

343 ]]) + "\n")

344 codecs.getwriter("utf-8")(sys.stdout).write(

345 " ".join([

346 "%(language)-10s",

347 "%(percentage)-10.2f",

348 "%(percentage_unknown)-10.2f",

349 "%(total_known)-11d"

350 "%(total)-11d"

351 "%(wps)-11d"

352 "%(uwps)-11d"

353 ]) % stats + "\n")

354 return

355

356 if print_stats:

357 codecs.getwriter("utf-8")(sys.stdout).write(

358 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)

359

360 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)

361 known = int(stats['total_known'])

362 total = int(stats['total'])

363 current_level = 0

364 for word_pair in word_pairs:

365 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)

366 known += word_pair[0]

367 if 100.0*known/total >= level_lines[0]:

368 current_level = level_lines[0]

369 while 100.0*known/total > level_lines[0]:

370 current_level = level_lines[0]

371 level_lines = level_lines[1:]

372 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)

373

374 def filter_add_notes(args):

375 lines = readlines_from_file(args[0])

376 notes = load_notes(notes_filenames())

377 lines = add_notes(lines, notes)

378 with codecs.open(args[0], "w", "utf-8") as f:

379 for line in lines:

380 f.write(line)

381

382 def filter_remove_notes(args):

383 lines = readlines_from_file(args[0])

384 notes = load_notes(notes_filenames())

385 lines = remove_notes(lines, notes)

386 with codecs.open(args[0], "w", "utf-8") as f:

387 for line in lines:

388 f.write(line)

389

390 def filter_get_words_group_words_add_stat(args):

391 vocabulary = load_vocabulary()

392 notes = load_notes(notes_filenames())

393 lines = readlines_from_stdin()

394 group_by = [1]

395 if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':

396 group_by.append(2)

397 if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':

398 group_by.append(3)

399 words = get_words(lines, group_by)

400 stats_only = False

401 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':

402 stats_only = True

403

404

405 stats = {}

406 stats['total'] = sum(words[x] for x in words.keys())

407 if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':

408 words = substract_dictionary(words, vocabulary)

409

410 stats['total_unknown'] = sum(words[x] for x in words.keys())

411 stats['total_known'] = stats['total'] - stats['total_unknown']

412 stats['percentage'] = 100.0*stats['total_known']/stats['total']

413 stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']

414 stats['groups'] = 0

415 stats['words'] = len(words)

416 stats['sentences'] = 0 #FIXME

417 stats['wps'] = 0 #FIXME

418 stats['uwps'] = 0 #FIXME

419 stats['language'] = config['language']

420

421 linked_words = find_linked_words(notes)

422 normalizator = Normalizator(config['language'], linked_words)

423

424 words_with_freq = []

425 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):

426 words_with_freq.append((words[k], k))

427

428 wgw = find_wordgroups_weights(words_with_freq, normalizator)

429 if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':

430 words_with_freq = sorted(

431 words_with_freq,

432 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),

433 reverse=True)

434

435 print_words_sorted(words_with_freq, stats, stats_only=stats_only)

436

437 (options, args) = parser.parse_args()

438 if options.language:

439 config['language'] = options.language

440

441 if options.function:

442 function_names = {

443 'add_notes' : filter_add_notes,

444 'remove_notes': filter_remove_notes,

445 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,

446 }

447 if options.function in function_names:

448 function_names[options.function](args)

449 else:

450 error_message("Unkown function %s.\nAvailable functions:\n%s" % (

451 options.function, "".join([" "+x for x in sorted(function_names.keys())])))

452 sys.exit(1)

453

454

455

456

457 #os.system("vim")

458