new-words: new-words.py annotate

new-words

annotate new-words.py @ 40:c3a50c0d2400

Functions for adding/removing notes + statistics now implemented in Python.

Option -O (old-style) is not supported anymore. If you need old-style new-words use new-words.sh

author	Igor Chubin <igor@chub.in>
date	Sun Jan 23 17:09:44 2011 +0100 (2011-01-23)
parents	a598e0d25784
children	4629e08b0d87

rev	line source
igor@37	1 #!/usr/bin/env python
igor@38	2 # -- coding: utf-8 --
igor@37	3
igor@40	4 from __future__ import with_statement
igor@38	5 import codecs
igor@38	6 import logging
igor@38	7 import os
igor@37	8 import optparse
igor@38	9 import re
igor@38	10 import subprocess
igor@38	11 import sys
igor@38	12 import Stemmer
igor@38	13
igor@38	14 config = {
igor@38	15 'config_directory': os.environ['HOME'] + '/.new-words',
igor@38	16 'language': 'en',
igor@38	17 }
igor@38	18
igor@38	19 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
igor@38	20
igor@38	21 class Normalizator:
igor@38	22 def __init__(self, language, linked_words={}):
igor@38	23 stemmer_algorithm = {
igor@38	24 'de' : 'german',
igor@38	25 'en' : 'english',
igor@38	26 'ru' : 'russian',
igor@38	27 'uk' : 'ukrainian',
igor@38	28 }
igor@38	29 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
igor@38	30 self.linked_words = linked_words
igor@38	31
igor@38	32 def normalize(self, word):
igor@38	33 word_chain = []
igor@38	34 while word in self.linked_words and not word in word_chain:
igor@38	35 word_chain.append(word)
igor@38	36 word = self.linked_words[word]
igor@38	37 return self.stemmer.stemWord(word.lower())
igor@37	38
igor@37	39 parser = optparse.OptionParser()
igor@37	40
igor@37	41 parser.add_option(
igor@37	42 "-a", "--no-marks",
igor@37	43 help="don't add marks (and don't save marks added by user)",
igor@37	44 action="store_true",
igor@37	45 dest="no_marks")
igor@37	46
igor@37	47 parser.add_option(
igor@37	48 "-c", "--compressed",
igor@37	49 help="show compressed wordlist: one word per group",
igor@37	50 action="store_true",
igor@37	51 dest="compressed")
igor@37	52
igor@37	53 parser.add_option(
igor@37	54 "-k", "--known-words",
igor@37	55 help="put higher words that are similar to the known words (only for English)",
igor@37	56 action="store_true",
igor@37	57 dest="compressed")
igor@37	58
igor@37	59 parser.add_option(
igor@37	60 "-l", "--language",
igor@37	61 help="specify language of text",
igor@37	62 action="store",
igor@37	63 dest="language")
igor@37	64
igor@37	65 parser.add_option(
igor@38	66 "-f", "--function",
igor@38	67 help="filter through subsystem [INTERNAL]",
igor@38	68 action="store",
igor@38	69 dest="function")
igor@38	70
igor@38	71 parser.add_option(
igor@37	72 "-m", "--merge-tag",
igor@37	73 help="merge words tagged with specified tag into the main vocabulary",
igor@37	74 action="store",
igor@37	75 dest="merge_tag")
igor@37	76
igor@37	77 parser.add_option(
igor@37	78 "-M", "--merge-tagged",
igor@37	79 help="merge words tagged with ANY tag into the main vocabulary",
igor@37	80 action="store_true",
igor@37	81 dest="merge_tagged")
igor@37	82
igor@37	83 parser.add_option(
igor@37	84 "-n", "--non-interactive",
igor@37	85 help="non-interactive mode (don't run vi)",
igor@37	86 action="store_true",
igor@37	87 dest="non_interactive")
igor@37	88
igor@37	89 parser.add_option(
igor@37	90 "-N", "--no-filter",
igor@37	91 help="switch off known words filtering",
igor@37	92 action="store_true",
igor@37	93 dest="no_filter")
igor@37	94
igor@37	95 parser.add_option(
igor@37	96 "-p", "--pages",
igor@37	97 help="work with specified pages only (pages = start-stop/total )",
igor@37	98 action="store",
igor@37	99 dest="pages")
igor@37	100
igor@37	101 parser.add_option(
igor@37	102 "-r", "--remove-tag",
igor@37	103 help="remove subvocabulary of specified tag",
igor@37	104 action="store",
igor@37	105 dest="remove_tag")
igor@37	106
igor@37	107 parser.add_option(
igor@37	108 "-s", "--text-stats",
igor@37	109 help="show the text statistics (percentage of known words and so on) and exit",
igor@37	110 action="store_true",
igor@37	111 dest="text_stats")
igor@37	112
igor@37	113 parser.add_option(
igor@37	114 "-S", "--voc-stats",
igor@37	115 help="show your vocabulary statistics (number of words and word groups)",
igor@37	116 action="store_true",
igor@37	117 dest="voc_stats")
igor@37	118
igor@37	119 parser.add_option(
igor@37	120 "-t", "--tag",
igor@37	121 help="tag known words with tag",
igor@37	122 action="store",
igor@37	123 dest="tag")
igor@37	124
igor@37	125 parser.add_option(
igor@37	126 "-T", "--show-tags",
igor@37	127 help="tag known words with tag",
igor@37	128 action="store_true",
igor@37	129 dest="show_tags")
igor@37	130
igor@37	131 parser.add_option(
igor@37	132 "-2", "--two-words",
igor@37	133 help="find 2 words' sequences",
igor@37	134 action="store_true",
igor@37	135 dest="two_words")
igor@37	136
igor@37	137 parser.add_option(
igor@37	138 "-3", "--three-words",
igor@37	139 help="find 3 words' sequences",
igor@37	140 action="store_true",
igor@37	141 dest="three_words")
igor@37	142
igor@38	143 def readlines_from_file(filename):
igor@38	144 res = []
igor@38	145 with codecs.open(filename, "r", "utf-8") as f:
igor@38	146 for line in f.readlines():
igor@38	147 res += [line]
igor@38	148 return res
igor@38	149
igor@38	150 def readlines_from_stdin():
igor@38	151 return codecs.getreader("utf-8")(sys.stdin).readlines()
igor@38	152
igor@38	153 def words_from_line(line):
igor@38	154 line = line.rstrip('\n')
igor@38	155 #return re.split('(?:\s\|[*\r,.:#@()+=<>$;"?!\|\[\]^%&~{}«»–])+', line)
igor@38	156 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
igor@38	157 return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line)
igor@38	158
igor@38	159 def get_words(lines):
igor@38	160 """
igor@38	161 Returns hash of words in a file
igor@38	162 word => number
igor@38	163 """
igor@38	164 result = {}
igor@38	165 for line in lines:
igor@38	166 words = words_from_line(line)
igor@38	167 for word in words:
igor@38	168 result.setdefault(word, 0)
igor@38	169 result[word] += 1
igor@38	170 return result
igor@38	171
igor@38	172 def load_vocabulary():
igor@38	173 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
igor@38	174
igor@38	175 def notes_filenames():
igor@38	176 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
igor@38	177
igor@38	178 def load_notes(files):
igor@38	179 notes = {}
igor@38	180 for filename in files:
igor@39	181 with codecs.open(filename, "r", "utf-8") as f:
igor@38	182 for line in f.readlines():
igor@38	183 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@38	184 notes.setdefault(word, {})
igor@38	185 notes[word][filename] = note
igor@38	186 return notes
igor@38	187
igor@39	188 def add_notes(lines, notes):
igor@39	189 notes_filename = notes_filenames()[0]
igor@39	190 result = []
igor@39	191 for line in lines:
igor@39	192 if line.startswith('#'):
igor@39	193 result += [line]
igor@39	194 else:
igor@39	195 match_object = re.search('^\s\S+\s(\S+)', line)
igor@39	196 if match_object:
igor@39	197 word = match_object.group(1)
igor@39	198 if word in notes:
igor@39	199 logging.debug(word)
igor@39	200 logging.debug(line)
igor@39	201 if notes_filename in notes[word]:
igor@39	202 line = line.rstrip('\n')
igor@39	203 line = "%-30s %s\n" % (line, notes[word][notes_filename])
igor@39	204 logging.debug(line)
igor@39	205 result += [line]
igor@39	206 else:
igor@39	207 result += [line]
igor@39	208 else:
igor@39	209 result += [line]
igor@39	210 return result
igor@39	211
igor@39	212 def remove_notes(lines, notes_group):
igor@39	213 notes_filename = notes_filenames()[0]
igor@39	214 notes = {}
igor@39	215 for k in notes_group.keys():
igor@39	216 if notes_filename in notes_group[k]:
igor@39	217 notes[k] = notes_group[k][notes_filename]
igor@39	218
igor@39	219 result = []
igor@39	220 for line in lines:
igor@39	221 line = line.rstrip('\n')
igor@39	222 match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
igor@39	223 if match_object:
igor@39	224 result.append("".join([
igor@39	225 match_object.group(1),
igor@39	226 match_object.group(2),
igor@39	227 match_object.group(3),
igor@39	228 match_object.group(4),
igor@39	229 "\n"
igor@39	230 ]))
igor@39	231 notes[match_object.group(4)] = match_object.group(6)
igor@39	232 else:
igor@39	233 result.append(line+"\n")
igor@39	234
igor@39	235 save_notes(notes_filename, notes)
igor@39	236 return result
igor@39	237
igor@39	238 def save_notes(filename, notes):
igor@39	239 lines = []
igor@39	240 saved_words = []
igor@39	241 with codecs.open(filename, "r", "utf-8") as f:
igor@39	242 for line in f.readlines():
igor@39	243 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@39	244 if word in notes:
igor@39	245 line = "%-29s %s\n" % (word, notes[word])
igor@39	246 saved_words.append(word)
igor@39	247 lines.append(line)
igor@39	248 for word in [x for x in notes.keys() if not x in saved_words]:
igor@39	249 line = "%-29s %s\n" % (word, notes[word])
igor@39	250 lines.append(line)
igor@39	251
igor@39	252 with codecs.open(filename, "w", "utf-8") as f:
igor@39	253 for line in lines:
igor@39	254 f.write(line)
igor@39	255
igor@39	256
igor@38	257 def substract_dictionary(dict1, dict2):
igor@38	258 """
igor@38	259 returns dict1 - dict2
igor@38	260 """
igor@38	261 result = {}
igor@38	262 for (k,v) in dict1.items():
igor@38	263 if not k in dict2:
igor@38	264 result[k] = v
igor@38	265 return result
igor@38	266
igor@38	267 def dump_words(words, filename):
igor@38	268 with codecs.open(filename, "w+", "utf-8") as f:
igor@38	269 for word in words.keys():
igor@38	270 f.write(("%s\n"%word)*words[word])
igor@38	271
igor@38	272 def error_message(text):
igor@38	273 print text
igor@38	274
igor@40	275 def find_wordgroups_weights(word_pairs, normalizator):
igor@38	276 weight = {}
igor@40	277 for (num, word) in word_pairs:
igor@38	278 normalized = normalizator.normalize(word)
igor@38	279 weight.setdefault(normalized, 0)
igor@40	280 weight[normalized] += num
igor@38	281 return weight
igor@38	282
igor@38	283 def find_linked_words(notes):
igor@38	284 linked_words = {}
igor@38	285 for word in notes.keys():
igor@38	286 for note in notes[word].values():
igor@38	287 if "@" in note:
igor@38	288 result = re.search(r'\@(\S*)', note)
igor@38	289 if result:
igor@38	290 main_word = result.group(1)
igor@38	291 if main_word:
igor@38	292 linked_words[word] = main_word
igor@38	293 return linked_words
igor@38	294
igor@40	295 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
igor@40	296 (num1, word1) = pair1
igor@40	297 (num2, word2) = pair2
igor@38	298
igor@38	299 normalized_word1 = normalizator.normalize(word1)
igor@38	300 normalized_word2 = normalizator.normalize(word2)
igor@38	301
igor@38	302 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
igor@38	303 if cmp_res != 0:
igor@38	304 return cmp_res
igor@38	305 else:
igor@38	306 cmp_res = cmp(normalized_word1, normalized_word2)
igor@38	307 if cmp_res != 0:
igor@38	308 return cmp_res
igor@38	309 else:
igor@38	310 return cmp(int(num1), int(num2))
igor@38	311
igor@40	312 def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False):
igor@40	313 if stats_only:
igor@40	314 codecs.getwriter("utf-8")(sys.stdout).write("stat_only")
igor@40	315 return
igor@38	316
igor@40	317 if print_stats:
igor@40	318 codecs.getwriter("utf-8")(sys.stdout).write(
igor@40	319 "# %(language)s, %(percentage)s, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
igor@38	320
igor@40	321 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
igor@40	322 known = int(stats['total_known'])
igor@40	323 total = int(stats['total'])
igor@40	324 current_level = 0
igor@40	325 for word_pair in word_pairs:
igor@40	326 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
igor@40	327 known += word_pair[0]
igor@40	328 if 100.0*known/total >= level_lines[0]:
igor@40	329 current_level = level_lines[0]
igor@40	330 while 100.0*known/total > level_lines[0]:
igor@40	331 current_level = level_lines[0]
igor@40	332 level_lines = level_lines[1:]
igor@40	333 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
igor@38	334
igor@39	335 def filter_add_notes(args):
igor@39	336 lines = readlines_from_file(args[0])
igor@39	337 notes = load_notes(notes_filenames())
igor@39	338 lines = add_notes(lines, notes)
igor@39	339 with codecs.open(args[0], "w", "utf-8") as f:
igor@39	340 for line in lines:
igor@39	341 f.write(line)
igor@39	342
igor@39	343 def filter_remove_notes(args):
igor@39	344 lines = readlines_from_file(args[0])
igor@39	345 notes = load_notes(notes_filenames())
igor@39	346 lines = remove_notes(lines, notes)
igor@39	347 with codecs.open(args[0], "w", "utf-8") as f:
igor@39	348 for line in lines:
igor@39	349 f.write(line)
igor@39	350
igor@40	351 def filter_get_words_group_words_add_stat(args):
igor@40	352 vocabulary = load_vocabulary()
igor@40	353 notes = load_notes(notes_filenames())
igor@40	354 lines = readlines_from_stdin()
igor@40	355 words = get_words(lines)
igor@40	356
igor@40	357 stats = {}
igor@40	358 stats['total'] = sum(words[x] for x in words.keys())
igor@40	359 words = substract_dictionary(words, vocabulary)
igor@40	360
igor@40	361 stats['total_unknown'] = sum(words[x] for x in words.keys())
igor@40	362 stats['total_known'] = stats['total'] - stats['total_unknown']
igor@40	363 stats['percentage'] = "%7.2f"%(100.0*stats['total_known']/stats['total'])
igor@40	364 stats['groups'] = 0
igor@40	365 stats['words'] = len(words)
igor@40	366 stats['sentences'] = 0 #FIXME
igor@40	367 stats['language'] = config['language']
igor@40	368
igor@40	369 linked_words = find_linked_words(notes)
igor@40	370 normalizator = Normalizator(config['language'], linked_words)
igor@40	371
igor@40	372 word_pairs = []
igor@40	373 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
igor@40	374 word_pairs.append((words[k], k))
igor@40	375
igor@40	376 wgw = find_wordgroups_weights(word_pairs, normalizator)
igor@40	377 word_pairs = sorted(
igor@40	378 word_pairs,
igor@40	379 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
igor@40	380 reverse=True)
igor@40	381
igor@40	382 print_words_sorted(word_pairs, stats)
igor@40	383
igor@37	384 (options, args) = parser.parse_args()
igor@38	385 if options.language:
igor@38	386 config['language'] = options.language
igor@37	387
igor@38	388 if options.function:
igor@38	389 function_names = {
igor@39	390 'add_notes' : filter_add_notes,
igor@39	391 'remove_notes': filter_remove_notes,
igor@40	392 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
igor@38	393 }
igor@38	394 if options.function in function_names:
igor@38	395 function_names[options.function](args)
igor@38	396 else:
igor@38	397 error_message("Unkown function %s.\nAvailable functions:\n%s" % (
igor@38	398 options.function, "".join([" "+x for x in sorted(function_names.keys())])))
igor@38	399 sys.exit(1)
igor@37	400
igor@37	401
igor@37	402
igor@37	403
igor@38	404 #os.system("vim")
igor@37	405