igor@37: #!/usr/bin/env python igor@38: # -*- coding: utf-8 -*- igor@37: igor@40: from __future__ import with_statement igor@38: import codecs igor@38: import logging igor@38: import os igor@37: import optparse igor@38: import re igor@38: import subprocess igor@38: import sys igor@38: import Stemmer igor@42: try: igor@42: import psyco igor@42: psyco.full() igor@42: except: igor@42: pass igor@38: igor@38: config = { igor@38: 'config_directory': os.environ['HOME'] + '/.new-words', igor@38: 'language': 'en', igor@38: } igor@38: igor@38: logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG) igor@38: igor@38: class Normalizator: igor@38: def __init__(self, language, linked_words={}): igor@38: stemmer_algorithm = { igor@38: 'de' : 'german', igor@38: 'en' : 'english', igor@38: 'ru' : 'russian', igor@38: 'uk' : 'ukrainian', igor@38: } igor@38: self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language]) igor@38: self.linked_words = linked_words igor@38: igor@38: def normalize(self, word): igor@38: word_chain = [] igor@38: while word in self.linked_words and not word in word_chain: igor@38: word_chain.append(word) igor@38: word = self.linked_words[word] igor@38: return self.stemmer.stemWord(word.lower()) igor@37: igor@37: parser = optparse.OptionParser() igor@37: igor@37: parser.add_option( igor@37: "-a", "--no-marks", igor@37: help="don't add marks (and don't save marks added by user)", igor@37: action="store_true", igor@37: dest="no_marks") igor@37: igor@37: parser.add_option( igor@37: "-c", "--compressed", igor@37: help="show compressed wordlist: one word per group", igor@37: action="store_true", igor@37: dest="compressed") igor@37: igor@37: parser.add_option( igor@37: "-k", "--known-words", igor@37: help="put higher words that are similar to the known words (only for English)", igor@37: action="store_true", igor@37: dest="compressed") igor@37: igor@37: parser.add_option( igor@37: "-l", "--language", igor@37: help="specify language of text", igor@37: action="store", igor@37: dest="language") igor@37: igor@37: parser.add_option( igor@38: "-f", "--function", igor@38: help="filter through subsystem [INTERNAL]", igor@38: action="store", igor@38: dest="function") igor@38: igor@38: parser.add_option( igor@37: "-m", "--merge-tag", igor@37: help="merge words tagged with specified tag into the main vocabulary", igor@37: action="store", igor@37: dest="merge_tag") igor@37: igor@37: parser.add_option( igor@37: "-M", "--merge-tagged", igor@37: help="merge words tagged with ANY tag into the main vocabulary", igor@37: action="store_true", igor@37: dest="merge_tagged") igor@37: igor@37: parser.add_option( igor@37: "-n", "--non-interactive", igor@37: help="non-interactive mode (don't run vi)", igor@37: action="store_true", igor@37: dest="non_interactive") igor@37: igor@37: parser.add_option( igor@37: "-N", "--no-filter", igor@37: help="switch off known words filtering", igor@37: action="store_true", igor@37: dest="no_filter") igor@37: igor@37: parser.add_option( igor@37: "-p", "--pages", igor@37: help="work with specified pages only (pages = start-stop/total )", igor@37: action="store", igor@37: dest="pages") igor@37: igor@37: parser.add_option( igor@37: "-r", "--remove-tag", igor@37: help="remove subvocabulary of specified tag", igor@37: action="store", igor@37: dest="remove_tag") igor@37: igor@37: parser.add_option( igor@37: "-s", "--text-stats", igor@37: help="show the text statistics (percentage of known words and so on) and exit", igor@37: action="store_true", igor@37: dest="text_stats") igor@37: igor@37: parser.add_option( igor@37: "-S", "--voc-stats", igor@37: help="show your vocabulary statistics (number of words and word groups)", igor@37: action="store_true", igor@37: dest="voc_stats") igor@37: igor@37: parser.add_option( igor@37: "-t", "--tag", igor@37: help="tag known words with tag", igor@37: action="store", igor@37: dest="tag") igor@37: igor@37: parser.add_option( igor@37: "-T", "--show-tags", igor@37: help="tag known words with tag", igor@37: action="store_true", igor@37: dest="show_tags") igor@37: igor@37: parser.add_option( igor@37: "-2", "--two-words", igor@37: help="find 2 words' sequences", igor@37: action="store_true", igor@37: dest="two_words") igor@37: igor@37: parser.add_option( igor@37: "-3", "--three-words", igor@37: help="find 3 words' sequences", igor@37: action="store_true", igor@37: dest="three_words") igor@37: igor@38: def readlines_from_file(filename): igor@38: res = [] igor@38: with codecs.open(filename, "r", "utf-8") as f: igor@38: for line in f.readlines(): igor@38: res += [line] igor@38: return res igor@38: igor@38: def readlines_from_stdin(): igor@38: return codecs.getreader("utf-8")(sys.stdin).readlines() igor@38: igor@38: def words_from_line(line): igor@38: line = line.rstrip('\n') igor@38: #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line) igor@38: #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line) igor@44: return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line) igor@38: igor@44: def get_words(lines, group_by=[1]): igor@38: """ igor@38: Returns hash of words in a file igor@38: word => number igor@38: """ igor@38: result = {} igor@44: (a, b, c) = ("", "", "") igor@38: for line in lines: igor@38: words = words_from_line(line) igor@38: for word in words: igor@41: if re.match('[0-9]*$', word): igor@41: continue igor@38: result.setdefault(word, 0) igor@38: result[word] += 1 igor@44: if 2 in group_by and a != "" and b != "": igor@44: w = "%s_%s" % (a,b) igor@44: result.setdefault(w, 0) igor@44: result[w] += 1 igor@44: if 3 in group_by and not "" in [a,b,c]: igor@44: w = "%s_%s_%s" % (a,b,c) igor@44: result.setdefault(w, 0) igor@44: result[w] += 1 igor@44: (a,b,c) = (b, c, word) igor@44: igor@44: logging.debug(result) igor@38: return result igor@38: igor@38: def load_vocabulary(): igor@38: return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language']))) igor@38: igor@38: def notes_filenames(): igor@38: return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])] igor@38: igor@38: def load_notes(files): igor@38: notes = {} igor@38: for filename in files: igor@39: with codecs.open(filename, "r", "utf-8") as f: igor@38: for line in f.readlines(): igor@38: (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) igor@38: notes.setdefault(word, {}) igor@38: notes[word][filename] = note igor@38: return notes igor@38: igor@39: def add_notes(lines, notes): igor@39: notes_filename = notes_filenames()[0] igor@39: result = [] igor@39: for line in lines: igor@39: if line.startswith('#'): igor@39: result += [line] igor@39: else: igor@39: match_object = re.search('^\s*\S+\s*(\S+)', line) igor@39: if match_object: igor@39: word = match_object.group(1) igor@39: if word in notes: igor@44: #logging.debug(word) igor@44: #logging.debug(line) igor@39: if notes_filename in notes[word]: igor@39: line = line.rstrip('\n') igor@39: line = "%-30s %s\n" % (line, notes[word][notes_filename]) igor@44: #logging.debug(line) igor@39: result += [line] igor@39: else: igor@39: result += [line] igor@39: else: igor@39: result += [line] igor@39: return result igor@39: igor@39: def remove_notes(lines, notes_group): igor@39: notes_filename = notes_filenames()[0] igor@39: notes = {} igor@39: for k in notes_group.keys(): igor@39: if notes_filename in notes_group[k]: igor@39: notes[k] = notes_group[k][notes_filename] igor@39: igor@39: result = [] igor@39: for line in lines: igor@39: line = line.rstrip('\n') igor@39: match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line) igor@39: if match_object: igor@39: result.append("".join([ igor@39: match_object.group(1), igor@39: match_object.group(2), igor@39: match_object.group(3), igor@39: match_object.group(4), igor@39: "\n" igor@39: ])) igor@39: notes[match_object.group(4)] = match_object.group(6) igor@39: else: igor@39: result.append(line+"\n") igor@39: igor@39: save_notes(notes_filename, notes) igor@39: return result igor@39: igor@39: def save_notes(filename, notes): igor@39: lines = [] igor@39: saved_words = [] igor@39: with codecs.open(filename, "r", "utf-8") as f: igor@39: for line in f.readlines(): igor@39: (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) igor@39: if word in notes: igor@39: line = "%-29s %s\n" % (word, notes[word]) igor@39: saved_words.append(word) igor@39: lines.append(line) igor@39: for word in [x for x in notes.keys() if not x in saved_words]: igor@39: line = "%-29s %s\n" % (word, notes[word]) igor@39: lines.append(line) igor@39: igor@39: with codecs.open(filename, "w", "utf-8") as f: igor@39: for line in lines: igor@39: f.write(line) igor@39: igor@39: igor@38: def substract_dictionary(dict1, dict2): igor@38: """ igor@38: returns dict1 - dict2 igor@38: """ igor@38: result = {} igor@38: for (k,v) in dict1.items(): igor@38: if not k in dict2: igor@38: result[k] = v igor@38: return result igor@38: igor@38: def dump_words(words, filename): igor@38: with codecs.open(filename, "w+", "utf-8") as f: igor@38: for word in words.keys(): igor@38: f.write(("%s\n"%word)*words[word]) igor@38: igor@38: def error_message(text): igor@38: print text igor@38: igor@40: def find_wordgroups_weights(word_pairs, normalizator): igor@38: weight = {} igor@40: for (num, word) in word_pairs: igor@38: normalized = normalizator.normalize(word) igor@38: weight.setdefault(normalized, 0) igor@40: weight[normalized] += num igor@38: return weight igor@38: igor@38: def find_linked_words(notes): igor@38: linked_words = {} igor@38: for word in notes.keys(): igor@38: for note in notes[word].values(): igor@38: if "@" in note: igor@38: result = re.search(r'\@(\S*)', note) igor@38: if result: igor@38: main_word = result.group(1) igor@38: if main_word: igor@38: linked_words[word] = main_word igor@38: return linked_words igor@38: igor@40: def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words): igor@40: (num1, word1) = pair1 igor@40: (num2, word2) = pair2 igor@38: igor@38: normalized_word1 = normalizator.normalize(word1) igor@38: normalized_word2 = normalizator.normalize(word2) igor@38: igor@38: cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2]) igor@38: if cmp_res != 0: igor@38: return cmp_res igor@38: else: igor@38: cmp_res = cmp(normalized_word1, normalized_word2) igor@38: if cmp_res != 0: igor@38: return cmp_res igor@38: else: igor@38: return cmp(int(num1), int(num2)) igor@38: igor@40: def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False): igor@40: if stats_only: igor@43: codecs.getwriter("utf-8")(sys.stdout).write( igor@43: " ".join([ igor@43: "%-10s" % x for x in [ igor@43: "LANG", igor@43: "KNOWN%", igor@43: "UNKNOWN%", igor@43: "KNOWN", igor@43: "TOTAL", igor@43: "WPS", igor@43: "UWPS*10" igor@43: ]]) + "\n") igor@43: codecs.getwriter("utf-8")(sys.stdout).write( igor@43: " ".join([ igor@43: "%(language)-10s", igor@43: "%(percentage)-10.2f", igor@43: "%(percentage_unknown)-10.2f", igor@43: "%(total_known)-11d" igor@43: "%(total)-11d" igor@43: "%(wps)-11d" igor@43: "%(uwps)-11d" igor@43: ]) % stats + "\n") igor@40: return igor@38: igor@40: if print_stats: igor@40: codecs.getwriter("utf-8")(sys.stdout).write( igor@43: "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats) igor@38: igor@40: level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102) igor@40: known = int(stats['total_known']) igor@40: total = int(stats['total']) igor@40: current_level = 0 igor@40: for word_pair in word_pairs: igor@40: codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair) igor@40: known += word_pair[0] igor@40: if 100.0*known/total >= level_lines[0]: igor@40: current_level = level_lines[0] igor@40: while 100.0*known/total > level_lines[0]: igor@40: current_level = level_lines[0] igor@40: level_lines = level_lines[1:] igor@40: codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level) igor@38: igor@39: def filter_add_notes(args): igor@39: lines = readlines_from_file(args[0]) igor@39: notes = load_notes(notes_filenames()) igor@39: lines = add_notes(lines, notes) igor@39: with codecs.open(args[0], "w", "utf-8") as f: igor@39: for line in lines: igor@39: f.write(line) igor@39: igor@39: def filter_remove_notes(args): igor@39: lines = readlines_from_file(args[0]) igor@39: notes = load_notes(notes_filenames()) igor@39: lines = remove_notes(lines, notes) igor@39: with codecs.open(args[0], "w", "utf-8") as f: igor@39: for line in lines: igor@39: f.write(line) igor@39: igor@40: def filter_get_words_group_words_add_stat(args): igor@40: vocabulary = load_vocabulary() igor@40: notes = load_notes(notes_filenames()) igor@40: lines = readlines_from_stdin() igor@44: group_by = [1] igor@44: if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES': igor@44: group_by.append(2) igor@44: if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES': igor@44: group_by.append(3) igor@44: words = get_words(lines, group_by) igor@43: stats_only = False igor@43: if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES': igor@43: stats_only = True igor@40: igor@44: igor@40: stats = {} igor@40: stats['total'] = sum(words[x] for x in words.keys()) igor@45: if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES': igor@45: words = substract_dictionary(words, vocabulary) igor@40: igor@40: stats['total_unknown'] = sum(words[x] for x in words.keys()) igor@40: stats['total_known'] = stats['total'] - stats['total_unknown'] igor@43: stats['percentage'] = 100.0*stats['total_known']/stats['total'] igor@43: stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total'] igor@40: stats['groups'] = 0 igor@40: stats['words'] = len(words) igor@43: stats['sentences'] = 0 #FIXME igor@43: stats['wps'] = 0 #FIXME igor@43: stats['uwps'] = 0 #FIXME igor@40: stats['language'] = config['language'] igor@40: igor@40: linked_words = find_linked_words(notes) igor@40: normalizator = Normalizator(config['language'], linked_words) igor@40: igor@44: words_with_freq = [] igor@40: for k in sorted(words.keys(), key=lambda k: words[k], reverse=True): igor@44: words_with_freq.append((words[k], k)) igor@40: igor@44: wgw = find_wordgroups_weights(words_with_freq, normalizator) igor@45: if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES': igor@45: words_with_freq = sorted( igor@44: words_with_freq, igor@40: cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words), igor@40: reverse=True) igor@40: igor@44: print_words_sorted(words_with_freq, stats, stats_only=stats_only) igor@40: igor@37: (options, args) = parser.parse_args() igor@38: if options.language: igor@38: config['language'] = options.language igor@37: igor@38: if options.function: igor@38: function_names = { igor@39: 'add_notes' : filter_add_notes, igor@39: 'remove_notes': filter_remove_notes, igor@40: 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat, igor@38: } igor@38: if options.function in function_names: igor@38: function_names[options.function](args) igor@38: else: igor@38: error_message("Unkown function %s.\nAvailable functions:\n%s" % ( igor@38: options.function, "".join([" "+x for x in sorted(function_names.keys())]))) igor@38: sys.exit(1) igor@37: igor@37: igor@37: igor@37: igor@38: #os.system("vim") igor@37: