new-words
diff new-words.py @ 38:adbc809d3924
Transition to Python started
new-words-py.sh is a wrapper around
new-words.py version which is not finished yet.
new-words-py.sh is a wrapper around
new-words.py version which is not finished yet.
author | Igor Chubin <igor@chub.in> |
---|---|
date | Sat Jan 22 23:42:31 2011 +0100 (2011-01-22) |
parents | be6336e98b3c |
children | a598e0d25784 |
line diff
1.1 --- a/new-words.py Fri Jan 21 15:59:45 2011 +0200 1.2 +++ b/new-words.py Sat Jan 22 23:42:31 2011 +0100 1.3 @@ -1,6 +1,39 @@ 1.4 #!/usr/bin/env python 1.5 +# -*- coding: utf-8 -*- 1.6 1.7 +import codecs 1.8 +import logging 1.9 +import os 1.10 import optparse 1.11 +import re 1.12 +import subprocess 1.13 +import sys 1.14 +import Stemmer 1.15 + 1.16 +config = { 1.17 + 'config_directory': os.environ['HOME'] + '/.new-words', 1.18 + 'language': 'en', 1.19 +} 1.20 + 1.21 +logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG) 1.22 + 1.23 +class Normalizator: 1.24 + def __init__(self, language, linked_words={}): 1.25 + stemmer_algorithm = { 1.26 + 'de' : 'german', 1.27 + 'en' : 'english', 1.28 + 'ru' : 'russian', 1.29 + 'uk' : 'ukrainian', 1.30 + } 1.31 + self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language]) 1.32 + self.linked_words = linked_words 1.33 + 1.34 + def normalize(self, word): 1.35 + word_chain = [] 1.36 + while word in self.linked_words and not word in word_chain: 1.37 + word_chain.append(word) 1.38 + word = self.linked_words[word] 1.39 + return self.stemmer.stemWord(word.lower()) 1.40 1.41 parser = optparse.OptionParser() 1.42 1.43 @@ -29,6 +62,12 @@ 1.44 dest="language") 1.45 1.46 parser.add_option( 1.47 + "-f", "--function", 1.48 + help="filter through subsystem [INTERNAL]", 1.49 + action="store", 1.50 + dest="function") 1.51 + 1.52 +parser.add_option( 1.53 "-m", "--merge-tag", 1.54 help="merge words tagged with specified tag into the main vocabulary", 1.55 action="store", 1.56 @@ -100,31 +139,156 @@ 1.57 action="store_true", 1.58 dest="three_words") 1.59 1.60 +def readlines_from_file(filename): 1.61 + res = [] 1.62 + with codecs.open(filename, "r", "utf-8") as f: 1.63 + for line in f.readlines(): 1.64 + res += [line] 1.65 + return res 1.66 + 1.67 +def readlines_from_stdin(): 1.68 + return codecs.getreader("utf-8")(sys.stdin).readlines() 1.69 + 1.70 +def words_from_line(line): 1.71 + line = line.rstrip('\n') 1.72 + #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line) 1.73 + #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line) 1.74 + return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line) 1.75 + 1.76 +def get_words(lines): 1.77 + """ 1.78 + Returns hash of words in a file 1.79 + word => number 1.80 + """ 1.81 + result = {} 1.82 + for line in lines: 1.83 + words = words_from_line(line) 1.84 + for word in words: 1.85 + result.setdefault(word, 0) 1.86 + result[word] += 1 1.87 + return result 1.88 + 1.89 +def load_vocabulary(): 1.90 + return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language']))) 1.91 + 1.92 +def notes_filenames(): 1.93 + return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])] 1.94 + 1.95 +def load_notes(files): 1.96 + notes = {} 1.97 + for filename in files: 1.98 + with open(filename) as f: 1.99 + for line in f.readlines(): 1.100 + (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) 1.101 + notes.setdefault(word, {}) 1.102 + notes[word][filename] = note 1.103 + return notes 1.104 + 1.105 +def print_words_sorted(words_freq): 1.106 + for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True): 1.107 + codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k)) 1.108 + 1.109 +def substract_dictionary(dict1, dict2): 1.110 + """ 1.111 + returns dict1 - dict2 1.112 + """ 1.113 + result = {} 1.114 + for (k,v) in dict1.items(): 1.115 + if not k in dict2: 1.116 + result[k] = v 1.117 + return result 1.118 + 1.119 +def dump_words(words, filename): 1.120 + with codecs.open(filename, "w+", "utf-8") as f: 1.121 + for word in words.keys(): 1.122 + f.write(("%s\n"%word)*words[word]) 1.123 + 1.124 +def error_message(text): 1.125 + print text 1.126 + 1.127 +def find_wordgroups_weights(lines, normalizator): 1.128 + weight = {} 1.129 + for line in lines: 1.130 + line = re.sub('^\s*', '', line.rstrip('\n')) 1.131 + (num, word) = re.split('\s+', line, maxsplit=1) 1.132 + normalized = normalizator.normalize(word) 1.133 + weight.setdefault(normalized, 0) 1.134 + weight[normalized] += int(num) 1.135 + return weight 1.136 + 1.137 +def find_linked_words(notes): 1.138 + linked_words = {} 1.139 + for word in notes.keys(): 1.140 + for note in notes[word].values(): 1.141 + if "@" in note: 1.142 + logging.debug("%s %s" % (word, note)) 1.143 + result = re.search(r'\@(\S*)', note) 1.144 + if result: 1.145 + main_word = result.group(1) 1.146 + logging.debug("%s %s" % (word, main_word)) 1.147 + if main_word: 1.148 + linked_words[word] = main_word 1.149 + return linked_words 1.150 + 1.151 + 1.152 +def compare_word_lines(line1, line2, wgw, normalizator, linked_words): 1.153 + line1 = re.sub('^\s*', '', line1.rstrip('\n')) 1.154 + (num1, word1) = re.split('\s+', line1, 1) 1.155 + line2 = re.sub('^\s*', '', line2.rstrip('\n')) 1.156 + (num2, word2) = re.split('\s+', line2, 1) 1.157 + 1.158 + normalized_word1 = normalizator.normalize(word1) 1.159 + normalized_word2 = normalizator.normalize(word2) 1.160 + 1.161 + cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2]) 1.162 + if cmp_res != 0: 1.163 + return cmp_res 1.164 + else: 1.165 + cmp_res = cmp(normalized_word1, normalized_word2) 1.166 + if cmp_res != 0: 1.167 + return cmp_res 1.168 + else: 1.169 + return cmp(int(num1), int(num2)) 1.170 + 1.171 +def filter_get_words(args): 1.172 + vocabulary = load_vocabulary() 1.173 + words = get_words(readlines_from_stdin()) 1.174 + dump_words(words, args[0]) 1.175 + words = substract_dictionary(words, vocabulary) 1.176 + print_words_sorted(words) 1.177 + 1.178 +def filter_group_words(args): 1.179 + lines = readlines_from_stdin() 1.180 + notes = load_notes(notes_filenames()) 1.181 + linked_words = find_linked_words(notes) 1.182 + logging.debug(linked_words) 1.183 + normalizator = Normalizator(config['language'], linked_words) 1.184 + 1.185 + wgw = find_wordgroups_weights(lines, normalizator) 1.186 + for line in sorted( 1.187 + lines, 1.188 + cmp=lambda x,y:compare_word_lines(x,y, wgw, normalizator, linked_words), 1.189 + reverse=True): 1.190 + codecs.getwriter("utf-8")(sys.stdout).write(line) 1.191 + 1.192 (options, args) = parser.parse_args() 1.193 +if options.language: 1.194 + config['language'] = options.language 1.195 1.196 -def get_words(): 1.197 - pass 1.198 +if options.function: 1.199 + function_names = { 1.200 + 'get_words' : filter_get_words, 1.201 + 'group_words' : filter_group_words, 1.202 + } 1.203 + if options.function in function_names: 1.204 + function_names[options.function](args) 1.205 + else: 1.206 + error_message("Unkown function %s.\nAvailable functions:\n%s" % ( 1.207 + options.function, "".join([" "+x for x in sorted(function_names.keys())]))) 1.208 + sys.exit(1) 1.209 1.210 -def add_stat(): 1.211 - pass 1.212 1.213 -def two_and_three_words(): 1.214 - pass 1.215 1.216 -def grep_v_english(): 1.217 - pass 1.218 1.219 -def group_words(): 1.220 - pass 1.221 +#os.system("vim") 1.222 1.223 -def add_marks(): 1.224 - pass 1.225 - 1.226 -def remove_marks(): 1.227 - pass 1.228 - 1.229 -def text_from_url(): 1.230 - pass 1.231 - 1.232 -def part(): 1.233 - pass