new-words
annotate new-words.py @ 38:adbc809d3924
Transition to Python started
new-words-py.sh is a wrapper around
new-words.py version which is not finished yet.
new-words-py.sh is a wrapper around
new-words.py version which is not finished yet.
author | Igor Chubin <igor@chub.in> |
---|---|
date | Sat Jan 22 23:42:31 2011 +0100 (2011-01-22) |
parents | be6336e98b3c |
children | a598e0d25784 |
rev | line source |
---|---|
igor@37 | 1 #!/usr/bin/env python |
igor@38 | 2 # -*- coding: utf-8 -*- |
igor@37 | 3 |
igor@38 | 4 import codecs |
igor@38 | 5 import logging |
igor@38 | 6 import os |
igor@37 | 7 import optparse |
igor@38 | 8 import re |
igor@38 | 9 import subprocess |
igor@38 | 10 import sys |
igor@38 | 11 import Stemmer |
igor@38 | 12 |
igor@38 | 13 config = { |
igor@38 | 14 'config_directory': os.environ['HOME'] + '/.new-words', |
igor@38 | 15 'language': 'en', |
igor@38 | 16 } |
igor@38 | 17 |
igor@38 | 18 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG) |
igor@38 | 19 |
igor@38 | 20 class Normalizator: |
igor@38 | 21 def __init__(self, language, linked_words={}): |
igor@38 | 22 stemmer_algorithm = { |
igor@38 | 23 'de' : 'german', |
igor@38 | 24 'en' : 'english', |
igor@38 | 25 'ru' : 'russian', |
igor@38 | 26 'uk' : 'ukrainian', |
igor@38 | 27 } |
igor@38 | 28 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language]) |
igor@38 | 29 self.linked_words = linked_words |
igor@38 | 30 |
igor@38 | 31 def normalize(self, word): |
igor@38 | 32 word_chain = [] |
igor@38 | 33 while word in self.linked_words and not word in word_chain: |
igor@38 | 34 word_chain.append(word) |
igor@38 | 35 word = self.linked_words[word] |
igor@38 | 36 return self.stemmer.stemWord(word.lower()) |
igor@37 | 37 |
igor@37 | 38 parser = optparse.OptionParser() |
igor@37 | 39 |
igor@37 | 40 parser.add_option( |
igor@37 | 41 "-a", "--no-marks", |
igor@37 | 42 help="don't add marks (and don't save marks added by user)", |
igor@37 | 43 action="store_true", |
igor@37 | 44 dest="no_marks") |
igor@37 | 45 |
igor@37 | 46 parser.add_option( |
igor@37 | 47 "-c", "--compressed", |
igor@37 | 48 help="show compressed wordlist: one word per group", |
igor@37 | 49 action="store_true", |
igor@37 | 50 dest="compressed") |
igor@37 | 51 |
igor@37 | 52 parser.add_option( |
igor@37 | 53 "-k", "--known-words", |
igor@37 | 54 help="put higher words that are similar to the known words (only for English)", |
igor@37 | 55 action="store_true", |
igor@37 | 56 dest="compressed") |
igor@37 | 57 |
igor@37 | 58 parser.add_option( |
igor@37 | 59 "-l", "--language", |
igor@37 | 60 help="specify language of text", |
igor@37 | 61 action="store", |
igor@37 | 62 dest="language") |
igor@37 | 63 |
igor@37 | 64 parser.add_option( |
igor@38 | 65 "-f", "--function", |
igor@38 | 66 help="filter through subsystem [INTERNAL]", |
igor@38 | 67 action="store", |
igor@38 | 68 dest="function") |
igor@38 | 69 |
igor@38 | 70 parser.add_option( |
igor@37 | 71 "-m", "--merge-tag", |
igor@37 | 72 help="merge words tagged with specified tag into the main vocabulary", |
igor@37 | 73 action="store", |
igor@37 | 74 dest="merge_tag") |
igor@37 | 75 |
igor@37 | 76 parser.add_option( |
igor@37 | 77 "-M", "--merge-tagged", |
igor@37 | 78 help="merge words tagged with ANY tag into the main vocabulary", |
igor@37 | 79 action="store_true", |
igor@37 | 80 dest="merge_tagged") |
igor@37 | 81 |
igor@37 | 82 parser.add_option( |
igor@37 | 83 "-n", "--non-interactive", |
igor@37 | 84 help="non-interactive mode (don't run vi)", |
igor@37 | 85 action="store_true", |
igor@37 | 86 dest="non_interactive") |
igor@37 | 87 |
igor@37 | 88 parser.add_option( |
igor@37 | 89 "-N", "--no-filter", |
igor@37 | 90 help="switch off known words filtering", |
igor@37 | 91 action="store_true", |
igor@37 | 92 dest="no_filter") |
igor@37 | 93 |
igor@37 | 94 parser.add_option( |
igor@37 | 95 "-p", "--pages", |
igor@37 | 96 help="work with specified pages only (pages = start-stop/total )", |
igor@37 | 97 action="store", |
igor@37 | 98 dest="pages") |
igor@37 | 99 |
igor@37 | 100 parser.add_option( |
igor@37 | 101 "-r", "--remove-tag", |
igor@37 | 102 help="remove subvocabulary of specified tag", |
igor@37 | 103 action="store", |
igor@37 | 104 dest="remove_tag") |
igor@37 | 105 |
igor@37 | 106 parser.add_option( |
igor@37 | 107 "-s", "--text-stats", |
igor@37 | 108 help="show the text statistics (percentage of known words and so on) and exit", |
igor@37 | 109 action="store_true", |
igor@37 | 110 dest="text_stats") |
igor@37 | 111 |
igor@37 | 112 parser.add_option( |
igor@37 | 113 "-S", "--voc-stats", |
igor@37 | 114 help="show your vocabulary statistics (number of words and word groups)", |
igor@37 | 115 action="store_true", |
igor@37 | 116 dest="voc_stats") |
igor@37 | 117 |
igor@37 | 118 parser.add_option( |
igor@37 | 119 "-t", "--tag", |
igor@37 | 120 help="tag known words with tag", |
igor@37 | 121 action="store", |
igor@37 | 122 dest="tag") |
igor@37 | 123 |
igor@37 | 124 parser.add_option( |
igor@37 | 125 "-T", "--show-tags", |
igor@37 | 126 help="tag known words with tag", |
igor@37 | 127 action="store_true", |
igor@37 | 128 dest="show_tags") |
igor@37 | 129 |
igor@37 | 130 parser.add_option( |
igor@37 | 131 "-2", "--two-words", |
igor@37 | 132 help="find 2 words' sequences", |
igor@37 | 133 action="store_true", |
igor@37 | 134 dest="two_words") |
igor@37 | 135 |
igor@37 | 136 parser.add_option( |
igor@37 | 137 "-3", "--three-words", |
igor@37 | 138 help="find 3 words' sequences", |
igor@37 | 139 action="store_true", |
igor@37 | 140 dest="three_words") |
igor@37 | 141 |
igor@38 | 142 def readlines_from_file(filename): |
igor@38 | 143 res = [] |
igor@38 | 144 with codecs.open(filename, "r", "utf-8") as f: |
igor@38 | 145 for line in f.readlines(): |
igor@38 | 146 res += [line] |
igor@38 | 147 return res |
igor@38 | 148 |
igor@38 | 149 def readlines_from_stdin(): |
igor@38 | 150 return codecs.getreader("utf-8")(sys.stdin).readlines() |
igor@38 | 151 |
igor@38 | 152 def words_from_line(line): |
igor@38 | 153 line = line.rstrip('\n') |
igor@38 | 154 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line) |
igor@38 | 155 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line) |
igor@38 | 156 return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line) |
igor@38 | 157 |
igor@38 | 158 def get_words(lines): |
igor@38 | 159 """ |
igor@38 | 160 Returns hash of words in a file |
igor@38 | 161 word => number |
igor@38 | 162 """ |
igor@38 | 163 result = {} |
igor@38 | 164 for line in lines: |
igor@38 | 165 words = words_from_line(line) |
igor@38 | 166 for word in words: |
igor@38 | 167 result.setdefault(word, 0) |
igor@38 | 168 result[word] += 1 |
igor@38 | 169 return result |
igor@38 | 170 |
igor@38 | 171 def load_vocabulary(): |
igor@38 | 172 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language']))) |
igor@38 | 173 |
igor@38 | 174 def notes_filenames(): |
igor@38 | 175 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])] |
igor@38 | 176 |
igor@38 | 177 def load_notes(files): |
igor@38 | 178 notes = {} |
igor@38 | 179 for filename in files: |
igor@38 | 180 with open(filename) as f: |
igor@38 | 181 for line in f.readlines(): |
igor@38 | 182 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) |
igor@38 | 183 notes.setdefault(word, {}) |
igor@38 | 184 notes[word][filename] = note |
igor@38 | 185 return notes |
igor@38 | 186 |
igor@38 | 187 def print_words_sorted(words_freq): |
igor@38 | 188 for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True): |
igor@38 | 189 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k)) |
igor@38 | 190 |
igor@38 | 191 def substract_dictionary(dict1, dict2): |
igor@38 | 192 """ |
igor@38 | 193 returns dict1 - dict2 |
igor@38 | 194 """ |
igor@38 | 195 result = {} |
igor@38 | 196 for (k,v) in dict1.items(): |
igor@38 | 197 if not k in dict2: |
igor@38 | 198 result[k] = v |
igor@38 | 199 return result |
igor@38 | 200 |
igor@38 | 201 def dump_words(words, filename): |
igor@38 | 202 with codecs.open(filename, "w+", "utf-8") as f: |
igor@38 | 203 for word in words.keys(): |
igor@38 | 204 f.write(("%s\n"%word)*words[word]) |
igor@38 | 205 |
igor@38 | 206 def error_message(text): |
igor@38 | 207 print text |
igor@38 | 208 |
igor@38 | 209 def find_wordgroups_weights(lines, normalizator): |
igor@38 | 210 weight = {} |
igor@38 | 211 for line in lines: |
igor@38 | 212 line = re.sub('^\s*', '', line.rstrip('\n')) |
igor@38 | 213 (num, word) = re.split('\s+', line, maxsplit=1) |
igor@38 | 214 normalized = normalizator.normalize(word) |
igor@38 | 215 weight.setdefault(normalized, 0) |
igor@38 | 216 weight[normalized] += int(num) |
igor@38 | 217 return weight |
igor@38 | 218 |
igor@38 | 219 def find_linked_words(notes): |
igor@38 | 220 linked_words = {} |
igor@38 | 221 for word in notes.keys(): |
igor@38 | 222 for note in notes[word].values(): |
igor@38 | 223 if "@" in note: |
igor@38 | 224 logging.debug("%s %s" % (word, note)) |
igor@38 | 225 result = re.search(r'\@(\S*)', note) |
igor@38 | 226 if result: |
igor@38 | 227 main_word = result.group(1) |
igor@38 | 228 logging.debug("%s %s" % (word, main_word)) |
igor@38 | 229 if main_word: |
igor@38 | 230 linked_words[word] = main_word |
igor@38 | 231 return linked_words |
igor@38 | 232 |
igor@38 | 233 |
igor@38 | 234 def compare_word_lines(line1, line2, wgw, normalizator, linked_words): |
igor@38 | 235 line1 = re.sub('^\s*', '', line1.rstrip('\n')) |
igor@38 | 236 (num1, word1) = re.split('\s+', line1, 1) |
igor@38 | 237 line2 = re.sub('^\s*', '', line2.rstrip('\n')) |
igor@38 | 238 (num2, word2) = re.split('\s+', line2, 1) |
igor@38 | 239 |
igor@38 | 240 normalized_word1 = normalizator.normalize(word1) |
igor@38 | 241 normalized_word2 = normalizator.normalize(word2) |
igor@38 | 242 |
igor@38 | 243 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2]) |
igor@38 | 244 if cmp_res != 0: |
igor@38 | 245 return cmp_res |
igor@38 | 246 else: |
igor@38 | 247 cmp_res = cmp(normalized_word1, normalized_word2) |
igor@38 | 248 if cmp_res != 0: |
igor@38 | 249 return cmp_res |
igor@38 | 250 else: |
igor@38 | 251 return cmp(int(num1), int(num2)) |
igor@38 | 252 |
igor@38 | 253 def filter_get_words(args): |
igor@38 | 254 vocabulary = load_vocabulary() |
igor@38 | 255 words = get_words(readlines_from_stdin()) |
igor@38 | 256 dump_words(words, args[0]) |
igor@38 | 257 words = substract_dictionary(words, vocabulary) |
igor@38 | 258 print_words_sorted(words) |
igor@38 | 259 |
igor@38 | 260 def filter_group_words(args): |
igor@38 | 261 lines = readlines_from_stdin() |
igor@38 | 262 notes = load_notes(notes_filenames()) |
igor@38 | 263 linked_words = find_linked_words(notes) |
igor@38 | 264 logging.debug(linked_words) |
igor@38 | 265 normalizator = Normalizator(config['language'], linked_words) |
igor@38 | 266 |
igor@38 | 267 wgw = find_wordgroups_weights(lines, normalizator) |
igor@38 | 268 for line in sorted( |
igor@38 | 269 lines, |
igor@38 | 270 cmp=lambda x,y:compare_word_lines(x,y, wgw, normalizator, linked_words), |
igor@38 | 271 reverse=True): |
igor@38 | 272 codecs.getwriter("utf-8")(sys.stdout).write(line) |
igor@38 | 273 |
igor@37 | 274 (options, args) = parser.parse_args() |
igor@38 | 275 if options.language: |
igor@38 | 276 config['language'] = options.language |
igor@37 | 277 |
igor@38 | 278 if options.function: |
igor@38 | 279 function_names = { |
igor@38 | 280 'get_words' : filter_get_words, |
igor@38 | 281 'group_words' : filter_group_words, |
igor@38 | 282 } |
igor@38 | 283 if options.function in function_names: |
igor@38 | 284 function_names[options.function](args) |
igor@38 | 285 else: |
igor@38 | 286 error_message("Unkown function %s.\nAvailable functions:\n%s" % ( |
igor@38 | 287 options.function, "".join([" "+x for x in sorted(function_names.keys())]))) |
igor@38 | 288 sys.exit(1) |
igor@37 | 289 |
igor@37 | 290 |
igor@37 | 291 |
igor@37 | 292 |
igor@38 | 293 #os.system("vim") |
igor@37 | 294 |