new-words

annotate new-words.py @ 38:adbc809d3924

Transition to Python started

new-words-py.sh is a wrapper around
new-words.py version which is not finished yet.
author Igor Chubin <igor@chub.in>
date Sat Jan 22 23:42:31 2011 +0100 (2011-01-22)
parents be6336e98b3c
children a598e0d25784
rev   line source
igor@37 1 #!/usr/bin/env python
igor@38 2 # -*- coding: utf-8 -*-
igor@37 3
igor@38 4 import codecs
igor@38 5 import logging
igor@38 6 import os
igor@37 7 import optparse
igor@38 8 import re
igor@38 9 import subprocess
igor@38 10 import sys
igor@38 11 import Stemmer
igor@38 12
igor@38 13 config = {
igor@38 14 'config_directory': os.environ['HOME'] + '/.new-words',
igor@38 15 'language': 'en',
igor@38 16 }
igor@38 17
igor@38 18 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
igor@38 19
igor@38 20 class Normalizator:
igor@38 21 def __init__(self, language, linked_words={}):
igor@38 22 stemmer_algorithm = {
igor@38 23 'de' : 'german',
igor@38 24 'en' : 'english',
igor@38 25 'ru' : 'russian',
igor@38 26 'uk' : 'ukrainian',
igor@38 27 }
igor@38 28 self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
igor@38 29 self.linked_words = linked_words
igor@38 30
igor@38 31 def normalize(self, word):
igor@38 32 word_chain = []
igor@38 33 while word in self.linked_words and not word in word_chain:
igor@38 34 word_chain.append(word)
igor@38 35 word = self.linked_words[word]
igor@38 36 return self.stemmer.stemWord(word.lower())
igor@37 37
igor@37 38 parser = optparse.OptionParser()
igor@37 39
igor@37 40 parser.add_option(
igor@37 41 "-a", "--no-marks",
igor@37 42 help="don't add marks (and don't save marks added by user)",
igor@37 43 action="store_true",
igor@37 44 dest="no_marks")
igor@37 45
igor@37 46 parser.add_option(
igor@37 47 "-c", "--compressed",
igor@37 48 help="show compressed wordlist: one word per group",
igor@37 49 action="store_true",
igor@37 50 dest="compressed")
igor@37 51
igor@37 52 parser.add_option(
igor@37 53 "-k", "--known-words",
igor@37 54 help="put higher words that are similar to the known words (only for English)",
igor@37 55 action="store_true",
igor@37 56 dest="compressed")
igor@37 57
igor@37 58 parser.add_option(
igor@37 59 "-l", "--language",
igor@37 60 help="specify language of text",
igor@37 61 action="store",
igor@37 62 dest="language")
igor@37 63
igor@37 64 parser.add_option(
igor@38 65 "-f", "--function",
igor@38 66 help="filter through subsystem [INTERNAL]",
igor@38 67 action="store",
igor@38 68 dest="function")
igor@38 69
igor@38 70 parser.add_option(
igor@37 71 "-m", "--merge-tag",
igor@37 72 help="merge words tagged with specified tag into the main vocabulary",
igor@37 73 action="store",
igor@37 74 dest="merge_tag")
igor@37 75
igor@37 76 parser.add_option(
igor@37 77 "-M", "--merge-tagged",
igor@37 78 help="merge words tagged with ANY tag into the main vocabulary",
igor@37 79 action="store_true",
igor@37 80 dest="merge_tagged")
igor@37 81
igor@37 82 parser.add_option(
igor@37 83 "-n", "--non-interactive",
igor@37 84 help="non-interactive mode (don't run vi)",
igor@37 85 action="store_true",
igor@37 86 dest="non_interactive")
igor@37 87
igor@37 88 parser.add_option(
igor@37 89 "-N", "--no-filter",
igor@37 90 help="switch off known words filtering",
igor@37 91 action="store_true",
igor@37 92 dest="no_filter")
igor@37 93
igor@37 94 parser.add_option(
igor@37 95 "-p", "--pages",
igor@37 96 help="work with specified pages only (pages = start-stop/total )",
igor@37 97 action="store",
igor@37 98 dest="pages")
igor@37 99
igor@37 100 parser.add_option(
igor@37 101 "-r", "--remove-tag",
igor@37 102 help="remove subvocabulary of specified tag",
igor@37 103 action="store",
igor@37 104 dest="remove_tag")
igor@37 105
igor@37 106 parser.add_option(
igor@37 107 "-s", "--text-stats",
igor@37 108 help="show the text statistics (percentage of known words and so on) and exit",
igor@37 109 action="store_true",
igor@37 110 dest="text_stats")
igor@37 111
igor@37 112 parser.add_option(
igor@37 113 "-S", "--voc-stats",
igor@37 114 help="show your vocabulary statistics (number of words and word groups)",
igor@37 115 action="store_true",
igor@37 116 dest="voc_stats")
igor@37 117
igor@37 118 parser.add_option(
igor@37 119 "-t", "--tag",
igor@37 120 help="tag known words with tag",
igor@37 121 action="store",
igor@37 122 dest="tag")
igor@37 123
igor@37 124 parser.add_option(
igor@37 125 "-T", "--show-tags",
igor@37 126 help="tag known words with tag",
igor@37 127 action="store_true",
igor@37 128 dest="show_tags")
igor@37 129
igor@37 130 parser.add_option(
igor@37 131 "-2", "--two-words",
igor@37 132 help="find 2 words' sequences",
igor@37 133 action="store_true",
igor@37 134 dest="two_words")
igor@37 135
igor@37 136 parser.add_option(
igor@37 137 "-3", "--three-words",
igor@37 138 help="find 3 words' sequences",
igor@37 139 action="store_true",
igor@37 140 dest="three_words")
igor@37 141
igor@38 142 def readlines_from_file(filename):
igor@38 143 res = []
igor@38 144 with codecs.open(filename, "r", "utf-8") as f:
igor@38 145 for line in f.readlines():
igor@38 146 res += [line]
igor@38 147 return res
igor@38 148
igor@38 149 def readlines_from_stdin():
igor@38 150 return codecs.getreader("utf-8")(sys.stdin).readlines()
igor@38 151
igor@38 152 def words_from_line(line):
igor@38 153 line = line.rstrip('\n')
igor@38 154 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
igor@38 155 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
igor@38 156 return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line)
igor@38 157
igor@38 158 def get_words(lines):
igor@38 159 """
igor@38 160 Returns hash of words in a file
igor@38 161 word => number
igor@38 162 """
igor@38 163 result = {}
igor@38 164 for line in lines:
igor@38 165 words = words_from_line(line)
igor@38 166 for word in words:
igor@38 167 result.setdefault(word, 0)
igor@38 168 result[word] += 1
igor@38 169 return result
igor@38 170
igor@38 171 def load_vocabulary():
igor@38 172 return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
igor@38 173
igor@38 174 def notes_filenames():
igor@38 175 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
igor@38 176
igor@38 177 def load_notes(files):
igor@38 178 notes = {}
igor@38 179 for filename in files:
igor@38 180 with open(filename) as f:
igor@38 181 for line in f.readlines():
igor@38 182 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
igor@38 183 notes.setdefault(word, {})
igor@38 184 notes[word][filename] = note
igor@38 185 return notes
igor@38 186
igor@38 187 def print_words_sorted(words_freq):
igor@38 188 for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True):
igor@38 189 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k))
igor@38 190
igor@38 191 def substract_dictionary(dict1, dict2):
igor@38 192 """
igor@38 193 returns dict1 - dict2
igor@38 194 """
igor@38 195 result = {}
igor@38 196 for (k,v) in dict1.items():
igor@38 197 if not k in dict2:
igor@38 198 result[k] = v
igor@38 199 return result
igor@38 200
igor@38 201 def dump_words(words, filename):
igor@38 202 with codecs.open(filename, "w+", "utf-8") as f:
igor@38 203 for word in words.keys():
igor@38 204 f.write(("%s\n"%word)*words[word])
igor@38 205
igor@38 206 def error_message(text):
igor@38 207 print text
igor@38 208
igor@38 209 def find_wordgroups_weights(lines, normalizator):
igor@38 210 weight = {}
igor@38 211 for line in lines:
igor@38 212 line = re.sub('^\s*', '', line.rstrip('\n'))
igor@38 213 (num, word) = re.split('\s+', line, maxsplit=1)
igor@38 214 normalized = normalizator.normalize(word)
igor@38 215 weight.setdefault(normalized, 0)
igor@38 216 weight[normalized] += int(num)
igor@38 217 return weight
igor@38 218
igor@38 219 def find_linked_words(notes):
igor@38 220 linked_words = {}
igor@38 221 for word in notes.keys():
igor@38 222 for note in notes[word].values():
igor@38 223 if "@" in note:
igor@38 224 logging.debug("%s %s" % (word, note))
igor@38 225 result = re.search(r'\@(\S*)', note)
igor@38 226 if result:
igor@38 227 main_word = result.group(1)
igor@38 228 logging.debug("%s %s" % (word, main_word))
igor@38 229 if main_word:
igor@38 230 linked_words[word] = main_word
igor@38 231 return linked_words
igor@38 232
igor@38 233
igor@38 234 def compare_word_lines(line1, line2, wgw, normalizator, linked_words):
igor@38 235 line1 = re.sub('^\s*', '', line1.rstrip('\n'))
igor@38 236 (num1, word1) = re.split('\s+', line1, 1)
igor@38 237 line2 = re.sub('^\s*', '', line2.rstrip('\n'))
igor@38 238 (num2, word2) = re.split('\s+', line2, 1)
igor@38 239
igor@38 240 normalized_word1 = normalizator.normalize(word1)
igor@38 241 normalized_word2 = normalizator.normalize(word2)
igor@38 242
igor@38 243 cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
igor@38 244 if cmp_res != 0:
igor@38 245 return cmp_res
igor@38 246 else:
igor@38 247 cmp_res = cmp(normalized_word1, normalized_word2)
igor@38 248 if cmp_res != 0:
igor@38 249 return cmp_res
igor@38 250 else:
igor@38 251 return cmp(int(num1), int(num2))
igor@38 252
igor@38 253 def filter_get_words(args):
igor@38 254 vocabulary = load_vocabulary()
igor@38 255 words = get_words(readlines_from_stdin())
igor@38 256 dump_words(words, args[0])
igor@38 257 words = substract_dictionary(words, vocabulary)
igor@38 258 print_words_sorted(words)
igor@38 259
igor@38 260 def filter_group_words(args):
igor@38 261 lines = readlines_from_stdin()
igor@38 262 notes = load_notes(notes_filenames())
igor@38 263 linked_words = find_linked_words(notes)
igor@38 264 logging.debug(linked_words)
igor@38 265 normalizator = Normalizator(config['language'], linked_words)
igor@38 266
igor@38 267 wgw = find_wordgroups_weights(lines, normalizator)
igor@38 268 for line in sorted(
igor@38 269 lines,
igor@38 270 cmp=lambda x,y:compare_word_lines(x,y, wgw, normalizator, linked_words),
igor@38 271 reverse=True):
igor@38 272 codecs.getwriter("utf-8")(sys.stdout).write(line)
igor@38 273
igor@37 274 (options, args) = parser.parse_args()
igor@38 275 if options.language:
igor@38 276 config['language'] = options.language
igor@37 277
igor@38 278 if options.function:
igor@38 279 function_names = {
igor@38 280 'get_words' : filter_get_words,
igor@38 281 'group_words' : filter_group_words,
igor@38 282 }
igor@38 283 if options.function in function_names:
igor@38 284 function_names[options.function](args)
igor@38 285 else:
igor@38 286 error_message("Unkown function %s.\nAvailable functions:\n%s" % (
igor@38 287 options.function, "".join([" "+x for x in sorted(function_names.keys())])))
igor@38 288 sys.exit(1)
igor@37 289
igor@37 290
igor@37 291
igor@37 292
igor@38 293 #os.system("vim")
igor@37 294