new-words

diff new-words.py @ 38:adbc809d3924
Transition to Python started

new-words-py.sh is a wrapper around
new-words.py version which is not finished yet.
author: Igor Chubin <igor@chub.in>
date: Sat Jan 22 23:42:31 2011 +0100 (2011-01-22)
parents: be6336e98b3c
children: a598e0d25784
     1.1 --- a/new-words.py	Fri Jan 21 15:59:45 2011 +0200
     1.2 +++ b/new-words.py	Sat Jan 22 23:42:31 2011 +0100
     1.3 @@ -1,6 +1,39 @@
     1.4  #!/usr/bin/env python
     1.5 +# -*- coding: utf-8 -*-
     1.6  
     1.7 +import codecs
     1.8 +import logging
     1.9 +import os
    1.10  import optparse
    1.11 +import re
    1.12 +import subprocess
    1.13 +import sys
    1.14 +import Stemmer
    1.15 +
    1.16 +config = {
    1.17 +    'config_directory': os.environ['HOME'] + '/.new-words',
    1.18 +    'language': 'en',
    1.19 +}
    1.20 +
    1.21 +logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
    1.22 +
    1.23 +class Normalizator:
    1.24 +    def __init__(self, language, linked_words={}):
    1.25 +        stemmer_algorithm = {
    1.26 +            'de' : 'german',
    1.27 +            'en' : 'english',
    1.28 +            'ru' : 'russian',
    1.29 +            'uk' : 'ukrainian',
    1.30 +        }
    1.31 +        self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
    1.32 +        self.linked_words = linked_words
    1.33 +
    1.34 +    def normalize(self, word):
    1.35 +        word_chain = []
    1.36 +        while word in self.linked_words and not word in word_chain:
    1.37 +            word_chain.append(word)
    1.38 +            word = self.linked_words[word]
    1.39 +        return self.stemmer.stemWord(word.lower())
    1.40  
    1.41  parser = optparse.OptionParser()
    1.42  
    1.43 @@ -29,6 +62,12 @@
    1.44      dest="language")
    1.45  
    1.46  parser.add_option(
    1.47 +    "-f", "--function",
    1.48 +    help="filter through subsystem [INTERNAL]",
    1.49 +    action="store",
    1.50 +    dest="function")
    1.51 +
    1.52 +parser.add_option(
    1.53      "-m", "--merge-tag",
    1.54      help="merge words tagged with specified tag into the main vocabulary",
    1.55      action="store",
    1.56 @@ -100,31 +139,156 @@
    1.57      action="store_true",
    1.58      dest="three_words")
    1.59  
    1.60 +def readlines_from_file(filename):
    1.61 +    res = []
    1.62 +    with codecs.open(filename, "r", "utf-8") as f:
    1.63 +        for line in f.readlines():
    1.64 +            res += [line]
    1.65 +    return res
    1.66 +
    1.67 +def readlines_from_stdin():
    1.68 +    return codecs.getreader("utf-8")(sys.stdin).readlines()
    1.69 +
    1.70 +def words_from_line(line):
    1.71 +    line = line.rstrip('\n')
    1.72 +    #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
    1.73 +    #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
    1.74 +    return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line)
    1.75 +
    1.76 +def get_words(lines):
    1.77 +    """
    1.78 +    Returns hash of words in a file
    1.79 +    word => number
    1.80 +    """
    1.81 +    result = {}
    1.82 +    for line in lines:
    1.83 +        words = words_from_line(line)
    1.84 +        for word in words:
    1.85 +            result.setdefault(word, 0)
    1.86 +            result[word] += 1
    1.87 +    return result
    1.88 +
    1.89 +def load_vocabulary():
    1.90 +    return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
    1.91 +
    1.92 +def notes_filenames():
    1.93 +    return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
    1.94 +
    1.95 +def load_notes(files):
    1.96 +    notes = {}
    1.97 +    for filename in files:
    1.98 +        with open(filename) as f:
    1.99 +            for line in f.readlines():
   1.100 +                (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
   1.101 +                notes.setdefault(word, {})
   1.102 +                notes[word][filename] = note
   1.103 +    return notes
   1.104 +
   1.105 +def print_words_sorted(words_freq):
   1.106 +    for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True):
   1.107 +        codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k))
   1.108 +
   1.109 +def substract_dictionary(dict1, dict2):
   1.110 +    """
   1.111 +    returns dict1 - dict2
   1.112 +    """
   1.113 +    result = {}
   1.114 +    for (k,v) in dict1.items():
   1.115 +        if not k in dict2:
   1.116 +            result[k] = v
   1.117 +    return result
   1.118 +
   1.119 +def dump_words(words, filename):
   1.120 +    with codecs.open(filename, "w+", "utf-8") as f:
   1.121 +        for word in words.keys():
   1.122 +            f.write(("%s\n"%word)*words[word])
   1.123 +
   1.124 +def error_message(text):
   1.125 +    print text
   1.126 +
   1.127 +def find_wordgroups_weights(lines, normalizator):
   1.128 +    weight = {}
   1.129 +    for line in lines:
   1.130 +        line = re.sub('^\s*', '', line.rstrip('\n'))
   1.131 +        (num, word) = re.split('\s+', line, maxsplit=1)
   1.132 +        normalized = normalizator.normalize(word)
   1.133 +        weight.setdefault(normalized, 0)
   1.134 +        weight[normalized] += int(num)
   1.135 +    return weight
   1.136 +
   1.137 +def find_linked_words(notes):
   1.138 +    linked_words = {}
   1.139 +    for word in notes.keys():
   1.140 +        for note in notes[word].values():
   1.141 +            if "@" in note:
   1.142 +                logging.debug("%s %s" % (word, note))
   1.143 +                result = re.search(r'\@(\S*)', note)
   1.144 +                if result:
   1.145 +                    main_word = result.group(1)
   1.146 +                    logging.debug("%s %s" % (word, main_word))
   1.147 +                    if main_word:
   1.148 +                        linked_words[word] = main_word
   1.149 +    return linked_words
   1.150 +
   1.151 +
   1.152 +def compare_word_lines(line1, line2, wgw, normalizator, linked_words):
   1.153 +    line1 = re.sub('^\s*', '', line1.rstrip('\n'))
   1.154 +    (num1, word1) = re.split('\s+', line1, 1)
   1.155 +    line2 = re.sub('^\s*', '', line2.rstrip('\n'))
   1.156 +    (num2, word2) = re.split('\s+', line2, 1)
   1.157 +
   1.158 +    normalized_word1 = normalizator.normalize(word1)
   1.159 +    normalized_word2 = normalizator.normalize(word2)
   1.160 +
   1.161 +    cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
   1.162 +    if cmp_res != 0:
   1.163 +        return cmp_res
   1.164 +    else:
   1.165 +        cmp_res = cmp(normalized_word1, normalized_word2)
   1.166 +        if cmp_res != 0:
   1.167 +            return cmp_res
   1.168 +        else:
   1.169 +            return cmp(int(num1), int(num2))
   1.170 +
   1.171 +def filter_get_words(args):
   1.172 +    vocabulary = load_vocabulary()
   1.173 +    words = get_words(readlines_from_stdin())
   1.174 +    dump_words(words, args[0])
   1.175 +    words = substract_dictionary(words, vocabulary)
   1.176 +    print_words_sorted(words)
   1.177 +
   1.178 +def filter_group_words(args):
   1.179 +    lines = readlines_from_stdin()
   1.180 +    notes = load_notes(notes_filenames())
   1.181 +    linked_words = find_linked_words(notes)
   1.182 +    logging.debug(linked_words)
   1.183 +    normalizator = Normalizator(config['language'], linked_words)
   1.184 +
   1.185 +    wgw = find_wordgroups_weights(lines, normalizator)
   1.186 +    for line in sorted(
   1.187 +                lines,
   1.188 +                cmp=lambda x,y:compare_word_lines(x,y, wgw, normalizator, linked_words),
   1.189 +                reverse=True):
   1.190 +        codecs.getwriter("utf-8")(sys.stdout).write(line)
   1.191 +
   1.192  (options, args) = parser.parse_args()
   1.193 +if options.language:
   1.194 +    config['language'] = options.language
   1.195  
   1.196 -def get_words():
   1.197 -    pass
   1.198 +if options.function:
   1.199 +    function_names = {
   1.200 +        'get_words' : filter_get_words,
   1.201 +        'group_words' : filter_group_words,
   1.202 +    }
   1.203 +    if options.function in function_names:
   1.204 +        function_names[options.function](args)
   1.205 +    else:
   1.206 +        error_message("Unkown function %s.\nAvailable functions:\n%s" % (
   1.207 +            options.function, "".join(["   "+x for x in sorted(function_names.keys())])))
   1.208 +        sys.exit(1)
   1.209  
   1.210 -def add_stat():
   1.211 -    pass
   1.212  
   1.213 -def two_and_three_words():
   1.214 -    pass
   1.215  
   1.216 -def grep_v_english():
   1.217 -    pass
   1.218  
   1.219 -def group_words():
   1.220 -    pass
   1.221 +#os.system("vim")
   1.222  
   1.223 -def add_marks():
   1.224 -    pass
   1.225 -
   1.226 -def remove_marks():
   1.227 -    pass
   1.228 -
   1.229 -def text_from_url():
   1.230 -    pass
   1.231 -
   1.232 -def part():
   1.233 -    pass
author	Igor Chubin <igor@chub.in>
date	Sat Jan 22 23:42:31 2011 +0100 (2011-01-22)
parents	be6336e98b3c
children	a598e0d25784