new-words
view new-words.py @ 39:a598e0d25784
add_notes (add_marks) + remove_notes (remove_marks) implemented in python
| author | Igor Chubin <igor@chub.in> | 
|---|---|
| date | Sun Jan 23 14:25:52 2011 +0100 (2011-01-23) | 
| parents | adbc809d3924 | 
| children | c3a50c0d2400 | 
 line source
     1 #!/usr/bin/env python
     2 # -*- coding: utf-8 -*-
     4 import codecs
     5 import logging
     6 import os
     7 import optparse
     8 import re
     9 import subprocess
    10 import sys
    11 import Stemmer
    13 config = {
    14     'config_directory': os.environ['HOME'] + '/.new-words',
    15     'language': 'en',
    16 }
    18 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
    20 class Normalizator:
    21     def __init__(self, language, linked_words={}):
    22         stemmer_algorithm = {
    23             'de' : 'german',
    24             'en' : 'english',
    25             'ru' : 'russian',
    26             'uk' : 'ukrainian',
    27         }
    28         self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
    29         self.linked_words = linked_words
    31     def normalize(self, word):
    32         word_chain = []
    33         while word in self.linked_words and not word in word_chain:
    34             word_chain.append(word)
    35             word = self.linked_words[word]
    36         return self.stemmer.stemWord(word.lower())
    38 parser = optparse.OptionParser()
    40 parser.add_option(
    41     "-a", "--no-marks",
    42     help="don't add marks (and don't save marks added by user)",
    43     action="store_true",
    44     dest="no_marks")
    46 parser.add_option(
    47     "-c", "--compressed",
    48     help="show compressed wordlist: one word per group",
    49     action="store_true",
    50     dest="compressed")
    52 parser.add_option(
    53     "-k", "--known-words",
    54     help="put higher words that are similar to the known words (only for English)",
    55     action="store_true",
    56     dest="compressed")
    58 parser.add_option(
    59     "-l", "--language",
    60     help="specify language of text",
    61     action="store",
    62     dest="language")
    64 parser.add_option(
    65     "-f", "--function",
    66     help="filter through subsystem [INTERNAL]",
    67     action="store",
    68     dest="function")
    70 parser.add_option(
    71     "-m", "--merge-tag",
    72     help="merge words tagged with specified tag into the main vocabulary",
    73     action="store",
    74     dest="merge_tag")
    76 parser.add_option(
    77     "-M", "--merge-tagged",
    78     help="merge words tagged with ANY tag into the main vocabulary",
    79     action="store_true",
    80     dest="merge_tagged")
    82 parser.add_option(
    83     "-n", "--non-interactive",
    84     help="non-interactive mode (don't run vi)",
    85     action="store_true",
    86     dest="non_interactive")
    88 parser.add_option(
    89     "-N", "--no-filter",
    90     help="switch off known words filtering",
    91     action="store_true",
    92     dest="no_filter")
    94 parser.add_option(
    95     "-p", "--pages",
    96     help="work with specified pages only (pages = start-stop/total )",
    97     action="store",
    98     dest="pages")
   100 parser.add_option(
   101     "-r", "--remove-tag",
   102     help="remove subvocabulary of specified tag",
   103     action="store",
   104     dest="remove_tag")
   106 parser.add_option(
   107     "-s", "--text-stats",
   108     help="show the text statistics (percentage of known words and so on) and exit",
   109     action="store_true",
   110     dest="text_stats")
   112 parser.add_option(
   113     "-S", "--voc-stats",
   114     help="show your vocabulary statistics (number of words and word groups)",
   115     action="store_true",
   116     dest="voc_stats")
   118 parser.add_option(
   119     "-t", "--tag",
   120     help="tag known words with tag",
   121     action="store",
   122     dest="tag")
   124 parser.add_option(
   125     "-T", "--show-tags",
   126     help="tag known words with tag",
   127     action="store_true",
   128     dest="show_tags")
   130 parser.add_option(
   131     "-2", "--two-words",
   132     help="find 2 words' sequences",
   133     action="store_true",
   134     dest="two_words")
   136 parser.add_option(
   137     "-3", "--three-words",
   138     help="find 3 words' sequences",
   139     action="store_true",
   140     dest="three_words")
   142 def readlines_from_file(filename):
   143     res = []
   144     with codecs.open(filename, "r", "utf-8") as f:
   145         for line in f.readlines():
   146             res += [line]
   147     return res
   149 def readlines_from_stdin():
   150     return codecs.getreader("utf-8")(sys.stdin).readlines()
   152 def words_from_line(line):
   153     line = line.rstrip('\n')
   154     #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
   155     #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
   156     return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line)
   158 def get_words(lines):
   159     """
   160     Returns hash of words in a file
   161     word => number
   162     """
   163     result = {}
   164     for line in lines:
   165         words = words_from_line(line)
   166         for word in words:
   167             result.setdefault(word, 0)
   168             result[word] += 1
   169     return result
   171 def load_vocabulary():
   172     return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
   174 def notes_filenames():
   175     return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
   177 def load_notes(files):
   178     notes = {}
   179     for filename in files:
   180         with codecs.open(filename, "r", "utf-8") as f:
   181             for line in f.readlines():
   182                 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
   183                 notes.setdefault(word, {})
   184                 notes[word][filename] = note
   185     return notes
   187 def add_notes(lines, notes):
   188     notes_filename = notes_filenames()[0]
   189     result = []
   190     for line in lines:
   191         if line.startswith('#'):
   192             result += [line]
   193         else:
   194             match_object = re.search('^\s*\S+\s*(\S+)', line)
   195             if match_object:
   196                 word = match_object.group(1)
   197                 if word in notes:
   198                     logging.debug(word)
   199                     logging.debug(line)
   200                     if notes_filename in notes[word]:
   201                         line = line.rstrip('\n')
   202                         line = "%-30s %s\n" % (line, notes[word][notes_filename])
   203                         logging.debug(line)
   204                         result += [line]
   205                 else:
   206                     result += [line]
   207             else:
   208                 result += [line]
   209     return result
   211 def remove_notes(lines, notes_group):
   212     notes_filename = notes_filenames()[0]
   213     notes = {}
   214     for k in notes_group.keys():
   215         if notes_filename in notes_group[k]:
   216             notes[k] = notes_group[k][notes_filename]
   218     result = []
   219     for line in lines:
   220         line = line.rstrip('\n')
   221         match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
   222         if match_object:
   223             result.append("".join([
   224                 match_object.group(1),
   225                 match_object.group(2),
   226                 match_object.group(3),
   227                 match_object.group(4),
   228                 "\n"
   229                 ]))
   230             notes[match_object.group(4)] = match_object.group(6)
   231         else:
   232             result.append(line+"\n")
   234     save_notes(notes_filename, notes)
   235     return result
   237 def save_notes(filename, notes):
   238     lines = []
   239     saved_words = []
   240     with codecs.open(filename, "r", "utf-8") as f:
   241         for line in f.readlines():
   242             (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
   243             if word in notes:
   244                 line = "%-29s %s\n" % (word, notes[word])
   245                 saved_words.append(word)
   246             lines.append(line)
   247     for word in [x for x in notes.keys() if not x in saved_words]:
   248         line = "%-29s %s\n" % (word, notes[word])
   249         lines.append(line)
   251     with codecs.open(filename, "w", "utf-8") as f:
   252         for line in lines:
   253             f.write(line)
   256 def print_words_sorted(words_freq):
   257     for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True):
   258         codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k))
   260 def substract_dictionary(dict1, dict2):
   261     """
   262     returns dict1 - dict2
   263     """
   264     result = {}
   265     for (k,v) in dict1.items():
   266         if not k in dict2:
   267             result[k] = v
   268     return result
   270 def dump_words(words, filename):
   271     with codecs.open(filename, "w+", "utf-8") as f:
   272         for word in words.keys():
   273             f.write(("%s\n"%word)*words[word])
   275 def error_message(text):
   276     print text
   278 def find_wordgroups_weights(lines, normalizator):
   279     weight = {}
   280     for line in lines:
   281         line = re.sub('^\s*', '', line.rstrip('\n'))
   282         (num, word) = re.split('\s+', line, maxsplit=1)
   283         normalized = normalizator.normalize(word)
   284         weight.setdefault(normalized, 0)
   285         weight[normalized] += int(num)
   286     return weight
   288 def find_linked_words(notes):
   289     linked_words = {}
   290     for word in notes.keys():
   291         for note in notes[word].values():
   292             if "@" in note:
   293                 result = re.search(r'\@(\S*)', note)
   294                 if result:
   295                     main_word = result.group(1)
   296                     if main_word:
   297                         linked_words[word] = main_word
   298     return linked_words
   301 def compare_word_lines(line1, line2, wgw, normalizator, linked_words):
   302     line1 = re.sub('^\s*', '', line1.rstrip('\n'))
   303     (num1, word1) = re.split('\s+', line1, 1)
   304     line2 = re.sub('^\s*', '', line2.rstrip('\n'))
   305     (num2, word2) = re.split('\s+', line2, 1)
   307     normalized_word1 = normalizator.normalize(word1)
   308     normalized_word2 = normalizator.normalize(word2)
   310     cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
   311     if cmp_res != 0:
   312         return cmp_res
   313     else:
   314         cmp_res = cmp(normalized_word1, normalized_word2)
   315         if cmp_res != 0:
   316             return cmp_res
   317         else:
   318             return cmp(int(num1), int(num2))
   320 def filter_get_words(args):
   321     vocabulary = load_vocabulary()
   322     words = get_words(readlines_from_stdin())
   323     dump_words(words, args[0])
   324     words = substract_dictionary(words, vocabulary)
   325     print_words_sorted(words)
   327 def filter_group_words(args):
   328     lines = readlines_from_stdin()
   329     notes = load_notes(notes_filenames())
   330     linked_words = find_linked_words(notes)
   331     normalizator = Normalizator(config['language'], linked_words)
   333     wgw = find_wordgroups_weights(lines, normalizator)
   334     for line in sorted(
   335                 lines,
   336                 cmp=lambda x,y:compare_word_lines(x,y, wgw, normalizator, linked_words),
   337                 reverse=True):
   338         codecs.getwriter("utf-8")(sys.stdout).write(line)
   340 def filter_add_notes(args):
   341     lines = readlines_from_file(args[0])
   342     notes = load_notes(notes_filenames())
   343     lines = add_notes(lines, notes)
   344     with codecs.open(args[0], "w", "utf-8") as f:
   345         for line in lines:
   346             f.write(line)
   348 def filter_remove_notes(args):
   349     lines = readlines_from_file(args[0])
   350     notes = load_notes(notes_filenames())
   351     lines = remove_notes(lines, notes)
   352     with codecs.open(args[0], "w", "utf-8") as f:
   353         for line in lines:
   354             f.write(line)
   356 (options, args) = parser.parse_args()
   357 if options.language:
   358     config['language'] = options.language
   360 if options.function:
   361     function_names = {
   362         'get_words' :   filter_get_words,
   363         'group_words' : filter_group_words,
   364         'add_notes' :   filter_add_notes,
   365         'remove_notes': filter_remove_notes,
   366     }
   367     if options.function in function_names:
   368         function_names[options.function](args)
   369     else:
   370         error_message("Unkown function %s.\nAvailable functions:\n%s" % (
   371             options.function, "".join(["   "+x for x in sorted(function_names.keys())])))
   372         sys.exit(1)
   377 #os.system("vim")
