new-words
view new-words.py @ 47:d708e2c1bad8
compressed wordlist support
| author | Igor Chubin <igor@chub.in> | 
|---|---|
| date | Mon Feb 07 21:21:17 2011 +0200 (2011-02-07) | 
| parents | 5f90e44eecfc | 
| children | 7194bdb56475 | 
 line source
     1 #!/usr/bin/env python
     2 # -*- coding: utf-8 -*-
     4 from __future__ import with_statement
     5 import codecs
     6 import logging
     7 import os
     8 import optparse
     9 import re
    10 import subprocess
    11 import sys
    12 import Stemmer
    13 try:
    14     import psyco
    15     psyco.full()
    16 except:
    17     pass
    19 config = {
    20     'config_directory': os.environ['HOME'] + '/.new-words',
    21     'language': 'en',
    22 }
    24 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
    26 class Normalizator:
    27     def __init__(self, language, linked_words={}):
    28         stemmer_algorithm = {
    29             'de' : 'german',
    30             'en' : 'english',
    31             'ru' : 'russian',
    32             'uk' : 'ukrainian',
    33         }
    34         self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
    35         self.linked_words = linked_words
    37     def normalize(self, word):
    38         word_chain = []
    39         while word in self.linked_words and not word in word_chain:
    40             word_chain.append(word)
    41             word = self.linked_words[word]
    42         return self.stemmer.stemWord(word.lower())
    44     def best_word_from_group(self, wordpairs_group):
    45         """Returns the word that is the most relevant to the wordpairs_group.
    47         At the moment: returns the word with minimal length"""
    49         minimal_length = min(len(pair[1]) for pair in wordpairs_group)
    50         return list(x[1] for x in sorted(
    51             (x for x in wordpairs_group if len(x[1]) == minimal_length),
    52             key=lambda x:x[0],
    53             reverse=True))[0]
    55 parser = optparse.OptionParser()
    57 parser.add_option(
    58     "-a", "--no-marks",
    59     help="don't add marks (and don't save marks added by user)",
    60     action="store_true",
    61     dest="no_marks")
    63 parser.add_option(
    64     "-c", "--compressed",
    65     help="show compressed wordlist: one word per group",
    66     action="store_true",
    67     dest="compressed")
    69 parser.add_option(
    70     "-k", "--known-words",
    71     help="put higher words that are similar to the known words (only for English)",
    72     action="store_true",
    73     dest="compressed")
    75 parser.add_option(
    76     "-l", "--language",
    77     help="specify language of text",
    78     action="store",
    79     dest="language")
    81 parser.add_option(
    82     "-f", "--function",
    83     help="filter through subsystem [INTERNAL]",
    84     action="store",
    85     dest="function")
    87 parser.add_option(
    88     "-m", "--merge-tag",
    89     help="merge words tagged with specified tag into the main vocabulary",
    90     action="store",
    91     dest="merge_tag")
    93 parser.add_option(
    94     "-M", "--merge-tagged",
    95     help="merge words tagged with ANY tag into the main vocabulary",
    96     action="store_true",
    97     dest="merge_tagged")
    99 parser.add_option(
   100     "-n", "--non-interactive",
   101     help="non-interactive mode (don't run vi)",
   102     action="store_true",
   103     dest="non_interactive")
   105 parser.add_option(
   106     "-N", "--no-filter",
   107     help="switch off known words filtering",
   108     action="store_true",
   109     dest="no_filter")
   111 parser.add_option(
   112     "-p", "--pages",
   113     help="work with specified pages only (pages = start-stop/total )",
   114     action="store",
   115     dest="pages")
   117 parser.add_option(
   118     "-r", "--remove-tag",
   119     help="remove subvocabulary of specified tag",
   120     action="store",
   121     dest="remove_tag")
   123 parser.add_option(
   124     "-s", "--text-stats",
   125     help="show the text statistics (percentage of known words and so on) and exit",
   126     action="store_true",
   127     dest="text_stats")
   129 parser.add_option(
   130     "-S", "--voc-stats",
   131     help="show your vocabulary statistics (number of words and word groups)",
   132     action="store_true",
   133     dest="voc_stats")
   135 parser.add_option(
   136     "-t", "--tag",
   137     help="tag known words with tag",
   138     action="store",
   139     dest="tag")
   141 parser.add_option(
   142     "-T", "--show-tags",
   143     help="tag known words with tag",
   144     action="store_true",
   145     dest="show_tags")
   147 parser.add_option(
   148     "-2", "--two-words",
   149     help="find 2 words' sequences",
   150     action="store_true",
   151     dest="two_words")
   153 parser.add_option(
   154     "-3", "--three-words",
   155     help="find 3 words' sequences",
   156     action="store_true",
   157     dest="three_words")
   159 def readlines_from_file(filename):
   160     res = []
   161     with codecs.open(filename, "r", "utf-8") as f:
   162         for line in f.readlines():
   163             res += [line]
   164     return res
   166 def readlines_from_stdin():
   167     return codecs.getreader("utf-8")(sys.stdin).readlines()
   169 def words_from_line(line):
   170     line = line.rstrip('\n')
   171     #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
   172     #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
   173     return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
   175 def get_words(lines, group_by=[1]):
   176     """
   177     Returns hash of words in a file
   178     word => number
   179     """
   180     result = {}
   181     (a, b, c) = ("", "", "")
   182     for line in lines:
   183         words = words_from_line(line)
   184         for word in words:
   185             if re.match('[0-9]*$', word):
   186                 continue
   187             result.setdefault(word, 0)
   188             result[word] += 1
   189             if 2 in group_by and a != "" and b != "":
   190                 w = "%s_%s" % (a,b)
   191                 result.setdefault(w, 0)
   192                 result[w] += 1
   193             if 3 in group_by and not "" in [a,b,c]:
   194                 w = "%s_%s_%s" % (a,b,c)
   195                 result.setdefault(w, 0)
   196                 result[w] += 1
   197             (a,b,c) = (b, c, word)
   199     logging.debug(result)
   200     return result
   202 def load_vocabulary():
   203     return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
   205 def notes_filenames():
   206     return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
   208 def load_notes(files):
   209     notes = {}
   210     for filename in files:
   211         with codecs.open(filename, "r", "utf-8") as f:
   212             for line in f.readlines():
   213                 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
   214                 notes.setdefault(word, {})
   215                 notes[word][filename] = note
   216     return notes
   218 def add_notes(lines, notes):
   219     notes_filename = notes_filenames()[0]
   220     result = []
   221     for line in lines:
   222         if line.startswith('#'):
   223             result += [line]
   224         else:
   225             match_object = re.search('^\s*\S+\s*(\S+)', line)
   226             if match_object:
   227                 word = match_object.group(1)
   228                 if word in notes:
   229                     if notes_filename in notes[word]:
   230                         line = line.rstrip('\n')
   231                         line = "%-30s %s\n" % (line, notes[word][notes_filename])
   232                         result += [line]
   233                 else:
   234                     result += [line]
   235             else:
   236                 result += [line]
   237     return result
   239 def remove_notes(lines, notes_group):
   240     notes_filename = notes_filenames()[0]
   241     notes = {}
   242     for k in notes_group.keys():
   243         if notes_filename in notes_group[k]:
   244             notes[k] = notes_group[k][notes_filename]
   246     result = []
   247     for line in lines:
   248         line = line.rstrip('\n')
   249         match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
   250         if match_object:
   251             result.append("".join([
   252                 match_object.group(1),
   253                 match_object.group(2),
   254                 match_object.group(3),
   255                 match_object.group(4),
   256                 "\n"
   257                 ]))
   258             notes[match_object.group(4)] = match_object.group(6)
   259         else:
   260             result.append(line+"\n")
   262     save_notes(notes_filename, notes)
   263     return result
   265 def save_notes(filename, notes):
   266     lines = []
   267     saved_words = []
   268     with codecs.open(filename, "r", "utf-8") as f:
   269         for line in f.readlines():
   270             (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
   271             if word in notes:
   272                 line = "%-29s %s\n" % (word, notes[word])
   273                 saved_words.append(word)
   274             lines.append(line)
   275     for word in [x for x in notes.keys() if not x in saved_words]:
   276         line = "%-29s %s\n" % (word, notes[word])
   277         lines.append(line)
   279     with codecs.open(filename, "w", "utf-8") as f:
   280         for line in lines:
   281             f.write(line)
   284 def substract_dictionary(dict1, dict2):
   285     """
   286     returns dict1 - dict2
   287     """
   288     result = {}
   289     for (k,v) in dict1.items():
   290         if not k in dict2:
   291             result[k] = v
   292     return result
   294 def dump_words(words, filename):
   295     with codecs.open(filename, "w+", "utf-8") as f:
   296         for word in words.keys():
   297             f.write(("%s\n"%word)*words[word])
   299 def error_message(text):
   300     print text
   302 def find_wordgroups_weights(word_pairs, normalizator):
   303     weight = {}
   304     for (num, word) in word_pairs:
   305         normalized = normalizator.normalize(word)
   306         weight.setdefault(normalized, 0)
   307         weight[normalized] += num
   308     return weight
   310 def find_linked_words(notes):
   311     linked_words = {}
   312     for word in notes.keys():
   313         for note in notes[word].values():
   314             if "@" in note:
   315                 result = re.search(r'\@(\S*)', note)
   316                 if result:
   317                     main_word = result.group(1)
   318                     if main_word:
   319                         linked_words[word] = main_word
   320     return linked_words
   322 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
   323     (num1, word1) = pair1
   324     (num2, word2) = pair2
   326     normalized_word1 = normalizator.normalize(word1)
   327     normalized_word2 = normalizator.normalize(word2)
   329     cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
   330     if cmp_res != 0:
   331         return cmp_res
   332     else:
   333         cmp_res = cmp(normalized_word1, normalized_word2)
   334         if cmp_res != 0:
   335             return cmp_res
   336         else:
   337             return cmp(int(num1), int(num2))
   340 def print_words_sorted(word_pairs, stats, normalizator, print_stats=True, stats_only=False, compressed_wordlist=False):
   341     if stats_only:
   342         codecs.getwriter("utf-8")(sys.stdout).write(
   343             " ".join([
   344                 "%-10s" % x for x in [
   345                 "LANG",
   346                 "KNOWN%",
   347                 "UNKNOWN%",
   348                 "KNOWN",
   349                 "TOTAL",
   350                 "WPS",
   351                 "UWPS*10"
   352                 ]]) + "\n")
   353         codecs.getwriter("utf-8")(sys.stdout).write(
   354             " ".join([
   355                 "%(language)-10s",
   356                 "%(percentage)-10.2f",
   357                 "%(percentage_unknown)-10.2f",
   358                 "%(total_known)-11d"
   359                 "%(total)-11d"
   360                 "%(wps)-11d"
   361                 "%(uwps)-11d"
   362                 ]) % stats + "\n")
   363         return
   365     if print_stats:
   366         codecs.getwriter("utf-8")(sys.stdout).write(
   367             "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
   369     level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
   370     known = int(stats['total_known'])
   371     total = int(stats['total'])
   372     current_level = 0
   373     old_normalized_word = None
   374     words_of_this_group = []
   375     for word_pair in word_pairs:
   377         normalized_word = normalizator.normalize(word_pair[1])
   378         if old_normalized_word and old_normalized_word != normalized_word:
   379             #codecs.getwriter("utf-8")(sys.stdout).write(
   380             #    "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
   381             compressed_word_pair = (
   382                 sum(x[0] for x in words_of_this_group),
   383                 normalizator.best_word_from_group(words_of_this_group)
   384                 )
   385             if compressed_wordlist:
   386                 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
   387             words_of_this_group = []
   389         old_normalized_word = normalized_word
   390         words_of_this_group.append(word_pair)
   392         if not compressed_wordlist:
   393             codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
   396         known += word_pair[0]
   397         if 100.0*known/total >= level_lines[0]:
   398             current_level = level_lines[0]
   399             while 100.0*known/total > level_lines[0]:
   400                 current_level = level_lines[0]
   401                 level_lines = level_lines[1:]
   402             codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
   404 def filter_add_notes(args):
   405     lines = readlines_from_file(args[0])
   406     notes = load_notes(notes_filenames())
   407     lines = add_notes(lines, notes)
   408     with codecs.open(args[0], "w", "utf-8") as f:
   409         for line in lines:
   410             f.write(line)
   412 def filter_remove_notes(args):
   413     lines = readlines_from_file(args[0])
   414     notes = load_notes(notes_filenames())
   415     lines = remove_notes(lines, notes)
   416     with codecs.open(args[0], "w", "utf-8") as f:
   417         for line in lines:
   418             f.write(line)
   420 def filter_get_words_group_words_add_stat(args):
   421     vocabulary = load_vocabulary()
   422     notes = load_notes(notes_filenames())
   423     lines = readlines_from_stdin()
   424     group_by = [1]
   425     if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
   426         group_by.append(2)
   427     if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
   428         group_by.append(3)
   429     words = get_words(lines, group_by)
   430     stats_only = False
   431     if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
   432         stats_only = True
   434     compressed_wordlist = False
   435     if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
   436         compressed_wordlist = True
   439     stats = {}
   440     stats['total'] = sum(words[x] for x in words.keys())
   441     if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':
   442         words = substract_dictionary(words, vocabulary)
   444     stats['total_unknown'] = sum(words[x] for x in words.keys())
   445     stats['total_known'] = stats['total'] - stats['total_unknown']
   446     stats['percentage'] = 100.0*stats['total_known']/stats['total']
   447     stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
   448     stats['groups'] = 0
   449     stats['words'] = len(words)
   450     stats['sentences'] = 0  #FIXME
   451     stats['wps'] = 0        #FIXME
   452     stats['uwps'] = 0       #FIXME
   453     stats['language'] = config['language']
   455     linked_words = find_linked_words(notes)
   456     normalizator = Normalizator(config['language'], linked_words)
   458     words_with_freq = []
   459     for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
   460         words_with_freq.append((words[k], k))
   462     wgw = find_wordgroups_weights(words_with_freq, normalizator)
   463     if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':
   464         words_with_freq = sorted(
   465                 words_with_freq,
   466                 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
   467                 reverse=True)
   469     print_words_sorted(
   470         words_with_freq,
   471         stats,
   472         normalizator,
   473         stats_only=stats_only,
   474         compressed_wordlist=compressed_wordlist
   475         )
   477 (options, args) = parser.parse_args()
   478 if options.language:
   479     config['language'] = options.language
   481 if options.function:
   482     function_names = {
   483         'add_notes' :   filter_add_notes,
   484         'remove_notes': filter_remove_notes,
   485         'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
   486     }
   487     if options.function in function_names:
   488         function_names[options.function](args)
   489     else:
   490         error_message("Unkown function %s.\nAvailable functions:\n%s" % (
   491             options.function, "".join(["   "+x for x in sorted(function_names.keys())])))
   492         sys.exit(1)
   497 #os.system("vim")
