new-words
view new-words.py @ 45:5f90e44eecfc
new-words.py: turn words filtering and grouping on and off
| author | Igor Chubin <igor@chub.in> | 
|---|---|
| date | Fri Feb 04 06:18:50 2011 +0100 (2011-02-04) | 
| parents | 7eb1a8c3eade | 
| children | d708e2c1bad8 | 
 line source
     1 #!/usr/bin/env python
     2 # -*- coding: utf-8 -*-
     4 from __future__ import with_statement
     5 import codecs
     6 import logging
     7 import os
     8 import optparse
     9 import re
    10 import subprocess
    11 import sys
    12 import Stemmer
    13 try:
    14     import psyco
    15     psyco.full()
    16 except:
    17     pass
    19 config = {
    20     'config_directory': os.environ['HOME'] + '/.new-words',
    21     'language': 'en',
    22 }
    24 logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG)
    26 class Normalizator:
    27     def __init__(self, language, linked_words={}):
    28         stemmer_algorithm = {
    29             'de' : 'german',
    30             'en' : 'english',
    31             'ru' : 'russian',
    32             'uk' : 'ukrainian',
    33         }
    34         self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language])
    35         self.linked_words = linked_words
    37     def normalize(self, word):
    38         word_chain = []
    39         while word in self.linked_words and not word in word_chain:
    40             word_chain.append(word)
    41             word = self.linked_words[word]
    42         return self.stemmer.stemWord(word.lower())
    44 parser = optparse.OptionParser()
    46 parser.add_option(
    47     "-a", "--no-marks",
    48     help="don't add marks (and don't save marks added by user)",
    49     action="store_true",
    50     dest="no_marks")
    52 parser.add_option(
    53     "-c", "--compressed",
    54     help="show compressed wordlist: one word per group",
    55     action="store_true",
    56     dest="compressed")
    58 parser.add_option(
    59     "-k", "--known-words",
    60     help="put higher words that are similar to the known words (only for English)",
    61     action="store_true",
    62     dest="compressed")
    64 parser.add_option(
    65     "-l", "--language",
    66     help="specify language of text",
    67     action="store",
    68     dest="language")
    70 parser.add_option(
    71     "-f", "--function",
    72     help="filter through subsystem [INTERNAL]",
    73     action="store",
    74     dest="function")
    76 parser.add_option(
    77     "-m", "--merge-tag",
    78     help="merge words tagged with specified tag into the main vocabulary",
    79     action="store",
    80     dest="merge_tag")
    82 parser.add_option(
    83     "-M", "--merge-tagged",
    84     help="merge words tagged with ANY tag into the main vocabulary",
    85     action="store_true",
    86     dest="merge_tagged")
    88 parser.add_option(
    89     "-n", "--non-interactive",
    90     help="non-interactive mode (don't run vi)",
    91     action="store_true",
    92     dest="non_interactive")
    94 parser.add_option(
    95     "-N", "--no-filter",
    96     help="switch off known words filtering",
    97     action="store_true",
    98     dest="no_filter")
   100 parser.add_option(
   101     "-p", "--pages",
   102     help="work with specified pages only (pages = start-stop/total )",
   103     action="store",
   104     dest="pages")
   106 parser.add_option(
   107     "-r", "--remove-tag",
   108     help="remove subvocabulary of specified tag",
   109     action="store",
   110     dest="remove_tag")
   112 parser.add_option(
   113     "-s", "--text-stats",
   114     help="show the text statistics (percentage of known words and so on) and exit",
   115     action="store_true",
   116     dest="text_stats")
   118 parser.add_option(
   119     "-S", "--voc-stats",
   120     help="show your vocabulary statistics (number of words and word groups)",
   121     action="store_true",
   122     dest="voc_stats")
   124 parser.add_option(
   125     "-t", "--tag",
   126     help="tag known words with tag",
   127     action="store",
   128     dest="tag")
   130 parser.add_option(
   131     "-T", "--show-tags",
   132     help="tag known words with tag",
   133     action="store_true",
   134     dest="show_tags")
   136 parser.add_option(
   137     "-2", "--two-words",
   138     help="find 2 words' sequences",
   139     action="store_true",
   140     dest="two_words")
   142 parser.add_option(
   143     "-3", "--three-words",
   144     help="find 3 words' sequences",
   145     action="store_true",
   146     dest="three_words")
   148 def readlines_from_file(filename):
   149     res = []
   150     with codecs.open(filename, "r", "utf-8") as f:
   151         for line in f.readlines():
   152             res += [line]
   153     return res
   155 def readlines_from_stdin():
   156     return codecs.getreader("utf-8")(sys.stdin).readlines()
   158 def words_from_line(line):
   159     line = line.rstrip('\n')
   160     #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
   161     #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
   162     return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
   164 def get_words(lines, group_by=[1]):
   165     """
   166     Returns hash of words in a file
   167     word => number
   168     """
   169     result = {}
   170     (a, b, c) = ("", "", "")
   171     for line in lines:
   172         words = words_from_line(line)
   173         for word in words:
   174             if re.match('[0-9]*$', word):
   175                 continue
   176             result.setdefault(word, 0)
   177             result[word] += 1
   178             if 2 in group_by and a != "" and b != "":
   179                 w = "%s_%s" % (a,b)
   180                 result.setdefault(w, 0)
   181                 result[w] += 1
   182             if 3 in group_by and not "" in [a,b,c]:
   183                 w = "%s_%s_%s" % (a,b,c)
   184                 result.setdefault(w, 0)
   185                 result[w] += 1
   186             (a,b,c) = (b, c, word)
   188     logging.debug(result)
   189     return result
   191 def load_vocabulary():
   192     return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
   194 def notes_filenames():
   195     return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
   197 def load_notes(files):
   198     notes = {}
   199     for filename in files:
   200         with codecs.open(filename, "r", "utf-8") as f:
   201             for line in f.readlines():
   202                 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
   203                 notes.setdefault(word, {})
   204                 notes[word][filename] = note
   205     return notes
   207 def add_notes(lines, notes):
   208     notes_filename = notes_filenames()[0]
   209     result = []
   210     for line in lines:
   211         if line.startswith('#'):
   212             result += [line]
   213         else:
   214             match_object = re.search('^\s*\S+\s*(\S+)', line)
   215             if match_object:
   216                 word = match_object.group(1)
   217                 if word in notes:
   218                     #logging.debug(word)
   219                     #logging.debug(line)
   220                     if notes_filename in notes[word]:
   221                         line = line.rstrip('\n')
   222                         line = "%-30s %s\n" % (line, notes[word][notes_filename])
   223                         #logging.debug(line)
   224                         result += [line]
   225                 else:
   226                     result += [line]
   227             else:
   228                 result += [line]
   229     return result
   231 def remove_notes(lines, notes_group):
   232     notes_filename = notes_filenames()[0]
   233     notes = {}
   234     for k in notes_group.keys():
   235         if notes_filename in notes_group[k]:
   236             notes[k] = notes_group[k][notes_filename]
   238     result = []
   239     for line in lines:
   240         line = line.rstrip('\n')
   241         match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
   242         if match_object:
   243             result.append("".join([
   244                 match_object.group(1),
   245                 match_object.group(2),
   246                 match_object.group(3),
   247                 match_object.group(4),
   248                 "\n"
   249                 ]))
   250             notes[match_object.group(4)] = match_object.group(6)
   251         else:
   252             result.append(line+"\n")
   254     save_notes(notes_filename, notes)
   255     return result
   257 def save_notes(filename, notes):
   258     lines = []
   259     saved_words = []
   260     with codecs.open(filename, "r", "utf-8") as f:
   261         for line in f.readlines():
   262             (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
   263             if word in notes:
   264                 line = "%-29s %s\n" % (word, notes[word])
   265                 saved_words.append(word)
   266             lines.append(line)
   267     for word in [x for x in notes.keys() if not x in saved_words]:
   268         line = "%-29s %s\n" % (word, notes[word])
   269         lines.append(line)
   271     with codecs.open(filename, "w", "utf-8") as f:
   272         for line in lines:
   273             f.write(line)
   276 def substract_dictionary(dict1, dict2):
   277     """
   278     returns dict1 - dict2
   279     """
   280     result = {}
   281     for (k,v) in dict1.items():
   282         if not k in dict2:
   283             result[k] = v
   284     return result
   286 def dump_words(words, filename):
   287     with codecs.open(filename, "w+", "utf-8") as f:
   288         for word in words.keys():
   289             f.write(("%s\n"%word)*words[word])
   291 def error_message(text):
   292     print text
   294 def find_wordgroups_weights(word_pairs, normalizator):
   295     weight = {}
   296     for (num, word) in word_pairs:
   297         normalized = normalizator.normalize(word)
   298         weight.setdefault(normalized, 0)
   299         weight[normalized] += num
   300     return weight
   302 def find_linked_words(notes):
   303     linked_words = {}
   304     for word in notes.keys():
   305         for note in notes[word].values():
   306             if "@" in note:
   307                 result = re.search(r'\@(\S*)', note)
   308                 if result:
   309                     main_word = result.group(1)
   310                     if main_word:
   311                         linked_words[word] = main_word
   312     return linked_words
   314 def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
   315     (num1, word1) = pair1
   316     (num2, word2) = pair2
   318     normalized_word1 = normalizator.normalize(word1)
   319     normalized_word2 = normalizator.normalize(word2)
   321     cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2])
   322     if cmp_res != 0:
   323         return cmp_res
   324     else:
   325         cmp_res = cmp(normalized_word1, normalized_word2)
   326         if cmp_res != 0:
   327             return cmp_res
   328         else:
   329             return cmp(int(num1), int(num2))
   331 def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False):
   332     if stats_only:
   333         codecs.getwriter("utf-8")(sys.stdout).write(
   334             " ".join([
   335                 "%-10s" % x for x in [
   336                 "LANG",
   337                 "KNOWN%",
   338                 "UNKNOWN%",
   339                 "KNOWN",
   340                 "TOTAL",
   341                 "WPS",
   342                 "UWPS*10"
   343                 ]]) + "\n")
   344         codecs.getwriter("utf-8")(sys.stdout).write(
   345             " ".join([
   346                 "%(language)-10s",
   347                 "%(percentage)-10.2f",
   348                 "%(percentage_unknown)-10.2f",
   349                 "%(total_known)-11d"
   350                 "%(total)-11d"
   351                 "%(wps)-11d"
   352                 "%(uwps)-11d"
   353                 ]) % stats + "\n")
   354         return
   356     if print_stats:
   357         codecs.getwriter("utf-8")(sys.stdout).write(
   358             "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
   360     level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
   361     known = int(stats['total_known'])
   362     total = int(stats['total'])
   363     current_level = 0
   364     for word_pair in word_pairs:
   365         codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
   366         known += word_pair[0]
   367         if 100.0*known/total >= level_lines[0]:
   368             current_level = level_lines[0]
   369             while 100.0*known/total > level_lines[0]:
   370                 current_level = level_lines[0]
   371                 level_lines = level_lines[1:]
   372             codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
   374 def filter_add_notes(args):
   375     lines = readlines_from_file(args[0])
   376     notes = load_notes(notes_filenames())
   377     lines = add_notes(lines, notes)
   378     with codecs.open(args[0], "w", "utf-8") as f:
   379         for line in lines:
   380             f.write(line)
   382 def filter_remove_notes(args):
   383     lines = readlines_from_file(args[0])
   384     notes = load_notes(notes_filenames())
   385     lines = remove_notes(lines, notes)
   386     with codecs.open(args[0], "w", "utf-8") as f:
   387         for line in lines:
   388             f.write(line)
   390 def filter_get_words_group_words_add_stat(args):
   391     vocabulary = load_vocabulary()
   392     notes = load_notes(notes_filenames())
   393     lines = readlines_from_stdin()
   394     group_by = [1]
   395     if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
   396         group_by.append(2)
   397     if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
   398         group_by.append(3)
   399     words = get_words(lines, group_by)
   400     stats_only = False
   401     if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
   402         stats_only = True
   405     stats = {}
   406     stats['total'] = sum(words[x] for x in words.keys())
   407     if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':
   408         words = substract_dictionary(words, vocabulary)
   410     stats['total_unknown'] = sum(words[x] for x in words.keys())
   411     stats['total_known'] = stats['total'] - stats['total_unknown']
   412     stats['percentage'] = 100.0*stats['total_known']/stats['total']
   413     stats['percentage_unknown'] = 100.0-100.0*stats['total_known']/stats['total']
   414     stats['groups'] = 0
   415     stats['words'] = len(words)
   416     stats['sentences'] = 0  #FIXME
   417     stats['wps'] = 0        #FIXME
   418     stats['uwps'] = 0       #FIXME
   419     stats['language'] = config['language']
   421     linked_words = find_linked_words(notes)
   422     normalizator = Normalizator(config['language'], linked_words)
   424     words_with_freq = []
   425     for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
   426         words_with_freq.append((words[k], k))
   428     wgw = find_wordgroups_weights(words_with_freq, normalizator)
   429     if 'WORDS_GROUPING' in os.environ and os.environ['WORDS_GROUPING'] == 'YES':
   430         words_with_freq = sorted(
   431                 words_with_freq,
   432                 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
   433                 reverse=True)
   435     print_words_sorted(words_with_freq, stats, stats_only=stats_only)
   437 (options, args) = parser.parse_args()
   438 if options.language:
   439     config['language'] = options.language
   441 if options.function:
   442     function_names = {
   443         'add_notes' :   filter_add_notes,
   444         'remove_notes': filter_remove_notes,
   445         'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
   446     }
   447     if options.function in function_names:
   448         function_names[options.function](args)
   449     else:
   450         error_message("Unkown function %s.\nAvailable functions:\n%s" % (
   451             options.function, "".join(["   "+x for x in sorted(function_names.keys())])))
   452         sys.exit(1)
   457 #os.system("vim")
