new-words

diff new-words.py @ 40:c3a50c0d2400

Functions for adding/removing notes + statistics now implemented in Python.

Option -O (old-style) is not supported anymore. If you need old-style new-words use new-words.sh
author Igor Chubin <igor@chub.in>
date Sun Jan 23 17:09:44 2011 +0100 (2011-01-23)
parents a598e0d25784
children 4629e08b0d87
line diff
     1.1 --- a/new-words.py	Sun Jan 23 14:25:52 2011 +0100
     1.2 +++ b/new-words.py	Sun Jan 23 17:09:44 2011 +0100
     1.3 @@ -1,6 +1,7 @@
     1.4  #!/usr/bin/env python
     1.5  # -*- coding: utf-8 -*-
     1.6  
     1.7 +from __future__ import with_statement
     1.8  import codecs
     1.9  import logging
    1.10  import os
    1.11 @@ -253,10 +254,6 @@
    1.12              f.write(line)
    1.13  
    1.14  
    1.15 -def print_words_sorted(words_freq):
    1.16 -    for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True):
    1.17 -        codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k))
    1.18 -
    1.19  def substract_dictionary(dict1, dict2):
    1.20      """
    1.21      returns dict1 - dict2
    1.22 @@ -275,14 +272,12 @@
    1.23  def error_message(text):
    1.24      print text
    1.25  
    1.26 -def find_wordgroups_weights(lines, normalizator):
    1.27 +def find_wordgroups_weights(word_pairs, normalizator):
    1.28      weight = {}
    1.29 -    for line in lines:
    1.30 -        line = re.sub('^\s*', '', line.rstrip('\n'))
    1.31 -        (num, word) = re.split('\s+', line, maxsplit=1)
    1.32 +    for (num, word) in word_pairs:
    1.33          normalized = normalizator.normalize(word)
    1.34          weight.setdefault(normalized, 0)
    1.35 -        weight[normalized] += int(num)
    1.36 +        weight[normalized] += num
    1.37      return weight
    1.38  
    1.39  def find_linked_words(notes):
    1.40 @@ -297,12 +292,9 @@
    1.41                          linked_words[word] = main_word
    1.42      return linked_words
    1.43  
    1.44 -
    1.45 -def compare_word_lines(line1, line2, wgw, normalizator, linked_words):
    1.46 -    line1 = re.sub('^\s*', '', line1.rstrip('\n'))
    1.47 -    (num1, word1) = re.split('\s+', line1, 1)
    1.48 -    line2 = re.sub('^\s*', '', line2.rstrip('\n'))
    1.49 -    (num2, word2) = re.split('\s+', line2, 1)
    1.50 +def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words):
    1.51 +    (num1, word1) = pair1
    1.52 +    (num2, word2) = pair2
    1.53  
    1.54      normalized_word1 = normalizator.normalize(word1)
    1.55      normalized_word2 = normalizator.normalize(word2)
    1.56 @@ -317,25 +309,28 @@
    1.57          else:
    1.58              return cmp(int(num1), int(num2))
    1.59  
    1.60 -def filter_get_words(args):
    1.61 -    vocabulary = load_vocabulary()
    1.62 -    words = get_words(readlines_from_stdin())
    1.63 -    dump_words(words, args[0])
    1.64 -    words = substract_dictionary(words, vocabulary)
    1.65 -    print_words_sorted(words)
    1.66 +def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False):
    1.67 +    if stats_only:
    1.68 +        codecs.getwriter("utf-8")(sys.stdout).write("stat_only")
    1.69 +        return
    1.70  
    1.71 -def filter_group_words(args):
    1.72 -    lines = readlines_from_stdin()
    1.73 -    notes = load_notes(notes_filenames())
    1.74 -    linked_words = find_linked_words(notes)
    1.75 -    normalizator = Normalizator(config['language'], linked_words)
    1.76 +    if print_stats:
    1.77 +        codecs.getwriter("utf-8")(sys.stdout).write(
    1.78 +            "# %(language)s, %(percentage)s, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
    1.79  
    1.80 -    wgw = find_wordgroups_weights(lines, normalizator)
    1.81 -    for line in sorted(
    1.82 -                lines,
    1.83 -                cmp=lambda x,y:compare_word_lines(x,y, wgw, normalizator, linked_words),
    1.84 -                reverse=True):
    1.85 -        codecs.getwriter("utf-8")(sys.stdout).write(line)
    1.86 +    level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
    1.87 +    known = int(stats['total_known'])
    1.88 +    total = int(stats['total'])
    1.89 +    current_level = 0
    1.90 +    for word_pair in word_pairs:
    1.91 +        codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
    1.92 +        known += word_pair[0]
    1.93 +        if 100.0*known/total >= level_lines[0]:
    1.94 +            current_level = level_lines[0]
    1.95 +            while 100.0*known/total > level_lines[0]:
    1.96 +                current_level = level_lines[0]
    1.97 +                level_lines = level_lines[1:]
    1.98 +            codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
    1.99  
   1.100  def filter_add_notes(args):
   1.101      lines = readlines_from_file(args[0])
   1.102 @@ -353,16 +348,48 @@
   1.103          for line in lines:
   1.104              f.write(line)
   1.105  
   1.106 +def filter_get_words_group_words_add_stat(args):
   1.107 +    vocabulary = load_vocabulary()
   1.108 +    notes = load_notes(notes_filenames())
   1.109 +    lines = readlines_from_stdin()
   1.110 +    words = get_words(lines)
   1.111 +
   1.112 +    stats = {}
   1.113 +    stats['total'] = sum(words[x] for x in words.keys())
   1.114 +    words = substract_dictionary(words, vocabulary)
   1.115 +
   1.116 +    stats['total_unknown'] = sum(words[x] for x in words.keys())
   1.117 +    stats['total_known'] = stats['total'] - stats['total_unknown']
   1.118 +    stats['percentage'] = "%7.2f"%(100.0*stats['total_known']/stats['total'])
   1.119 +    stats['groups'] = 0
   1.120 +    stats['words'] = len(words)
   1.121 +    stats['sentences'] = 0 #FIXME
   1.122 +    stats['language'] = config['language']
   1.123 +
   1.124 +    linked_words = find_linked_words(notes)
   1.125 +    normalizator = Normalizator(config['language'], linked_words)
   1.126 +
   1.127 +    word_pairs = []
   1.128 +    for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
   1.129 +        word_pairs.append((words[k], k))
   1.130 +
   1.131 +    wgw = find_wordgroups_weights(word_pairs, normalizator)
   1.132 +    word_pairs = sorted(
   1.133 +                word_pairs,
   1.134 +                cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
   1.135 +                reverse=True)
   1.136 +
   1.137 +    print_words_sorted(word_pairs, stats)
   1.138 +
   1.139  (options, args) = parser.parse_args()
   1.140  if options.language:
   1.141      config['language'] = options.language
   1.142  
   1.143  if options.function:
   1.144      function_names = {
   1.145 -        'get_words' :   filter_get_words,
   1.146 -        'group_words' : filter_group_words,
   1.147          'add_notes' :   filter_add_notes,
   1.148          'remove_notes': filter_remove_notes,
   1.149 +        'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
   1.150      }
   1.151      if options.function in function_names:
   1.152          function_names[options.function](args)