new-words

diff new-words.py @ 47:d708e2c1bad8

compressed wordlist support
author Igor Chubin <igor@chub.in>
date Mon Feb 07 21:21:17 2011 +0200 (2011-02-07)
parents 5f90e44eecfc
children 7194bdb56475
line diff
     1.1 --- a/new-words.py	Fri Feb 04 06:18:50 2011 +0100
     1.2 +++ b/new-words.py	Mon Feb 07 21:21:17 2011 +0200
     1.3 @@ -41,6 +41,17 @@
     1.4              word = self.linked_words[word]
     1.5          return self.stemmer.stemWord(word.lower())
     1.6  
     1.7 +    def best_word_from_group(self, wordpairs_group):
     1.8 +        """Returns the word that is the most relevant to the wordpairs_group.
     1.9 +
    1.10 +        At the moment: returns the word with minimal length"""
    1.11 +
    1.12 +        minimal_length = min(len(pair[1]) for pair in wordpairs_group)
    1.13 +        return list(x[1] for x in sorted(
    1.14 +            (x for x in wordpairs_group if len(x[1]) == minimal_length),
    1.15 +            key=lambda x:x[0],
    1.16 +            reverse=True))[0]
    1.17 +
    1.18  parser = optparse.OptionParser()
    1.19  
    1.20  parser.add_option(
    1.21 @@ -215,12 +226,9 @@
    1.22              if match_object:
    1.23                  word = match_object.group(1)
    1.24                  if word in notes:
    1.25 -                    #logging.debug(word)
    1.26 -                    #logging.debug(line)
    1.27                      if notes_filename in notes[word]:
    1.28                          line = line.rstrip('\n')
    1.29                          line = "%-30s %s\n" % (line, notes[word][notes_filename])
    1.30 -                        #logging.debug(line)
    1.31                          result += [line]
    1.32                  else:
    1.33                      result += [line]
    1.34 @@ -328,7 +336,8 @@
    1.35          else:
    1.36              return cmp(int(num1), int(num2))
    1.37  
    1.38 -def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False):
    1.39 +
    1.40 +def print_words_sorted(word_pairs, stats, normalizator, print_stats=True, stats_only=False, compressed_wordlist=False):
    1.41      if stats_only:
    1.42          codecs.getwriter("utf-8")(sys.stdout).write(
    1.43              " ".join([
    1.44 @@ -361,8 +370,29 @@
    1.45      known = int(stats['total_known'])
    1.46      total = int(stats['total'])
    1.47      current_level = 0
    1.48 +    old_normalized_word = None
    1.49 +    words_of_this_group = []
    1.50      for word_pair in word_pairs:
    1.51 -        codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
    1.52 +
    1.53 +        normalized_word = normalizator.normalize(word_pair[1])
    1.54 +        if old_normalized_word and old_normalized_word != normalized_word:
    1.55 +            #codecs.getwriter("utf-8")(sys.stdout).write(
    1.56 +            #    "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
    1.57 +            compressed_word_pair = (
    1.58 +                sum(x[0] for x in words_of_this_group),
    1.59 +                normalizator.best_word_from_group(words_of_this_group)
    1.60 +                )
    1.61 +            if compressed_wordlist:
    1.62 +                codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
    1.63 +            words_of_this_group = []
    1.64 +
    1.65 +        old_normalized_word = normalized_word
    1.66 +        words_of_this_group.append(word_pair)
    1.67 +
    1.68 +        if not compressed_wordlist:
    1.69 +            codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
    1.70 +
    1.71 +
    1.72          known += word_pair[0]
    1.73          if 100.0*known/total >= level_lines[0]:
    1.74              current_level = level_lines[0]
    1.75 @@ -401,6 +431,10 @@
    1.76      if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
    1.77          stats_only = True
    1.78  
    1.79 +    compressed_wordlist = False
    1.80 +    if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
    1.81 +        compressed_wordlist = True
    1.82 +
    1.83  
    1.84      stats = {}
    1.85      stats['total'] = sum(words[x] for x in words.keys())
    1.86 @@ -432,7 +466,13 @@
    1.87                  cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
    1.88                  reverse=True)
    1.89  
    1.90 -    print_words_sorted(words_with_freq, stats, stats_only=stats_only)
    1.91 +    print_words_sorted(
    1.92 +        words_with_freq,
    1.93 +        stats,
    1.94 +        normalizator,
    1.95 +        stats_only=stats_only,
    1.96 +        compressed_wordlist=compressed_wordlist
    1.97 +        )
    1.98  
    1.99  (options, args) = parser.parse_args()
   1.100  if options.language: