new-words

changeset 47:d708e2c1bad8

compressed wordlist support
author Igor Chubin <igor@chub.in>
date Mon Feb 07 21:21:17 2011 +0200 (2011-02-07)
parents bf0aa8e3c1ce
children 7194bdb56475
files new-words-py.sh new-words.py
line diff
     1.1 --- a/new-words-py.sh	Fri Feb 04 06:39:25 2011 +0100
     1.2 +++ b/new-words-py.sh	Mon Feb 07 21:21:17 2011 +0200
     1.3 @@ -141,9 +141,10 @@
     1.4  }
     1.5  get_words_group_words_add_stat()
     1.6  {
     1.7 -    STAT_ONLY="$STAT_ONLY" \
     1.8 +    COMPRESSED_WORDLIST="$COMPRESSED_WORDLIST" \
     1.9      GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \
    1.10      GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \
    1.11 +    STAT_ONLY="$STAT_ONLY" \
    1.12      WORDS_GROUPING="$WORDS_GROUPING" \
    1.13      FILTER_WORDS="$FILTER_WORDS" \
    1.14      $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1"
     2.1 --- a/new-words.py	Fri Feb 04 06:39:25 2011 +0100
     2.2 +++ b/new-words.py	Mon Feb 07 21:21:17 2011 +0200
     2.3 @@ -41,6 +41,17 @@
     2.4              word = self.linked_words[word]
     2.5          return self.stemmer.stemWord(word.lower())
     2.6  
     2.7 +    def best_word_from_group(self, wordpairs_group):
     2.8 +        """Returns the word that is the most relevant to the wordpairs_group.
     2.9 +
    2.10 +        At the moment: returns the word with minimal length"""
    2.11 +
    2.12 +        minimal_length = min(len(pair[1]) for pair in wordpairs_group)
    2.13 +        return list(x[1] for x in sorted(
    2.14 +            (x for x in wordpairs_group if len(x[1]) == minimal_length),
    2.15 +            key=lambda x:x[0],
    2.16 +            reverse=True))[0]
    2.17 +
    2.18  parser = optparse.OptionParser()
    2.19  
    2.20  parser.add_option(
    2.21 @@ -215,12 +226,9 @@
    2.22              if match_object:
    2.23                  word = match_object.group(1)
    2.24                  if word in notes:
    2.25 -                    #logging.debug(word)
    2.26 -                    #logging.debug(line)
    2.27                      if notes_filename in notes[word]:
    2.28                          line = line.rstrip('\n')
    2.29                          line = "%-30s %s\n" % (line, notes[word][notes_filename])
    2.30 -                        #logging.debug(line)
    2.31                          result += [line]
    2.32                  else:
    2.33                      result += [line]
    2.34 @@ -328,7 +336,8 @@
    2.35          else:
    2.36              return cmp(int(num1), int(num2))
    2.37  
    2.38 -def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False):
    2.39 +
    2.40 +def print_words_sorted(word_pairs, stats, normalizator, print_stats=True, stats_only=False, compressed_wordlist=False):
    2.41      if stats_only:
    2.42          codecs.getwriter("utf-8")(sys.stdout).write(
    2.43              " ".join([
    2.44 @@ -361,8 +370,29 @@
    2.45      known = int(stats['total_known'])
    2.46      total = int(stats['total'])
    2.47      current_level = 0
    2.48 +    old_normalized_word = None
    2.49 +    words_of_this_group = []
    2.50      for word_pair in word_pairs:
    2.51 -        codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
    2.52 +
    2.53 +        normalized_word = normalizator.normalize(word_pair[1])
    2.54 +        if old_normalized_word and old_normalized_word != normalized_word:
    2.55 +            #codecs.getwriter("utf-8")(sys.stdout).write(
    2.56 +            #    "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
    2.57 +            compressed_word_pair = (
    2.58 +                sum(x[0] for x in words_of_this_group),
    2.59 +                normalizator.best_word_from_group(words_of_this_group)
    2.60 +                )
    2.61 +            if compressed_wordlist:
    2.62 +                codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
    2.63 +            words_of_this_group = []
    2.64 +
    2.65 +        old_normalized_word = normalized_word
    2.66 +        words_of_this_group.append(word_pair)
    2.67 +
    2.68 +        if not compressed_wordlist:
    2.69 +            codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
    2.70 +
    2.71 +
    2.72          known += word_pair[0]
    2.73          if 100.0*known/total >= level_lines[0]:
    2.74              current_level = level_lines[0]
    2.75 @@ -401,6 +431,10 @@
    2.76      if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
    2.77          stats_only = True
    2.78  
    2.79 +    compressed_wordlist = False
    2.80 +    if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
    2.81 +        compressed_wordlist = True
    2.82 +
    2.83  
    2.84      stats = {}
    2.85      stats['total'] = sum(words[x] for x in words.keys())
    2.86 @@ -432,7 +466,13 @@
    2.87                  cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
    2.88                  reverse=True)
    2.89  
    2.90 -    print_words_sorted(words_with_freq, stats, stats_only=stats_only)
    2.91 +    print_words_sorted(
    2.92 +        words_with_freq,
    2.93 +        stats,
    2.94 +        normalizator,
    2.95 +        stats_only=stats_only,
    2.96 +        compressed_wordlist=compressed_wordlist
    2.97 +        )
    2.98  
    2.99  (options, args) = parser.parse_args()
   2.100  if options.language: