# HG changeset patch
# User Igor Chubin <igor@chub.in>
# Date 1297106477 -7200
# Node ID d708e2c1bad814cc2a27aeb0dd0931d1dabd91dc
# Parent  bf0aa8e3c1ce0186a3d9502f74818fe205bd3d70
compressed wordlist support

diff -r bf0aa8e3c1ce -r d708e2c1bad8 new-words-py.sh
--- a/new-words-py.sh	Fri Feb 04 06:39:25 2011 +0100
+++ b/new-words-py.sh	Mon Feb 07 21:21:17 2011 +0200
@@ -141,9 +141,10 @@
 }
 get_words_group_words_add_stat()
 {
-    STAT_ONLY="$STAT_ONLY" \
+    COMPRESSED_WORDLIST="$COMPRESSED_WORDLIST" \
     GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \
     GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \
+    STAT_ONLY="$STAT_ONLY" \
     WORDS_GROUPING="$WORDS_GROUPING" \
     FILTER_WORDS="$FILTER_WORDS" \
     $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1"
diff -r bf0aa8e3c1ce -r d708e2c1bad8 new-words.py
--- a/new-words.py	Fri Feb 04 06:39:25 2011 +0100
+++ b/new-words.py	Mon Feb 07 21:21:17 2011 +0200
@@ -41,6 +41,17 @@
             word = self.linked_words[word]
         return self.stemmer.stemWord(word.lower())
 
+    def best_word_from_group(self, wordpairs_group):
+        """Returns the word that is the most relevant to the wordpairs_group.
+
+        At the moment: returns the word with minimal length"""
+
+        minimal_length = min(len(pair[1]) for pair in wordpairs_group)
+        return list(x[1] for x in sorted(
+            (x for x in wordpairs_group if len(x[1]) == minimal_length),
+            key=lambda x:x[0],
+            reverse=True))[0]
+
 parser = optparse.OptionParser()
 
 parser.add_option(
@@ -215,12 +226,9 @@
             if match_object:
                 word = match_object.group(1)
                 if word in notes:
-                    #logging.debug(word)
-                    #logging.debug(line)
                     if notes_filename in notes[word]:
                         line = line.rstrip('\n')
                         line = "%-30s %s\n" % (line, notes[word][notes_filename])
-                        #logging.debug(line)
                         result += [line]
                 else:
                     result += [line]
@@ -328,7 +336,8 @@
         else:
             return cmp(int(num1), int(num2))
 
-def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False):
+
+def print_words_sorted(word_pairs, stats, normalizator, print_stats=True, stats_only=False, compressed_wordlist=False):
     if stats_only:
         codecs.getwriter("utf-8")(sys.stdout).write(
             " ".join([
@@ -361,8 +370,29 @@
     known = int(stats['total_known'])
     total = int(stats['total'])
     current_level = 0
+    old_normalized_word = None
+    words_of_this_group = []
     for word_pair in word_pairs:
-        codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
+
+        normalized_word = normalizator.normalize(word_pair[1])
+        if old_normalized_word and old_normalized_word != normalized_word:
+            #codecs.getwriter("utf-8")(sys.stdout).write(
+            #    "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
+            compressed_word_pair = (
+                sum(x[0] for x in words_of_this_group),
+                normalizator.best_word_from_group(words_of_this_group)
+                )
+            if compressed_wordlist:
+                codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
+            words_of_this_group = []
+
+        old_normalized_word = normalized_word
+        words_of_this_group.append(word_pair)
+
+        if not compressed_wordlist:
+            codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
+
+
         known += word_pair[0]
         if 100.0*known/total >= level_lines[0]:
             current_level = level_lines[0]
@@ -401,6 +431,10 @@
     if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
         stats_only = True
 
+    compressed_wordlist = False
+    if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
+        compressed_wordlist = True
+
 
     stats = {}
     stats['total'] = sum(words[x] for x in words.keys())
@@ -432,7 +466,13 @@
                 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
                 reverse=True)
 
-    print_words_sorted(words_with_freq, stats, stats_only=stats_only)
+    print_words_sorted(
+        words_with_freq,
+        stats,
+        normalizator,
+        stats_only=stats_only,
+        compressed_wordlist=compressed_wordlist
+        )
 
 (options, args) = parser.parse_args()
 if options.language: