# HG changeset patch
# User Igor Chubin <igor@chub.in>
# Date 1297278503 -7200
# Node ID 00286f6bfa8580737e9a35281e2cf06ff7fb5d14
# Parent  7194bdb564754800ff60b171f5ecf4991561eca3
experimental: when -c specified, use dictionary for compression

diff -r 7194bdb56475 -r 00286f6bfa85 new-words.py
--- a/new-words.py	Tue Feb 08 20:35:38 2011 +0200
+++ b/new-words.py	Wed Feb 09 21:08:23 2011 +0200
@@ -3,6 +3,7 @@
 
 from __future__ import with_statement
 import codecs
+import difflib
 import logging
 import os
 import optparse
@@ -45,13 +46,72 @@
         """Returns the word that is the most relevant to the wordpairs_group.
 
         At the moment: returns the word with minimal length"""
+        
+        def f(x, y):
+            return difflib.SequenceMatcher(
+                        None, 
+                        #(x[-2:] == 'en' and x[:-2].lower() or x.lower()), 
+                        x.lower(),
+                        y.lower()).ratio()
 
         minimal_length = min(len(pair[1]) for pair in wordpairs_group)
-        return list(x[1] for x in sorted(
+        best_match = list(x[1] for x in sorted(
             (x for x in wordpairs_group if len(x[1]) == minimal_length),
             key=lambda x:x[0],
             reverse=True))[0]
 
+        suggestions = self.dictionary_suggestions(best_match)
+        if len(suggestions) == 1:
+            return best_match
+
+        #return best_match
+
+        verb = False
+        corrected_best_match = best_match
+        if best_match[-2:] == 'et':
+            word = best_match[:-1]+"n"
+            sugg = self.dictionary_suggestions(word)
+            if len(sugg) == 1:
+                return word
+            suggestions += sugg
+            corrected_best_match = word
+            corrected_best_match = best_match[:-2]
+            verb = True
+
+        if best_match[-1] == 't':
+            word = best_match[:-1]+"en"
+            sugg = self.dictionary_suggestions(word)
+            if len(sugg) == 1:
+                return word
+            suggestions += sugg
+            corrected_best_match = best_match[:-1]
+            verb = True
+
+        if corrected_best_match[0].lower() == corrected_best_match[0]:
+            suggestions = [ x for x in suggestions
+                if x[0].lower() == x[0] ]
+
+        if suggestions == []:
+            return best_match+"_"
+        return best_match+" "+(" ".join(
+                            sorted(
+                                suggestions,
+                                key = lambda x: f(x, corrected_best_match),
+                                reverse = True
+                                )
+                            )
+                        )
+
+    def dictionary_suggestions(self, word):
+        return [
+            x.decode('utf-8').rstrip('\n')
+            for x 
+                in subprocess.Popen(
+                    ["de-variants", word],
+                    stdout=subprocess.PIPE
+                ).stdout.readlines() ]
+
+
 parser = optparse.OptionParser()
 
 parser.add_option(
@@ -388,11 +448,11 @@
         if old_normalized_word and old_normalized_word != normalized_word:
             #codecs.getwriter("utf-8")(sys.stdout).write(
             #    "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
-            compressed_word_pair = (
-                sum(x[0] for x in words_of_this_group),
-                normalizator.best_word_from_group(words_of_this_group)
-                )
             if compressed_wordlist:
+                compressed_word_pair = (
+                    sum(x[0] for x in words_of_this_group),
+                    normalizator.best_word_from_group(words_of_this_group)
+                    )
                 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
                 printed_words += 1
             words_of_this_group = []