new-words

changeset 49:00286f6bfa85

experimental: when -c specified, use dictionary for compression
author Igor Chubin <igor@chub.in>
date Wed Feb 09 21:08:23 2011 +0200 (2011-02-09)
parents 7194bdb56475
children 4e931db74618
files new-words.py
line diff
     1.1 --- a/new-words.py	Tue Feb 08 20:35:38 2011 +0200
     1.2 +++ b/new-words.py	Wed Feb 09 21:08:23 2011 +0200
     1.3 @@ -3,6 +3,7 @@
     1.4  
     1.5  from __future__ import with_statement
     1.6  import codecs
     1.7 +import difflib
     1.8  import logging
     1.9  import os
    1.10  import optparse
    1.11 @@ -45,13 +46,72 @@
    1.12          """Returns the word that is the most relevant to the wordpairs_group.
    1.13  
    1.14          At the moment: returns the word with minimal length"""
    1.15 +        
    1.16 +        def f(x, y):
    1.17 +            return difflib.SequenceMatcher(
    1.18 +                        None, 
    1.19 +                        #(x[-2:] == 'en' and x[:-2].lower() or x.lower()), 
    1.20 +                        x.lower(),
    1.21 +                        y.lower()).ratio()
    1.22  
    1.23          minimal_length = min(len(pair[1]) for pair in wordpairs_group)
    1.24 -        return list(x[1] for x in sorted(
    1.25 +        best_match = list(x[1] for x in sorted(
    1.26              (x for x in wordpairs_group if len(x[1]) == minimal_length),
    1.27              key=lambda x:x[0],
    1.28              reverse=True))[0]
    1.29  
    1.30 +        suggestions = self.dictionary_suggestions(best_match)
    1.31 +        if len(suggestions) == 1:
    1.32 +            return best_match
    1.33 +
    1.34 +        #return best_match
    1.35 +
    1.36 +        verb = False
    1.37 +        corrected_best_match = best_match
    1.38 +        if best_match[-2:] == 'et':
    1.39 +            word = best_match[:-1]+"n"
    1.40 +            sugg = self.dictionary_suggestions(word)
    1.41 +            if len(sugg) == 1:
    1.42 +                return word
    1.43 +            suggestions += sugg
    1.44 +            corrected_best_match = word
    1.45 +            corrected_best_match = best_match[:-2]
    1.46 +            verb = True
    1.47 +
    1.48 +        if best_match[-1] == 't':
    1.49 +            word = best_match[:-1]+"en"
    1.50 +            sugg = self.dictionary_suggestions(word)
    1.51 +            if len(sugg) == 1:
    1.52 +                return word
    1.53 +            suggestions += sugg
    1.54 +            corrected_best_match = best_match[:-1]
    1.55 +            verb = True
    1.56 +
    1.57 +        if corrected_best_match[0].lower() == corrected_best_match[0]:
    1.58 +            suggestions = [ x for x in suggestions
    1.59 +                if x[0].lower() == x[0] ]
    1.60 +
    1.61 +        if suggestions == []:
    1.62 +            return best_match+"_"
    1.63 +        return best_match+" "+(" ".join(
    1.64 +                            sorted(
    1.65 +                                suggestions,
    1.66 +                                key = lambda x: f(x, corrected_best_match),
    1.67 +                                reverse = True
    1.68 +                                )
    1.69 +                            )
    1.70 +                        )
    1.71 +
    1.72 +    def dictionary_suggestions(self, word):
    1.73 +        return [
    1.74 +            x.decode('utf-8').rstrip('\n')
    1.75 +            for x 
    1.76 +                in subprocess.Popen(
    1.77 +                    ["de-variants", word],
    1.78 +                    stdout=subprocess.PIPE
    1.79 +                ).stdout.readlines() ]
    1.80 +
    1.81 +
    1.82  parser = optparse.OptionParser()
    1.83  
    1.84  parser.add_option(
    1.85 @@ -388,11 +448,11 @@
    1.86          if old_normalized_word and old_normalized_word != normalized_word:
    1.87              #codecs.getwriter("utf-8")(sys.stdout).write(
    1.88              #    "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
    1.89 -            compressed_word_pair = (
    1.90 -                sum(x[0] for x in words_of_this_group),
    1.91 -                normalizator.best_word_from_group(words_of_this_group)
    1.92 -                )
    1.93              if compressed_wordlist:
    1.94 +                compressed_word_pair = (
    1.95 +                    sum(x[0] for x in words_of_this_group),
    1.96 +                    normalizator.best_word_from_group(words_of_this_group)
    1.97 +                    )
    1.98                  codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
    1.99                  printed_words += 1
   1.100              words_of_this_group = []