# HG changeset patch # User Igor Chubin # Date 1297278503 -7200 # Node ID 00286f6bfa8580737e9a35281e2cf06ff7fb5d14 # Parent 7194bdb564754800ff60b171f5ecf4991561eca3 experimental: when -c specified, use dictionary for compression diff -r 7194bdb56475 -r 00286f6bfa85 new-words.py --- a/new-words.py Tue Feb 08 20:35:38 2011 +0200 +++ b/new-words.py Wed Feb 09 21:08:23 2011 +0200 @@ -3,6 +3,7 @@ from __future__ import with_statement import codecs +import difflib import logging import os import optparse @@ -45,13 +46,72 @@ """Returns the word that is the most relevant to the wordpairs_group. At the moment: returns the word with minimal length""" + + def f(x, y): + return difflib.SequenceMatcher( + None, + #(x[-2:] == 'en' and x[:-2].lower() or x.lower()), + x.lower(), + y.lower()).ratio() minimal_length = min(len(pair[1]) for pair in wordpairs_group) - return list(x[1] for x in sorted( + best_match = list(x[1] for x in sorted( (x for x in wordpairs_group if len(x[1]) == minimal_length), key=lambda x:x[0], reverse=True))[0] + suggestions = self.dictionary_suggestions(best_match) + if len(suggestions) == 1: + return best_match + + #return best_match + + verb = False + corrected_best_match = best_match + if best_match[-2:] == 'et': + word = best_match[:-1]+"n" + sugg = self.dictionary_suggestions(word) + if len(sugg) == 1: + return word + suggestions += sugg + corrected_best_match = word + corrected_best_match = best_match[:-2] + verb = True + + if best_match[-1] == 't': + word = best_match[:-1]+"en" + sugg = self.dictionary_suggestions(word) + if len(sugg) == 1: + return word + suggestions += sugg + corrected_best_match = best_match[:-1] + verb = True + + if corrected_best_match[0].lower() == corrected_best_match[0]: + suggestions = [ x for x in suggestions + if x[0].lower() == x[0] ] + + if suggestions == []: + return best_match+"_" + return best_match+" "+(" ".join( + sorted( + suggestions, + key = lambda x: f(x, corrected_best_match), + reverse = True + ) + ) + ) + + def dictionary_suggestions(self, word): + return [ + x.decode('utf-8').rstrip('\n') + for x + in subprocess.Popen( + ["de-variants", word], + stdout=subprocess.PIPE + ).stdout.readlines() ] + + parser = optparse.OptionParser() parser.add_option( @@ -388,11 +448,11 @@ if old_normalized_word and old_normalized_word != normalized_word: #codecs.getwriter("utf-8")(sys.stdout).write( # "### %s\n" % normalizator.best_word_from_group(words_of_this_group)) - compressed_word_pair = ( - sum(x[0] for x in words_of_this_group), - normalizator.best_word_from_group(words_of_this_group) - ) if compressed_wordlist: + compressed_word_pair = ( + sum(x[0] for x in words_of_this_group), + normalizator.best_word_from_group(words_of_this_group) + ) codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair) printed_words += 1 words_of_this_group = []