new-words
changeset 49:00286f6bfa85
experimental: when -c specified, use dictionary for compression
author | Igor Chubin <igor@chub.in> |
---|---|
date | Wed Feb 09 21:08:23 2011 +0200 (2011-02-09) |
parents | 7194bdb56475 |
children | 4e931db74618 |
files | new-words.py |
line diff
1.1 --- a/new-words.py Tue Feb 08 20:35:38 2011 +0200 1.2 +++ b/new-words.py Wed Feb 09 21:08:23 2011 +0200 1.3 @@ -3,6 +3,7 @@ 1.4 1.5 from __future__ import with_statement 1.6 import codecs 1.7 +import difflib 1.8 import logging 1.9 import os 1.10 import optparse 1.11 @@ -45,13 +46,72 @@ 1.12 """Returns the word that is the most relevant to the wordpairs_group. 1.13 1.14 At the moment: returns the word with minimal length""" 1.15 + 1.16 + def f(x, y): 1.17 + return difflib.SequenceMatcher( 1.18 + None, 1.19 + #(x[-2:] == 'en' and x[:-2].lower() or x.lower()), 1.20 + x.lower(), 1.21 + y.lower()).ratio() 1.22 1.23 minimal_length = min(len(pair[1]) for pair in wordpairs_group) 1.24 - return list(x[1] for x in sorted( 1.25 + best_match = list(x[1] for x in sorted( 1.26 (x for x in wordpairs_group if len(x[1]) == minimal_length), 1.27 key=lambda x:x[0], 1.28 reverse=True))[0] 1.29 1.30 + suggestions = self.dictionary_suggestions(best_match) 1.31 + if len(suggestions) == 1: 1.32 + return best_match 1.33 + 1.34 + #return best_match 1.35 + 1.36 + verb = False 1.37 + corrected_best_match = best_match 1.38 + if best_match[-2:] == 'et': 1.39 + word = best_match[:-1]+"n" 1.40 + sugg = self.dictionary_suggestions(word) 1.41 + if len(sugg) == 1: 1.42 + return word 1.43 + suggestions += sugg 1.44 + corrected_best_match = word 1.45 + corrected_best_match = best_match[:-2] 1.46 + verb = True 1.47 + 1.48 + if best_match[-1] == 't': 1.49 + word = best_match[:-1]+"en" 1.50 + sugg = self.dictionary_suggestions(word) 1.51 + if len(sugg) == 1: 1.52 + return word 1.53 + suggestions += sugg 1.54 + corrected_best_match = best_match[:-1] 1.55 + verb = True 1.56 + 1.57 + if corrected_best_match[0].lower() == corrected_best_match[0]: 1.58 + suggestions = [ x for x in suggestions 1.59 + if x[0].lower() == x[0] ] 1.60 + 1.61 + if suggestions == []: 1.62 + return best_match+"_" 1.63 + return best_match+" "+(" ".join( 1.64 + sorted( 1.65 + suggestions, 1.66 + key = lambda x: f(x, corrected_best_match), 1.67 + reverse = True 1.68 + ) 1.69 + ) 1.70 + ) 1.71 + 1.72 + def dictionary_suggestions(self, word): 1.73 + return [ 1.74 + x.decode('utf-8').rstrip('\n') 1.75 + for x 1.76 + in subprocess.Popen( 1.77 + ["de-variants", word], 1.78 + stdout=subprocess.PIPE 1.79 + ).stdout.readlines() ] 1.80 + 1.81 + 1.82 parser = optparse.OptionParser() 1.83 1.84 parser.add_option( 1.85 @@ -388,11 +448,11 @@ 1.86 if old_normalized_word and old_normalized_word != normalized_word: 1.87 #codecs.getwriter("utf-8")(sys.stdout).write( 1.88 # "### %s\n" % normalizator.best_word_from_group(words_of_this_group)) 1.89 - compressed_word_pair = ( 1.90 - sum(x[0] for x in words_of_this_group), 1.91 - normalizator.best_word_from_group(words_of_this_group) 1.92 - ) 1.93 if compressed_wordlist: 1.94 + compressed_word_pair = ( 1.95 + sum(x[0] for x in words_of_this_group), 1.96 + normalizator.best_word_from_group(words_of_this_group) 1.97 + ) 1.98 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair) 1.99 printed_words += 1 1.100 words_of_this_group = []