new-words
diff new-words.py @ 47:d708e2c1bad8
compressed wordlist support
author | Igor Chubin <igor@chub.in> |
---|---|
date | Mon Feb 07 21:21:17 2011 +0200 (2011-02-07) |
parents | 5f90e44eecfc |
children | 7194bdb56475 |
line diff
1.1 --- a/new-words.py Fri Feb 04 06:18:50 2011 +0100 1.2 +++ b/new-words.py Mon Feb 07 21:21:17 2011 +0200 1.3 @@ -41,6 +41,17 @@ 1.4 word = self.linked_words[word] 1.5 return self.stemmer.stemWord(word.lower()) 1.6 1.7 + def best_word_from_group(self, wordpairs_group): 1.8 + """Returns the word that is the most relevant to the wordpairs_group. 1.9 + 1.10 + At the moment: returns the word with minimal length""" 1.11 + 1.12 + minimal_length = min(len(pair[1]) for pair in wordpairs_group) 1.13 + return list(x[1] for x in sorted( 1.14 + (x for x in wordpairs_group if len(x[1]) == minimal_length), 1.15 + key=lambda x:x[0], 1.16 + reverse=True))[0] 1.17 + 1.18 parser = optparse.OptionParser() 1.19 1.20 parser.add_option( 1.21 @@ -215,12 +226,9 @@ 1.22 if match_object: 1.23 word = match_object.group(1) 1.24 if word in notes: 1.25 - #logging.debug(word) 1.26 - #logging.debug(line) 1.27 if notes_filename in notes[word]: 1.28 line = line.rstrip('\n') 1.29 line = "%-30s %s\n" % (line, notes[word][notes_filename]) 1.30 - #logging.debug(line) 1.31 result += [line] 1.32 else: 1.33 result += [line] 1.34 @@ -328,7 +336,8 @@ 1.35 else: 1.36 return cmp(int(num1), int(num2)) 1.37 1.38 -def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False): 1.39 + 1.40 +def print_words_sorted(word_pairs, stats, normalizator, print_stats=True, stats_only=False, compressed_wordlist=False): 1.41 if stats_only: 1.42 codecs.getwriter("utf-8")(sys.stdout).write( 1.43 " ".join([ 1.44 @@ -361,8 +370,29 @@ 1.45 known = int(stats['total_known']) 1.46 total = int(stats['total']) 1.47 current_level = 0 1.48 + old_normalized_word = None 1.49 + words_of_this_group = [] 1.50 for word_pair in word_pairs: 1.51 - codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair) 1.52 + 1.53 + normalized_word = normalizator.normalize(word_pair[1]) 1.54 + if old_normalized_word and old_normalized_word != normalized_word: 1.55 + #codecs.getwriter("utf-8")(sys.stdout).write( 1.56 + # "### %s\n" % normalizator.best_word_from_group(words_of_this_group)) 1.57 + compressed_word_pair = ( 1.58 + sum(x[0] for x in words_of_this_group), 1.59 + normalizator.best_word_from_group(words_of_this_group) 1.60 + ) 1.61 + if compressed_wordlist: 1.62 + codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair) 1.63 + words_of_this_group = [] 1.64 + 1.65 + old_normalized_word = normalized_word 1.66 + words_of_this_group.append(word_pair) 1.67 + 1.68 + if not compressed_wordlist: 1.69 + codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair) 1.70 + 1.71 + 1.72 known += word_pair[0] 1.73 if 100.0*known/total >= level_lines[0]: 1.74 current_level = level_lines[0] 1.75 @@ -401,6 +431,10 @@ 1.76 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES': 1.77 stats_only = True 1.78 1.79 + compressed_wordlist = False 1.80 + if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES': 1.81 + compressed_wordlist = True 1.82 + 1.83 1.84 stats = {} 1.85 stats['total'] = sum(words[x] for x in words.keys()) 1.86 @@ -432,7 +466,13 @@ 1.87 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words), 1.88 reverse=True) 1.89 1.90 - print_words_sorted(words_with_freq, stats, stats_only=stats_only) 1.91 + print_words_sorted( 1.92 + words_with_freq, 1.93 + stats, 1.94 + normalizator, 1.95 + stats_only=stats_only, 1.96 + compressed_wordlist=compressed_wordlist 1.97 + ) 1.98 1.99 (options, args) = parser.parse_args() 1.100 if options.language: