new-words
changeset 47:d708e2c1bad8
compressed wordlist support
author | Igor Chubin <igor@chub.in> |
---|---|
date | Mon Feb 07 21:21:17 2011 +0200 (2011-02-07) |
parents | bf0aa8e3c1ce |
children | 7194bdb56475 |
files | new-words-py.sh new-words.py |
line diff
1.1 --- a/new-words-py.sh Fri Feb 04 06:39:25 2011 +0100 1.2 +++ b/new-words-py.sh Mon Feb 07 21:21:17 2011 +0200 1.3 @@ -141,9 +141,10 @@ 1.4 } 1.5 get_words_group_words_add_stat() 1.6 { 1.7 - STAT_ONLY="$STAT_ONLY" \ 1.8 + COMPRESSED_WORDLIST="$COMPRESSED_WORDLIST" \ 1.9 GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \ 1.10 GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \ 1.11 + STAT_ONLY="$STAT_ONLY" \ 1.12 WORDS_GROUPING="$WORDS_GROUPING" \ 1.13 FILTER_WORDS="$FILTER_WORDS" \ 1.14 $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1"
2.1 --- a/new-words.py Fri Feb 04 06:39:25 2011 +0100 2.2 +++ b/new-words.py Mon Feb 07 21:21:17 2011 +0200 2.3 @@ -41,6 +41,17 @@ 2.4 word = self.linked_words[word] 2.5 return self.stemmer.stemWord(word.lower()) 2.6 2.7 + def best_word_from_group(self, wordpairs_group): 2.8 + """Returns the word that is the most relevant to the wordpairs_group. 2.9 + 2.10 + At the moment: returns the word with minimal length""" 2.11 + 2.12 + minimal_length = min(len(pair[1]) for pair in wordpairs_group) 2.13 + return list(x[1] for x in sorted( 2.14 + (x for x in wordpairs_group if len(x[1]) == minimal_length), 2.15 + key=lambda x:x[0], 2.16 + reverse=True))[0] 2.17 + 2.18 parser = optparse.OptionParser() 2.19 2.20 parser.add_option( 2.21 @@ -215,12 +226,9 @@ 2.22 if match_object: 2.23 word = match_object.group(1) 2.24 if word in notes: 2.25 - #logging.debug(word) 2.26 - #logging.debug(line) 2.27 if notes_filename in notes[word]: 2.28 line = line.rstrip('\n') 2.29 line = "%-30s %s\n" % (line, notes[word][notes_filename]) 2.30 - #logging.debug(line) 2.31 result += [line] 2.32 else: 2.33 result += [line] 2.34 @@ -328,7 +336,8 @@ 2.35 else: 2.36 return cmp(int(num1), int(num2)) 2.37 2.38 -def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False): 2.39 + 2.40 +def print_words_sorted(word_pairs, stats, normalizator, print_stats=True, stats_only=False, compressed_wordlist=False): 2.41 if stats_only: 2.42 codecs.getwriter("utf-8")(sys.stdout).write( 2.43 " ".join([ 2.44 @@ -361,8 +370,29 @@ 2.45 known = int(stats['total_known']) 2.46 total = int(stats['total']) 2.47 current_level = 0 2.48 + old_normalized_word = None 2.49 + words_of_this_group = [] 2.50 for word_pair in word_pairs: 2.51 - codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair) 2.52 + 2.53 + normalized_word = normalizator.normalize(word_pair[1]) 2.54 + if old_normalized_word and old_normalized_word != normalized_word: 2.55 + #codecs.getwriter("utf-8")(sys.stdout).write( 2.56 + # "### %s\n" % normalizator.best_word_from_group(words_of_this_group)) 2.57 + compressed_word_pair = ( 2.58 + sum(x[0] for x in words_of_this_group), 2.59 + normalizator.best_word_from_group(words_of_this_group) 2.60 + ) 2.61 + if compressed_wordlist: 2.62 + codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair) 2.63 + words_of_this_group = [] 2.64 + 2.65 + old_normalized_word = normalized_word 2.66 + words_of_this_group.append(word_pair) 2.67 + 2.68 + if not compressed_wordlist: 2.69 + codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair) 2.70 + 2.71 + 2.72 known += word_pair[0] 2.73 if 100.0*known/total >= level_lines[0]: 2.74 current_level = level_lines[0] 2.75 @@ -401,6 +431,10 @@ 2.76 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES': 2.77 stats_only = True 2.78 2.79 + compressed_wordlist = False 2.80 + if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES': 2.81 + compressed_wordlist = True 2.82 + 2.83 2.84 stats = {} 2.85 stats['total'] = sum(words[x] for x in words.keys()) 2.86 @@ -432,7 +466,13 @@ 2.87 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words), 2.88 reverse=True) 2.89 2.90 - print_words_sorted(words_with_freq, stats, stats_only=stats_only) 2.91 + print_words_sorted( 2.92 + words_with_freq, 2.93 + stats, 2.94 + normalizator, 2.95 + stats_only=stats_only, 2.96 + compressed_wordlist=compressed_wordlist 2.97 + ) 2.98 2.99 (options, args) = parser.parse_args() 2.100 if options.language: