# HG changeset patch # User Igor Chubin # Date 1297106477 -7200 # Node ID d708e2c1bad814cc2a27aeb0dd0931d1dabd91dc # Parent bf0aa8e3c1ce0186a3d9502f74818fe205bd3d70 compressed wordlist support diff -r bf0aa8e3c1ce -r d708e2c1bad8 new-words-py.sh --- a/new-words-py.sh Fri Feb 04 06:39:25 2011 +0100 +++ b/new-words-py.sh Mon Feb 07 21:21:17 2011 +0200 @@ -141,9 +141,10 @@ } get_words_group_words_add_stat() { - STAT_ONLY="$STAT_ONLY" \ + COMPRESSED_WORDLIST="$COMPRESSED_WORDLIST" \ GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \ GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \ + STAT_ONLY="$STAT_ONLY" \ WORDS_GROUPING="$WORDS_GROUPING" \ FILTER_WORDS="$FILTER_WORDS" \ $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1" diff -r bf0aa8e3c1ce -r d708e2c1bad8 new-words.py --- a/new-words.py Fri Feb 04 06:39:25 2011 +0100 +++ b/new-words.py Mon Feb 07 21:21:17 2011 +0200 @@ -41,6 +41,17 @@ word = self.linked_words[word] return self.stemmer.stemWord(word.lower()) + def best_word_from_group(self, wordpairs_group): + """Returns the word that is the most relevant to the wordpairs_group. + + At the moment: returns the word with minimal length""" + + minimal_length = min(len(pair[1]) for pair in wordpairs_group) + return list(x[1] for x in sorted( + (x for x in wordpairs_group if len(x[1]) == minimal_length), + key=lambda x:x[0], + reverse=True))[0] + parser = optparse.OptionParser() parser.add_option( @@ -215,12 +226,9 @@ if match_object: word = match_object.group(1) if word in notes: - #logging.debug(word) - #logging.debug(line) if notes_filename in notes[word]: line = line.rstrip('\n') line = "%-30s %s\n" % (line, notes[word][notes_filename]) - #logging.debug(line) result += [line] else: result += [line] @@ -328,7 +336,8 @@ else: return cmp(int(num1), int(num2)) -def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False): + +def print_words_sorted(word_pairs, stats, normalizator, print_stats=True, stats_only=False, compressed_wordlist=False): if stats_only: codecs.getwriter("utf-8")(sys.stdout).write( " ".join([ @@ -361,8 +370,29 @@ known = int(stats['total_known']) total = int(stats['total']) current_level = 0 + old_normalized_word = None + words_of_this_group = [] for word_pair in word_pairs: - codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair) + + normalized_word = normalizator.normalize(word_pair[1]) + if old_normalized_word and old_normalized_word != normalized_word: + #codecs.getwriter("utf-8")(sys.stdout).write( + # "### %s\n" % normalizator.best_word_from_group(words_of_this_group)) + compressed_word_pair = ( + sum(x[0] for x in words_of_this_group), + normalizator.best_word_from_group(words_of_this_group) + ) + if compressed_wordlist: + codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair) + words_of_this_group = [] + + old_normalized_word = normalized_word + words_of_this_group.append(word_pair) + + if not compressed_wordlist: + codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair) + + known += word_pair[0] if 100.0*known/total >= level_lines[0]: current_level = level_lines[0] @@ -401,6 +431,10 @@ if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES': stats_only = True + compressed_wordlist = False + if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES': + compressed_wordlist = True + stats = {} stats['total'] = sum(words[x] for x in words.keys()) @@ -432,7 +466,13 @@ cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words), reverse=True) - print_words_sorted(words_with_freq, stats, stats_only=stats_only) + print_words_sorted( + words_with_freq, + stats, + normalizator, + stats_only=stats_only, + compressed_wordlist=compressed_wordlist + ) (options, args) = parser.parse_args() if options.language: