# HG changeset patch # User Igor Chubin # Date 1297190138 -7200 # Node ID 7194bdb564754800ff60b171f5ecf4991561eca3 # Parent d708e2c1bad814cc2a27aeb0dd0931d1dabd91dc new feature: -r and -R can specify number of words (or percentage) to show diff -r d708e2c1bad8 -r 7194bdb56475 new-words-py.sh --- a/new-words-py.sh Mon Feb 07 21:21:17 2011 +0200 +++ b/new-words-py.sh Tue Feb 08 20:35:38 2011 +0200 @@ -25,7 +25,9 @@ -T show list of active tags -m tag merge the words tagged with "tag" into the main vocabulary -M merge the words tagged with any tag into the main vocabulary - -r tag remove subvocabulary for the "tag" + -d tag delete subvocabulary for the "tag" + -r RANGE show only first RANGE words + -R RANGE show only words lower than RANGE percent -2 -3 find 2 and 3 words' sequences The language of the text can be specified also @@ -84,7 +86,7 @@ SHOW_VOC_STAT=NO COMPRESSED_WORDLIST=NO WORDS_GROUPING=YES -while getopts Gcl:sSkanNp:t:Tm:Mr:23 opt +while getopts Gcl:sSkanNp:t:Tm:Md:r:R:23 opt do case "$opt" in c) COMPRESSED_WORDLIST=YES;; @@ -101,7 +103,9 @@ T) TAGS_LIST_ONLY="YES";; m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";; M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";; - r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";; + d) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";; + r) SHOW_RANGE="$OPTARG";; + R) SHOW_RANGE_PERCENTAGE="$OPTARG";; 2) GROUP_WORDS_BY_TWO=YES;; 3) GROUP_WORDS_BY_THREE=YES;; \?) # unknown flag @@ -141,6 +145,8 @@ } get_words_group_words_add_stat() { + SHOW_RANGE="$SHOW_RANGE" \ + SHOW_RANGE_PERCENTAGE="$SHOW_RANGE_PERCENTAGE" \ COMPRESSED_WORDLIST="$COMPRESSED_WORDLIST" \ GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \ GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \ diff -r d708e2c1bad8 -r 7194bdb56475 new-words.py --- a/new-words.py Mon Feb 07 21:21:17 2011 +0200 +++ b/new-words.py Tue Feb 08 20:35:38 2011 +0200 @@ -115,10 +115,10 @@ dest="pages") parser.add_option( - "-r", "--remove-tag", - help="remove subvocabulary of specified tag", + "-d", "--delete-tag", + help="delete subvocabulary of specified tag", action="store", - dest="remove_tag") + dest="delete_tag") parser.add_option( "-s", "--text-stats", @@ -337,7 +337,16 @@ return cmp(int(num1), int(num2)) -def print_words_sorted(word_pairs, stats, normalizator, print_stats=True, stats_only=False, compressed_wordlist=False): +def print_words_sorted( + word_pairs, + stats, + normalizator, + print_stats=True, + stats_only=False, + compressed_wordlist=False, + show_range=0, + show_range_percentage=0, + ): if stats_only: codecs.getwriter("utf-8")(sys.stdout).write( " ".join([ @@ -372,6 +381,7 @@ current_level = 0 old_normalized_word = None words_of_this_group = [] + printed_words = 0 for word_pair in word_pairs: normalized_word = normalizator.normalize(word_pair[1]) @@ -384,6 +394,7 @@ ) if compressed_wordlist: codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair) + printed_words += 1 words_of_this_group = [] old_normalized_word = normalized_word @@ -391,6 +402,7 @@ if not compressed_wordlist: codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair) + printed_words += 1 known += word_pair[0] @@ -401,6 +413,11 @@ level_lines = level_lines[1:] codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level) + if show_range >0 and printed_words >= show_range: + break + if show_range_percentage >0 and 100.0*known/total >= show_range_percentage: + break + def filter_add_notes(args): lines = readlines_from_file(args[0]) notes = load_notes(notes_filenames()) @@ -422,6 +439,7 @@ notes = load_notes(notes_filenames()) lines = readlines_from_stdin() group_by = [1] + if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES': group_by.append(2) if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES': @@ -435,6 +453,17 @@ if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES': compressed_wordlist = True + show_range = os.environ.get('SHOW_RANGE', '') + if show_range != '': + show_range = int(show_range) + else: + show_range = 0 + show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '') + if show_range_percentage != '': + show_range_percentage = int(show_range_percentage) + else: + show_range_percentage = 0 + stats = {} stats['total'] = sum(words[x] for x in words.keys()) @@ -471,7 +500,9 @@ stats, normalizator, stats_only=stats_only, - compressed_wordlist=compressed_wordlist + compressed_wordlist=compressed_wordlist, + show_range=show_range, + show_range_percentage=show_range_percentage, ) (options, args) = parser.parse_args()