new-words
changeset 48:7194bdb56475
new feature: -r and -R can specify number of words (or percentage) to show
author | Igor Chubin <igor@chub.in> |
---|---|
date | Tue Feb 08 20:35:38 2011 +0200 (2011-02-08) |
parents | d708e2c1bad8 |
children | 00286f6bfa85 |
files | new-words-py.sh new-words.py |
line diff
1.1 --- a/new-words-py.sh Mon Feb 07 21:21:17 2011 +0200 1.2 +++ b/new-words-py.sh Tue Feb 08 20:35:38 2011 +0200 1.3 @@ -25,7 +25,9 @@ 1.4 -T show list of active tags 1.5 -m tag merge the words tagged with "tag" into the main vocabulary 1.6 -M merge the words tagged with any tag into the main vocabulary 1.7 - -r tag remove subvocabulary for the "tag" 1.8 + -d tag delete subvocabulary for the "tag" 1.9 + -r RANGE show only first RANGE words 1.10 + -R RANGE show only words lower than RANGE percent 1.11 -2 -3 find 2 and 3 words' sequences 1.12 1.13 The language of the text can be specified also 1.14 @@ -84,7 +86,7 @@ 1.15 SHOW_VOC_STAT=NO 1.16 COMPRESSED_WORDLIST=NO 1.17 WORDS_GROUPING=YES 1.18 -while getopts Gcl:sSkanNp:t:Tm:Mr:23 opt 1.19 +while getopts Gcl:sSkanNp:t:Tm:Md:r:R:23 opt 1.20 do 1.21 case "$opt" in 1.22 c) COMPRESSED_WORDLIST=YES;; 1.23 @@ -101,7 +103,9 @@ 1.24 T) TAGS_LIST_ONLY="YES";; 1.25 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";; 1.26 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";; 1.27 - r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";; 1.28 + d) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";; 1.29 + r) SHOW_RANGE="$OPTARG";; 1.30 + R) SHOW_RANGE_PERCENTAGE="$OPTARG";; 1.31 2) GROUP_WORDS_BY_TWO=YES;; 1.32 3) GROUP_WORDS_BY_THREE=YES;; 1.33 \?) # unknown flag 1.34 @@ -141,6 +145,8 @@ 1.35 } 1.36 get_words_group_words_add_stat() 1.37 { 1.38 + SHOW_RANGE="$SHOW_RANGE" \ 1.39 + SHOW_RANGE_PERCENTAGE="$SHOW_RANGE_PERCENTAGE" \ 1.40 COMPRESSED_WORDLIST="$COMPRESSED_WORDLIST" \ 1.41 GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \ 1.42 GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \
2.1 --- a/new-words.py Mon Feb 07 21:21:17 2011 +0200 2.2 +++ b/new-words.py Tue Feb 08 20:35:38 2011 +0200 2.3 @@ -115,10 +115,10 @@ 2.4 dest="pages") 2.5 2.6 parser.add_option( 2.7 - "-r", "--remove-tag", 2.8 - help="remove subvocabulary of specified tag", 2.9 + "-d", "--delete-tag", 2.10 + help="delete subvocabulary of specified tag", 2.11 action="store", 2.12 - dest="remove_tag") 2.13 + dest="delete_tag") 2.14 2.15 parser.add_option( 2.16 "-s", "--text-stats", 2.17 @@ -337,7 +337,16 @@ 2.18 return cmp(int(num1), int(num2)) 2.19 2.20 2.21 -def print_words_sorted(word_pairs, stats, normalizator, print_stats=True, stats_only=False, compressed_wordlist=False): 2.22 +def print_words_sorted( 2.23 + word_pairs, 2.24 + stats, 2.25 + normalizator, 2.26 + print_stats=True, 2.27 + stats_only=False, 2.28 + compressed_wordlist=False, 2.29 + show_range=0, 2.30 + show_range_percentage=0, 2.31 + ): 2.32 if stats_only: 2.33 codecs.getwriter("utf-8")(sys.stdout).write( 2.34 " ".join([ 2.35 @@ -372,6 +381,7 @@ 2.36 current_level = 0 2.37 old_normalized_word = None 2.38 words_of_this_group = [] 2.39 + printed_words = 0 2.40 for word_pair in word_pairs: 2.41 2.42 normalized_word = normalizator.normalize(word_pair[1]) 2.43 @@ -384,6 +394,7 @@ 2.44 ) 2.45 if compressed_wordlist: 2.46 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair) 2.47 + printed_words += 1 2.48 words_of_this_group = [] 2.49 2.50 old_normalized_word = normalized_word 2.51 @@ -391,6 +402,7 @@ 2.52 2.53 if not compressed_wordlist: 2.54 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair) 2.55 + printed_words += 1 2.56 2.57 2.58 known += word_pair[0] 2.59 @@ -401,6 +413,11 @@ 2.60 level_lines = level_lines[1:] 2.61 codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level) 2.62 2.63 + if show_range >0 and printed_words >= show_range: 2.64 + break 2.65 + if show_range_percentage >0 and 100.0*known/total >= show_range_percentage: 2.66 + break 2.67 + 2.68 def filter_add_notes(args): 2.69 lines = readlines_from_file(args[0]) 2.70 notes = load_notes(notes_filenames()) 2.71 @@ -422,6 +439,7 @@ 2.72 notes = load_notes(notes_filenames()) 2.73 lines = readlines_from_stdin() 2.74 group_by = [1] 2.75 + 2.76 if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES': 2.77 group_by.append(2) 2.78 if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES': 2.79 @@ -435,6 +453,17 @@ 2.80 if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES': 2.81 compressed_wordlist = True 2.82 2.83 + show_range = os.environ.get('SHOW_RANGE', '') 2.84 + if show_range != '': 2.85 + show_range = int(show_range) 2.86 + else: 2.87 + show_range = 0 2.88 + show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '') 2.89 + if show_range_percentage != '': 2.90 + show_range_percentage = int(show_range_percentage) 2.91 + else: 2.92 + show_range_percentage = 0 2.93 + 2.94 2.95 stats = {} 2.96 stats['total'] = sum(words[x] for x in words.keys()) 2.97 @@ -471,7 +500,9 @@ 2.98 stats, 2.99 normalizator, 2.100 stats_only=stats_only, 2.101 - compressed_wordlist=compressed_wordlist 2.102 + compressed_wordlist=compressed_wordlist, 2.103 + show_range=show_range, 2.104 + show_range_percentage=show_range_percentage, 2.105 ) 2.106 2.107 (options, args) = parser.parse_args()