# HG changeset patch
# User Igor Chubin <igor@chub.in>
# Date 1297190138 -7200
# Node ID 7194bdb564754800ff60b171f5ecf4991561eca3
# Parent  d708e2c1bad814cc2a27aeb0dd0931d1dabd91dc
new feature: -r and -R can specify number of words (or percentage) to show

diff -r d708e2c1bad8 -r 7194bdb56475 new-words-py.sh
--- a/new-words-py.sh	Mon Feb 07 21:21:17 2011 +0200
+++ b/new-words-py.sh	Tue Feb 08 20:35:38 2011 +0200
@@ -25,7 +25,9 @@
     -T          show list of active tags
     -m tag      merge the words tagged with "tag" into the main vocabulary
     -M          merge the words tagged with any tag into the main vocabulary 
-    -r tag      remove subvocabulary for the "tag"
+    -d tag      delete subvocabulary for the "tag"
+    -r RANGE    show only first RANGE words
+    -R RANGE    show only words lower than RANGE percent
     -2 -3       find 2 and 3 words' sequences
 
 The language of the text can be specified also
@@ -84,7 +86,7 @@
 SHOW_VOC_STAT=NO
 COMPRESSED_WORDLIST=NO
 WORDS_GROUPING=YES
-while getopts Gcl:sSkanNp:t:Tm:Mr:23 opt
+while getopts Gcl:sSkanNp:t:Tm:Md:r:R:23 opt
 do
     case "$opt" in
       c)  COMPRESSED_WORDLIST=YES;;
@@ -101,7 +103,9 @@
       T)  TAGS_LIST_ONLY="YES";;
       m)  DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
       M)  DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
-      r)  REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
+      d)  REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
+      r)  SHOW_RANGE="$OPTARG";;
+      R)  SHOW_RANGE_PERCENTAGE="$OPTARG";;
       2)  GROUP_WORDS_BY_TWO=YES;;
       3)  GROUP_WORDS_BY_THREE=YES;;
       \?)       # unknown flag
@@ -141,6 +145,8 @@
 }
 get_words_group_words_add_stat()
 {
+    SHOW_RANGE="$SHOW_RANGE" \
+    SHOW_RANGE_PERCENTAGE="$SHOW_RANGE_PERCENTAGE" \
     COMPRESSED_WORDLIST="$COMPRESSED_WORDLIST" \
     GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \
     GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \
diff -r d708e2c1bad8 -r 7194bdb56475 new-words.py
--- a/new-words.py	Mon Feb 07 21:21:17 2011 +0200
+++ b/new-words.py	Tue Feb 08 20:35:38 2011 +0200
@@ -115,10 +115,10 @@
     dest="pages")
 
 parser.add_option(
-    "-r", "--remove-tag",
-    help="remove subvocabulary of specified tag",
+    "-d", "--delete-tag",
+    help="delete subvocabulary of specified tag",
     action="store",
-    dest="remove_tag")
+    dest="delete_tag")
 
 parser.add_option(
     "-s", "--text-stats",
@@ -337,7 +337,16 @@
             return cmp(int(num1), int(num2))
 
 
-def print_words_sorted(word_pairs, stats, normalizator, print_stats=True, stats_only=False, compressed_wordlist=False):
+def print_words_sorted(
+        word_pairs,
+        stats,
+        normalizator,
+        print_stats=True,
+        stats_only=False,
+        compressed_wordlist=False,
+        show_range=0,
+        show_range_percentage=0,
+        ):
     if stats_only:
         codecs.getwriter("utf-8")(sys.stdout).write(
             " ".join([
@@ -372,6 +381,7 @@
     current_level = 0
     old_normalized_word = None
     words_of_this_group = []
+    printed_words = 0
     for word_pair in word_pairs:
 
         normalized_word = normalizator.normalize(word_pair[1])
@@ -384,6 +394,7 @@
                 )
             if compressed_wordlist:
                 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
+                printed_words += 1
             words_of_this_group = []
 
         old_normalized_word = normalized_word
@@ -391,6 +402,7 @@
 
         if not compressed_wordlist:
             codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
+            printed_words += 1
 
 
         known += word_pair[0]
@@ -401,6 +413,11 @@
                 level_lines = level_lines[1:]
             codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
 
+        if show_range >0 and printed_words >= show_range:
+            break
+        if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
+            break
+
 def filter_add_notes(args):
     lines = readlines_from_file(args[0])
     notes = load_notes(notes_filenames())
@@ -422,6 +439,7 @@
     notes = load_notes(notes_filenames())
     lines = readlines_from_stdin()
     group_by = [1]
+
     if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
         group_by.append(2)
     if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
@@ -435,6 +453,17 @@
     if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
         compressed_wordlist = True
 
+    show_range = os.environ.get('SHOW_RANGE', '')
+    if show_range != '':
+        show_range = int(show_range)
+    else:
+        show_range = 0
+    show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '')
+    if show_range_percentage != '':
+        show_range_percentage = int(show_range_percentage)
+    else:
+        show_range_percentage = 0
+
 
     stats = {}
     stats['total'] = sum(words[x] for x in words.keys())
@@ -471,7 +500,9 @@
         stats,
         normalizator,
         stats_only=stats_only,
-        compressed_wordlist=compressed_wordlist
+        compressed_wordlist=compressed_wordlist,
+        show_range=show_range,
+        show_range_percentage=show_range_percentage,
         )
 
 (options, args) = parser.parse_args()