new-words

changeset 48:7194bdb56475
new feature: -r and -R can specify number of words (or percentage) to show
author: Igor Chubin <igor@chub.in>
date: Tue Feb 08 20:35:38 2011 +0200 (2011-02-08)
parents: d708e2c1bad8
children: 00286f6bfa85
files: new-words-py.sh new-words.py
     1.1 --- a/new-words-py.sh	Mon Feb 07 21:21:17 2011 +0200
     1.2 +++ b/new-words-py.sh	Tue Feb 08 20:35:38 2011 +0200
     1.3 @@ -25,7 +25,9 @@
     1.4      -T          show list of active tags
     1.5      -m tag      merge the words tagged with "tag" into the main vocabulary
     1.6      -M          merge the words tagged with any tag into the main vocabulary 
     1.7 -    -r tag      remove subvocabulary for the "tag"
     1.8 +    -d tag      delete subvocabulary for the "tag"
     1.9 +    -r RANGE    show only first RANGE words
    1.10 +    -R RANGE    show only words lower than RANGE percent
    1.11      -2 -3       find 2 and 3 words' sequences
    1.12  
    1.13  The language of the text can be specified also
    1.14 @@ -84,7 +86,7 @@
    1.15  SHOW_VOC_STAT=NO
    1.16  COMPRESSED_WORDLIST=NO
    1.17  WORDS_GROUPING=YES
    1.18 -while getopts Gcl:sSkanNp:t:Tm:Mr:23 opt
    1.19 +while getopts Gcl:sSkanNp:t:Tm:Md:r:R:23 opt
    1.20  do
    1.21      case "$opt" in
    1.22        c)  COMPRESSED_WORDLIST=YES;;
    1.23 @@ -101,7 +103,9 @@
    1.24        T)  TAGS_LIST_ONLY="YES";;
    1.25        m)  DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
    1.26        M)  DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
    1.27 -      r)  REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
    1.28 +      d)  REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
    1.29 +      r)  SHOW_RANGE="$OPTARG";;
    1.30 +      R)  SHOW_RANGE_PERCENTAGE="$OPTARG";;
    1.31        2)  GROUP_WORDS_BY_TWO=YES;;
    1.32        3)  GROUP_WORDS_BY_THREE=YES;;
    1.33        \?)       # unknown flag
    1.34 @@ -141,6 +145,8 @@
    1.35  }
    1.36  get_words_group_words_add_stat()
    1.37  {
    1.38 +    SHOW_RANGE="$SHOW_RANGE" \
    1.39 +    SHOW_RANGE_PERCENTAGE="$SHOW_RANGE_PERCENTAGE" \
    1.40      COMPRESSED_WORDLIST="$COMPRESSED_WORDLIST" \
    1.41      GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \
    1.42      GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \

     2.1 --- a/new-words.py	Mon Feb 07 21:21:17 2011 +0200
     2.2 +++ b/new-words.py	Tue Feb 08 20:35:38 2011 +0200
     2.3 @@ -115,10 +115,10 @@
     2.4      dest="pages")
     2.5  
     2.6  parser.add_option(
     2.7 -    "-r", "--remove-tag",
     2.8 -    help="remove subvocabulary of specified tag",
     2.9 +    "-d", "--delete-tag",
    2.10 +    help="delete subvocabulary of specified tag",
    2.11      action="store",
    2.12 -    dest="remove_tag")
    2.13 +    dest="delete_tag")
    2.14  
    2.15  parser.add_option(
    2.16      "-s", "--text-stats",
    2.17 @@ -337,7 +337,16 @@
    2.18              return cmp(int(num1), int(num2))
    2.19  
    2.20  
    2.21 -def print_words_sorted(word_pairs, stats, normalizator, print_stats=True, stats_only=False, compressed_wordlist=False):
    2.22 +def print_words_sorted(
    2.23 +        word_pairs,
    2.24 +        stats,
    2.25 +        normalizator,
    2.26 +        print_stats=True,
    2.27 +        stats_only=False,
    2.28 +        compressed_wordlist=False,
    2.29 +        show_range=0,
    2.30 +        show_range_percentage=0,
    2.31 +        ):
    2.32      if stats_only:
    2.33          codecs.getwriter("utf-8")(sys.stdout).write(
    2.34              " ".join([
    2.35 @@ -372,6 +381,7 @@
    2.36      current_level = 0
    2.37      old_normalized_word = None
    2.38      words_of_this_group = []
    2.39 +    printed_words = 0
    2.40      for word_pair in word_pairs:
    2.41  
    2.42          normalized_word = normalizator.normalize(word_pair[1])
    2.43 @@ -384,6 +394,7 @@
    2.44                  )
    2.45              if compressed_wordlist:
    2.46                  codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
    2.47 +                printed_words += 1
    2.48              words_of_this_group = []
    2.49  
    2.50          old_normalized_word = normalized_word
    2.51 @@ -391,6 +402,7 @@
    2.52  
    2.53          if not compressed_wordlist:
    2.54              codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
    2.55 +            printed_words += 1
    2.56  
    2.57  
    2.58          known += word_pair[0]
    2.59 @@ -401,6 +413,11 @@
    2.60                  level_lines = level_lines[1:]
    2.61              codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
    2.62  
    2.63 +        if show_range >0 and printed_words >= show_range:
    2.64 +            break
    2.65 +        if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
    2.66 +            break
    2.67 +
    2.68  def filter_add_notes(args):
    2.69      lines = readlines_from_file(args[0])
    2.70      notes = load_notes(notes_filenames())
    2.71 @@ -422,6 +439,7 @@
    2.72      notes = load_notes(notes_filenames())
    2.73      lines = readlines_from_stdin()
    2.74      group_by = [1]
    2.75 +
    2.76      if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
    2.77          group_by.append(2)
    2.78      if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
    2.79 @@ -435,6 +453,17 @@
    2.80      if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
    2.81          compressed_wordlist = True
    2.82  
    2.83 +    show_range = os.environ.get('SHOW_RANGE', '')
    2.84 +    if show_range != '':
    2.85 +        show_range = int(show_range)
    2.86 +    else:
    2.87 +        show_range = 0
    2.88 +    show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '')
    2.89 +    if show_range_percentage != '':
    2.90 +        show_range_percentage = int(show_range_percentage)
    2.91 +    else:
    2.92 +        show_range_percentage = 0
    2.93 +
    2.94  
    2.95      stats = {}
    2.96      stats['total'] = sum(words[x] for x in words.keys())
    2.97 @@ -471,7 +500,9 @@
    2.98          stats,
    2.99          normalizator,
   2.100          stats_only=stats_only,
   2.101 -        compressed_wordlist=compressed_wordlist
   2.102 +        compressed_wordlist=compressed_wordlist,
   2.103 +        show_range=show_range,
   2.104 +        show_range_percentage=show_range_percentage,
   2.105          )
   2.106  
   2.107  (options, args) = parser.parse_args()
author	Igor Chubin <igor@chub.in>
date	Tue Feb 08 20:35:38 2011 +0200 (2011-02-08)
parents	d708e2c1bad8
children	00286f6bfa85
files	new-words-py.sh new-words.py