new-words

changeset 50:4e931db74618

filtering by wordlist
author Igor Chubin <igor@chub.in>
date Fri Mar 25 22:35:44 2011 +0200 (2011-03-25)
parents 00286f6bfa85
children 74e05d4436ee
files new-words-py.sh new-words.py new-words.sh
line diff
     1.1 --- a/new-words-py.sh	Wed Feb 09 21:08:23 2011 +0200
     1.2 +++ b/new-words-py.sh	Fri Mar 25 22:35:44 2011 +0200
     1.3 @@ -12,6 +12,7 @@
     1.4  
     1.5      -h          print this screen
     1.6      -c          show compressed wordlist: one word per group
     1.7 +    -f file     show only words related to the words in the file
     1.8      -G          turn off word grouping
     1.9      -k          put higher words that are similar to the known words (only for English)
    1.10      -l lang     override language settings
    1.11 @@ -86,10 +87,12 @@
    1.12  SHOW_VOC_STAT=NO
    1.13  COMPRESSED_WORDLIST=NO
    1.14  WORDS_GROUPING=YES
    1.15 -while getopts Gcl:sSkanNp:t:Tm:Md:r:R:23 opt
    1.16 +ALLOWED_WORDS_FILENAME=''
    1.17 +while getopts Gcf:l:sSkanNp:t:Tm:Md:r:R:23 opt
    1.18  do
    1.19      case "$opt" in
    1.20        c)  COMPRESSED_WORDLIST=YES;;
    1.21 +      f)  ALLOWED_WORDS_FILENAME="$OPTARG";;
    1.22        G)  WORDS_GROUPING=NO;;
    1.23        s)  STAT_ONLY=YES;;
    1.24        S)  SHOW_VOC_STAT=YES;;
    1.25 @@ -153,6 +156,7 @@
    1.26      STAT_ONLY="$STAT_ONLY" \
    1.27      WORDS_GROUPING="$WORDS_GROUPING" \
    1.28      FILTER_WORDS="$FILTER_WORDS" \
    1.29 +    ALLOWED_WORDS_FILENAME="$ALLOWED_WORDS_FILENAME" \
    1.30      $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1"
    1.31  }
    1.32  
     2.1 --- a/new-words.py	Wed Feb 09 21:08:23 2011 +0200
     2.2 +++ b/new-words.py	Fri Mar 25 22:35:44 2011 +0200
     2.3 @@ -544,6 +544,20 @@
     2.4      linked_words = find_linked_words(notes)
     2.5      normalizator = Normalizator(config['language'], linked_words)
     2.6  
     2.7 +    # filter words by allowed_words_filter
     2.8 +    if os.environ.get('ALLOWED_WORDS_FILENAME', ''):
     2.9 +        allowed_words_filename = os.environ.get('ALLOWED_WORDS_FILENAME', '')
    2.10 +        normalized_allowed_words = [
    2.11 +            normalizator.normalize(w.rstrip('\n')) 
    2.12 +            for w in readlines_from_file(allowed_words_filename)
    2.13 +        ]
    2.14 +
    2.15 +        result = {}
    2.16 +        for w, wn in words.iteritems():
    2.17 +            if normalizator.normalize(w) in normalized_allowed_words:
    2.18 +                result[w] = wn
    2.19 +        words = result
    2.20 +
    2.21      words_with_freq = []
    2.22      for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
    2.23          words_with_freq.append((words[k], k))
     3.1 --- a/new-words.sh	Wed Feb 09 21:08:23 2011 +0200
     3.2 +++ b/new-words.sh	Fri Mar 25 22:35:44 2011 +0200
     3.3 @@ -12,6 +12,7 @@
     3.4  
     3.5      -h          print this screen
     3.6      -c          show compressed wordlist: one word per group
     3.7 +    -f file     show only words that are related to the words from the file
     3.8      -k          put higher words that are similar to the known words (only for English)
     3.9      -l lang     override language settings
    3.10      -n          non-interactive mode (don't run vi)
    3.11 @@ -81,6 +82,7 @@
    3.12  FILTER_WORDS=YES
    3.13  SHOW_VOC_STAT=NO
    3.14  COMPRESSED_WORDLIST=NO
    3.15 +ALLOWED_WORDS_FILENAME=''
    3.16  while getopts cl:sSkanNp:t:Tm:Mr:23 opt
    3.17  do
    3.18      case "$opt" in