# HG changeset patch # User Igor Chubin # Date 1301085344 -7200 # Node ID 4e931db74618878956d60815edc4f37020572c78 # Parent 00286f6bfa8580737e9a35281e2cf06ff7fb5d14 filtering by wordlist diff -r 00286f6bfa85 -r 4e931db74618 new-words-py.sh --- a/new-words-py.sh Wed Feb 09 21:08:23 2011 +0200 +++ b/new-words-py.sh Fri Mar 25 22:35:44 2011 +0200 @@ -12,6 +12,7 @@ -h print this screen -c show compressed wordlist: one word per group + -f file show only words related to the words in the file -G turn off word grouping -k put higher words that are similar to the known words (only for English) -l lang override language settings @@ -86,10 +87,12 @@ SHOW_VOC_STAT=NO COMPRESSED_WORDLIST=NO WORDS_GROUPING=YES -while getopts Gcl:sSkanNp:t:Tm:Md:r:R:23 opt +ALLOWED_WORDS_FILENAME='' +while getopts Gcf:l:sSkanNp:t:Tm:Md:r:R:23 opt do case "$opt" in c) COMPRESSED_WORDLIST=YES;; + f) ALLOWED_WORDS_FILENAME="$OPTARG";; G) WORDS_GROUPING=NO;; s) STAT_ONLY=YES;; S) SHOW_VOC_STAT=YES;; @@ -153,6 +156,7 @@ STAT_ONLY="$STAT_ONLY" \ WORDS_GROUPING="$WORDS_GROUPING" \ FILTER_WORDS="$FILTER_WORDS" \ + ALLOWED_WORDS_FILENAME="$ALLOWED_WORDS_FILENAME" \ $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1" } diff -r 00286f6bfa85 -r 4e931db74618 new-words.py --- a/new-words.py Wed Feb 09 21:08:23 2011 +0200 +++ b/new-words.py Fri Mar 25 22:35:44 2011 +0200 @@ -544,6 +544,20 @@ linked_words = find_linked_words(notes) normalizator = Normalizator(config['language'], linked_words) + # filter words by allowed_words_filter + if os.environ.get('ALLOWED_WORDS_FILENAME', ''): + allowed_words_filename = os.environ.get('ALLOWED_WORDS_FILENAME', '') + normalized_allowed_words = [ + normalizator.normalize(w.rstrip('\n')) + for w in readlines_from_file(allowed_words_filename) + ] + + result = {} + for w, wn in words.iteritems(): + if normalizator.normalize(w) in normalized_allowed_words: + result[w] = wn + words = result + words_with_freq = [] for k in sorted(words.keys(), key=lambda k: words[k], reverse=True): words_with_freq.append((words[k], k)) diff -r 00286f6bfa85 -r 4e931db74618 new-words.sh --- a/new-words.sh Wed Feb 09 21:08:23 2011 +0200 +++ b/new-words.sh Fri Mar 25 22:35:44 2011 +0200 @@ -12,6 +12,7 @@ -h print this screen -c show compressed wordlist: one word per group + -f file show only words that are related to the words from the file -k put higher words that are similar to the known words (only for English) -l lang override language settings -n non-interactive mode (don't run vi) @@ -81,6 +82,7 @@ FILTER_WORDS=YES SHOW_VOC_STAT=NO COMPRESSED_WORDLIST=NO +ALLOWED_WORDS_FILENAME='' while getopts cl:sSkanNp:t:Tm:Mr:23 opt do case "$opt" in