new-words

changeset 54:e25de9ea9184

new-words.py is almost ready
author Igor Chubin <igor@chub.in>
date Tue Nov 01 20:19:18 2011 +0100 (2011-11-01)
parents f583256b7ab1
children 2a1a25e61872
files new-words-py.sh new-words.py
line diff
     1.1 --- a/new-words-py.sh	Mon Oct 31 20:21:20 2011 +0200
     1.2 +++ b/new-words-py.sh	Tue Nov 01 20:19:18 2011 +0100
     1.3 @@ -49,9 +49,6 @@
     1.4  
     1.5  NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py
     1.6  WORK_DIR=~/.new-words/
     1.7 -TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX`
     1.8 -TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX`
     1.9 -export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX`
    1.10  editor=${EDITOR:-vim}
    1.11  
    1.12  # language detection
    1.13 @@ -133,74 +130,31 @@
    1.14    exit 0
    1.15  fi
    1.16  
    1.17 -text_from_url()
    1.18 -{
    1.19 -lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
    1.20 -}
    1.21 -
    1.22 -add_marks()
    1.23 -{
    1.24 -    $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1" 
    1.25 -}
    1.26 -remove_marks()  
    1.27 -{
    1.28 -    $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1"
    1.29 -}
    1.30  get_words_group_words_add_stat()
    1.31  {
    1.32 +    [ "$PART_TO_PROCESS" == "" ] || PART_TO_PROCESS="-p $PART_TO_PROCESS"
    1.33 +    [ "$ALLOWED_WORDS_FILENAME" = "" ] || ALLOWED_WORDS_FILENAME="-f $ALLOWED_WORDS_FILENAME"
    1.34 +    [ "$SHOW_RANGE_PERCENTAGE" = "" ] || SHOW_RANGE_PERCENTAGE="-R $SHOW_RANGE_PERCENTAGE"
    1.35 +    [ "$NON_INTERACTIVE_MODE" = YES ] && non_interactive="-n"
    1.36 +    [ "$STAT_ONLY" = YES ] && stat_only="-s"
    1.37 +    [ "$COMPRESSED_WORDLIST" = YES ] && compressed_wordlist="-c"
    1.38 +    [ "$FILTER_WORDS" = NO ] && filter_words="-N"
    1.39 +    [ "$GROUP_WORDS_BY_TWO" = YES ] && group_words_by_two="-2"
    1.40 +    [ "$GROUP_WORDS_BY_THREE" = YES ] && group_words_by_three="-3"
    1.41 +
    1.42      SHOW_RANGE="$SHOW_RANGE" \
    1.43 -    SHOW_RANGE_PERCENTAGE="$SHOW_RANGE_PERCENTAGE" \
    1.44 -    COMPRESSED_WORDLIST="$COMPRESSED_WORDLIST" \
    1.45 -    GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \
    1.46 -    GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \
    1.47 -    STAT_ONLY="$STAT_ONLY" \
    1.48      WORDS_GROUPING="$WORDS_GROUPING" \
    1.49 -    FILTER_WORDS="$FILTER_WORDS" \
    1.50 -    ALLOWED_WORDS_FILENAME="$ALLOWED_WORDS_FILENAME" \
    1.51 -    $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1"
    1.52 -}
    1.53 -
    1.54 -part()
    1.55 -{
    1.56 -    PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
    1.57 -    cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
    1.58 -#!/usr/bin/perl
    1.59 -
    1.60 -my @lines=<STDIN>;
    1.61 -my $lines=$#lines;
    1.62 -my $interval=$ARGV[0];
    1.63 -if (not $interval) {
    1.64 -    print @lines;
    1.65 -}
    1.66 -else {
    1.67 -    my ($start,$stop,$total);
    1.68 -    if ($interval =~ m@(.*)/(.*)@) {
    1.69 -        $start = $1;
    1.70 -        $total = $2;
    1.71 -    }
    1.72 -    else {
    1.73 -        $start=$interval;
    1.74 -        $total=0;
    1.75 -    }
    1.76 -    if ($start =~ m@(.*)-(.*)@) {
    1.77 -        $start = $1;
    1.78 -        $stop = $2;
    1.79 -    }
    1.80 -    if ($start =~ m@(.*)\+(.*)@) {
    1.81 -        $start = $1;
    1.82 -        $stop = $start+$2;
    1.83 -    }
    1.84 -
    1.85 -    $start=int($lines/$total*$start);
    1.86 -    $stop=int($lines/$total*$stop);
    1.87 -
    1.88 -    for($i=$start;$i<$stop;$i++){
    1.89 -        print $lines[$i];
    1.90 -    }
    1.91 -}
    1.92 -PERL_SCRIPT
    1.93 -    perl $PERL_SCRIPT_TEMP_NAME "$1"
    1.94 -    rm $PERL_SCRIPT_TEMP_NAME
    1.95 +    $NEW_WORDS_PY -l "$LANGUAGE" \
    1.96 +    $SHOW_RANGE_PERCENTAGE \
    1.97 +    $PART_TO_PROCESS \
    1.98 +    $ALLOWED_WORDS_FILENAME \
    1.99 +    $non_interactive \
   1.100 +    $stat_only \
   1.101 +    $compressed_wordlist \
   1.102 +    $filter_words \
   1.103 +    $group_words_by_two \
   1.104 +    $group_words_by_three \
   1.105 +    -X get_words_group_words_add_stat "$1"
   1.106  }
   1.107  
   1.108  if [ "$TAGS_LIST_ONLY" = "YES" ] 
   1.109 @@ -232,68 +186,30 @@
   1.110      exit 0
   1.111  fi
   1.112  
   1.113 -mkdir -p $WORK_DIR
   1.114 -oldpwd="$PWD"
   1.115 -cd $WORK_DIR
   1.116 -if [ "$MERGE_TAGGED_WORDS" = "YES" ]
   1.117 -then
   1.118 -    VOC_FILES=''
   1.119 -    for i in $MERGE_THIS_TAGS
   1.120 -    do
   1.121 -        f=`tag_file_name $i`
   1.122 -        [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
   1.123 -    done
   1.124 -    if [ -z "$VOC_FILES" ]
   1.125 -    then 
   1.126 -        echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
   1.127 -    else
   1.128 -        cat $VOC_FILES
   1.129 -    fi
   1.130 -elif [ "$MERGE_ALL_TAGGED" = "YES" ]
   1.131 -then
   1.132 -    cat ${LANGUAGE}_*.txt
   1.133 -elif echo "$1" | grep -q http: 
   1.134 -then 
   1.135 -    text_from_url "$1"
   1.136 -elif [ "$#" != 0 ]
   1.137 -then
   1.138 -    if echo $1 | grep -q ^/
   1.139 -    then
   1.140 -        cat "$1"
   1.141 -    else
   1.142 -        cat "$oldpwd/$1"
   1.143 -    fi
   1.144 -else 
   1.145 -    cat
   1.146 -fi \
   1.147 -   | part $PART_TO_PROCESS \
   1.148 -   | tee $ORIGINAL_TEXT \
   1.149 -   | \
   1.150 -    get_words_group_words_add_stat \
   1.151 -   | tee "$TEMP1" > "$TEMP2"
   1.152 +get_words_group_words_add_stat "$1"
   1.153  
   1.154 -if [ "$STAT_ONLY" = "YES" ]
   1.155 -then
   1.156 -    cat "$TEMP1"
   1.157 -elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
   1.158 -then
   1.159 -    cat "$TEMP1"
   1.160 -else
   1.161 -    if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ] 
   1.162 -    then
   1.163 -        [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
   1.164 -        if [ "$editor" = vim ]
   1.165 -        then
   1.166 -            vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
   1.167 -        else
   1.168 -            $editor "$TEMP2"
   1.169 -        fi
   1.170 -        remove_marks "$TEMP2"
   1.171 +#mkdir -p $WORK_DIR
   1.172 +#oldpwd="$PWD"
   1.173 +#cd $WORK_DIR
   1.174 +#if [ "$MERGE_TAGGED_WORDS" = "YES" ]
   1.175 +#then
   1.176 +#    VOC_FILES=''
   1.177 +#    for i in $MERGE_THIS_TAGS
   1.178 +#    do
   1.179 +#        f=`tag_file_name $i`
   1.180 +#        [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
   1.181 +#    done
   1.182 +#    if [ -z "$VOC_FILES" ]
   1.183 +#    then 
   1.184 +#        echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
   1.185 +#    else
   1.186 +#        cat $VOC_FILES
   1.187 +#    fi
   1.188 +#elif [ "$MERGE_ALL_TAGGED" = "YES" ]
   1.189 +#then
   1.190 +#    cat ${LANGUAGE}_*.txt
   1.191 +#else 
   1.192 +#    cat
   1.193 +#fi 
   1.194  
   1.195 -        vocabulary="$VOCABULARY"
   1.196 -        [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`"
   1.197 -        diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary"
   1.198 -    fi
   1.199 -fi
   1.200  
   1.201 -rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"
     2.1 --- a/new-words.py	Mon Oct 31 20:21:20 2011 +0200
     2.2 +++ b/new-words.py	Tue Nov 01 20:19:18 2011 +0100
     2.3 @@ -11,6 +11,7 @@
     2.4  import subprocess
     2.5  import sys
     2.6  import Stemmer
     2.7 +import tempfile
     2.8  try:
     2.9      import psyco
    2.10      psyco.full()
    2.11 @@ -141,7 +142,13 @@
    2.12      dest="language")
    2.13  
    2.14  parser.add_option(
    2.15 -    "-f", "--function",
    2.16 +    "-f", "--allowed-words",
    2.17 +    help="file with list of allowed words (words that will be shown in the output)",
    2.18 +    action="store",
    2.19 +    dest="allowed_words")
    2.20 +
    2.21 +parser.add_option(
    2.22 +    "-X", "--function",
    2.23      help="filter through subsystem [INTERNAL]",
    2.24      action="store",
    2.25      dest="function")
    2.26 @@ -183,6 +190,12 @@
    2.27      dest="delete_tag")
    2.28  
    2.29  parser.add_option(
    2.30 +    "-R", "--show-range-percentage",
    2.31 +    help="show only words that cover specified percentage of the text, skip the rest",
    2.32 +    action="store",
    2.33 +    dest="show_range_percentage")
    2.34 +
    2.35 +parser.add_option(
    2.36      "-s", "--text-stats",
    2.37      help="show the text statistics (percentage of known words and so on) and exit",
    2.38      action="store_true",
    2.39 @@ -225,6 +238,16 @@
    2.40              res += [line]
    2.41      return res
    2.42  
    2.43 +def readlines_from_url(url):
    2.44 +    return [x.decode('utf-8') for x in
    2.45 +        subprocess.Popen(
    2.46 +            "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url),
    2.47 +            shell = True,
    2.48 +            stdout = subprocess.PIPE,
    2.49 +            stderr = subprocess.STDOUT
    2.50 +            ).communicate()[0].split('\n')
    2.51 +    ]
    2.52 +
    2.53  def readlines_from_stdin():
    2.54      return codecs.getreader("utf-8")(sys.stdin).readlines()
    2.55  
    2.56 @@ -261,8 +284,11 @@
    2.57      logging.debug(result)
    2.58      return result
    2.59  
    2.60 +def voc_filename():
    2.61 +    return "%s/%s.txt"%(config['config_directory'], config['language'])
    2.62 +
    2.63  def load_vocabulary():
    2.64 -    return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
    2.65 +    return get_words(readlines_from_file(voc_filename()))
    2.66  
    2.67  def notes_filenames():
    2.68      return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
    2.69 @@ -409,8 +435,10 @@
    2.70          show_range=0,
    2.71          show_range_percentage=0,
    2.72          ):
    2.73 +    result = []
    2.74      if stats_only:
    2.75 -        codecs.getwriter("utf-8")(sys.stdout).write(
    2.76 +        #codecs.getwriter("utf-8")(sys.stdout).write(
    2.77 +        result.append(
    2.78              " ".join([
    2.79                  "%-10s" % x for x in [
    2.80                  "LANG",
    2.81 @@ -421,7 +449,7 @@
    2.82                  "WPS",
    2.83                  "UWPS*10"
    2.84                  ]]) + "\n")
    2.85 -        codecs.getwriter("utf-8")(sys.stdout).write(
    2.86 +        result.append(
    2.87              " ".join([
    2.88                  "%(language)-10s",
    2.89                  "%(percentage)-10.2f",
    2.90 @@ -431,10 +459,10 @@
    2.91                  "%(wps)-11d"
    2.92                  "%(uwps)-11d"
    2.93                  ]) % stats + "\n")
    2.94 -        return
    2.95 +        return "".join(result)
    2.96  
    2.97      if print_stats:
    2.98 -        codecs.getwriter("utf-8")(sys.stdout).write(
    2.99 +        result.append(
   2.100              "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
   2.101  
   2.102      level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
   2.103 @@ -448,14 +476,12 @@
   2.104  
   2.105          normalized_word = normalizator.normalize(word_pair[1])
   2.106          if old_normalized_word and old_normalized_word != normalized_word:
   2.107 -            #codecs.getwriter("utf-8")(sys.stdout).write(
   2.108 -            #    "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
   2.109              if compressed_wordlist:
   2.110                  compressed_word_pair = (
   2.111                      sum(x[0] for x in words_of_this_group),
   2.112                      normalizator.best_word_from_group(words_of_this_group)
   2.113                      )
   2.114 -                codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
   2.115 +                result.append("%10s %s\n" % compressed_word_pair)
   2.116                  printed_words += 1
   2.117              words_of_this_group = []
   2.118  
   2.119 @@ -463,7 +489,7 @@
   2.120          words_of_this_group.append(word_pair)
   2.121  
   2.122          if not compressed_wordlist:
   2.123 -            codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
   2.124 +            result.append("%10s %s\n" % word_pair)
   2.125              printed_words += 1
   2.126  
   2.127  
   2.128 @@ -473,28 +499,14 @@
   2.129              while 100.0*known/total > level_lines[0]:
   2.130                  current_level = level_lines[0]
   2.131                  level_lines = level_lines[1:]
   2.132 -            codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
   2.133 +            result.append("# %s\n" % current_level)
   2.134  
   2.135          if show_range >0 and printed_words >= show_range:
   2.136              break
   2.137          if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
   2.138              break
   2.139  
   2.140 -def filter_add_notes(args):
   2.141 -    lines = readlines_from_file(args[0])
   2.142 -    notes = load_notes(notes_filenames())
   2.143 -    lines = add_notes(lines, notes)
   2.144 -    with codecs.open(args[0], "w", "utf-8") as f:
   2.145 -        for line in lines:
   2.146 -            f.write(line)
   2.147 -
   2.148 -def filter_remove_notes(args):
   2.149 -    lines = readlines_from_file(args[0])
   2.150 -    notes = load_notes(notes_filenames())
   2.151 -    lines = remove_notes(lines, notes)
   2.152 -    with codecs.open(args[0], "w", "utf-8") as f:
   2.153 -        for line in lines:
   2.154 -            f.write(line)
   2.155 +    return result
   2.156  
   2.157  def parse_parts_description(parts_description):
   2.158      """
   2.159 @@ -503,8 +515,6 @@
   2.160       from-to/step
   2.161       from+delta/step
   2.162      """
   2.163 -    def incorrect_parts_description(pd):
   2.164 -        raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % pd)
   2.165  
   2.166      try:
   2.167          (a, step) = parts_description.split("/", 1)
   2.168 @@ -525,7 +535,7 @@
   2.169          return (start, stop, step)
   2.170  
   2.171      except:
   2.172 -        raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % pd)
   2.173 +        raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % parts_description)
   2.174  
   2.175  
   2.176  def take_part(lines, part_description = None):
   2.177 @@ -536,27 +546,45 @@
   2.178      part_size = (1.0*n) / step
   2.179      result = []
   2.180      for i in range(n):
   2.181 -        if part_size * i >= start and part_size * i <= stop:
   2.182 -            result += lines[i]
   2.183 +        if i >= start * part_size and i <= stop * part_size:
   2.184 +            result += [lines[i]]
   2.185      return result
   2.186  
   2.187  def filter_get_words_group_words_add_stat(args):
   2.188      vocabulary = load_vocabulary()
   2.189      notes = load_notes(notes_filenames())
   2.190 -    lines = take_part(readlines_from_stdin(), config.get('pages', ''))
   2.191 +
   2.192 +    if len(args) > 0:
   2.193 +        if 'http://' in args[0]:
   2.194 +            input_lines = readlines_from_url(args[0])
   2.195 +        else:
   2.196 +            input_lines = readlines_from_file(args[0])
   2.197 +    else:
   2.198 +        input_lines = readlines_from_stdin()
   2.199 +
   2.200 +    if len(input_lines) == 0:
   2.201 +        print >> sys.stderr, "Nothing to do, standard input is empty, exiting."
   2.202 +        sys.exit(1)
   2.203 +
   2.204 +    lines = take_part(input_lines, config.get('pages', ''))
   2.205 +
   2.206 +    (_, original_text_tempfile) = tempfile.mkstemp(prefix='new-word')
   2.207 +    with codecs.open(original_text_tempfile, "w", "utf-8") as f:
   2.208 +        f.write("".join(lines))
   2.209 +
   2.210      group_by = [1]
   2.211  
   2.212 -    if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
   2.213 +    if 'two_words' in config:
   2.214          group_by.append(2)
   2.215 -    if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
   2.216 +    if 'three_words' in config:
   2.217          group_by.append(3)
   2.218      words = get_words(lines, group_by)
   2.219      stats_only = False
   2.220 -    if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
   2.221 +    if 'text_stats' in config:
   2.222          stats_only = True
   2.223  
   2.224      compressed_wordlist = False
   2.225 -    if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
   2.226 +    if 'compressed' in config:
   2.227          compressed_wordlist = True
   2.228  
   2.229      show_range = os.environ.get('SHOW_RANGE', '')
   2.230 @@ -564,16 +592,16 @@
   2.231          show_range = int(show_range)
   2.232      else:
   2.233          show_range = 0
   2.234 -    show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '')
   2.235 -    if show_range_percentage != '':
   2.236 -        show_range_percentage = int(show_range_percentage)
   2.237 +
   2.238 +    if 'show_range_percentage' in config:
   2.239 +        show_range_percentage = int(config['show_range_percentage'])
   2.240      else:
   2.241          show_range_percentage = 0
   2.242  
   2.243  
   2.244      stats = {}
   2.245      stats['total'] = sum(words[x] for x in words.keys())
   2.246 -    if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':
   2.247 +    if not 'no_filter' in config:
   2.248          words = substract_dictionary(words, vocabulary)
   2.249  
   2.250      stats['total_unknown'] = sum(words[x] for x in words.keys())
   2.251 @@ -591,8 +619,8 @@
   2.252      normalizator = Normalizator(config['language'], linked_words)
   2.253  
   2.254      # filter words by allowed_words_filter
   2.255 -    if os.environ.get('ALLOWED_WORDS_FILENAME', ''):
   2.256 -        allowed_words_filename = os.environ.get('ALLOWED_WORDS_FILENAME', '')
   2.257 +    if 'allowed_words' in config:
   2.258 +        allowed_words_filename = config['allowed_words']
   2.259          normalized_allowed_words = [
   2.260              normalizator.normalize(w.rstrip('\n')) 
   2.261              for w in readlines_from_file(allowed_words_filename)
   2.262 @@ -615,7 +643,7 @@
   2.263                  cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
   2.264                  reverse=True)
   2.265  
   2.266 -    print_words_sorted(
   2.267 +    output = print_words_sorted(
   2.268          words_with_freq,
   2.269          stats,
   2.270          normalizator,
   2.271 @@ -625,14 +653,87 @@
   2.272          show_range_percentage=show_range_percentage,
   2.273          )
   2.274  
   2.275 +
   2.276 +    if ('non_interactive' in config or 'text_stats' in config):
   2.277 +        codecs.getwriter("utf-8")(sys.stdout).write("".join(output))
   2.278 +    else:
   2.279 +        (_, temp1) = tempfile.mkstemp(prefix='new-word')
   2.280 +        (_, temp2) = tempfile.mkstemp(prefix='new-word')
   2.281 +
   2.282 +        with codecs.open(temp1, "w", "utf-8") as f:
   2.283 +            f.write("".join(output))
   2.284 +        with codecs.open(temp2, "w", "utf-8") as f:
   2.285 +            f.write("".join(add_notes(output, notes)))
   2.286 +
   2.287 +        os.putenv('ORIGINAL_TEXT', original_text_tempfile)
   2.288 +        os.system((
   2.289 +            "vim"
   2.290 +            " -c 'setlocal spell spelllang={language}'"
   2.291 +            " -c 'set keywordprg={language}'"
   2.292 +            " -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255'"
   2.293 +            " {filename}"
   2.294 +            " < /dev/tty > /dev/tty"
   2.295 +            ).format(language=config['language'], filename=temp2))
   2.296 +
   2.297 +        lines = remove_notes(readlines_from_file(temp2), notes)
   2.298 +
   2.299 +        # compare lines_before and lines_after and return deleted words
   2.300 +        lines_before = output
   2.301 +        lines_after = lines
   2.302 +        deleted_words = []
   2.303 +
   2.304 +        for line in lines_before:
   2.305 +            if line not in lines_after:
   2.306 +                line = line.strip()
   2.307 +                if ' ' in line:
   2.308 +                    word = re.split('\s+', line, 1)[1]
   2.309 +                    if ' ' in word:
   2.310 +                        word = re.split('\s+', word, 1)[0]
   2.311 +                deleted_words.append(word)
   2.312 +
   2.313 +        with codecs.open(voc_filename(), "a", "utf-8") as f:
   2.314 +            f.write("\n".join(deleted_words + ['']))
   2.315 +
   2.316 +        os.unlink(temp1)
   2.317 +        os.unlink(temp2)
   2.318 +
   2.319 +    os.unlink(original_text_tempfile)
   2.320 +
   2.321  (options, args) = parser.parse_args()
   2.322  if options.language:
   2.323      config['language'] = options.language
   2.324  
   2.325 +if options.pages:
   2.326 +    config['pages'] = options.pages
   2.327 +else:
   2.328 +    config['pages'] = ""
   2.329 +
   2.330 +if options.allowed_words:
   2.331 +    config['allowed_words'] = options.allowed_words
   2.332 +
   2.333 +if options.show_range_percentage:
   2.334 +    config['show_range_percentage'] = options.show_range_percentage
   2.335 +
   2.336 +if options.non_interactive:
   2.337 +    config['non_interactive'] = True
   2.338 +
   2.339 +if options.text_stats:
   2.340 +    config['text_stats'] = True
   2.341 +
   2.342 +if options.compressed:
   2.343 +    config['compressed'] = True
   2.344 +
   2.345 +if options.no_filter:
   2.346 +    config['no_filter'] = True
   2.347 +
   2.348 +if options.two_words:
   2.349 +    config['two_words'] = True
   2.350 +
   2.351 +if options.three_words:
   2.352 +    config['three_words'] = True
   2.353 +
   2.354  if options.function:
   2.355      function_names = {
   2.356 -        'add_notes' :   filter_add_notes,
   2.357 -        'remove_notes': filter_remove_notes,
   2.358          'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
   2.359      }
   2.360      if options.function in function_names: