# HG changeset patch
# User Igor Chubin <igor@chub.in>
# Date 1320175158 -3600
# Node ID e25de9ea918461c8a7ca83d9844b7ca16f239efd
# Parent  f583256b7ab150bcd0d54855b0e07ca8d22b2288
new-words.py is almost ready

diff -r f583256b7ab1 -r e25de9ea9184 new-words-py.sh
--- a/new-words-py.sh	Mon Oct 31 20:21:20 2011 +0200
+++ b/new-words-py.sh	Tue Nov 01 20:19:18 2011 +0100
@@ -49,9 +49,6 @@
 
 NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py
 WORK_DIR=~/.new-words/
-TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX`
-TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX`
-export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX`
 editor=${EDITOR:-vim}
 
 # language detection
@@ -133,74 +130,31 @@
   exit 0
 fi
 
-text_from_url()
-{
-lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
-}
-
-add_marks()
-{
-    $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1" 
-}
-remove_marks()  
-{
-    $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1"
-}
 get_words_group_words_add_stat()
 {
+    [ "$PART_TO_PROCESS" == "" ] || PART_TO_PROCESS="-p $PART_TO_PROCESS"
+    [ "$ALLOWED_WORDS_FILENAME" = "" ] || ALLOWED_WORDS_FILENAME="-f $ALLOWED_WORDS_FILENAME"
+    [ "$SHOW_RANGE_PERCENTAGE" = "" ] || SHOW_RANGE_PERCENTAGE="-R $SHOW_RANGE_PERCENTAGE"
+    [ "$NON_INTERACTIVE_MODE" = YES ] && non_interactive="-n"
+    [ "$STAT_ONLY" = YES ] && stat_only="-s"
+    [ "$COMPRESSED_WORDLIST" = YES ] && compressed_wordlist="-c"
+    [ "$FILTER_WORDS" = NO ] && filter_words="-N"
+    [ "$GROUP_WORDS_BY_TWO" = YES ] && group_words_by_two="-2"
+    [ "$GROUP_WORDS_BY_THREE" = YES ] && group_words_by_three="-3"
+
     SHOW_RANGE="$SHOW_RANGE" \
-    SHOW_RANGE_PERCENTAGE="$SHOW_RANGE_PERCENTAGE" \
-    COMPRESSED_WORDLIST="$COMPRESSED_WORDLIST" \
-    GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \
-    GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \
-    STAT_ONLY="$STAT_ONLY" \
     WORDS_GROUPING="$WORDS_GROUPING" \
-    FILTER_WORDS="$FILTER_WORDS" \
-    ALLOWED_WORDS_FILENAME="$ALLOWED_WORDS_FILENAME" \
-    $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1"
-}
-
-part()
-{
-    PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
-    cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
-#!/usr/bin/perl
-
-my @lines=<STDIN>;
-my $lines=$#lines;
-my $interval=$ARGV[0];
-if (not $interval) {
-    print @lines;
-}
-else {
-    my ($start,$stop,$total);
-    if ($interval =~ m@(.*)/(.*)@) {
-        $start = $1;
-        $total = $2;
-    }
-    else {
-        $start=$interval;
-        $total=0;
-    }
-    if ($start =~ m@(.*)-(.*)@) {
-        $start = $1;
-        $stop = $2;
-    }
-    if ($start =~ m@(.*)\+(.*)@) {
-        $start = $1;
-        $stop = $start+$2;
-    }
-
-    $start=int($lines/$total*$start);
-    $stop=int($lines/$total*$stop);
-
-    for($i=$start;$i<$stop;$i++){
-        print $lines[$i];
-    }
-}
-PERL_SCRIPT
-    perl $PERL_SCRIPT_TEMP_NAME "$1"
-    rm $PERL_SCRIPT_TEMP_NAME
+    $NEW_WORDS_PY -l "$LANGUAGE" \
+    $SHOW_RANGE_PERCENTAGE \
+    $PART_TO_PROCESS \
+    $ALLOWED_WORDS_FILENAME \
+    $non_interactive \
+    $stat_only \
+    $compressed_wordlist \
+    $filter_words \
+    $group_words_by_two \
+    $group_words_by_three \
+    -X get_words_group_words_add_stat "$1"
 }
 
 if [ "$TAGS_LIST_ONLY" = "YES" ] 
@@ -232,68 +186,30 @@
     exit 0
 fi
 
-mkdir -p $WORK_DIR
-oldpwd="$PWD"
-cd $WORK_DIR
-if [ "$MERGE_TAGGED_WORDS" = "YES" ]
-then
-    VOC_FILES=''
-    for i in $MERGE_THIS_TAGS
-    do
-        f=`tag_file_name $i`
-        [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
-    done
-    if [ -z "$VOC_FILES" ]
-    then 
-        echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
-    else
-        cat $VOC_FILES
-    fi
-elif [ "$MERGE_ALL_TAGGED" = "YES" ]
-then
-    cat ${LANGUAGE}_*.txt
-elif echo "$1" | grep -q http: 
-then 
-    text_from_url "$1"
-elif [ "$#" != 0 ]
-then
-    if echo $1 | grep -q ^/
-    then
-        cat "$1"
-    else
-        cat "$oldpwd/$1"
-    fi
-else 
-    cat
-fi \
-   | part $PART_TO_PROCESS \
-   | tee $ORIGINAL_TEXT \
-   | \
-    get_words_group_words_add_stat \
-   | tee "$TEMP1" > "$TEMP2"
+get_words_group_words_add_stat "$1"
 
-if [ "$STAT_ONLY" = "YES" ]
-then
-    cat "$TEMP1"
-elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
-then
-    cat "$TEMP1"
-else
-    if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ] 
-    then
-        [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
-        if [ "$editor" = vim ]
-        then
-            vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
-        else
-            $editor "$TEMP2"
-        fi
-        remove_marks "$TEMP2"
+#mkdir -p $WORK_DIR
+#oldpwd="$PWD"
+#cd $WORK_DIR
+#if [ "$MERGE_TAGGED_WORDS" = "YES" ]
+#then
+#    VOC_FILES=''
+#    for i in $MERGE_THIS_TAGS
+#    do
+#        f=`tag_file_name $i`
+#        [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
+#    done
+#    if [ -z "$VOC_FILES" ]
+#    then 
+#        echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
+#    else
+#        cat $VOC_FILES
+#    fi
+#elif [ "$MERGE_ALL_TAGGED" = "YES" ]
+#then
+#    cat ${LANGUAGE}_*.txt
+#else 
+#    cat
+#fi 
 
-        vocabulary="$VOCABULARY"
-        [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`"
-        diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary"
-    fi
-fi
 
-rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"
diff -r f583256b7ab1 -r e25de9ea9184 new-words.py
--- a/new-words.py	Mon Oct 31 20:21:20 2011 +0200
+++ b/new-words.py	Tue Nov 01 20:19:18 2011 +0100
@@ -11,6 +11,7 @@
 import subprocess
 import sys
 import Stemmer
+import tempfile
 try:
     import psyco
     psyco.full()
@@ -141,7 +142,13 @@
     dest="language")
 
 parser.add_option(
-    "-f", "--function",
+    "-f", "--allowed-words",
+    help="file with list of allowed words (words that will be shown in the output)",
+    action="store",
+    dest="allowed_words")
+
+parser.add_option(
+    "-X", "--function",
     help="filter through subsystem [INTERNAL]",
     action="store",
     dest="function")
@@ -183,6 +190,12 @@
     dest="delete_tag")
 
 parser.add_option(
+    "-R", "--show-range-percentage",
+    help="show only words that cover specified percentage of the text, skip the rest",
+    action="store",
+    dest="show_range_percentage")
+
+parser.add_option(
     "-s", "--text-stats",
     help="show the text statistics (percentage of known words and so on) and exit",
     action="store_true",
@@ -225,6 +238,16 @@
             res += [line]
     return res
 
+def readlines_from_url(url):
+    return [x.decode('utf-8') for x in
+        subprocess.Popen(
+            "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url),
+            shell = True,
+            stdout = subprocess.PIPE,
+            stderr = subprocess.STDOUT
+            ).communicate()[0].split('\n')
+    ]
+
 def readlines_from_stdin():
     return codecs.getreader("utf-8")(sys.stdin).readlines()
 
@@ -261,8 +284,11 @@
     logging.debug(result)
     return result
 
+def voc_filename():
+    return "%s/%s.txt"%(config['config_directory'], config['language'])
+
 def load_vocabulary():
-    return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
+    return get_words(readlines_from_file(voc_filename()))
 
 def notes_filenames():
     return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
@@ -409,8 +435,10 @@
         show_range=0,
         show_range_percentage=0,
         ):
+    result = []
     if stats_only:
-        codecs.getwriter("utf-8")(sys.stdout).write(
+        #codecs.getwriter("utf-8")(sys.stdout).write(
+        result.append(
             " ".join([
                 "%-10s" % x for x in [
                 "LANG",
@@ -421,7 +449,7 @@
                 "WPS",
                 "UWPS*10"
                 ]]) + "\n")
-        codecs.getwriter("utf-8")(sys.stdout).write(
+        result.append(
             " ".join([
                 "%(language)-10s",
                 "%(percentage)-10.2f",
@@ -431,10 +459,10 @@
                 "%(wps)-11d"
                 "%(uwps)-11d"
                 ]) % stats + "\n")
-        return
+        return "".join(result)
 
     if print_stats:
-        codecs.getwriter("utf-8")(sys.stdout).write(
+        result.append(
             "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
 
     level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
@@ -448,14 +476,12 @@
 
         normalized_word = normalizator.normalize(word_pair[1])
         if old_normalized_word and old_normalized_word != normalized_word:
-            #codecs.getwriter("utf-8")(sys.stdout).write(
-            #    "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
             if compressed_wordlist:
                 compressed_word_pair = (
                     sum(x[0] for x in words_of_this_group),
                     normalizator.best_word_from_group(words_of_this_group)
                     )
-                codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
+                result.append("%10s %s\n" % compressed_word_pair)
                 printed_words += 1
             words_of_this_group = []
 
@@ -463,7 +489,7 @@
         words_of_this_group.append(word_pair)
 
         if not compressed_wordlist:
-            codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
+            result.append("%10s %s\n" % word_pair)
             printed_words += 1
 
 
@@ -473,28 +499,14 @@
             while 100.0*known/total > level_lines[0]:
                 current_level = level_lines[0]
                 level_lines = level_lines[1:]
-            codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
+            result.append("# %s\n" % current_level)
 
         if show_range >0 and printed_words >= show_range:
             break
         if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
             break
 
-def filter_add_notes(args):
-    lines = readlines_from_file(args[0])
-    notes = load_notes(notes_filenames())
-    lines = add_notes(lines, notes)
-    with codecs.open(args[0], "w", "utf-8") as f:
-        for line in lines:
-            f.write(line)
-
-def filter_remove_notes(args):
-    lines = readlines_from_file(args[0])
-    notes = load_notes(notes_filenames())
-    lines = remove_notes(lines, notes)
-    with codecs.open(args[0], "w", "utf-8") as f:
-        for line in lines:
-            f.write(line)
+    return result
 
 def parse_parts_description(parts_description):
     """
@@ -503,8 +515,6 @@
      from-to/step
      from+delta/step
     """
-    def incorrect_parts_description(pd):
-        raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % pd)
 
     try:
         (a, step) = parts_description.split("/", 1)
@@ -525,7 +535,7 @@
         return (start, stop, step)
 
     except:
-        raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % pd)
+        raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % parts_description)
 
 
 def take_part(lines, part_description = None):
@@ -536,27 +546,45 @@
     part_size = (1.0*n) / step
     result = []
     for i in range(n):
-        if part_size * i >= start and part_size * i <= stop:
-            result += lines[i]
+        if i >= start * part_size and i <= stop * part_size:
+            result += [lines[i]]
     return result
 
 def filter_get_words_group_words_add_stat(args):
     vocabulary = load_vocabulary()
     notes = load_notes(notes_filenames())
-    lines = take_part(readlines_from_stdin(), config.get('pages', ''))
+
+    if len(args) > 0:
+        if 'http://' in args[0]:
+            input_lines = readlines_from_url(args[0])
+        else:
+            input_lines = readlines_from_file(args[0])
+    else:
+        input_lines = readlines_from_stdin()
+
+    if len(input_lines) == 0:
+        print >> sys.stderr, "Nothing to do, standard input is empty, exiting."
+        sys.exit(1)
+
+    lines = take_part(input_lines, config.get('pages', ''))
+
+    (_, original_text_tempfile) = tempfile.mkstemp(prefix='new-word')
+    with codecs.open(original_text_tempfile, "w", "utf-8") as f:
+        f.write("".join(lines))
+
     group_by = [1]
 
-    if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
+    if 'two_words' in config:
         group_by.append(2)
-    if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
+    if 'three_words' in config:
         group_by.append(3)
     words = get_words(lines, group_by)
     stats_only = False
-    if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
+    if 'text_stats' in config:
         stats_only = True
 
     compressed_wordlist = False
-    if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
+    if 'compressed' in config:
         compressed_wordlist = True
 
     show_range = os.environ.get('SHOW_RANGE', '')
@@ -564,16 +592,16 @@
         show_range = int(show_range)
     else:
         show_range = 0
-    show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '')
-    if show_range_percentage != '':
-        show_range_percentage = int(show_range_percentage)
+
+    if 'show_range_percentage' in config:
+        show_range_percentage = int(config['show_range_percentage'])
     else:
         show_range_percentage = 0
 
 
     stats = {}
     stats['total'] = sum(words[x] for x in words.keys())
-    if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':
+    if not 'no_filter' in config:
         words = substract_dictionary(words, vocabulary)
 
     stats['total_unknown'] = sum(words[x] for x in words.keys())
@@ -591,8 +619,8 @@
     normalizator = Normalizator(config['language'], linked_words)
 
     # filter words by allowed_words_filter
-    if os.environ.get('ALLOWED_WORDS_FILENAME', ''):
-        allowed_words_filename = os.environ.get('ALLOWED_WORDS_FILENAME', '')
+    if 'allowed_words' in config:
+        allowed_words_filename = config['allowed_words']
         normalized_allowed_words = [
             normalizator.normalize(w.rstrip('\n')) 
             for w in readlines_from_file(allowed_words_filename)
@@ -615,7 +643,7 @@
                 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
                 reverse=True)
 
-    print_words_sorted(
+    output = print_words_sorted(
         words_with_freq,
         stats,
         normalizator,
@@ -625,14 +653,87 @@
         show_range_percentage=show_range_percentage,
         )
 
+
+    if ('non_interactive' in config or 'text_stats' in config):
+        codecs.getwriter("utf-8")(sys.stdout).write("".join(output))
+    else:
+        (_, temp1) = tempfile.mkstemp(prefix='new-word')
+        (_, temp2) = tempfile.mkstemp(prefix='new-word')
+
+        with codecs.open(temp1, "w", "utf-8") as f:
+            f.write("".join(output))
+        with codecs.open(temp2, "w", "utf-8") as f:
+            f.write("".join(add_notes(output, notes)))
+
+        os.putenv('ORIGINAL_TEXT', original_text_tempfile)
+        os.system((
+            "vim"
+            " -c 'setlocal spell spelllang={language}'"
+            " -c 'set keywordprg={language}'"
+            " -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255'"
+            " {filename}"
+            " < /dev/tty > /dev/tty"
+            ).format(language=config['language'], filename=temp2))
+
+        lines = remove_notes(readlines_from_file(temp2), notes)
+
+        # compare lines_before and lines_after and return deleted words
+        lines_before = output
+        lines_after = lines
+        deleted_words = []
+
+        for line in lines_before:
+            if line not in lines_after:
+                line = line.strip()
+                if ' ' in line:
+                    word = re.split('\s+', line, 1)[1]
+                    if ' ' in word:
+                        word = re.split('\s+', word, 1)[0]
+                deleted_words.append(word)
+
+        with codecs.open(voc_filename(), "a", "utf-8") as f:
+            f.write("\n".join(deleted_words + ['']))
+
+        os.unlink(temp1)
+        os.unlink(temp2)
+
+    os.unlink(original_text_tempfile)
+
 (options, args) = parser.parse_args()
 if options.language:
     config['language'] = options.language
 
+if options.pages:
+    config['pages'] = options.pages
+else:
+    config['pages'] = ""
+
+if options.allowed_words:
+    config['allowed_words'] = options.allowed_words
+
+if options.show_range_percentage:
+    config['show_range_percentage'] = options.show_range_percentage
+
+if options.non_interactive:
+    config['non_interactive'] = True
+
+if options.text_stats:
+    config['text_stats'] = True
+
+if options.compressed:
+    config['compressed'] = True
+
+if options.no_filter:
+    config['no_filter'] = True
+
+if options.two_words:
+    config['two_words'] = True
+
+if options.three_words:
+    config['three_words'] = True
+
 if options.function:
     function_names = {
-        'add_notes' :   filter_add_notes,
-        'remove_notes': filter_remove_notes,
         'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
     }
     if options.function in function_names: