# HG changeset patch # User Igor Chubin # Date 1320175158 -3600 # Node ID e25de9ea918461c8a7ca83d9844b7ca16f239efd # Parent f583256b7ab150bcd0d54855b0e07ca8d22b2288 new-words.py is almost ready diff -r f583256b7ab1 -r e25de9ea9184 new-words-py.sh --- a/new-words-py.sh Mon Oct 31 20:21:20 2011 +0200 +++ b/new-words-py.sh Tue Nov 01 20:19:18 2011 +0100 @@ -49,9 +49,6 @@ NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py WORK_DIR=~/.new-words/ -TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX` -TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX` -export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX` editor=${EDITOR:-vim} # language detection @@ -133,74 +130,31 @@ exit 0 fi -text_from_url() -{ -lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@' -} - -add_marks() -{ - $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1" -} -remove_marks() -{ - $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1" -} get_words_group_words_add_stat() { + [ "$PART_TO_PROCESS" == "" ] || PART_TO_PROCESS="-p $PART_TO_PROCESS" + [ "$ALLOWED_WORDS_FILENAME" = "" ] || ALLOWED_WORDS_FILENAME="-f $ALLOWED_WORDS_FILENAME" + [ "$SHOW_RANGE_PERCENTAGE" = "" ] || SHOW_RANGE_PERCENTAGE="-R $SHOW_RANGE_PERCENTAGE" + [ "$NON_INTERACTIVE_MODE" = YES ] && non_interactive="-n" + [ "$STAT_ONLY" = YES ] && stat_only="-s" + [ "$COMPRESSED_WORDLIST" = YES ] && compressed_wordlist="-c" + [ "$FILTER_WORDS" = NO ] && filter_words="-N" + [ "$GROUP_WORDS_BY_TWO" = YES ] && group_words_by_two="-2" + [ "$GROUP_WORDS_BY_THREE" = YES ] && group_words_by_three="-3" + SHOW_RANGE="$SHOW_RANGE" \ - SHOW_RANGE_PERCENTAGE="$SHOW_RANGE_PERCENTAGE" \ - COMPRESSED_WORDLIST="$COMPRESSED_WORDLIST" \ - GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \ - GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \ - STAT_ONLY="$STAT_ONLY" \ WORDS_GROUPING="$WORDS_GROUPING" \ - FILTER_WORDS="$FILTER_WORDS" \ - ALLOWED_WORDS_FILENAME="$ALLOWED_WORDS_FILENAME" \ - $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1" -} - -part() -{ - PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX` - cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME -#!/usr/bin/perl - -my @lines=; -my $lines=$#lines; -my $interval=$ARGV[0]; -if (not $interval) { - print @lines; -} -else { - my ($start,$stop,$total); - if ($interval =~ m@(.*)/(.*)@) { - $start = $1; - $total = $2; - } - else { - $start=$interval; - $total=0; - } - if ($start =~ m@(.*)-(.*)@) { - $start = $1; - $stop = $2; - } - if ($start =~ m@(.*)\+(.*)@) { - $start = $1; - $stop = $start+$2; - } - - $start=int($lines/$total*$start); - $stop=int($lines/$total*$stop); - - for($i=$start;$i<$stop;$i++){ - print $lines[$i]; - } -} -PERL_SCRIPT - perl $PERL_SCRIPT_TEMP_NAME "$1" - rm $PERL_SCRIPT_TEMP_NAME + $NEW_WORDS_PY -l "$LANGUAGE" \ + $SHOW_RANGE_PERCENTAGE \ + $PART_TO_PROCESS \ + $ALLOWED_WORDS_FILENAME \ + $non_interactive \ + $stat_only \ + $compressed_wordlist \ + $filter_words \ + $group_words_by_two \ + $group_words_by_three \ + -X get_words_group_words_add_stat "$1" } if [ "$TAGS_LIST_ONLY" = "YES" ] @@ -232,68 +186,30 @@ exit 0 fi -mkdir -p $WORK_DIR -oldpwd="$PWD" -cd $WORK_DIR -if [ "$MERGE_TAGGED_WORDS" = "YES" ] -then - VOC_FILES='' - for i in $MERGE_THIS_TAGS - do - f=`tag_file_name $i` - [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f" - done - if [ -z "$VOC_FILES" ] - then - echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr - else - cat $VOC_FILES - fi -elif [ "$MERGE_ALL_TAGGED" = "YES" ] -then - cat ${LANGUAGE}_*.txt -elif echo "$1" | grep -q http: -then - text_from_url "$1" -elif [ "$#" != 0 ] -then - if echo $1 | grep -q ^/ - then - cat "$1" - else - cat "$oldpwd/$1" - fi -else - cat -fi \ - | part $PART_TO_PROCESS \ - | tee $ORIGINAL_TEXT \ - | \ - get_words_group_words_add_stat \ - | tee "$TEMP1" > "$TEMP2" +get_words_group_words_add_stat "$1" -if [ "$STAT_ONLY" = "YES" ] -then - cat "$TEMP1" -elif [ "$NON_INTERACTIVE_MODE" = "YES" ] -then - cat "$TEMP1" -else - if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ] - then - [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2" - if [ "$editor" = vim ] - then - vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty - else - $editor "$TEMP2" - fi - remove_marks "$TEMP2" +#mkdir -p $WORK_DIR +#oldpwd="$PWD" +#cd $WORK_DIR +#if [ "$MERGE_TAGGED_WORDS" = "YES" ] +#then +# VOC_FILES='' +# for i in $MERGE_THIS_TAGS +# do +# f=`tag_file_name $i` +# [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f" +# done +# if [ -z "$VOC_FILES" ] +# then +# echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr +# else +# cat $VOC_FILES +# fi +#elif [ "$MERGE_ALL_TAGGED" = "YES" ] +#then +# cat ${LANGUAGE}_*.txt +#else +# cat +#fi - vocabulary="$VOCABULARY" - [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`" - diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary" - fi -fi -rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT" diff -r f583256b7ab1 -r e25de9ea9184 new-words.py --- a/new-words.py Mon Oct 31 20:21:20 2011 +0200 +++ b/new-words.py Tue Nov 01 20:19:18 2011 +0100 @@ -11,6 +11,7 @@ import subprocess import sys import Stemmer +import tempfile try: import psyco psyco.full() @@ -141,7 +142,13 @@ dest="language") parser.add_option( - "-f", "--function", + "-f", "--allowed-words", + help="file with list of allowed words (words that will be shown in the output)", + action="store", + dest="allowed_words") + +parser.add_option( + "-X", "--function", help="filter through subsystem [INTERNAL]", action="store", dest="function") @@ -183,6 +190,12 @@ dest="delete_tag") parser.add_option( + "-R", "--show-range-percentage", + help="show only words that cover specified percentage of the text, skip the rest", + action="store", + dest="show_range_percentage") + +parser.add_option( "-s", "--text-stats", help="show the text statistics (percentage of known words and so on) and exit", action="store_true", @@ -225,6 +238,16 @@ res += [line] return res +def readlines_from_url(url): + return [x.decode('utf-8') for x in + subprocess.Popen( + "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url), + shell = True, + stdout = subprocess.PIPE, + stderr = subprocess.STDOUT + ).communicate()[0].split('\n') + ] + def readlines_from_stdin(): return codecs.getreader("utf-8")(sys.stdin).readlines() @@ -261,8 +284,11 @@ logging.debug(result) return result +def voc_filename(): + return "%s/%s.txt"%(config['config_directory'], config['language']) + def load_vocabulary(): - return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language']))) + return get_words(readlines_from_file(voc_filename())) def notes_filenames(): return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])] @@ -409,8 +435,10 @@ show_range=0, show_range_percentage=0, ): + result = [] if stats_only: - codecs.getwriter("utf-8")(sys.stdout).write( + #codecs.getwriter("utf-8")(sys.stdout).write( + result.append( " ".join([ "%-10s" % x for x in [ "LANG", @@ -421,7 +449,7 @@ "WPS", "UWPS*10" ]]) + "\n") - codecs.getwriter("utf-8")(sys.stdout).write( + result.append( " ".join([ "%(language)-10s", "%(percentage)-10.2f", @@ -431,10 +459,10 @@ "%(wps)-11d" "%(uwps)-11d" ]) % stats + "\n") - return + return "".join(result) if print_stats: - codecs.getwriter("utf-8")(sys.stdout).write( + result.append( "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats) level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102) @@ -448,14 +476,12 @@ normalized_word = normalizator.normalize(word_pair[1]) if old_normalized_word and old_normalized_word != normalized_word: - #codecs.getwriter("utf-8")(sys.stdout).write( - # "### %s\n" % normalizator.best_word_from_group(words_of_this_group)) if compressed_wordlist: compressed_word_pair = ( sum(x[0] for x in words_of_this_group), normalizator.best_word_from_group(words_of_this_group) ) - codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair) + result.append("%10s %s\n" % compressed_word_pair) printed_words += 1 words_of_this_group = [] @@ -463,7 +489,7 @@ words_of_this_group.append(word_pair) if not compressed_wordlist: - codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair) + result.append("%10s %s\n" % word_pair) printed_words += 1 @@ -473,28 +499,14 @@ while 100.0*known/total > level_lines[0]: current_level = level_lines[0] level_lines = level_lines[1:] - codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level) + result.append("# %s\n" % current_level) if show_range >0 and printed_words >= show_range: break if show_range_percentage >0 and 100.0*known/total >= show_range_percentage: break -def filter_add_notes(args): - lines = readlines_from_file(args[0]) - notes = load_notes(notes_filenames()) - lines = add_notes(lines, notes) - with codecs.open(args[0], "w", "utf-8") as f: - for line in lines: - f.write(line) - -def filter_remove_notes(args): - lines = readlines_from_file(args[0]) - notes = load_notes(notes_filenames()) - lines = remove_notes(lines, notes) - with codecs.open(args[0], "w", "utf-8") as f: - for line in lines: - f.write(line) + return result def parse_parts_description(parts_description): """ @@ -503,8 +515,6 @@ from-to/step from+delta/step """ - def incorrect_parts_description(pd): - raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % pd) try: (a, step) = parts_description.split("/", 1) @@ -525,7 +535,7 @@ return (start, stop, step) except: - raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % pd) + raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % parts_description) def take_part(lines, part_description = None): @@ -536,27 +546,45 @@ part_size = (1.0*n) / step result = [] for i in range(n): - if part_size * i >= start and part_size * i <= stop: - result += lines[i] + if i >= start * part_size and i <= stop * part_size: + result += [lines[i]] return result def filter_get_words_group_words_add_stat(args): vocabulary = load_vocabulary() notes = load_notes(notes_filenames()) - lines = take_part(readlines_from_stdin(), config.get('pages', '')) + + if len(args) > 0: + if 'http://' in args[0]: + input_lines = readlines_from_url(args[0]) + else: + input_lines = readlines_from_file(args[0]) + else: + input_lines = readlines_from_stdin() + + if len(input_lines) == 0: + print >> sys.stderr, "Nothing to do, standard input is empty, exiting." + sys.exit(1) + + lines = take_part(input_lines, config.get('pages', '')) + + (_, original_text_tempfile) = tempfile.mkstemp(prefix='new-word') + with codecs.open(original_text_tempfile, "w", "utf-8") as f: + f.write("".join(lines)) + group_by = [1] - if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES': + if 'two_words' in config: group_by.append(2) - if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES': + if 'three_words' in config: group_by.append(3) words = get_words(lines, group_by) stats_only = False - if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES': + if 'text_stats' in config: stats_only = True compressed_wordlist = False - if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES': + if 'compressed' in config: compressed_wordlist = True show_range = os.environ.get('SHOW_RANGE', '') @@ -564,16 +592,16 @@ show_range = int(show_range) else: show_range = 0 - show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '') - if show_range_percentage != '': - show_range_percentage = int(show_range_percentage) + + if 'show_range_percentage' in config: + show_range_percentage = int(config['show_range_percentage']) else: show_range_percentage = 0 stats = {} stats['total'] = sum(words[x] for x in words.keys()) - if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES': + if not 'no_filter' in config: words = substract_dictionary(words, vocabulary) stats['total_unknown'] = sum(words[x] for x in words.keys()) @@ -591,8 +619,8 @@ normalizator = Normalizator(config['language'], linked_words) # filter words by allowed_words_filter - if os.environ.get('ALLOWED_WORDS_FILENAME', ''): - allowed_words_filename = os.environ.get('ALLOWED_WORDS_FILENAME', '') + if 'allowed_words' in config: + allowed_words_filename = config['allowed_words'] normalized_allowed_words = [ normalizator.normalize(w.rstrip('\n')) for w in readlines_from_file(allowed_words_filename) @@ -615,7 +643,7 @@ cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words), reverse=True) - print_words_sorted( + output = print_words_sorted( words_with_freq, stats, normalizator, @@ -625,14 +653,87 @@ show_range_percentage=show_range_percentage, ) + + if ('non_interactive' in config or 'text_stats' in config): + codecs.getwriter("utf-8")(sys.stdout).write("".join(output)) + else: + (_, temp1) = tempfile.mkstemp(prefix='new-word') + (_, temp2) = tempfile.mkstemp(prefix='new-word') + + with codecs.open(temp1, "w", "utf-8") as f: + f.write("".join(output)) + with codecs.open(temp2, "w", "utf-8") as f: + f.write("".join(add_notes(output, notes))) + + os.putenv('ORIGINAL_TEXT', original_text_tempfile) + os.system(( + "vim" + " -c 'setlocal spell spelllang={language}'" + " -c 'set keywordprg={language}'" + " -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255'" + " {filename}" + " < /dev/tty > /dev/tty" + ).format(language=config['language'], filename=temp2)) + + lines = remove_notes(readlines_from_file(temp2), notes) + + # compare lines_before and lines_after and return deleted words + lines_before = output + lines_after = lines + deleted_words = [] + + for line in lines_before: + if line not in lines_after: + line = line.strip() + if ' ' in line: + word = re.split('\s+', line, 1)[1] + if ' ' in word: + word = re.split('\s+', word, 1)[0] + deleted_words.append(word) + + with codecs.open(voc_filename(), "a", "utf-8") as f: + f.write("\n".join(deleted_words + [''])) + + os.unlink(temp1) + os.unlink(temp2) + + os.unlink(original_text_tempfile) + (options, args) = parser.parse_args() if options.language: config['language'] = options.language +if options.pages: + config['pages'] = options.pages +else: + config['pages'] = "" + +if options.allowed_words: + config['allowed_words'] = options.allowed_words + +if options.show_range_percentage: + config['show_range_percentage'] = options.show_range_percentage + +if options.non_interactive: + config['non_interactive'] = True + +if options.text_stats: + config['text_stats'] = True + +if options.compressed: + config['compressed'] = True + +if options.no_filter: + config['no_filter'] = True + +if options.two_words: + config['two_words'] = True + +if options.three_words: + config['three_words'] = True + if options.function: function_names = { - 'add_notes' : filter_add_notes, - 'remove_notes': filter_remove_notes, 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat, } if options.function in function_names: