# HG changeset patch # User Igor Chubin # Date 1295736151 -3600 # Node ID adbc809d39242cc78d735fafc32f3adcbe1c2519 # Parent be6336e98b3c34bb8b165103cdde16d12a00f934 Transition to Python started new-words-py.sh is a wrapper around new-words.py version which is not finished yet. diff -r be6336e98b3c -r adbc809d3924 new-words-py.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/new-words-py.sh Sat Jan 22 23:42:31 2011 +0100 @@ -0,0 +1,825 @@ +#!/bin/bash + +show_usage() +{ +cat < /dev/stderr + +USAGE: + + new-words [ -l lang ] [ -s ] [ ARG ] + +SWITCHES: + + -h print this screen + -c show compressed wordlist: one word per group + -k put higher words that are similar to the known words (only for English) + -l lang override language settings + -n non-interactive mode (don't run vi) + -N turn off known words filtering + -a don't add marks (and don't save marks added by user) + -p pages work with specified pages only (pages = start-stop/total ) + -s show the text statistics (percentage of known words and so on) and exit + -S show your vocabulary statistics (number of words and word groups) + -t tag tag known words with tag + -T show list of active tags + -m tag merge the words tagged with "tag" into the main vocabulary + -M merge the words tagged with any tag into the main vocabulary + -r tag remove subvocabulary for the "tag" + -2 -3 find 2 and 3 words' sequences + +The language of the text can be specified also +by name of the program new-words (correspondent link must be created before). +For example, these calls are equivalent: + + de-words URL + new-words -l de URL + +HELP +} + +if [ "$1" = "-h" ] +then + show_usage + exit 0 +fi + +NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py +WORK_DIR=~/.new-words/ +TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX` +TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX` +export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX` +editor=${EDITOR:-vim} + +# language detection + +LANGUAGE=en +my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `" +for arg +do + if echo "$arg" | grep -q http://...wikipedia.org/wiki/ + then + LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`" + fi +done +[ "${my_name}" = "new" ] || LANGUAGE="$my_name" + +#---------------------------------------------------- +# command line options processing + +STAT_ONLY=NO +NEED_TO_USE_VOCABULARY_WHEN_SORT=NO +DONT_ADD_MARKS=NO +NON_INTERACTIVE_MODE=NO +PART_TO_PROCESS='' +GROUP_WORDS_BY_THREE=NO +GROUP_WORDS_BY_TWO=NO +TAG_NAME='' +MERGE_THIS_TAGS='' +TAGS_LIST_ONLY=NO +MERGE_TAGGED_WORDS=NO +MERGE_ALL_TAGGED=NO +DONT_ADD_MARKLINES=NO +FILTER_WORDS=YES +SHOW_VOC_STAT=NO +COMPRESSED_WORDLIST=NO +OLD_STYLE="NO" +while getopts Ocl:sSkanNp:t:Tm:Mr:23 opt +do + case "$opt" in + O) OLD_STYLE=YES;; + c) COMPRESSED_WORDLIST=YES;; + s) STAT_ONLY=YES;; + S) SHOW_VOC_STAT=YES;; + k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;; + l) LANGUAGE="$OPTARG";; + a) DONT_ADD_MARKS=YES;; + n) NON_INTERACTIVE_MODE=YES;; + N) FILTER_WORDS=NO;; + p) PART_TO_PROCESS="$OPTARG";; + t) TAG_NAME="$OPTARG";; + T) TAGS_LIST_ONLY="YES";; + m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";; + M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";; + r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";; + 2) GROUP_WORDS_BY_TWO=YES;; + 3) GROUP_WORDS_BY_THREE=YES;; + \?) # unknown flag + show_usage + exit 1;; + esac +done +shift `expr $OPTIND - 1` + +if [ "$1" = "-l" ] +then + LANGUAGE="$2" + shift 2 +fi + +VOCABULARY=${LANGUAGE}.txt +NOTES_FILE=notes-${LANGUAGE}.txt + +if [ "${SHOW_VOC_STAT}" = "YES" ] +then + $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>" + exit 0 +fi + +#---------------------------------------------------- + +get_words() +{ + if [ "$OLD_STYLE" = NO ] + then + $NEW_WORDS_PY -l "$LANGUAGE" -f get_words "$1" + else + get_words_OLD "$@" + fi +} + +get_words_OLD() +{ + export FILTER_WORDS +tr ' ' '\n' | sed 's/--/ /g' \ +| sed "s/'/__APOSTROPHE__/g" \ +| perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\ +| sed "s/__APOSTROPHE__/'/g" \ +| tr ' ' '\n' \ +| tee "$1" \ +| grep_v_english_perl \ +| sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn +} + +add_stat() +{ + if [ "$DONT_ADD_MARKLINES" = "YES" ] + then + cat + return + fi + before="$1" + after=${before}2 + cat > "$after" + total="`wc -w $1 | awk '{print $1}'`" + total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`" + total_known="`echo $total-$total_unknown|bc`" + percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`" + #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`" + sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`" + + + if [ "$STAT_ONLY" = "YES" ] + then + echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10" + echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` " + rm $after + return 0 + else + groups="`echo $(grep '# groups' $after | awk '{print $3}')`" + words="`echo $(grep -v '^#' $after | wc -l)`" + echo "# $LANGUAGE, $percentage, <$total_known/$total>, <$groups/$words>" + fi + + PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` + cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME +my $total=shift(@ARGV); +my $total_known=shift(@ARGV); +my $s=0; +my $mark_line=int($total_known*100/$total/5)*5; +if ($mark_line>=90) { + $mark_line=int($total_known*100/$total)+1; +} else { $mark_line +=5; }; +while(<>) +{ + next if /^#\s*groups\s*/; + print; + /^\s*([0-9]*)\s*/; + $s+=$1; + if (($total_known+$s)*100/$total>=$mark_line) { + print "# $mark_line\n"; + if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; }; + } +} +PERL_SCRIPT + perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after" + rm $PERL_SCRIPT_TEMP_NAME + rm $after +} + +two_and_three_words() +{ + if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ] + then + cat + else + cat + + export GROUP_WORDS_BY_THREE + export GROUP_WORDS_BY_TWO + PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX` + cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME +#!/usr/bin/perl +local $/; +$words=<>; +$words=~ s@[!?;,:#1-9".]@ @g; +$words =~ s@\s+@ @g; +@words = split /\s+/, $words; +for ($i=0; $i<$#words-3;$i++) { + my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]); + if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) { + print "${a}_${b}_${c}\n"; + }; + if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) { + print "${a}_${b}\n"; + }; +} +PERL_SCRIPT + perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT" + rm $PERL_SCRIPT_TEMP_NAME + fi +} + +grep_v_english() +{ +[ -e "$VOCABULARY" ] || touch "$VOCABULARY" +eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat +} + +grep_v_english_perl() +{ + PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` + cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME + if ($ENV{FILTER_WORDS} eq "NO") { + while(<>) { print; } + exit(0); + } +$voc_files=$ENV{VOC_FILES}; +$voc_files=~s@^ @@; +for $voc_file (split /\s+/,$voc_files) { + if (open(VOC, $voc_file)) { + while (){ + chomp; + #s/'//g; + $voc{$_}="1"; + } + } +} +while(<>) { + chomp; + if (not defined($voc{$_})) { print "$_\n"; } +} +PERL_SCRIPT + [ -e "$VOCABULARY" ] || touch "$VOCABULARY" + export VOCABULARY VOC_FILES + VOC_FILES=$VOCABULARY + for i in $TAG_NAME + do + VOC_FILES="${VOC_FILES} `tag_file_name $i`" + done + perl $PERL_SCRIPT_TEMP_NAME + rm $PERL_SCRIPT_TEMP_NAME +} + +group_words() +{ + if [ "$OLD_STYLE" = NO ] + then + $NEW_WORDS_PY -l "$LANGUAGE" -f group_words "$1" + else + group_words_OLD "$@" + fi +} + +group_words_OLD() +{ + #if [ "$LANGUAGE" != "en" ] + #then + # cat + # return + #fi + PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX` + cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME +#!/usr/bin/perl + +use Encode; +use utf8; +use Lingua::Stem::Snowball qw(stem); + +eval { +# http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it + require String::Similarity; + String::Similarity->import(); +}; +unless($@) +{ + our $HAVE_String_Similarity=1; +} + + +sub load_notes_dict() +{ + my %dict; + if (open(NOTES, $ENV{NOTES_FILE})) { + while() { + $_ = decode( "utf8", $_); + chomp; + s/^\s+//; + my ($a,$b)=split /\s+/,$_,2; + $dict{$a}=$b; + } + } + return %dict; +} + +sub similar($$){ + my $a=shift; + my $b=shift; + if ($HAVE_String_Similarity) { + return $Similarity{"$a $b"}; + } + else { + return 0; + } +} + + +sub normalize_without_linked($) +{ + if ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); } + elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); } + elsif ( $ENV{LANGUAGE} eq "uk" ) { return normalize_ukrainian(shift); } + elsif ( $ENV{LANGUAGE} eq "io" ) { return normalize_esperanto(shift); } + else { return shift ; } +} + +sub normalize_with_linked($) +{ + my $word = normalize_without_linked(shift); + #return $word; + if ($linked_words{$word}) { + return $linked_words{$word}; + } + else { + return $word; + } +} + +sub normalize($) +{ + return normalize_with_linked(shift); +} + +sub normalize_ukrainian($) +{ + $_=lc(shift); + s/[юіоеуаи]$//g; + return $_; +} + +sub normalize_esperanto($) +{ + $_=lc(shift); +# verbs + s/i$//; s/is$//; s/os$//; s/as$//; s/us$//; + +# nouns + s/j?n?$//; + + return $_; +} + +sub normalize_german($) +{ + @stems = stem('de', \@_); + return $stems[0]; +} + +sub normalize_german_($) +{ + $_=lc(shift); + + s/heit$//; s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//; + s/en$//; s/er$//; + + s/lich$//; s/ig$//; + s/al$//; s/isch$//; + s/ell$//; s/haft$//; + + s/bar$//; s/sam$//; s/lich$//; + + @prefixes=qw( + ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu + be emp ent er ge miss ver zer durch über um unter wieder); + @prefixes=(); + for $pref (@prefixes) { + s/^$pref//; + } + + + return $_; +} + +sub normalize_english($) +{ + $_=lc(shift); + + s/s$//; + + s/ation$//; s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//; + + s/ed$//; + s/en$//; + s/er$//; + s/est$//; + s/ing$//; + + s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//; + s/ify$//; s/fy$//; s/ly$//; + s/ise$//; s/ize$//; + + s/e$//; + return $_; +} + + +sub compare($$) +{ + my $a=shift; + my $b=shift; + $a =~ s/^\s*//; + $b =~ s/^\s*//; + my ($a1, $a2)= split /\s+/,$a,2; + my ($b1, $b2)= split /\s+/,$b,2; + + my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)}; + + if ($cmp) { + return $cmp; + } + else { + if (normalize($a2) ne normalize($b2)) { + return normalize($a2) cmp normalize($b2); + } + else { + return $a1 <=> $b1; + } + } +} + +sub log_($) +{ + return; + open(LOG, ">>", "/tmp/log1"); + print LOG $_[0]; + close(LOG); +} + +sub find_linked_words($) +{ + my %linked_words; + my $dict = shift; + log_("1"); + log_(join(" ", keys(%$dict))); + + for $key (keys(%$dict)) { + $val = $dict->{$key}; + log_($key."\n"); + if ($val =~ /\@([a-z]*)/) { + $linked_words{normalize($key)} = normalize($1); + log_(normalize($key)." = ".normalize($1)."\n"); + } + } + return %linked_words; +} + +sub lc_length($) +{ + my $a= shift; + $a =~ s/[a-z]//g; + return length($a); +} + +our %dict = load_notes_dict(); +our %linked_words = find_linked_words(\%dict); + +our %Vocabulary; +open(VOC, $ENV{VOCABULARY}) + or die "Can't open VOCABULARY"; +while (){ + chomp; + #s/'//g; + $Vocabulary{normalize($_)}="1"; +} +close(VOC); + +binmode STDIN,":utf8"; +@lines=; +for $L (@lines) { + chomp($L); + #$L = decode( "utf8", $L); + $l=$L; + $l =~ s/^\s*//; + my ($a, $b)=split(/\s+/,$l,2); + $group_weight{normalize($b)}+=$a; +} +if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") { + for $k (keys %group_weight) { + if (defined($Vocabulary{$k})) { + $group_weight{$k} *= 2; + } + } +} +@lines2 = sort { compare($b,$a) } @lines; +binmode STDOUT, ":utf8"; +print "# groups ".scalar(keys(%group_weight))."\n"; +if ($ENV{COMPRESSED_WORDLIST} eq "YES") { + my $sum = 0; + my $min = 9999; + for $L (@lines2) { + chomp($L); $l=$L; $l =~ s/^(\s*)//; my $spaces = $1; my ($a, $b)=split(/\s+/,$l,2); + $group_name = normalize($b); + if ($group_name ne $prev_group_name and $prev_group_name ne '' ) { + #print (" "x(7-length($sum))),"$sum $main_word\n"; + print +(" "x(7-length($sum))),"$sum $main_word\n"; + $sum = $a; + $min = length($b) + 2*lc_length($b); + $main_word = $b; + } + else { + $sum += $a; + if ($min > length($b) + 2*lc_length($b)) { + $min = length($b) + 2*lc_length($b); + $main_word = $b; + } + } + $prev_group_name = $group_name; + } +} +else { + for $l (@lines2) { + print "$l\n"; + } +} +PERL_SCRIPT + export VOCABULARY + export NEED_TO_USE_VOCABULARY_WHEN_SORT + export LANGUAGE + export COMPRESSED_WORDLIST + [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" + export NOTES_FILE + perl $PERL_SCRIPT_TEMP_NAME + rm $PERL_SCRIPT_TEMP_NAME +} + +text_from_url() +{ +lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@' +} + +add_marks() +{ + PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` + cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME +use Encode; + +sub load_notes_dict() +{ + my %dict; + if (open(NOTES, $ENV{NOTES_FILE})) { + while() { + $_ = decode( "utf8", $_); + chomp; + s/^\s+//; + my ($a,$b)=split /\s+/,$_,2; + $dict{$a}=$b; + } + } + return %dict; +} + +%dict = load_notes_dict(); + +$file = $ARGV[0]; +if (open(F, $file)) { + @lines=; + close(F); + for (@lines) {$_ = decode( "utf8", $_);}; + + if (open(F, ">$file")) { + binmode F, ":utf8"; + for (@lines) { + m/\s+\S+\s+(\S+)/; + $name=$1; + if (not /^#/ and defined($dict{$name})) { + chomp; + $mark=$dict{$name}; + $space=" "x(30-length($_)); + print F "$_$space$mark\n"; + } + else { + print F "$_"; + } + } + close(F); + } +} +PERL_SCRIPT + [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" + export NOTES_FILE + perl $PERL_SCRIPT_TEMP_NAME "$1" + rm $PERL_SCRIPT_TEMP_NAME +} + +remove_marks() +{ + PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` + cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME +$file = $ARGV[0]; +our %dict; +if (open(F, $file)) { + @lines=; + close(F); + + if (open(F, ">$file")) { + for (@lines) { + chomp; + if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) { + my $name=$4; + my $comment=$6; + $dict{$name}=$comment; + print F "$1$2$3$4\n"; + } + else { + print F "$_\n"; + } + } + } +} +if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) { + @lines=; + close(NOTES); + + if (open(NOTES, ">".$ENV{NOTES_FILE})) { + for (@lines) { + chomp; + s/^\s+//; + my ($a,$b)=split /\s+/,$_,2; + if (not defined($dict{$a}) || ($dict{$a} eq $b)) { + print NOTES "$_\n"; + if (defined($dict{$a})) { unset($dict{$a}); } + } + } + for (keys %dict) { + $mark=$dict{$_}; + $space=" "x(30-length($_)); + print NOTES "$_$space$mark\n"; + } + } +} +PERL_SCRIPT + [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" + export NOTES_FILE + export DONT_ADD_MARKS + perl $PERL_SCRIPT_TEMP_NAME "$1" + rm $PERL_SCRIPT_TEMP_NAME +} + +part() +{ + PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX` + cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME +#!/usr/bin/perl + +my @lines=; +my $lines=$#lines; +my $interval=$ARGV[0]; +if (not $interval) { + print @lines; +} +else { + my ($start,$stop,$total); + if ($interval =~ m@(.*)/(.*)@) { + $start = $1; + $total = $2; + } + else { + $start=$interval; + $total=0; + } + if ($start =~ m@(.*)-(.*)@) { + $start = $1; + $stop = $2; + } + if ($start =~ m@(.*)\+(.*)@) { + $start = $1; + $stop = $start+$2; + } + + $start=int($lines/$total*$start); + $stop=int($lines/$total*$stop); + + for($i=$start;$i<$stop;$i++){ + print $lines[$i]; + } +} +PERL_SCRIPT + perl $PERL_SCRIPT_TEMP_NAME "$1" + rm $PERL_SCRIPT_TEMP_NAME +} + +if [ "$TAGS_LIST_ONLY" = "YES" ] +then + cd "${WORK_DIR}" + echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//' + exit 0 +fi + +tag_file_name() +{ + echo "${LANGUAGE}_${1}.txt" +} + +if [ "$REMOVE_TAG" = "YES" ] +then + cd "${WORK_DIR}" + for i in $TAG_NAME + do + echo "$TAGNAME" | grep -q '[/*?]' && continue + f="`tag_file_name $i`" + if [ -e "$f" ] + then + rm -f "$f" && echo Tag "'$i'" removed + else + echo Unknown tag "'$i'" + fi + done + exit 0 +fi + +mkdir -p $WORK_DIR +oldpwd="$PWD" +cd $WORK_DIR +if [ "$MERGE_TAGGED_WORDS" = "YES" ] +then + VOC_FILES='' + for i in $MERGE_THIS_TAGS + do + f=`tag_file_name $i` + [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f" + done + if [ -z "$VOC_FILES" ] + then + echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr + else + cat $VOC_FILES + fi +elif [ "$MERGE_ALL_TAGGED" = "YES" ] +then + cat ${LANGUAGE}_*.txt +elif echo "$1" | grep -q http: +then + text_from_url "$1" +elif [ "$#" != 0 ] +then + if echo $1 | grep -q ^/ + then + cat "$1" + else + cat "$oldpwd/$1" + fi +else + cat +fi \ + | part $PART_TO_PROCESS \ + | tee $ORIGINAL_TEXT \ + | two_and_three_words \ + | get_words ${TEMP1}-full \ + | group_words \ + | add_stat ${TEMP1}-full \ + | tee "$TEMP1" > "$TEMP2" + +if [ "$STAT_ONLY" = "YES" ] +then + cat "$TEMP1" +elif [ "$NON_INTERACTIVE_MODE" = "YES" ] +then + cat "$TEMP1" +else + if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ] + then + [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2" + if [ "$editor" = vim ] + then + vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty + else + $editor "$TEMP2" + fi + remove_marks "$TEMP2" + + vocabulary="$VOCABULARY" + [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`" + diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary" + fi +fi + +rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT" diff -r be6336e98b3c -r adbc809d3924 new-words.py --- a/new-words.py Fri Jan 21 15:59:45 2011 +0200 +++ b/new-words.py Sat Jan 22 23:42:31 2011 +0100 @@ -1,6 +1,39 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- +import codecs +import logging +import os import optparse +import re +import subprocess +import sys +import Stemmer + +config = { + 'config_directory': os.environ['HOME'] + '/.new-words', + 'language': 'en', +} + +logging.basicConfig(filename='/tmp/new-words-py.log', level=logging.DEBUG) + +class Normalizator: + def __init__(self, language, linked_words={}): + stemmer_algorithm = { + 'de' : 'german', + 'en' : 'english', + 'ru' : 'russian', + 'uk' : 'ukrainian', + } + self.stemmer = Stemmer.Stemmer(stemmer_algorithm[language]) + self.linked_words = linked_words + + def normalize(self, word): + word_chain = [] + while word in self.linked_words and not word in word_chain: + word_chain.append(word) + word = self.linked_words[word] + return self.stemmer.stemWord(word.lower()) parser = optparse.OptionParser() @@ -29,6 +62,12 @@ dest="language") parser.add_option( + "-f", "--function", + help="filter through subsystem [INTERNAL]", + action="store", + dest="function") + +parser.add_option( "-m", "--merge-tag", help="merge words tagged with specified tag into the main vocabulary", action="store", @@ -100,31 +139,156 @@ action="store_true", dest="three_words") +def readlines_from_file(filename): + res = [] + with codecs.open(filename, "r", "utf-8") as f: + for line in f.readlines(): + res += [line] + return res + +def readlines_from_stdin(): + return codecs.getreader("utf-8")(sys.stdin).readlines() + +def words_from_line(line): + line = line.rstrip('\n') + #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line) + #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line) + return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line) + +def get_words(lines): + """ + Returns hash of words in a file + word => number + """ + result = {} + for line in lines: + words = words_from_line(line) + for word in words: + result.setdefault(word, 0) + result[word] += 1 + return result + +def load_vocabulary(): + return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language']))) + +def notes_filenames(): + return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])] + +def load_notes(files): + notes = {} + for filename in files: + with open(filename) as f: + for line in f.readlines(): + (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) + notes.setdefault(word, {}) + notes[word][filename] = note + return notes + +def print_words_sorted(words_freq): + for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True): + codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k)) + +def substract_dictionary(dict1, dict2): + """ + returns dict1 - dict2 + """ + result = {} + for (k,v) in dict1.items(): + if not k in dict2: + result[k] = v + return result + +def dump_words(words, filename): + with codecs.open(filename, "w+", "utf-8") as f: + for word in words.keys(): + f.write(("%s\n"%word)*words[word]) + +def error_message(text): + print text + +def find_wordgroups_weights(lines, normalizator): + weight = {} + for line in lines: + line = re.sub('^\s*', '', line.rstrip('\n')) + (num, word) = re.split('\s+', line, maxsplit=1) + normalized = normalizator.normalize(word) + weight.setdefault(normalized, 0) + weight[normalized] += int(num) + return weight + +def find_linked_words(notes): + linked_words = {} + for word in notes.keys(): + for note in notes[word].values(): + if "@" in note: + logging.debug("%s %s" % (word, note)) + result = re.search(r'\@(\S*)', note) + if result: + main_word = result.group(1) + logging.debug("%s %s" % (word, main_word)) + if main_word: + linked_words[word] = main_word + return linked_words + + +def compare_word_lines(line1, line2, wgw, normalizator, linked_words): + line1 = re.sub('^\s*', '', line1.rstrip('\n')) + (num1, word1) = re.split('\s+', line1, 1) + line2 = re.sub('^\s*', '', line2.rstrip('\n')) + (num2, word2) = re.split('\s+', line2, 1) + + normalized_word1 = normalizator.normalize(word1) + normalized_word2 = normalizator.normalize(word2) + + cmp_res = cmp(wgw[normalized_word1], wgw[normalized_word2]) + if cmp_res != 0: + return cmp_res + else: + cmp_res = cmp(normalized_word1, normalized_word2) + if cmp_res != 0: + return cmp_res + else: + return cmp(int(num1), int(num2)) + +def filter_get_words(args): + vocabulary = load_vocabulary() + words = get_words(readlines_from_stdin()) + dump_words(words, args[0]) + words = substract_dictionary(words, vocabulary) + print_words_sorted(words) + +def filter_group_words(args): + lines = readlines_from_stdin() + notes = load_notes(notes_filenames()) + linked_words = find_linked_words(notes) + logging.debug(linked_words) + normalizator = Normalizator(config['language'], linked_words) + + wgw = find_wordgroups_weights(lines, normalizator) + for line in sorted( + lines, + cmp=lambda x,y:compare_word_lines(x,y, wgw, normalizator, linked_words), + reverse=True): + codecs.getwriter("utf-8")(sys.stdout).write(line) + (options, args) = parser.parse_args() +if options.language: + config['language'] = options.language -def get_words(): - pass +if options.function: + function_names = { + 'get_words' : filter_get_words, + 'group_words' : filter_group_words, + } + if options.function in function_names: + function_names[options.function](args) + else: + error_message("Unkown function %s.\nAvailable functions:\n%s" % ( + options.function, "".join([" "+x for x in sorted(function_names.keys())]))) + sys.exit(1) -def add_stat(): - pass -def two_and_three_words(): - pass -def grep_v_english(): - pass -def group_words(): - pass +#os.system("vim") -def add_marks(): - pass - -def remove_marks(): - pass - -def text_from_url(): - pass - -def part(): - pass