# HG changeset patch # User Igor Chubin # Date 1292144646 -3600 # Node ID 720a701b2ba9d296b3674bc046a084c2cf2195db # Parent 753fb84437aa0b09fee9831149d0148161982b50 -S and -N keys diff -r 753fb84437aa -r 720a701b2ba9 new-words.sh --- a/new-words.sh Fri Nov 05 20:07:46 2010 +0100 +++ b/new-words.sh Sun Dec 12 10:04:06 2010 +0100 @@ -14,9 +14,11 @@ -k put higher words that are similar to the known words (only for English) -l lang override language settings -n non-interactive mode (don't run vi) + -N turn off known words filtering -a don't add marks (and don't save marks added by user) -p pages work with specified pages only (pages = start-stop/total ) -s show the text statistics (percentage of known words and so on) and exit + -S show your vocabulary statistics (number of words and word groups) -t tag tag known words with tag -T show list of active tags -m tag merge the words tagged with "tag" into the main vocabulary @@ -75,14 +77,18 @@ MERGE_TAGGED_WORDS=NO MERGE_ALL_TAGGED=NO DONT_ADD_MARKLINES=NO -while getopts l:skanp:t:Tm:Mr:23 opt +FILTER_WORDS=YES +SHOW_VOC_STAT=NO +while getopts l:sSkanNp:t:Tm:Mr:23 opt do case "$opt" in s) STAT_ONLY=YES;; + S) SHOW_VOC_STAT=YES;; k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;; l) LANGUAGE="$OPTARG";; a) DONT_ADD_MARKS=YES;; n) NON_INTERACTIVE_MODE=YES;; + N) FILTER_WORDS=NO;; p) PART_TO_PROCESS="$OPTARG";; t) TAG_NAME="$OPTARG";; T) TAGS_LIST_ONLY="YES";; @@ -107,10 +113,17 @@ VOCABULARY=${LANGUAGE}.txt NOTES_FILE=notes-${LANGUAGE}.txt +if [ "${SHOW_VOC_STAT}" = "YES" ] +then + $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>" + exit 0 +fi + #---------------------------------------------------- get_words() { + export FILTER_WORDS tr ' ' '\n' | sed 's/--/ /g' \ | sed "s/'/__APOSTROPHE__/g" \ | perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\ @@ -146,7 +159,9 @@ rm $after return 0 else - echo "# $LANGUAGE, $percentage, <$total_known/$total>" + groups="$(grep '# groups' $after | awk '{print $3}')" + words="$(grep -v '^#' $after | wc -l)" + echo "# $LANGUAGE, $percentage, <$total_known/$total>, <$groups/$words>" fi PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` @@ -160,6 +175,7 @@ } else { $mark_line +=5; }; while(<>) { + next if /^#\s*groups\s*/; print; /^\s*([0-9]*)\s*/; $s+=$1; @@ -217,6 +233,10 @@ { PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME + if ($ENV{FILTER_WORDS} eq "NO") { + while(<>) { print; } + exit(0); + } $voc_files=$ENV{VOC_FILES}; $voc_files=~s@^ @@; for $voc_file (split /\s+/,$voc_files) { @@ -478,6 +498,7 @@ } @lines2 = sort { compare($b,$a) } @lines; binmode STDOUT, ":utf8"; +print "# groups ".scalar(keys(%group_weight))."\n"; for $l (@lines2) { print "$l\n"; }