# HG changeset patch
# User Igor Chubin <igor@chub.in>
# Date 1292144646 -3600
# Node ID 720a701b2ba9d296b3674bc046a084c2cf2195db
# Parent  753fb84437aa0b09fee9831149d0148161982b50
-S and -N keys

diff -r 753fb84437aa -r 720a701b2ba9 new-words.sh
--- a/new-words.sh	Fri Nov 05 20:07:46 2010 +0100
+++ b/new-words.sh	Sun Dec 12 10:04:06 2010 +0100
@@ -14,9 +14,11 @@
     -k          put higher words that are similar to the known words (only for English)
     -l lang     override language settings
     -n          non-interactive mode (don't run vi)
+    -N          turn off known words filtering
     -a          don't add marks (and don't save marks added by user)
     -p pages    work with specified pages only (pages = start-stop/total )
     -s          show the text statistics (percentage of known words and so on) and exit
+    -S          show your vocabulary statistics (number of words and word groups)
     -t tag      tag known words with tag
     -T          show list of active tags
     -m tag      merge the words tagged with "tag" into the main vocabulary
@@ -75,14 +77,18 @@
 MERGE_TAGGED_WORDS=NO
 MERGE_ALL_TAGGED=NO
 DONT_ADD_MARKLINES=NO
-while getopts l:skanp:t:Tm:Mr:23 opt
+FILTER_WORDS=YES
+SHOW_VOC_STAT=NO
+while getopts l:sSkanNp:t:Tm:Mr:23 opt
 do
     case "$opt" in
       s)  STAT_ONLY=YES;;
+      S)  SHOW_VOC_STAT=YES;;
       k)  NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
       l)  LANGUAGE="$OPTARG";;
       a)  DONT_ADD_MARKS=YES;;
       n)  NON_INTERACTIVE_MODE=YES;;
+      N)  FILTER_WORDS=NO;;
       p)  PART_TO_PROCESS="$OPTARG";;
       t)  TAG_NAME="$OPTARG";;
       T)  TAGS_LIST_ONLY="YES";;
@@ -107,10 +113,17 @@
 VOCABULARY=${LANGUAGE}.txt
 NOTES_FILE=notes-${LANGUAGE}.txt
 
+if [ "${SHOW_VOC_STAT}" = "YES" ]
+then
+  $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>"
+  exit 0
+fi
+
 #----------------------------------------------------
 
 get_words()
 {
+    export FILTER_WORDS
 tr ' ' '\n' | sed 's/--/ /g' \
 | sed "s/'/__APOSTROPHE__/g" \
 | perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\
@@ -146,7 +159,9 @@
         rm $after
         return 0
     else 
-        echo "# $LANGUAGE, $percentage, <$total_known/$total>"
+        groups="$(grep '# groups' $after | awk '{print $3}')"
+        words="$(grep -v '^#' $after | wc -l)"
+        echo "# $LANGUAGE, $percentage, <$total_known/$total>, <$groups/$words>"
     fi
 
     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
@@ -160,6 +175,7 @@
 } else { $mark_line +=5; };
 while(<>)
 {
+    next if /^#\s*groups\s*/;
     print;
     /^\s*([0-9]*)\s*/;
     $s+=$1;
@@ -217,6 +233,10 @@
 {
     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
+    if ($ENV{FILTER_WORDS} eq "NO") {
+        while(<>) { print; }
+        exit(0);
+    }
 $voc_files=$ENV{VOC_FILES};
 $voc_files=~s@^ @@;
 for $voc_file (split /\s+/,$voc_files) {
@@ -478,6 +498,7 @@
 }
 @lines2 = sort { compare($b,$a) } @lines;
 binmode STDOUT, ":utf8";
+print "# groups ".scalar(keys(%group_weight))."\n";
 for $l (@lines2) {
     print "$l\n";
 }