# HG changeset patch
# User Igor Chubin <igor@chub.in>
# Date 1274392941 -10800
# Node ID 4a10c0f4510cb4c9387190b0f42abfbcdd197a26
# Parent  d1eb7dc37feb7fb9b4b4e2a42176b9a71ffa6e7e
apostrophe support, some improvements in speed, two and three words combination support

diff -r d1eb7dc37feb -r 4a10c0f4510c new-words.sh
--- a/new-words.sh	Mon May 17 14:48:34 2010 +0300
+++ b/new-words.sh	Fri May 21 01:02:21 2010 +0300
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 
 show_usage()
 {
@@ -17,6 +17,7 @@
     -m          don't add marks (and don't save marks added by user)
     -p pages    work with specified pages only (pages = start-stop/total )
     -s          show the text statistics (percentage of known words and so on) and exit
+    -2 -3       find 2 and 3 words' sequences
 
 The language of the text can be specified also
 by name of the program new-words (correspondent link must be created before).
@@ -61,7 +62,9 @@
 DONT_ADD_MARKS=NO
 NON_INTERACTIVE_MODE=NO
 PART_TO_PROCESS=''
-while getopts l:skmnp: opt
+GROUP_WORDS_BY_THREE=NO
+GROUP_WORDS_BY_TWO=NO
+while getopts l:skmnp:23 opt
 do
     case "$opt" in
       s)  STAT_ONLY=YES;;
@@ -70,6 +73,8 @@
       m)  DONT_ADD_MARKS=YES;;
       n)  NON_INTERACTIVE_MODE=YES;;
       p)  PART_TO_PROCESS="$OPTARG";;
+      2)  GROUP_WORDS_BY_TWO=YES;;
+      3)  GROUP_WORDS_BY_THREE=YES;;
       \?)       # unknown flag
           show_usage
           exit 1;;
@@ -92,10 +97,9 @@
 {
 tr ' ' '\n' | sed 's/--/ /g' \
 | sed "s/'/__APOSTROPHE__/g" \
-| tr  '—·-' '-----' \
-| tr '*\r,.:#@()+=—<>$;"?!|·[]^%&' '                           ' \
+| perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/                        /; binmode STDOUT, ":utf8"; print if /^[[:alpha:]'"'"'_-]*$/'\
+| sed "s/__APOSTROPHE__/'/g" \
 | tr ' ' '\n' \
-| grep -x '[[:alpha:]'"'"'-]*' \
 | tee "$1" \
 | grep_v_english_perl \
 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
@@ -149,6 +153,39 @@
     rm $after
 }
 
+two_and_three_words()
+{
+    if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ]
+    then 
+        cat
+    else
+        cat 
+
+    export GROUP_WORDS_BY_THREE
+    export GROUP_WORDS_BY_TWO
+    PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX`
+    cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
+#!/usr/bin/perl
+local $/;
+$words=<>;
+$words=~ s@[!?;,:#1-9".]@ @g;
+$words =~ s@\s+@ @g;
+@words = split /\s+/, $words;
+for ($i=0; $i<$#words-3;$i++) {
+    my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]);
+    if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) {
+        print "${a}_${b}_${c}\n";
+    };  
+    if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) {
+        print "${a}_${b}\n";
+    };
+}
+PERL_SCRIPT
+    perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT"
+    rm $PERL_SCRIPT_TEMP_NAME
+    fi
+}
+
 grep_v_english()
 {
 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
@@ -488,6 +525,7 @@
 fi \
    | part $PART_TO_PROCESS \
    | tee $ORIGINAL_TEXT \
+   | two_and_three_words \
    | get_words ${TEMP1}-full \
    | group_words \
    | add_stat ${TEMP1}-full \