new-words

changeset 26:4a10c0f4510c

apostrophe support, some improvements in speed, two and three words combination support
author Igor Chubin <igor@chub.in>
date Fri May 21 01:02:21 2010 +0300 (2010-05-21)
parents d1eb7dc37feb
children 0a80b2fa3ed8
files new-words.sh
line diff
     1.1 --- a/new-words.sh	Mon May 17 14:48:34 2010 +0300
     1.2 +++ b/new-words.sh	Fri May 21 01:02:21 2010 +0300
     1.3 @@ -1,4 +1,4 @@
     1.4 -#!/bin/sh
     1.5 +#!/bin/bash
     1.6  
     1.7  show_usage()
     1.8  {
     1.9 @@ -17,6 +17,7 @@
    1.10      -m          don't add marks (and don't save marks added by user)
    1.11      -p pages    work with specified pages only (pages = start-stop/total )
    1.12      -s          show the text statistics (percentage of known words and so on) and exit
    1.13 +    -2 -3       find 2 and 3 words' sequences
    1.14  
    1.15  The language of the text can be specified also
    1.16  by name of the program new-words (correspondent link must be created before).
    1.17 @@ -61,7 +62,9 @@
    1.18  DONT_ADD_MARKS=NO
    1.19  NON_INTERACTIVE_MODE=NO
    1.20  PART_TO_PROCESS=''
    1.21 -while getopts l:skmnp: opt
    1.22 +GROUP_WORDS_BY_THREE=NO
    1.23 +GROUP_WORDS_BY_TWO=NO
    1.24 +while getopts l:skmnp:23 opt
    1.25  do
    1.26      case "$opt" in
    1.27        s)  STAT_ONLY=YES;;
    1.28 @@ -70,6 +73,8 @@
    1.29        m)  DONT_ADD_MARKS=YES;;
    1.30        n)  NON_INTERACTIVE_MODE=YES;;
    1.31        p)  PART_TO_PROCESS="$OPTARG";;
    1.32 +      2)  GROUP_WORDS_BY_TWO=YES;;
    1.33 +      3)  GROUP_WORDS_BY_THREE=YES;;
    1.34        \?)       # unknown flag
    1.35            show_usage
    1.36            exit 1;;
    1.37 @@ -92,10 +97,9 @@
    1.38  {
    1.39  tr ' ' '\n' | sed 's/--/ /g' \
    1.40  | sed "s/'/__APOSTROPHE__/g" \
    1.41 -| tr  '—·-' '-----' \
    1.42 -| tr '*\r,.:#@()+=—<>$;"?!|·[]^%&' '                           ' \
    1.43 +| perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/                        /; binmode STDOUT, ":utf8"; print if /^[[:alpha:]'"'"'_-]*$/'\
    1.44 +| sed "s/__APOSTROPHE__/'/g" \
    1.45  | tr ' ' '\n' \
    1.46 -| grep -x '[[:alpha:]'"'"'-]*' \
    1.47  | tee "$1" \
    1.48  | grep_v_english_perl \
    1.49  | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
    1.50 @@ -149,6 +153,39 @@
    1.51      rm $after
    1.52  }
    1.53  
    1.54 +two_and_three_words()
    1.55 +{
    1.56 +    if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ]
    1.57 +    then 
    1.58 +        cat
    1.59 +    else
    1.60 +        cat 
    1.61 +
    1.62 +    export GROUP_WORDS_BY_THREE
    1.63 +    export GROUP_WORDS_BY_TWO
    1.64 +    PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX`
    1.65 +    cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
    1.66 +#!/usr/bin/perl
    1.67 +local $/;
    1.68 +$words=<>;
    1.69 +$words=~ s@[!?;,:#1-9".]@ @g;
    1.70 +$words =~ s@\s+@ @g;
    1.71 +@words = split /\s+/, $words;
    1.72 +for ($i=0; $i<$#words-3;$i++) {
    1.73 +    my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]);
    1.74 +    if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) {
    1.75 +        print "${a}_${b}_${c}\n";
    1.76 +    };  
    1.77 +    if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) {
    1.78 +        print "${a}_${b}\n";
    1.79 +    };
    1.80 +}
    1.81 +PERL_SCRIPT
    1.82 +    perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT"
    1.83 +    rm $PERL_SCRIPT_TEMP_NAME
    1.84 +    fi
    1.85 +}
    1.86 +
    1.87  grep_v_english()
    1.88  {
    1.89  [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
    1.90 @@ -488,6 +525,7 @@
    1.91  fi \
    1.92     | part $PART_TO_PROCESS \
    1.93     | tee $ORIGINAL_TEXT \
    1.94 +   | two_and_three_words \
    1.95     | get_words ${TEMP1}-full \
    1.96     | group_words \
    1.97     | add_stat ${TEMP1}-full \