# HG changeset patch # User Igor Chubin # Date 1274392941 -10800 # Node ID 4a10c0f4510cb4c9387190b0f42abfbcdd197a26 # Parent d1eb7dc37feb7fb9b4b4e2a42176b9a71ffa6e7e apostrophe support, some improvements in speed, two and three words combination support diff -r d1eb7dc37feb -r 4a10c0f4510c new-words.sh --- a/new-words.sh Mon May 17 14:48:34 2010 +0300 +++ b/new-words.sh Fri May 21 01:02:21 2010 +0300 @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash show_usage() { @@ -17,6 +17,7 @@ -m don't add marks (and don't save marks added by user) -p pages work with specified pages only (pages = start-stop/total ) -s show the text statistics (percentage of known words and so on) and exit + -2 -3 find 2 and 3 words' sequences The language of the text can be specified also by name of the program new-words (correspondent link must be created before). @@ -61,7 +62,9 @@ DONT_ADD_MARKS=NO NON_INTERACTIVE_MODE=NO PART_TO_PROCESS='' -while getopts l:skmnp: opt +GROUP_WORDS_BY_THREE=NO +GROUP_WORDS_BY_TWO=NO +while getopts l:skmnp:23 opt do case "$opt" in s) STAT_ONLY=YES;; @@ -70,6 +73,8 @@ m) DONT_ADD_MARKS=YES;; n) NON_INTERACTIVE_MODE=YES;; p) PART_TO_PROCESS="$OPTARG";; + 2) GROUP_WORDS_BY_TWO=YES;; + 3) GROUP_WORDS_BY_THREE=YES;; \?) # unknown flag show_usage exit 1;; @@ -92,10 +97,9 @@ { tr ' ' '\n' | sed 's/--/ /g' \ | sed "s/'/__APOSTROPHE__/g" \ -| tr '—·-' '-----' \ -| tr '*\r,.:#@()+=—<>$;"?!|·[]^%&' ' ' \ +| perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:]'"'"'_-]*$/'\ +| sed "s/__APOSTROPHE__/'/g" \ | tr ' ' '\n' \ -| grep -x '[[:alpha:]'"'"'-]*' \ | tee "$1" \ | grep_v_english_perl \ | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn @@ -149,6 +153,39 @@ rm $after } +two_and_three_words() +{ + if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ] + then + cat + else + cat + + export GROUP_WORDS_BY_THREE + export GROUP_WORDS_BY_TWO + PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX` + cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME +#!/usr/bin/perl +local $/; +$words=<>; +$words=~ s@[!?;,:#1-9".]@ @g; +$words =~ s@\s+@ @g; +@words = split /\s+/, $words; +for ($i=0; $i<$#words-3;$i++) { + my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]); + if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) { + print "${a}_${b}_${c}\n"; + }; + if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) { + print "${a}_${b}\n"; + }; +} +PERL_SCRIPT + perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT" + rm $PERL_SCRIPT_TEMP_NAME + fi +} + grep_v_english() { [ -e "$VOCABULARY" ] || touch "$VOCABULARY" @@ -488,6 +525,7 @@ fi \ | part $PART_TO_PROCESS \ | tee $ORIGINAL_TEXT \ + | two_and_three_words \ | get_words ${TEMP1}-full \ | group_words \ | add_stat ${TEMP1}-full \