new-words
changeset 26:4a10c0f4510c
apostrophe support, some improvements in speed, two and three words combination support
author | Igor Chubin <igor@chub.in> |
---|---|
date | Fri May 21 01:02:21 2010 +0300 (2010-05-21) |
parents | d1eb7dc37feb |
children | 0a80b2fa3ed8 |
files | new-words.sh |
line diff
1.1 --- a/new-words.sh Mon May 17 14:48:34 2010 +0300 1.2 +++ b/new-words.sh Fri May 21 01:02:21 2010 +0300 1.3 @@ -1,4 +1,4 @@ 1.4 -#!/bin/sh 1.5 +#!/bin/bash 1.6 1.7 show_usage() 1.8 { 1.9 @@ -17,6 +17,7 @@ 1.10 -m don't add marks (and don't save marks added by user) 1.11 -p pages work with specified pages only (pages = start-stop/total ) 1.12 -s show the text statistics (percentage of known words and so on) and exit 1.13 + -2 -3 find 2 and 3 words' sequences 1.14 1.15 The language of the text can be specified also 1.16 by name of the program new-words (correspondent link must be created before). 1.17 @@ -61,7 +62,9 @@ 1.18 DONT_ADD_MARKS=NO 1.19 NON_INTERACTIVE_MODE=NO 1.20 PART_TO_PROCESS='' 1.21 -while getopts l:skmnp: opt 1.22 +GROUP_WORDS_BY_THREE=NO 1.23 +GROUP_WORDS_BY_TWO=NO 1.24 +while getopts l:skmnp:23 opt 1.25 do 1.26 case "$opt" in 1.27 s) STAT_ONLY=YES;; 1.28 @@ -70,6 +73,8 @@ 1.29 m) DONT_ADD_MARKS=YES;; 1.30 n) NON_INTERACTIVE_MODE=YES;; 1.31 p) PART_TO_PROCESS="$OPTARG";; 1.32 + 2) GROUP_WORDS_BY_TWO=YES;; 1.33 + 3) GROUP_WORDS_BY_THREE=YES;; 1.34 \?) # unknown flag 1.35 show_usage 1.36 exit 1;; 1.37 @@ -92,10 +97,9 @@ 1.38 { 1.39 tr ' ' '\n' | sed 's/--/ /g' \ 1.40 | sed "s/'/__APOSTROPHE__/g" \ 1.41 -| tr '—·-' '-----' \ 1.42 -| tr '*\r,.:#@()+=—<>$;"?!|·[]^%&' ' ' \ 1.43 +| perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:]'"'"'_-]*$/'\ 1.44 +| sed "s/__APOSTROPHE__/'/g" \ 1.45 | tr ' ' '\n' \ 1.46 -| grep -x '[[:alpha:]'"'"'-]*' \ 1.47 | tee "$1" \ 1.48 | grep_v_english_perl \ 1.49 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn 1.50 @@ -149,6 +153,39 @@ 1.51 rm $after 1.52 } 1.53 1.54 +two_and_three_words() 1.55 +{ 1.56 + if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ] 1.57 + then 1.58 + cat 1.59 + else 1.60 + cat 1.61 + 1.62 + export GROUP_WORDS_BY_THREE 1.63 + export GROUP_WORDS_BY_TWO 1.64 + PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX` 1.65 + cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.66 +#!/usr/bin/perl 1.67 +local $/; 1.68 +$words=<>; 1.69 +$words=~ s@[!?;,:#1-9".]@ @g; 1.70 +$words =~ s@\s+@ @g; 1.71 +@words = split /\s+/, $words; 1.72 +for ($i=0; $i<$#words-3;$i++) { 1.73 + my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]); 1.74 + if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) { 1.75 + print "${a}_${b}_${c}\n"; 1.76 + }; 1.77 + if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) { 1.78 + print "${a}_${b}\n"; 1.79 + }; 1.80 +} 1.81 +PERL_SCRIPT 1.82 + perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT" 1.83 + rm $PERL_SCRIPT_TEMP_NAME 1.84 + fi 1.85 +} 1.86 + 1.87 grep_v_english() 1.88 { 1.89 [ -e "$VOCABULARY" ] || touch "$VOCABULARY" 1.90 @@ -488,6 +525,7 @@ 1.91 fi \ 1.92 | part $PART_TO_PROCESS \ 1.93 | tee $ORIGINAL_TEXT \ 1.94 + | two_and_three_words \ 1.95 | get_words ${TEMP1}-full \ 1.96 | group_words \ 1.97 | add_stat ${TEMP1}-full \