new-words
changeset 33:720a701b2ba9
-S and -N keys
author | Igor Chubin <igor@chub.in> |
---|---|
date | Sun Dec 12 10:04:06 2010 +0100 (2010-12-12) |
parents | 753fb84437aa |
children | 3827cce83602 |
files | new-words.sh |
line diff
1.1 --- a/new-words.sh Fri Nov 05 20:07:46 2010 +0100 1.2 +++ b/new-words.sh Sun Dec 12 10:04:06 2010 +0100 1.3 @@ -14,9 +14,11 @@ 1.4 -k put higher words that are similar to the known words (only for English) 1.5 -l lang override language settings 1.6 -n non-interactive mode (don't run vi) 1.7 + -N turn off known words filtering 1.8 -a don't add marks (and don't save marks added by user) 1.9 -p pages work with specified pages only (pages = start-stop/total ) 1.10 -s show the text statistics (percentage of known words and so on) and exit 1.11 + -S show your vocabulary statistics (number of words and word groups) 1.12 -t tag tag known words with tag 1.13 -T show list of active tags 1.14 -m tag merge the words tagged with "tag" into the main vocabulary 1.15 @@ -75,14 +77,18 @@ 1.16 MERGE_TAGGED_WORDS=NO 1.17 MERGE_ALL_TAGGED=NO 1.18 DONT_ADD_MARKLINES=NO 1.19 -while getopts l:skanp:t:Tm:Mr:23 opt 1.20 +FILTER_WORDS=YES 1.21 +SHOW_VOC_STAT=NO 1.22 +while getopts l:sSkanNp:t:Tm:Mr:23 opt 1.23 do 1.24 case "$opt" in 1.25 s) STAT_ONLY=YES;; 1.26 + S) SHOW_VOC_STAT=YES;; 1.27 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;; 1.28 l) LANGUAGE="$OPTARG";; 1.29 a) DONT_ADD_MARKS=YES;; 1.30 n) NON_INTERACTIVE_MODE=YES;; 1.31 + N) FILTER_WORDS=NO;; 1.32 p) PART_TO_PROCESS="$OPTARG";; 1.33 t) TAG_NAME="$OPTARG";; 1.34 T) TAGS_LIST_ONLY="YES";; 1.35 @@ -107,10 +113,17 @@ 1.36 VOCABULARY=${LANGUAGE}.txt 1.37 NOTES_FILE=notes-${LANGUAGE}.txt 1.38 1.39 +if [ "${SHOW_VOC_STAT}" = "YES" ] 1.40 +then 1.41 + $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>" 1.42 + exit 0 1.43 +fi 1.44 + 1.45 #---------------------------------------------------- 1.46 1.47 get_words() 1.48 { 1.49 + export FILTER_WORDS 1.50 tr ' ' '\n' | sed 's/--/ /g' \ 1.51 | sed "s/'/__APOSTROPHE__/g" \ 1.52 | perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\ 1.53 @@ -146,7 +159,9 @@ 1.54 rm $after 1.55 return 0 1.56 else 1.57 - echo "# $LANGUAGE, $percentage, <$total_known/$total>" 1.58 + groups="$(grep '# groups' $after | awk '{print $3}')" 1.59 + words="$(grep -v '^#' $after | wc -l)" 1.60 + echo "# $LANGUAGE, $percentage, <$total_known/$total>, <$groups/$words>" 1.61 fi 1.62 1.63 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` 1.64 @@ -160,6 +175,7 @@ 1.65 } else { $mark_line +=5; }; 1.66 while(<>) 1.67 { 1.68 + next if /^#\s*groups\s*/; 1.69 print; 1.70 /^\s*([0-9]*)\s*/; 1.71 $s+=$1; 1.72 @@ -217,6 +233,10 @@ 1.73 { 1.74 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` 1.75 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.76 + if ($ENV{FILTER_WORDS} eq "NO") { 1.77 + while(<>) { print; } 1.78 + exit(0); 1.79 + } 1.80 $voc_files=$ENV{VOC_FILES}; 1.81 $voc_files=~s@^ @@; 1.82 for $voc_file (split /\s+/,$voc_files) { 1.83 @@ -478,6 +498,7 @@ 1.84 } 1.85 @lines2 = sort { compare($b,$a) } @lines; 1.86 binmode STDOUT, ":utf8"; 1.87 +print "# groups ".scalar(keys(%group_weight))."\n"; 1.88 for $l (@lines2) { 1.89 print "$l\n"; 1.90 }