new-words
changeset 16:c65ffd60cc18
ключ -s: Информация об известных словах в тексте
Может использоваться в пакетном режиме,
например, для того чтобы из множества файлов
выбрать тот, в котором процент известных слов наибольший
Может использоваться в пакетном режиме,
например, для того чтобы из множества файлов
выбрать тот, в котором процент известных слов наибольший
author | Igor Chubin <igor@chub.in> |
---|---|
date | Sun Apr 04 19:03:30 2010 +0300 (2010-04-04) |
parents | c6efd17741aa |
children | 35eeaf2620ce |
files | new-words.sh |
line diff
1.1 --- a/new-words.sh Sun Apr 04 14:12:35 2010 +0300 1.2 +++ b/new-words.sh Sun Apr 04 19:03:30 2010 +0300 1.3 @@ -1,6 +1,15 @@ 1.4 #!/bin/sh 1.5 cat <<HELP > /dev/null 1.6 1.7 +USAGE: 1.8 + 1.9 + new-words [ -l lang ] [ -s ] [ ARG ] 1.10 + 1.11 +SWITCHES: 1.12 + 1.13 + -s show text statistics and exit 1.14 + -l lang override language settings 1.15 + 1.16 Поддержка нескольких языков: 1.17 1.18 new-words -l lang URL 1.19 @@ -37,6 +46,13 @@ 1.20 VOCABULARY=${LANGUAGE}.txt 1.21 NOTES_FILE=notes-${LANGUAGE}.txt 1.22 1.23 +STAT_ONLY=NO 1.24 +if [ "$1" = "-s" ] 1.25 +then 1.26 + STAT_ONLY=YES 1.27 + shift 1.28 +fi 1.29 + 1.30 get_words() 1.31 { 1.32 tr ' ' '\n' | sed 's/--/ /g' \ 1.33 @@ -59,13 +75,22 @@ 1.34 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`" 1.35 total_known="`echo $total-$total_unknown|bc`" 1.36 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`" 1.37 - echo "# $LANGUAGE, $percentage, <$total_known/$total>" 1.38 + if [ "$STAT_ONLY" = "YES" ] 1.39 + then 1.40 + echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL" 1.41 + echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total" 1.42 + return 0 1.43 + else 1.44 + echo "# $LANGUAGE, $percentage, <$total_known/$total>" 1.45 + fi 1.46 + 1.47 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` 1.48 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.49 my $total=shift(@ARGV); 1.50 my $total_known=shift(@ARGV); 1.51 my $s=0; 1.52 -my $mark_line=int($total_known*100/$total/5)*5+5; 1.53 +my $mark_line=int($total_known*100/$total/5)*5; 1.54 +if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; }; 1.55 while(<>) 1.56 { 1.57 print; 1.58 @@ -154,7 +179,7 @@ 1.59 s/ing$//; 1.60 1.61 s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//; 1.62 - s/ation$//; s/ness$//; s/ally$//; 1.63 + s/ation$//; s/ness$//; s/ship$//; s/ally$//; 1.64 s/ment$//; s/ify$//; s/ity$//; s/fy$//; s/ly$//; 1.65 s/ise$//; s/ize$//; 1.66 1.67 @@ -327,15 +352,20 @@ 1.68 | add_stat ${TEMP1}-full \ 1.69 | tee "$TEMP1" > "$TEMP2" 1.70 1.71 -add_marks "$TEMP2" 1.72 -if [ "$editor" = vim ] 1.73 +if [ "$STAT_ONLY" = "YES" ] 1.74 then 1.75 - vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=' "$TEMP2" < /dev/tty > /dev/tty 1.76 + cat "$TEMP1" 1.77 else 1.78 - echo 2 1.79 - $editor "$TEMP2" 1.80 + add_marks "$TEMP2" 1.81 + if [ "$editor" = vim ] 1.82 + then 1.83 + vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=' "$TEMP2" < /dev/tty > /dev/tty 1.84 + else 1.85 + echo 2 1.86 + $editor "$TEMP2" 1.87 + fi 1.88 + remove_marks "$TEMP2" 1.89 fi 1.90 -remove_marks "$TEMP2" 1.91 1.92 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$VOCABULARY" 1.93 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"