new-words

diff new-words.sh @ 18:7e3a52db54ad

Среднее количество слов и неизвестных слов (*10) в предложении в статистике.

Пример использования:

LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10
en 89.8 10.2 167021 185840 21 21
author Igor Chubin <igor@chub.in>
date Mon Apr 05 21:34:55 2010 +0300 (2010-04-05)
parents 35eeaf2620ce
children 416394a87d9f
line diff
     1.1 --- a/new-words.sh	Sun Apr 04 20:11:22 2010 +0300
     1.2 +++ b/new-words.sh	Mon Apr 05 21:34:55 2010 +0300
     1.3 @@ -75,10 +75,14 @@
     1.4      total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
     1.5      total_known="`echo $total-$total_unknown|bc`"
     1.6      percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
     1.7 +    #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`"
     1.8 +    sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`"
     1.9 +
    1.10 +
    1.11      if [ "$STAT_ONLY" = "YES" ]
    1.12      then
    1.13 -        echo "LANG  KNOWN%  UNKNOWN%  KNOWN    TOTAL"
    1.14 -        echo "$LANGUAGE    $percentage    `echo \(100-$percentage\) | bc -l`      $total_known    $total"
    1.15 +        echo "LANG  KNOWN%  UNKNOWN%  KNOWN     TOTAL     WPS  UWPS*10"
    1.16 +        echo "$LANGUAGE    $percentage    `echo \(100-$percentage\) | bc -l`      $total_known    $total    `echo $total/$sentences|bc`   `echo 10*$total_unknown/$sentences|bc` "
    1.17          rm $after
    1.18          return 0
    1.19      else