# HG changeset patch # User Igor Chubin # Date 1270397010 -10800 # Node ID c65ffd60cc182d25a5ed8337fcd21cc7f9021f67 # Parent c6efd17741aaf1761e6a0eb00a28ea4893838624 ключ -s: Информация об известных словах в тексте Может использоваться в пакетном режиме, например, для того чтобы из множества файлов выбрать тот, в котором процент известных слов наибольший diff -r c6efd17741aa -r c65ffd60cc18 new-words.sh --- a/new-words.sh Sun Apr 04 14:12:35 2010 +0300 +++ b/new-words.sh Sun Apr 04 19:03:30 2010 +0300 @@ -1,6 +1,15 @@ #!/bin/sh cat < /dev/null +USAGE: + + new-words [ -l lang ] [ -s ] [ ARG ] + +SWITCHES: + + -s show text statistics and exit + -l lang override language settings + Поддержка нескольких языков: new-words -l lang URL @@ -37,6 +46,13 @@ VOCABULARY=${LANGUAGE}.txt NOTES_FILE=notes-${LANGUAGE}.txt +STAT_ONLY=NO +if [ "$1" = "-s" ] +then + STAT_ONLY=YES + shift +fi + get_words() { tr ' ' '\n' | sed 's/--/ /g' \ @@ -59,13 +75,22 @@ total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`" total_known="`echo $total-$total_unknown|bc`" percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`" - echo "# $LANGUAGE, $percentage, <$total_known/$total>" + if [ "$STAT_ONLY" = "YES" ] + then + echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL" + echo "$LANGUAGE $percentage `echo \(100-$percentage\) | bc -l` $total_known $total" + return 0 + else + echo "# $LANGUAGE, $percentage, <$total_known/$total>" + fi + PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME my $total=shift(@ARGV); my $total_known=shift(@ARGV); my $s=0; -my $mark_line=int($total_known*100/$total/5)*5+5; +my $mark_line=int($total_known*100/$total/5)*5; +if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; }; while(<>) { print; @@ -154,7 +179,7 @@ s/ing$//; s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//; - s/ation$//; s/ness$//; s/ally$//; + s/ation$//; s/ness$//; s/ship$//; s/ally$//; s/ment$//; s/ify$//; s/ity$//; s/fy$//; s/ly$//; s/ise$//; s/ize$//; @@ -327,15 +352,20 @@ | add_stat ${TEMP1}-full \ | tee "$TEMP1" > "$TEMP2" -add_marks "$TEMP2" -if [ "$editor" = vim ] +if [ "$STAT_ONLY" = "YES" ] then - vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=' "$TEMP2" < /dev/tty > /dev/tty + cat "$TEMP1" else - echo 2 - $editor "$TEMP2" + add_marks "$TEMP2" + if [ "$editor" = vim ] + then + vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=' "$TEMP2" < /dev/tty > /dev/tty + else + echo 2 + $editor "$TEMP2" + fi + remove_marks "$TEMP2" fi -remove_marks "$TEMP2" diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$VOCABULARY" rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"