# HG changeset patch # User igor@book.xt.vpn # Date 1267561707 -7200 # Node ID c703b8898696cc1feb5411ac69881b0505b04a98 # Parent 68722cd6faff470c41bb7e8853ae6f4c6d2fd3a8 Комментарий-министатистика, дефис в словах, автоматический выбор языка для википедии. * Комментарий с процентом известных слов, * Автоматическое определение языка по названию страницы в википедии (только двухбуквенный), * Слова с дефисом. diff -r 68722cd6faff -r c703b8898696 new-words.sh --- a/new-words.sh Fri Feb 26 21:41:27 2010 +0200 +++ b/new-words.sh Tue Mar 02 22:28:27 2010 +0200 @@ -1,5 +1,4 @@ #!/bin/sh - cat < /dev/null Поддержка нескольких языков: @@ -23,6 +22,10 @@ LANGUAGE=en my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `" +if echo "$1" | grep -q http://...wikipedia.org/wiki/ +then +LANGUAGE="`echo $1 | sed s@http://@@ | sed s@.wikipedia.*@@`" +fi [ "${my_name}" = "new" ] || LANGUAGE="$my_name" if [ "$1" = "-l" ] then @@ -36,12 +39,30 @@ get_words() { tr ' ' '\n' | sed 's/--/ /g' \ -| tr -d '*\r,.-:#@()+=—<>$;"?!|·[]^%&'"'" \ -| tr ' ' '\n' | grep_v_english_perl \ -| grep -x '[[:alnum:]]*' \ +| sed "s/'/__APOSTROPHE__/g" \ +| tr '—·-' '-----' \ +| tr '*\r,.:#@()+=—<>$;"?!|·[]^%&' ' ' \ +| tr ' ' '\n' \ +| grep -x '[[:alpha:]'"'"'-]*' \ +| tee "$1" \ +| grep_v_english_perl \ | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn } +add_stat() +{ + before="$1" + after=${before}2 + cat > "$after" + total="`wc -w $1 | awk '{print $1}'`" + total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`" + total_known="`echo $total-$total_unknown|bc`" + percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`" + echo "# $LANGUAGE, $percentage, <$total_known/$total>" + cat "$after" + rm $after +} + grep_v_english() { [ -e "$VOCABULARY" ] || touch "$VOCABULARY" @@ -56,7 +77,7 @@ or die "Can't open VOCABULARY"; while (){ chomp; - s/'//g; + #s/'//g; $voc{$_}="1"; } while(<>) { @@ -97,7 +118,7 @@ for (@lines) { m/\s+\S+\s+(\S+)/; $name=$1; - if (defined($dict{$name})) { + if (not /^#/ and defined($dict{$name})) { chomp; $mark=$dict{$name}; $space=" "x(30-length($_)); @@ -130,7 +151,7 @@ if (open(F, ">$file")) { for (@lines) { chomp; - if (m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) { + if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) { my $name=$4; my $comment=$6; $dict{$name}=$comment; @@ -174,12 +195,12 @@ cd $WORK_DIR if echo "$1" | grep -q http: then - text_from_url "$1" | get_words | tee "$TEMP1" > "$TEMP2" + text_from_url "$1" | get_words ${TEMP1}-full | add_stat ${TEMP1}-full| tee "$TEMP1" > "$TEMP2" elif [ "$#" != 0 ] then - cat "$1" | get_words | tee "$TEMP1" > "$TEMP2" + cat "$1" | get_words ${TEMP1}-full | add_stat ${TEMP1}-full | tee "$TEMP1" > "$TEMP2" else - get_words | tee "$TEMP1" > "$TEMP2" + get_words ${TEMP1}-full| add_stat ${TEMP1}-full | tee "$TEMP1" > "$TEMP2" fi add_marks "$TEMP2" @@ -193,4 +214,4 @@ remove_marks "$TEMP2" diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$VOCABULARY" -rm -f "$TEMP1" "$TEMP2" +rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full"