new-words
changeset 3:c703b8898696
Комментарий-министатистика, дефис в словах, автоматический выбор языка для википедии.
* Комментарий с процентом известных слов,
* Автоматическое определение языка по названию страницы в википедии (только двухбуквенный),
* Слова с дефисом.
* Комментарий с процентом известных слов,
* Автоматическое определение языка по названию страницы в википедии (только двухбуквенный),
* Слова с дефисом.
author | igor@book.xt.vpn |
---|---|
date | Tue Mar 02 22:28:27 2010 +0200 (2010-03-02) |
parents | 68722cd6faff |
children | 0d44e794175b |
files | new-words.sh |
line diff
1.1 --- a/new-words.sh Fri Feb 26 21:41:27 2010 +0200 1.2 +++ b/new-words.sh Tue Mar 02 22:28:27 2010 +0200 1.3 @@ -1,5 +1,4 @@ 1.4 #!/bin/sh 1.5 - 1.6 cat <<HELP > /dev/null 1.7 1.8 Поддержка нескольких языков: 1.9 @@ -23,6 +22,10 @@ 1.10 1.11 LANGUAGE=en 1.12 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `" 1.13 +if echo "$1" | grep -q http://...wikipedia.org/wiki/ 1.14 +then 1.15 +LANGUAGE="`echo $1 | sed s@http://@@ | sed s@.wikipedia.*@@`" 1.16 +fi 1.17 [ "${my_name}" = "new" ] || LANGUAGE="$my_name" 1.18 if [ "$1" = "-l" ] 1.19 then 1.20 @@ -36,12 +39,30 @@ 1.21 get_words() 1.22 { 1.23 tr ' ' '\n' | sed 's/--/ /g' \ 1.24 -| tr -d '*\r,.-:#@()+=—<>$;"?!|·[]^%&'"'" \ 1.25 -| tr ' ' '\n' | grep_v_english_perl \ 1.26 -| grep -x '[[:alnum:]]*' \ 1.27 +| sed "s/'/__APOSTROPHE__/g" \ 1.28 +| tr '—·-' '-----' \ 1.29 +| tr '*\r,.:#@()+=—<>$;"?!|·[]^%&' ' ' \ 1.30 +| tr ' ' '\n' \ 1.31 +| grep -x '[[:alpha:]'"'"'-]*' \ 1.32 +| tee "$1" \ 1.33 +| grep_v_english_perl \ 1.34 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn 1.35 } 1.36 1.37 +add_stat() 1.38 +{ 1.39 + before="$1" 1.40 + after=${before}2 1.41 + cat > "$after" 1.42 + total="`wc -w $1 | awk '{print $1}'`" 1.43 + total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`" 1.44 + total_known="`echo $total-$total_unknown|bc`" 1.45 + percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`" 1.46 + echo "# $LANGUAGE, $percentage, <$total_known/$total>" 1.47 + cat "$after" 1.48 + rm $after 1.49 +} 1.50 + 1.51 grep_v_english() 1.52 { 1.53 [ -e "$VOCABULARY" ] || touch "$VOCABULARY" 1.54 @@ -56,7 +77,7 @@ 1.55 or die "Can't open VOCABULARY"; 1.56 while (<VOC>){ 1.57 chomp; 1.58 - s/'//g; 1.59 + #s/'//g; 1.60 $voc{$_}="1"; 1.61 } 1.62 while(<>) { 1.63 @@ -97,7 +118,7 @@ 1.64 for (@lines) { 1.65 m/\s+\S+\s+(\S+)/; 1.66 $name=$1; 1.67 - if (defined($dict{$name})) { 1.68 + if (not /^#/ and defined($dict{$name})) { 1.69 chomp; 1.70 $mark=$dict{$name}; 1.71 $space=" "x(30-length($_)); 1.72 @@ -130,7 +151,7 @@ 1.73 if (open(F, ">$file")) { 1.74 for (@lines) { 1.75 chomp; 1.76 - if (m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) { 1.77 + if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) { 1.78 my $name=$4; 1.79 my $comment=$6; 1.80 $dict{$name}=$comment; 1.81 @@ -174,12 +195,12 @@ 1.82 cd $WORK_DIR 1.83 if echo "$1" | grep -q http: 1.84 then 1.85 - text_from_url "$1" | get_words | tee "$TEMP1" > "$TEMP2" 1.86 + text_from_url "$1" | get_words ${TEMP1}-full | add_stat ${TEMP1}-full| tee "$TEMP1" > "$TEMP2" 1.87 elif [ "$#" != 0 ] 1.88 then 1.89 - cat "$1" | get_words | tee "$TEMP1" > "$TEMP2" 1.90 + cat "$1" | get_words ${TEMP1}-full | add_stat ${TEMP1}-full | tee "$TEMP1" > "$TEMP2" 1.91 else 1.92 - get_words | tee "$TEMP1" > "$TEMP2" 1.93 + get_words ${TEMP1}-full| add_stat ${TEMP1}-full | tee "$TEMP1" > "$TEMP2" 1.94 fi 1.95 1.96 add_marks "$TEMP2" 1.97 @@ -193,4 +214,4 @@ 1.98 remove_marks "$TEMP2" 1.99 1.100 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$VOCABULARY" 1.101 -rm -f "$TEMP1" "$TEMP2" 1.102 +rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full"