new-words

changeset 3:c703b8898696

Комментарий-министатистика, дефис в словах, автоматический выбор языка для википедии.

* Комментарий с процентом известных слов,
* Автоматическое определение языка по названию страницы в википедии (только двухбуквенный),
* Слова с дефисом.
author igor@book.xt.vpn
date Tue Mar 02 22:28:27 2010 +0200 (2010-03-02)
parents 68722cd6faff
children 0d44e794175b
files new-words.sh
line diff
     1.1 --- a/new-words.sh	Fri Feb 26 21:41:27 2010 +0200
     1.2 +++ b/new-words.sh	Tue Mar 02 22:28:27 2010 +0200
     1.3 @@ -1,5 +1,4 @@
     1.4  #!/bin/sh
     1.5 -
     1.6  cat <<HELP > /dev/null
     1.7  
     1.8  Поддержка нескольких языков:
     1.9 @@ -23,6 +22,10 @@
    1.10  
    1.11  LANGUAGE=en
    1.12  my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
    1.13 +if echo "$1" | grep -q http://...wikipedia.org/wiki/
    1.14 +then
    1.15 +LANGUAGE="`echo $1 | sed s@http://@@ | sed s@.wikipedia.*@@`"
    1.16 +fi
    1.17  [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
    1.18  if [ "$1" = "-l" ]
    1.19  then
    1.20 @@ -36,12 +39,30 @@
    1.21  get_words()
    1.22  {
    1.23  tr ' ' '\n' | sed 's/--/ /g' \
    1.24 -| tr -d '*\r,.-:#@()+=—<>$;"?!|·[]^%&'"'" \
    1.25 -| tr ' ' '\n' | grep_v_english_perl \
    1.26 -| grep -x '[[:alnum:]]*' \
    1.27 +| sed "s/'/__APOSTROPHE__/g" \
    1.28 +| tr  '—·-' '-----' \
    1.29 +| tr '*\r,.:#@()+=—<>$;"?!|·[]^%&' '                           ' \
    1.30 +| tr ' ' '\n' \
    1.31 +| grep -x '[[:alpha:]'"'"'-]*' \
    1.32 +| tee "$1" \
    1.33 +| grep_v_english_perl \
    1.34  | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
    1.35  }
    1.36  
    1.37 +add_stat()
    1.38 +{
    1.39 +    before="$1"
    1.40 +    after=${before}2
    1.41 +    cat > "$after"
    1.42 +    total="`wc -w $1 | awk '{print $1}'`"
    1.43 +    total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
    1.44 +    total_known="`echo $total-$total_unknown|bc`"
    1.45 +    percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
    1.46 +    echo "# $LANGUAGE, $percentage, <$total_known/$total>"
    1.47 +    cat "$after"
    1.48 +    rm $after
    1.49 +}
    1.50 +
    1.51  grep_v_english()
    1.52  {
    1.53  [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
    1.54 @@ -56,7 +77,7 @@
    1.55   or die "Can't open VOCABULARY";
    1.56  while (<VOC>){
    1.57      chomp;
    1.58 -    s/'//g;
    1.59 +    #s/'//g;
    1.60      $voc{$_}="1";
    1.61  }
    1.62  while(<>) {
    1.63 @@ -97,7 +118,7 @@
    1.64          for (@lines) {
    1.65              m/\s+\S+\s+(\S+)/;
    1.66              $name=$1;
    1.67 -            if (defined($dict{$name})) {
    1.68 +            if (not /^#/ and defined($dict{$name})) {
    1.69                  chomp;
    1.70                  $mark=$dict{$name};
    1.71                  $space=" "x(30-length($_));
    1.72 @@ -130,7 +151,7 @@
    1.73      if (open(F, ">$file")) {
    1.74          for (@lines) {
    1.75              chomp;
    1.76 -            if (m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
    1.77 +            if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
    1.78                  my $name=$4;
    1.79                  my $comment=$6;
    1.80                  $dict{$name}=$comment;
    1.81 @@ -174,12 +195,12 @@
    1.82  cd $WORK_DIR
    1.83  if echo "$1" | grep -q http: 
    1.84  then 
    1.85 -    text_from_url "$1" | get_words | tee "$TEMP1" > "$TEMP2"
    1.86 +    text_from_url "$1" | get_words ${TEMP1}-full | add_stat ${TEMP1}-full| tee "$TEMP1" > "$TEMP2"
    1.87  elif [ "$#" != 0 ]
    1.88  then
    1.89 -    cat "$1" | get_words | tee "$TEMP1" > "$TEMP2"
    1.90 +    cat "$1" | get_words ${TEMP1}-full | add_stat ${TEMP1}-full | tee "$TEMP1" > "$TEMP2"
    1.91  else 
    1.92 -    get_words | tee "$TEMP1" > "$TEMP2"
    1.93 +    get_words ${TEMP1}-full| add_stat ${TEMP1}-full | tee "$TEMP1" > "$TEMP2"
    1.94  fi
    1.95  
    1.96  add_marks "$TEMP2"
    1.97 @@ -193,4 +214,4 @@
    1.98  remove_marks "$TEMP2"
    1.99  
   1.100  diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$VOCABULARY"
   1.101 -rm -f "$TEMP1" "$TEMP2"
   1.102 +rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full"