new-words

changeset 20:23f949c03f95
Usage information, oneliners prototype script, autoraising of the words that are similar to known
author: Igor Chubin <igor@chub.in>
date: Wed Apr 28 21:00:38 2010 +0300 (2010-04-28)
parents: 416394a87d9f
children: 190d4ac6b07c
files: new-words.sh oneliners.sh
     1.1 --- a/new-words.sh	Tue Apr 20 21:15:19 2010 +0400
     1.2 +++ b/new-words.sh	Wed Apr 28 21:00:38 2010 +0300
     1.3 @@ -1,5 +1,8 @@
     1.4  #!/bin/sh
     1.5 -cat <<HELP > /dev/null
     1.6 +
     1.7 +show_usage()
     1.8 +{
     1.9 +cat <<HELP > /dev/stderr
    1.10  
    1.11  USAGE: 
    1.12  
    1.13 @@ -7,22 +10,26 @@
    1.14  
    1.15  SWITCHES: 
    1.16  
    1.17 -    -s          show text statistics and exit
    1.18 +    -h          print this screen
    1.19 +    -k          put higher words that are similar to the known words (only for English)
    1.20      -l lang     override language settings
    1.21 +    -s          show the text statistics (percentage of known words and so on) and exit
    1.22  
    1.23 -Поддержка нескольких языков:
    1.24 +The language of the text can be specified also
    1.25 +by name of the program new-words (correspondent link must be created before).
    1.26 +For example, these calls are equivalent:
    1.27  
    1.28 -    new-words -l lang URL
    1.29 -
    1.30 -Например, для немецких текстов:
    1.31 -
    1.32 +    de-words URL
    1.33      new-words -l de URL
    1.34  
    1.35 -Или, предварительно создав соответствующую ссылку:
    1.36 +HELP
    1.37 +}
    1.38  
    1.39 -    de-words URL
    1.40 -
    1.41 -HELP
    1.42 +if [ "$1" = "-h" ]
    1.43 +then
    1.44 +    show_usage
    1.45 +    exit 0
    1.46 +fi
    1.47  
    1.48  WORK_DIR=~/.new-words/
    1.49  TEMP1=`mktemp /tmp/new-words-XXXXXXXXXX-temp1`
    1.50 @@ -53,6 +60,13 @@
    1.51      shift
    1.52  fi
    1.53  
    1.54 +NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
    1.55 +if [ "$1" = "-k" ]
    1.56 +then
    1.57 +    NEED_TO_USE_VOCABULARY_WHEN_SORT=YES
    1.58 +    shift
    1.59 +fi
    1.60 +
    1.61  get_words()
    1.62  {
    1.63  tr ' ' '\n' | sed 's/--/ /g' \
    1.64 @@ -219,7 +233,17 @@
    1.65      }
    1.66  }
    1.67  
    1.68 -@lines=<>;
    1.69 +our %Vocabulary;
    1.70 +open(VOC, $ENV{VOCABULARY})
    1.71 + or die "Can't open VOCABULARY";
    1.72 +while (<VOC>){
    1.73 +    chomp;
    1.74 +    #s/'//g;
    1.75 +    $Vocabulary{normalize($_)}="1";
    1.76 +}
    1.77 +close(VOC);
    1.78 +
    1.79 +@lines=<STDIN>;
    1.80  for $L (@lines) {
    1.81      chomp($L);
    1.82      $l=$L;
    1.83 @@ -227,11 +251,20 @@
    1.84      my ($a, $b)=split(/\s+/,$l,2);
    1.85      $group_weight{normalize($b)}+=$a;
    1.86  }
    1.87 +if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") {
    1.88 +    for $k (keys %group_weight) {
    1.89 +        if (defined($Vocabulary{$k})) {
    1.90 +            $group_weight{$k} *= 2;
    1.91 +        }
    1.92 +    }
    1.93 +}
    1.94  @lines2 = sort { compare($b,$a) } @lines;
    1.95  for $l (@lines2) {
    1.96      print "$l\n";
    1.97  }
    1.98  PERL_SCRIPT
    1.99 +    export VOCABULARY
   1.100 +    export NEED_TO_USE_VOCABULARY_WHEN_SORT
   1.101      perl $PERL_SCRIPT_TEMP_NAME
   1.102      rm $PERL_SCRIPT_TEMP_NAME
   1.103  }

     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/oneliners.sh	Wed Apr 28 21:00:38 2010 +0300
     2.3 @@ -0,0 +1,11 @@
     2.4 +temp=/tmp/oneliner$$
     2.5 +while read l
     2.6 +do
     2.7 +    dict -h dictd.xdsl.by -d mueller24 "$l" 2> /dev/null | perl -n -e 'print if ($y and not /^\s*$/) ; $y=1 if /'$l'/;' | grep -v _Syn | grep -v _Ant > $temp
     2.8 +    if [ `wc -l $temp | awk '{print $1}'` = 1 ]
     2.9 +    then
    2.10 +        echo -n $l
    2.11 +        cat $temp
    2.12 +    fi
    2.13 +done
    2.14 +
author	Igor Chubin <igor@chub.in>
date	Wed Apr 28 21:00:38 2010 +0300 (2010-04-28)
parents	416394a87d9f
children	190d4ac6b07c
files	new-words.sh oneliners.sh