new-words
diff new-words.sh @ 20:23f949c03f95
Usage information, oneliners prototype script, autoraising of the words that are similar to known
author | Igor Chubin <igor@chub.in> |
---|---|
date | Wed Apr 28 21:00:38 2010 +0300 (2010-04-28) |
parents | 416394a87d9f |
children | 190d4ac6b07c |
line diff
1.1 --- a/new-words.sh Tue Apr 20 21:15:19 2010 +0400 1.2 +++ b/new-words.sh Wed Apr 28 21:00:38 2010 +0300 1.3 @@ -1,5 +1,8 @@ 1.4 #!/bin/sh 1.5 -cat <<HELP > /dev/null 1.6 + 1.7 +show_usage() 1.8 +{ 1.9 +cat <<HELP > /dev/stderr 1.10 1.11 USAGE: 1.12 1.13 @@ -7,22 +10,26 @@ 1.14 1.15 SWITCHES: 1.16 1.17 - -s show text statistics and exit 1.18 + -h print this screen 1.19 + -k put higher words that are similar to the known words (only for English) 1.20 -l lang override language settings 1.21 + -s show the text statistics (percentage of known words and so on) and exit 1.22 1.23 -Поддержка нескольких языков: 1.24 +The language of the text can be specified also 1.25 +by name of the program new-words (correspondent link must be created before). 1.26 +For example, these calls are equivalent: 1.27 1.28 - new-words -l lang URL 1.29 - 1.30 -Например, для немецких текстов: 1.31 - 1.32 + de-words URL 1.33 new-words -l de URL 1.34 1.35 -Или, предварительно создав соответствующую ссылку: 1.36 +HELP 1.37 +} 1.38 1.39 - de-words URL 1.40 - 1.41 -HELP 1.42 +if [ "$1" = "-h" ] 1.43 +then 1.44 + show_usage 1.45 + exit 0 1.46 +fi 1.47 1.48 WORK_DIR=~/.new-words/ 1.49 TEMP1=`mktemp /tmp/new-words-XXXXXXXXXX-temp1` 1.50 @@ -53,6 +60,13 @@ 1.51 shift 1.52 fi 1.53 1.54 +NEED_TO_USE_VOCABULARY_WHEN_SORT=NO 1.55 +if [ "$1" = "-k" ] 1.56 +then 1.57 + NEED_TO_USE_VOCABULARY_WHEN_SORT=YES 1.58 + shift 1.59 +fi 1.60 + 1.61 get_words() 1.62 { 1.63 tr ' ' '\n' | sed 's/--/ /g' \ 1.64 @@ -219,7 +233,17 @@ 1.65 } 1.66 } 1.67 1.68 -@lines=<>; 1.69 +our %Vocabulary; 1.70 +open(VOC, $ENV{VOCABULARY}) 1.71 + or die "Can't open VOCABULARY"; 1.72 +while (<VOC>){ 1.73 + chomp; 1.74 + #s/'//g; 1.75 + $Vocabulary{normalize($_)}="1"; 1.76 +} 1.77 +close(VOC); 1.78 + 1.79 +@lines=<STDIN>; 1.80 for $L (@lines) { 1.81 chomp($L); 1.82 $l=$L; 1.83 @@ -227,11 +251,20 @@ 1.84 my ($a, $b)=split(/\s+/,$l,2); 1.85 $group_weight{normalize($b)}+=$a; 1.86 } 1.87 +if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") { 1.88 + for $k (keys %group_weight) { 1.89 + if (defined($Vocabulary{$k})) { 1.90 + $group_weight{$k} *= 2; 1.91 + } 1.92 + } 1.93 +} 1.94 @lines2 = sort { compare($b,$a) } @lines; 1.95 for $l (@lines2) { 1.96 print "$l\n"; 1.97 } 1.98 PERL_SCRIPT 1.99 + export VOCABULARY 1.100 + export NEED_TO_USE_VOCABULARY_WHEN_SORT 1.101 perl $PERL_SCRIPT_TEMP_NAME 1.102 rm $PERL_SCRIPT_TEMP_NAME 1.103 }