new-words
view new-words.sh @ 22:46e987f4636d
part.pl script + german normalization support
| author | Igor Chubin <igor@chub.in> | 
|---|---|
| date | Sun May 16 18:20:18 2010 +0300 (2010-05-16) | 
| parents | 190d4ac6b07c | 
| children | 4b9d13c78de2 | 
 line source
     1 #!/bin/sh
     3 show_usage()
     4 {
     5 cat <<HELP > /dev/stderr
     7 USAGE: 
     9     new-words [ -l lang ] [ -s ] [ ARG ] 
    11 SWITCHES: 
    13     -h          print this screen
    14     -k          put higher words that are similar to the known words (only for English)
    15     -l lang     override language settings
    16     -n          non-interactive mode (don't run vi)
    17     -m          don't add marks (and don't save marks added by user)
    18     -s          show the text statistics (percentage of known words and so on) and exit
    20 The language of the text can be specified also
    21 by name of the program new-words (correspondent link must be created before).
    22 For example, these calls are equivalent:
    24     de-words URL
    25     new-words -l de URL
    27 HELP
    28 }
    30 if [ "$1" = "-h" ]
    31 then
    32     show_usage
    33     exit 0
    34 fi
    36 WORK_DIR=~/.new-words/
    37 TEMP1=`mktemp /tmp/new-words-XXXXXXXXXX-temp1`
    38 TEMP2=`mktemp /tmp/new-words-XXXXXXXXXX-temp2`
    39 export ORIGINAL_TEXT=`mktemp /tmp/new-words-XXXXXXXXXX-orig`
    40 editor=${EDITOR:-vim}
    42 #----------------------------------------------------
    43 # command line options processing
    45 LANGUAGE=en
    46 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
    47 for arg
    48 do
    49     if echo "$arg" | grep -q http://...wikipedia.org/wiki/
    50     then
    51     LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
    52     fi
    53 done
    54 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
    55 if [ "$1" = "-l" ]
    56 then
    57     LANGUAGE="$2"
    58     VOCABULARY="$LANGUAGE".txt
    59     shift 2
    60 fi
    61 VOCABULARY=${LANGUAGE}.txt
    62 NOTES_FILE=notes-${LANGUAGE}.txt
    64 STAT_ONLY=NO
    65 if [ "$1" = "-s" ]
    66 then
    67     STAT_ONLY=YES
    68     shift
    69 fi
    71 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
    72 if [ "$1" = "-k" ]
    73 then
    74     NEED_TO_USE_VOCABULARY_WHEN_SORT=YES
    75     shift
    76 fi
    78 DONT_ADD_MARKS=NO
    79 if [ "$1" = "-m" ]
    80 then
    81     DONT_ADD_MARKS=YES
    82     shift
    83 fi
    85 NON_INTERACTIVE_MODE=NO
    86 if [ "$1" = "-n" ]
    87 then
    88     NON_INTERACTIVE_MODE=YES
    89     shift
    90 fi
    92 #----------------------------------------------------
    94 get_words()
    95 {
    96 tr ' ' '\n' | sed 's/--/ /g' \
    97 | sed "s/'/__APOSTROPHE__/g" \
    98 | tr  '—·-' '-----' \
    99 | tr '*\r,.:#@()+=—<>$;"?!|·[]^%&' '                           ' \
   100 | tr ' ' '\n' \
   101 | grep -x '[[:alpha:]'"'"'-]*' \
   102 | tee "$1" \
   103 | grep_v_english_perl \
   104 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn
   105 }
   107 add_stat()
   108 {
   109     before="$1"
   110     after=${before}2
   111     cat > "$after"
   112     total="`wc -w $1 | awk '{print $1}'`"
   113     total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"
   114     total_known="`echo $total-$total_unknown|bc`"
   115     percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.\(.\).*/.\1/'`"
   116     #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`"
   117     sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`"
   120     if [ "$STAT_ONLY" = "YES" ]
   121     then
   122         echo "LANG  KNOWN%  UNKNOWN%  KNOWN     TOTAL     WPS  UWPS*10"
   123         echo "$LANGUAGE    $percentage    `echo \(100-$percentage\) | bc -l`      $total_known    $total    `echo $total/$sentences|bc`   `echo 10*$total_unknown/$sentences|bc` "
   124         rm $after
   125         return 0
   126     else 
   127         echo "# $LANGUAGE, $percentage, <$total_known/$total>"
   128     fi
   130     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
   131     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
   132 my $total=shift(@ARGV);
   133 my $total_known=shift(@ARGV);
   134 my $s=0;
   135 my $mark_line=int($total_known*100/$total/5)*5;
   136 if ($mark_line>=90) { 
   137     $mark_line=int($total_known*100/$total)+1;
   138 } else { $mark_line +=5; };
   139 while(<>)
   140 {
   141     print;
   142     /^\s*([0-9]*)\s*/;
   143     $s+=$1;
   144     if (($total_known+$s)*100/$total>=$mark_line) {
   145         print "# $mark_line\n";
   146         if ($mark_line>=90) { $mark_line+=1; } else { $mark_line +=5; };
   147     }
   148 }
   149 PERL_SCRIPT
   150     perl $PERL_SCRIPT_TEMP_NAME "$total" "$total_known" "$after"
   151     rm $PERL_SCRIPT_TEMP_NAME
   152     rm $after
   153 }
   155 grep_v_english()
   156 {
   157 [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
   158 eval $(cat $VOCABULARY | tr -d "'" | xargs -n10 echo | tr ' ' '|' | sed 's/^/egrep -xv "RRRRRRR|/' | sed 's/$/"/' | tr '\n' '|')cat
   159 }
   161 grep_v_english_perl()
   162 {
   163     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
   164     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
   165 open(VOC, $ENV{VOCABULARY})
   166  or die "Can't open VOCABULARY";
   167 while (<VOC>){
   168     chomp;
   169     #s/'//g;
   170     $voc{$_}="1";
   171 }
   172 while(<>) {
   173     chomp;
   174     if (not defined($voc{$_})) { print "$_\n"; }
   175 }
   176 PERL_SCRIPT
   177     [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
   178     export VOCABULARY 
   179     perl $PERL_SCRIPT_TEMP_NAME
   180     rm $PERL_SCRIPT_TEMP_NAME
   181 }
   183 group_words()
   184 {
   185     #if [ "$LANGUAGE" != "en" ]
   186     #then
   187     #    cat 
   188     #    return
   189     #fi
   190     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-group-words-XXXXXXXX`
   191     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
   192 #!/usr/bin/perl
   194 eval {
   195 # http://stackoverflow.com/questions/251694/how-can-i-check-if-i-have-a-perl-module-before-using-it
   196     require String::Similarity;
   197     String::Similarity->import();
   198 };
   199 unless($@)
   200 {
   201     our $HAVE_String_Similarity=1;
   202 }
   204 sub similar($$){
   205     my $a=shift;
   206     my $b=shift;
   207     if ($HAVE_String_Similarity) {
   208         return $Similarity{"$a $b"};
   209     } 
   210     else {
   211         return 0;
   212     }
   213 }
   215 sub normalize($)
   216 {
   217     if   ( $ENV{LANGUAGE} eq "en" ) { return normalize_english(shift); }
   218     elsif ( $ENV{LANGUAGE} eq "de" ) { return normalize_german(shift); }
   219     else { return shift ; }
   220 }
   222 sub normalize_german($)
   223 {
   224     $_=lc(shift);
   226     s/heit$//;  s/keit$//; s/tum$//; s/ung$//; s/nis$//;s/schaft$//; s/ist$//; 
   227     s/en$//; s/er$//;
   229     s/lich$//; s/ig$//;
   230     s/al$//; s/isch$//;
   231     s/ell$//; s/haft$//;
   233     s/bar$//; s/sam$//; s/lich$//;
   235     @prefixes=qw(
   236         ab an auf aus bei dazwischen ein fest heraus her hinaus hin los mit nach voraus vorbei vor weg weiter zurück zusammen zu
   237         be emp ent er ge miss ver zer durch über um unter wieder);
   238     for $pref (@prefixes) {
   239         s/^$pref//;
   240     }
   243     return $_;
   244 }
   246 sub normalize_english($)
   247 {
   248     $_=lc(shift);
   250     s/s$//;
   252     s/ation$//;  s/ness$//; s/ship$//; s/ally$//; s/ance$//;s/ity$//; s/ment$//; 
   254     s/ed$//;
   255     s/en$//;
   256     s/er$//;
   257     s/est$//;
   258     s/ing$//;
   260     s/ism$//; s/ist$//; s/ful$//; s/able$//; s/ably$//;
   261     s/ify$//; s/fy$//; s/ly$//;
   262     s/ise$//; s/ize$//;
   264     s/e$//;
   265     return $_;
   266 }
   269 sub compare($$)
   270 {
   271     my $a=shift;
   272     my $b=shift;
   273     $a =~ s/^\s*//;
   274     $b =~ s/^\s*//;
   275     my ($a1, $a2)= split /\s+/,$a,2;
   276     my ($b1, $b2)= split /\s+/,$b,2;
   278     my $cmp = $group_weight{normalize($a2)} <=> $group_weight{normalize($b2)};
   280     if ($cmp) {
   281         return $cmp;
   282     }
   283     else {
   284         if (normalize($a2) ne normalize($b2)) {
   285             return normalize($a2) cmp normalize($b2);
   286         }
   287         else {
   288             return $a1 <=> $b1;
   289         }
   290     }
   291 }
   293 our %Vocabulary;
   294 open(VOC, $ENV{VOCABULARY})
   295  or die "Can't open VOCABULARY";
   296 while (<VOC>){
   297     chomp;
   298     #s/'//g;
   299     $Vocabulary{normalize($_)}="1";
   300 }
   301 close(VOC);
   303 @lines=<STDIN>;
   304 for $L (@lines) {
   305     chomp($L);
   306     $l=$L;
   307     $l =~ s/^\s*//;
   308     my ($a, $b)=split(/\s+/,$l,2);
   309     $group_weight{normalize($b)}+=$a;
   310 }
   311 if ($ENV{NEED_TO_USE_VOCABULARY_WHEN_SORT} eq "YES") {
   312     for $k (keys %group_weight) {
   313         if (defined($Vocabulary{$k})) {
   314             $group_weight{$k} *= 2;
   315         }
   316     }
   317 }
   318 @lines2 = sort { compare($b,$a) } @lines;
   319 for $l (@lines2) {
   320     print "$l\n";
   321 }
   322 PERL_SCRIPT
   323     export VOCABULARY
   324     export NEED_TO_USE_VOCABULARY_WHEN_SORT
   325     export LANGUAGE
   326     perl $PERL_SCRIPT_TEMP_NAME
   327     rm $PERL_SCRIPT_TEMP_NAME
   328 }
   330 text_from_url()
   331 {
   332 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
   333 }
   335 add_marks()
   336 {
   337     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
   338     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
   339 $file = $ARGV[0];
   340 our $dict;
   341 if (open(NOTES, $ENV{NOTES_FILE})) {
   342     while(<NOTES>) {
   343         chomp;
   344         s/^\s+//;
   345         my ($a,$b)=split /\s+/,$_,2;
   346         $dict{$a}=$b;
   347     }
   348 }
   349 if (open(F, $file)) {
   350     @lines=<F>;
   351     close(F);
   353     if (open(F, ">$file")) {
   354         for (@lines) {
   355             m/\s+\S+\s+(\S+)/;
   356             $name=$1;
   357             if (not /^#/ and defined($dict{$name})) {
   358                 chomp;
   359                 $mark=$dict{$name};
   360                 $space=" "x(30-length($_));
   361                 print F "$_$space$mark\n";
   362             }
   363             else {
   364                 print F "$_";
   365             }
   366         }
   367         close(F);
   368     }
   369 }
   370 PERL_SCRIPT
   371     [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
   372     export NOTES_FILE
   373     perl $PERL_SCRIPT_TEMP_NAME "$1"
   374     rm $PERL_SCRIPT_TEMP_NAME
   375 }
   377 remove_marks()
   378 {
   379     PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
   380     cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
   381 $file = $ARGV[0];
   382 our %dict;
   383 if (open(F, $file)) {
   384     @lines=<F>;
   385     close(F);
   387     if (open(F, ">$file")) {
   388         for (@lines) {
   389             chomp;
   390             if (not /^#/ and m/(\s+)(\S+)(\s+)(\S+)(\s+)(.*)/) {
   391                 my $name=$4;
   392                 my $comment=$6;
   393                 $dict{$name}=$comment;
   394                 print F "$1$2$3$4\n";
   395             }
   396             else {
   397                 print F "$_\n";
   398             }
   399         }
   400     }
   401 }
   402 if (($ENV{DONT_ADD_MARKS} ne "YES") and open(NOTES, $ENV{NOTES_FILE})) {
   403     @lines=<NOTES>;
   404     close(NOTES);
   406     if (open(NOTES, ">".$ENV{NOTES_FILE})) {
   407         for (@lines) {
   408             chomp;
   409             s/^\s+//;
   410             my ($a,$b)=split /\s+/,$_,2;
   411             if (not defined($dict{$a}) || ($dict{$a} eq $b)) {
   412                 print NOTES "$_\n";
   413                 if (defined($dict{$a})) { unset($dict{$a}); }
   414             }
   415         }
   416         for (keys %dict) {
   417             $mark=$dict{$_};
   418             $space=" "x(30-length($_));
   419             print NOTES "$_$space$mark\n";
   420         }
   421     }
   422 }
   423 PERL_SCRIPT
   424     [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
   425     export NOTES_FILE
   426     export DONT_ADD_MARKS
   427     perl $PERL_SCRIPT_TEMP_NAME "$1"
   428     rm $PERL_SCRIPT_TEMP_NAME
   429 }
   431 mkdir -p $WORK_DIR
   432 oldpwd="$PWD"
   433 cd $WORK_DIR
   434 if echo "$1" | grep -q http: 
   435 then 
   436     text_from_url "$1"
   437 elif [ "$#" != 0 ]
   438 then
   439     if echo $1 | grep -q ^/
   440     then
   441         cat "$1"
   442     else
   443         cat "$oldpwd/$1"
   444     fi
   445 else 
   446     cat
   447 fi \
   448    | tee $ORIGINAL_TEXT \
   449    | get_words ${TEMP1}-full \
   450    | group_words \
   451    | add_stat ${TEMP1}-full \
   452    | tee "$TEMP1" > "$TEMP2"
   454 if [ "$STAT_ONLY" = "YES" ]
   455 then
   456     cat "$TEMP1"
   457 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
   458 then
   459     cat "$TEMP1"
   460 else
   461     [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
   462     if [ "$editor" = vim ]
   463     then
   464         vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=' "$TEMP2" < /dev/tty > /dev/tty
   465     else
   466         echo 2
   467         $editor "$TEMP2"
   468     fi
   469     remove_marks "$TEMP2"
   470 fi
   472 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$VOCABULARY"
   473 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"
