new-words

changeset 36:f95804355b0f

compressed wordlist support
author Igor Chubin <igor@chub.in>
date Sat Jan 01 19:47:39 2011 +0100 (2011-01-01)
parents f06e8b0ee41a
children be6336e98b3c
files new-words.sh
line diff
     1.1 --- a/new-words.sh	Thu Dec 30 21:06:02 2010 +0100
     1.2 +++ b/new-words.sh	Sat Jan 01 19:47:39 2011 +0100
     1.3 @@ -11,6 +11,7 @@
     1.4  SWITCHES: 
     1.5  
     1.6      -h          print this screen
     1.7 +    -c          show compressed wordlist: one word per group
     1.8      -k          put higher words that are similar to the known words (only for English)
     1.9      -l lang     override language settings
    1.10      -n          non-interactive mode (don't run vi)
    1.11 @@ -79,9 +80,11 @@
    1.12  DONT_ADD_MARKLINES=NO
    1.13  FILTER_WORDS=YES
    1.14  SHOW_VOC_STAT=NO
    1.15 -while getopts l:sSkanNp:t:Tm:Mr:23 opt
    1.16 +COMPRESSED_WORDLIST=NO
    1.17 +while getopts cl:sSkanNp:t:Tm:Mr:23 opt
    1.18  do
    1.19      case "$opt" in
    1.20 +      c)  COMPRESSED_WORDLIST=YES;;
    1.21        s)  STAT_ONLY=YES;;
    1.22        S)  SHOW_VOC_STAT=YES;;
    1.23        k)  NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
    1.24 @@ -466,6 +469,13 @@
    1.25      return %linked_words;
    1.26  }
    1.27  
    1.28 +sub lc_length($)
    1.29 +{
    1.30 +    my $a= shift;
    1.31 +    $a =~ s/[a-z]//g;
    1.32 +    return length($a);
    1.33 +}
    1.34 +
    1.35  our %dict = load_notes_dict();
    1.36  our %linked_words = find_linked_words(\%dict);
    1.37  
    1.38 @@ -499,13 +509,39 @@
    1.39  @lines2 = sort { compare($b,$a) } @lines;
    1.40  binmode STDOUT, ":utf8";
    1.41  print "# groups ".scalar(keys(%group_weight))."\n";
    1.42 -for $l (@lines2) {
    1.43 -    print "$l\n";
    1.44 +if ($ENV{COMPRESSED_WORDLIST} eq "YES") {
    1.45 +    my $sum = 0;
    1.46 +    my $min = 9999;
    1.47 +    for $L (@lines2) {
    1.48 +        chomp($L); $l=$L; $l =~ s/^(\s*)//; my $spaces = $1; my ($a, $b)=split(/\s+/,$l,2);
    1.49 +        $group_name = normalize($b);
    1.50 +        if ($group_name ne $prev_group_name and $prev_group_name ne '' ) {
    1.51 +            #print (" "x(7-length($sum))),"$sum $main_word\n";
    1.52 +            print +(" "x(7-length($sum))),"$sum $main_word\n";
    1.53 +            $sum = $a;
    1.54 +            $min = length($b) + 2*lc_length($b);
    1.55 +            $main_word = $b;
    1.56 +        }
    1.57 +        else {
    1.58 +            $sum += $a;
    1.59 +            if ($min > length($b) + 2*lc_length($b)) {
    1.60 +                $min = length($b) + 2*lc_length($b);
    1.61 +                $main_word = $b;
    1.62 +            }
    1.63 +        }
    1.64 +        $prev_group_name = $group_name;
    1.65 +    }
    1.66 +}
    1.67 +else {
    1.68 +    for $l (@lines2) {
    1.69 +        print "$l\n";
    1.70 +    }
    1.71  }
    1.72  PERL_SCRIPT
    1.73      export VOCABULARY
    1.74      export NEED_TO_USE_VOCABULARY_WHEN_SORT
    1.75      export LANGUAGE
    1.76 +    export COMPRESSED_WORDLIST
    1.77      [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE"
    1.78      export NOTES_FILE
    1.79      perl $PERL_SCRIPT_TEMP_NAME