new-words
changeset 36:f95804355b0f
compressed wordlist support
author | Igor Chubin <igor@chub.in> |
---|---|
date | Sat Jan 01 19:47:39 2011 +0100 (2011-01-01) |
parents | f06e8b0ee41a |
children | be6336e98b3c |
files | new-words.sh |
line diff
1.1 --- a/new-words.sh Thu Dec 30 21:06:02 2010 +0100 1.2 +++ b/new-words.sh Sat Jan 01 19:47:39 2011 +0100 1.3 @@ -11,6 +11,7 @@ 1.4 SWITCHES: 1.5 1.6 -h print this screen 1.7 + -c show compressed wordlist: one word per group 1.8 -k put higher words that are similar to the known words (only for English) 1.9 -l lang override language settings 1.10 -n non-interactive mode (don't run vi) 1.11 @@ -79,9 +80,11 @@ 1.12 DONT_ADD_MARKLINES=NO 1.13 FILTER_WORDS=YES 1.14 SHOW_VOC_STAT=NO 1.15 -while getopts l:sSkanNp:t:Tm:Mr:23 opt 1.16 +COMPRESSED_WORDLIST=NO 1.17 +while getopts cl:sSkanNp:t:Tm:Mr:23 opt 1.18 do 1.19 case "$opt" in 1.20 + c) COMPRESSED_WORDLIST=YES;; 1.21 s) STAT_ONLY=YES;; 1.22 S) SHOW_VOC_STAT=YES;; 1.23 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;; 1.24 @@ -466,6 +469,13 @@ 1.25 return %linked_words; 1.26 } 1.27 1.28 +sub lc_length($) 1.29 +{ 1.30 + my $a= shift; 1.31 + $a =~ s/[a-z]//g; 1.32 + return length($a); 1.33 +} 1.34 + 1.35 our %dict = load_notes_dict(); 1.36 our %linked_words = find_linked_words(\%dict); 1.37 1.38 @@ -499,13 +509,39 @@ 1.39 @lines2 = sort { compare($b,$a) } @lines; 1.40 binmode STDOUT, ":utf8"; 1.41 print "# groups ".scalar(keys(%group_weight))."\n"; 1.42 -for $l (@lines2) { 1.43 - print "$l\n"; 1.44 +if ($ENV{COMPRESSED_WORDLIST} eq "YES") { 1.45 + my $sum = 0; 1.46 + my $min = 9999; 1.47 + for $L (@lines2) { 1.48 + chomp($L); $l=$L; $l =~ s/^(\s*)//; my $spaces = $1; my ($a, $b)=split(/\s+/,$l,2); 1.49 + $group_name = normalize($b); 1.50 + if ($group_name ne $prev_group_name and $prev_group_name ne '' ) { 1.51 + #print (" "x(7-length($sum))),"$sum $main_word\n"; 1.52 + print +(" "x(7-length($sum))),"$sum $main_word\n"; 1.53 + $sum = $a; 1.54 + $min = length($b) + 2*lc_length($b); 1.55 + $main_word = $b; 1.56 + } 1.57 + else { 1.58 + $sum += $a; 1.59 + if ($min > length($b) + 2*lc_length($b)) { 1.60 + $min = length($b) + 2*lc_length($b); 1.61 + $main_word = $b; 1.62 + } 1.63 + } 1.64 + $prev_group_name = $group_name; 1.65 + } 1.66 +} 1.67 +else { 1.68 + for $l (@lines2) { 1.69 + print "$l\n"; 1.70 + } 1.71 } 1.72 PERL_SCRIPT 1.73 export VOCABULARY 1.74 export NEED_TO_USE_VOCABULARY_WHEN_SORT 1.75 export LANGUAGE 1.76 + export COMPRESSED_WORDLIST 1.77 [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" 1.78 export NOTES_FILE 1.79 perl $PERL_SCRIPT_TEMP_NAME