# HG changeset patch # User Igor Chubin # Date 1293907659 -3600 # Node ID f95804355b0f0d8019544f9e6601c8d28004d9ca # Parent f06e8b0ee41afab39f9085eddeb616051beba138 compressed wordlist support diff -r f06e8b0ee41a -r f95804355b0f new-words.sh --- a/new-words.sh Thu Dec 30 21:06:02 2010 +0100 +++ b/new-words.sh Sat Jan 01 19:47:39 2011 +0100 @@ -11,6 +11,7 @@ SWITCHES: -h print this screen + -c show compressed wordlist: one word per group -k put higher words that are similar to the known words (only for English) -l lang override language settings -n non-interactive mode (don't run vi) @@ -79,9 +80,11 @@ DONT_ADD_MARKLINES=NO FILTER_WORDS=YES SHOW_VOC_STAT=NO -while getopts l:sSkanNp:t:Tm:Mr:23 opt +COMPRESSED_WORDLIST=NO +while getopts cl:sSkanNp:t:Tm:Mr:23 opt do case "$opt" in + c) COMPRESSED_WORDLIST=YES;; s) STAT_ONLY=YES;; S) SHOW_VOC_STAT=YES;; k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;; @@ -466,6 +469,13 @@ return %linked_words; } +sub lc_length($) +{ + my $a= shift; + $a =~ s/[a-z]//g; + return length($a); +} + our %dict = load_notes_dict(); our %linked_words = find_linked_words(\%dict); @@ -499,13 +509,39 @@ @lines2 = sort { compare($b,$a) } @lines; binmode STDOUT, ":utf8"; print "# groups ".scalar(keys(%group_weight))."\n"; -for $l (@lines2) { - print "$l\n"; +if ($ENV{COMPRESSED_WORDLIST} eq "YES") { + my $sum = 0; + my $min = 9999; + for $L (@lines2) { + chomp($L); $l=$L; $l =~ s/^(\s*)//; my $spaces = $1; my ($a, $b)=split(/\s+/,$l,2); + $group_name = normalize($b); + if ($group_name ne $prev_group_name and $prev_group_name ne '' ) { + #print (" "x(7-length($sum))),"$sum $main_word\n"; + print +(" "x(7-length($sum))),"$sum $main_word\n"; + $sum = $a; + $min = length($b) + 2*lc_length($b); + $main_word = $b; + } + else { + $sum += $a; + if ($min > length($b) + 2*lc_length($b)) { + $min = length($b) + 2*lc_length($b); + $main_word = $b; + } + } + $prev_group_name = $group_name; + } +} +else { + for $l (@lines2) { + print "$l\n"; + } } PERL_SCRIPT export VOCABULARY export NEED_TO_USE_VOCABULARY_WHEN_SORT export LANGUAGE + export COMPRESSED_WORDLIST [ -e "$NOTES_FILE" ] || touch "$NOTES_FILE" export NOTES_FILE perl $PERL_SCRIPT_TEMP_NAME