new-words

changeset 33:720a701b2ba9
-S and -N keys
author: Igor Chubin <igor@chub.in>
date: Sun Dec 12 10:04:06 2010 +0100 (2010-12-12)
parents: 753fb84437aa
children: 3827cce83602
files: new-words.sh
     1.1 --- a/new-words.sh	Fri Nov 05 20:07:46 2010 +0100
     1.2 +++ b/new-words.sh	Sun Dec 12 10:04:06 2010 +0100
     1.3 @@ -14,9 +14,11 @@
     1.4      -k          put higher words that are similar to the known words (only for English)
     1.5      -l lang     override language settings
     1.6      -n          non-interactive mode (don't run vi)
     1.7 +    -N          turn off known words filtering
     1.8      -a          don't add marks (and don't save marks added by user)
     1.9      -p pages    work with specified pages only (pages = start-stop/total )
    1.10      -s          show the text statistics (percentage of known words and so on) and exit
    1.11 +    -S          show your vocabulary statistics (number of words and word groups)
    1.12      -t tag      tag known words with tag
    1.13      -T          show list of active tags
    1.14      -m tag      merge the words tagged with "tag" into the main vocabulary
    1.15 @@ -75,14 +77,18 @@
    1.16  MERGE_TAGGED_WORDS=NO
    1.17  MERGE_ALL_TAGGED=NO
    1.18  DONT_ADD_MARKLINES=NO
    1.19 -while getopts l:skanp:t:Tm:Mr:23 opt
    1.20 +FILTER_WORDS=YES
    1.21 +SHOW_VOC_STAT=NO
    1.22 +while getopts l:sSkanNp:t:Tm:Mr:23 opt
    1.23  do
    1.24      case "$opt" in
    1.25        s)  STAT_ONLY=YES;;
    1.26 +      S)  SHOW_VOC_STAT=YES;;
    1.27        k)  NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
    1.28        l)  LANGUAGE="$OPTARG";;
    1.29        a)  DONT_ADD_MARKS=YES;;
    1.30        n)  NON_INTERACTIVE_MODE=YES;;
    1.31 +      N)  FILTER_WORDS=NO;;
    1.32        p)  PART_TO_PROCESS="$OPTARG";;
    1.33        t)  TAG_NAME="$OPTARG";;
    1.34        T)  TAGS_LIST_ONLY="YES";;
    1.35 @@ -107,10 +113,17 @@
    1.36  VOCABULARY=${LANGUAGE}.txt
    1.37  NOTES_FILE=notes-${LANGUAGE}.txt
    1.38  
    1.39 +if [ "${SHOW_VOC_STAT}" = "YES" ]
    1.40 +then
    1.41 +  $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>"
    1.42 +  exit 0
    1.43 +fi
    1.44 +
    1.45  #----------------------------------------------------
    1.46  
    1.47  get_words()
    1.48  {
    1.49 +    export FILTER_WORDS
    1.50  tr ' ' '\n' | sed 's/--/ /g' \
    1.51  | sed "s/'/__APOSTROPHE__/g" \
    1.52  | perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\
    1.53 @@ -146,7 +159,9 @@
    1.54          rm $after
    1.55          return 0
    1.56      else 
    1.57 -        echo "# $LANGUAGE, $percentage, <$total_known/$total>"
    1.58 +        groups="$(grep '# groups' $after | awk '{print $3}')"
    1.59 +        words="$(grep -v '^#' $after | wc -l)"
    1.60 +        echo "# $LANGUAGE, $percentage, <$total_known/$total>, <$groups/$words>"
    1.61      fi
    1.62  
    1.63      PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
    1.64 @@ -160,6 +175,7 @@
    1.65  } else { $mark_line +=5; };
    1.66  while(<>)
    1.67  {
    1.68 +    next if /^#\s*groups\s*/;
    1.69      print;
    1.70      /^\s*([0-9]*)\s*/;
    1.71      $s+=$1;
    1.72 @@ -217,6 +233,10 @@
    1.73  {
    1.74      PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
    1.75      cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
    1.76 +    if ($ENV{FILTER_WORDS} eq "NO") {
    1.77 +        while(<>) { print; }
    1.78 +        exit(0);
    1.79 +    }
    1.80  $voc_files=$ENV{VOC_FILES};
    1.81  $voc_files=~s@^ @@;
    1.82  for $voc_file (split /\s+/,$voc_files) {
    1.83 @@ -478,6 +498,7 @@
    1.84  }
    1.85  @lines2 = sort { compare($b,$a) } @lines;
    1.86  binmode STDOUT, ":utf8";
    1.87 +print "# groups ".scalar(keys(%group_weight))."\n";
    1.88  for $l (@lines2) {
    1.89      print "$l\n";
    1.90  }
author	Igor Chubin <igor@chub.in>
date	Sun Dec 12 10:04:06 2010 +0100 (2010-12-12)
parents	753fb84437aa
children	3827cce83602
files	new-words.sh