new-words

changeset 27:0a80b2fa3ed8

initial tagging support
author Igor Chubin <igor@chub.in>
date Fri May 21 19:53:24 2010 +0300 (2010-05-21)
parents 4a10c0f4510c
children 7db7bbf96fad
files new-words.sh
line diff
     1.1 --- a/new-words.sh	Fri May 21 01:02:21 2010 +0300
     1.2 +++ b/new-words.sh	Fri May 21 19:53:24 2010 +0300
     1.3 @@ -14,9 +14,14 @@
     1.4      -k          put higher words that are similar to the known words (only for English)
     1.5      -l lang     override language settings
     1.6      -n          non-interactive mode (don't run vi)
     1.7 -    -m          don't add marks (and don't save marks added by user)
     1.8 +    -a          don't add marks (and don't save marks added by user)
     1.9      -p pages    work with specified pages only (pages = start-stop/total )
    1.10      -s          show the text statistics (percentage of known words and so on) and exit
    1.11 +    -t tag      tag known words with tag
    1.12 +    -T          show list of active tags
    1.13 +    -m tag      merge the words tagged with "tag" into the main vocabulary
    1.14 +    -M          merge the words tagged with any tag into the main vocabulary 
    1.15 +    -r tag      remove subvocabulary for the "tag"
    1.16      -2 -3       find 2 and 3 words' sequences
    1.17  
    1.18  The language of the text can be specified also
    1.19 @@ -64,15 +69,26 @@
    1.20  PART_TO_PROCESS=''
    1.21  GROUP_WORDS_BY_THREE=NO
    1.22  GROUP_WORDS_BY_TWO=NO
    1.23 -while getopts l:skmnp:23 opt
    1.24 +TAG_NAME=''
    1.25 +MERGE_THIS_TAGS=''
    1.26 +TAGS_LIST_ONLY=NO
    1.27 +MERGE_TAGGED_WORDS=NO
    1.28 +MERGE_ALL_TAGGED=NO
    1.29 +DONT_ADD_MARKLINES=NO
    1.30 +while getopts l:skanp:t:Tm:Mr:23 opt
    1.31  do
    1.32      case "$opt" in
    1.33        s)  STAT_ONLY=YES;;
    1.34        k)  NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
    1.35        l)  LANGUAGE="$OPTARG";;
    1.36 -      m)  DONT_ADD_MARKS=YES;;
    1.37 +      a)  DONT_ADD_MARKS=YES;;
    1.38        n)  NON_INTERACTIVE_MODE=YES;;
    1.39        p)  PART_TO_PROCESS="$OPTARG";;
    1.40 +      t)  TAG_NAME="$OPTARG";;
    1.41 +      T)  TAGS_LIST_ONLY="YES";;
    1.42 +      m)  DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
    1.43 +      M)  DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
    1.44 +      r)  REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
    1.45        2)  GROUP_WORDS_BY_TWO=YES;;
    1.46        3)  GROUP_WORDS_BY_THREE=YES;;
    1.47        \?)       # unknown flag
    1.48 @@ -107,6 +123,11 @@
    1.49  
    1.50  add_stat()
    1.51  {
    1.52 +    if [ "$DONT_ADD_MARKLINES" = "YES" ]
    1.53 +    then
    1.54 +        cat
    1.55 +        return
    1.56 +    fi
    1.57      before="$1"
    1.58      after=${before}2
    1.59      cat > "$after"
    1.60 @@ -196,12 +217,16 @@
    1.61  {
    1.62      PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
    1.63      cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
    1.64 -open(VOC, $ENV{VOCABULARY})
    1.65 - or die "Can't open VOCABULARY";
    1.66 -while (<VOC>){
    1.67 -    chomp;
    1.68 -    #s/'//g;
    1.69 -    $voc{$_}="1";
    1.70 +$voc_files=$ENV{VOC_FILES};
    1.71 +$voc_files=~s@^ @@;
    1.72 +for $voc_file (split /\s+/,$voc_files) {
    1.73 +    if (open(VOC, $voc_file)) {
    1.74 +        while (<VOC>){
    1.75 +            chomp;
    1.76 +            #s/'//g;
    1.77 +            $voc{$_}="1";
    1.78 +        }
    1.79 +    }
    1.80  }
    1.81  while(<>) {
    1.82      chomp;
    1.83 @@ -209,7 +234,12 @@
    1.84  }
    1.85  PERL_SCRIPT
    1.86      [ -e "$VOCABULARY" ] || touch "$VOCABULARY"
    1.87 -    export VOCABULARY 
    1.88 +    export VOCABULARY VOC_FILES
    1.89 +    VOC_FILES=$VOCABULARY
    1.90 +    for i in $TAG_NAME
    1.91 +    do
    1.92 +        VOC_FILES="${VOC_FILES} `tag_file_name $i`"
    1.93 +    done
    1.94      perl $PERL_SCRIPT_TEMP_NAME
    1.95      rm $PERL_SCRIPT_TEMP_NAME
    1.96  }
    1.97 @@ -506,10 +536,56 @@
    1.98      rm $PERL_SCRIPT_TEMP_NAME
    1.99  }
   1.100  
   1.101 +if [ "$TAGS_LIST_ONLY" = "YES" ] 
   1.102 +then
   1.103 +    cd "${WORK_DIR}"
   1.104 +    echo ${LANGUAGE}_*.txt | tr ' ' '\n'  | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'
   1.105 +    exit 0
   1.106 +fi
   1.107 +
   1.108 +tag_file_name()
   1.109 +{
   1.110 +    echo "${LANGUAGE}_${1}.txt"
   1.111 +}
   1.112 +
   1.113 +if [ "$REMOVE_TAG" = "YES" ]
   1.114 +then
   1.115 +    cd "${WORK_DIR}"
   1.116 +    for i in $TAG_NAME 
   1.117 +    do
   1.118 +        echo "$TAGNAME" | grep -q '[/*?]' && continue
   1.119 +        f="`tag_file_name $i`"
   1.120 +        if [ -e "$f" ] 
   1.121 +        then
   1.122 +            rm -f "$f" && echo Tag "'$i'" removed
   1.123 +        else
   1.124 +            echo Unknown tag "'$i'"
   1.125 +        fi
   1.126 +    done
   1.127 +    exit 0
   1.128 +fi
   1.129 +
   1.130  mkdir -p $WORK_DIR
   1.131  oldpwd="$PWD"
   1.132  cd $WORK_DIR
   1.133 -if echo "$1" | grep -q http: 
   1.134 +if [ "$MERGE_TAGGED_WORDS" = "YES" ]
   1.135 +then
   1.136 +    VOC_FILES=''
   1.137 +    for i in $MERGE_THIS_TAGS
   1.138 +    do
   1.139 +        f=`tag_file_name $i`
   1.140 +        [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
   1.141 +    done
   1.142 +    if [ -z "$VOC_FILES" ]
   1.143 +    then 
   1.144 +        echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
   1.145 +    else
   1.146 +        cat $VOC_FILES
   1.147 +    fi
   1.148 +elif [ "$MERGE_ALL_TAGGED" = "YES" ]
   1.149 +then
   1.150 +    cat ${LANGUAGE}_*.txt
   1.151 +elif echo "$1" | grep -q http: 
   1.152  then 
   1.153      text_from_url "$1"
   1.154  elif [ "$#" != 0 ]
   1.155 @@ -538,16 +614,21 @@
   1.156  then
   1.157      cat "$TEMP1"
   1.158  else
   1.159 -    [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
   1.160 -    if [ "$editor" = vim ]
   1.161 +    if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ] 
   1.162      then
   1.163 -        vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
   1.164 -    else
   1.165 -        echo 2
   1.166 -        $editor "$TEMP2"
   1.167 +        [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
   1.168 +        if [ "$editor" = vim ]
   1.169 +        then
   1.170 +            vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
   1.171 +        else
   1.172 +            $editor "$TEMP2"
   1.173 +        fi
   1.174 +        remove_marks "$TEMP2"
   1.175 +
   1.176 +        vocabulary="$VOCABULARY"
   1.177 +        [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`"
   1.178 +        diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary"
   1.179      fi
   1.180 -    remove_marks "$TEMP2"
   1.181  fi
   1.182  
   1.183 -diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$VOCABULARY"
   1.184  rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"