new-words
changeset 27:0a80b2fa3ed8
initial tagging support
author | Igor Chubin <igor@chub.in> |
---|---|
date | Fri May 21 19:53:24 2010 +0300 (2010-05-21) |
parents | 4a10c0f4510c |
children | 7db7bbf96fad |
files | new-words.sh |
line diff
1.1 --- a/new-words.sh Fri May 21 01:02:21 2010 +0300 1.2 +++ b/new-words.sh Fri May 21 19:53:24 2010 +0300 1.3 @@ -14,9 +14,14 @@ 1.4 -k put higher words that are similar to the known words (only for English) 1.5 -l lang override language settings 1.6 -n non-interactive mode (don't run vi) 1.7 - -m don't add marks (and don't save marks added by user) 1.8 + -a don't add marks (and don't save marks added by user) 1.9 -p pages work with specified pages only (pages = start-stop/total ) 1.10 -s show the text statistics (percentage of known words and so on) and exit 1.11 + -t tag tag known words with tag 1.12 + -T show list of active tags 1.13 + -m tag merge the words tagged with "tag" into the main vocabulary 1.14 + -M merge the words tagged with any tag into the main vocabulary 1.15 + -r tag remove subvocabulary for the "tag" 1.16 -2 -3 find 2 and 3 words' sequences 1.17 1.18 The language of the text can be specified also 1.19 @@ -64,15 +69,26 @@ 1.20 PART_TO_PROCESS='' 1.21 GROUP_WORDS_BY_THREE=NO 1.22 GROUP_WORDS_BY_TWO=NO 1.23 -while getopts l:skmnp:23 opt 1.24 +TAG_NAME='' 1.25 +MERGE_THIS_TAGS='' 1.26 +TAGS_LIST_ONLY=NO 1.27 +MERGE_TAGGED_WORDS=NO 1.28 +MERGE_ALL_TAGGED=NO 1.29 +DONT_ADD_MARKLINES=NO 1.30 +while getopts l:skanp:t:Tm:Mr:23 opt 1.31 do 1.32 case "$opt" in 1.33 s) STAT_ONLY=YES;; 1.34 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;; 1.35 l) LANGUAGE="$OPTARG";; 1.36 - m) DONT_ADD_MARKS=YES;; 1.37 + a) DONT_ADD_MARKS=YES;; 1.38 n) NON_INTERACTIVE_MODE=YES;; 1.39 p) PART_TO_PROCESS="$OPTARG";; 1.40 + t) TAG_NAME="$OPTARG";; 1.41 + T) TAGS_LIST_ONLY="YES";; 1.42 + m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";; 1.43 + M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";; 1.44 + r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";; 1.45 2) GROUP_WORDS_BY_TWO=YES;; 1.46 3) GROUP_WORDS_BY_THREE=YES;; 1.47 \?) # unknown flag 1.48 @@ -107,6 +123,11 @@ 1.49 1.50 add_stat() 1.51 { 1.52 + if [ "$DONT_ADD_MARKLINES" = "YES" ] 1.53 + then 1.54 + cat 1.55 + return 1.56 + fi 1.57 before="$1" 1.58 after=${before}2 1.59 cat > "$after" 1.60 @@ -196,12 +217,16 @@ 1.61 { 1.62 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` 1.63 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.64 -open(VOC, $ENV{VOCABULARY}) 1.65 - or die "Can't open VOCABULARY"; 1.66 -while (<VOC>){ 1.67 - chomp; 1.68 - #s/'//g; 1.69 - $voc{$_}="1"; 1.70 +$voc_files=$ENV{VOC_FILES}; 1.71 +$voc_files=~s@^ @@; 1.72 +for $voc_file (split /\s+/,$voc_files) { 1.73 + if (open(VOC, $voc_file)) { 1.74 + while (<VOC>){ 1.75 + chomp; 1.76 + #s/'//g; 1.77 + $voc{$_}="1"; 1.78 + } 1.79 + } 1.80 } 1.81 while(<>) { 1.82 chomp; 1.83 @@ -209,7 +234,12 @@ 1.84 } 1.85 PERL_SCRIPT 1.86 [ -e "$VOCABULARY" ] || touch "$VOCABULARY" 1.87 - export VOCABULARY 1.88 + export VOCABULARY VOC_FILES 1.89 + VOC_FILES=$VOCABULARY 1.90 + for i in $TAG_NAME 1.91 + do 1.92 + VOC_FILES="${VOC_FILES} `tag_file_name $i`" 1.93 + done 1.94 perl $PERL_SCRIPT_TEMP_NAME 1.95 rm $PERL_SCRIPT_TEMP_NAME 1.96 } 1.97 @@ -506,10 +536,56 @@ 1.98 rm $PERL_SCRIPT_TEMP_NAME 1.99 } 1.100 1.101 +if [ "$TAGS_LIST_ONLY" = "YES" ] 1.102 +then 1.103 + cd "${WORK_DIR}" 1.104 + echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//' 1.105 + exit 0 1.106 +fi 1.107 + 1.108 +tag_file_name() 1.109 +{ 1.110 + echo "${LANGUAGE}_${1}.txt" 1.111 +} 1.112 + 1.113 +if [ "$REMOVE_TAG" = "YES" ] 1.114 +then 1.115 + cd "${WORK_DIR}" 1.116 + for i in $TAG_NAME 1.117 + do 1.118 + echo "$TAGNAME" | grep -q '[/*?]' && continue 1.119 + f="`tag_file_name $i`" 1.120 + if [ -e "$f" ] 1.121 + then 1.122 + rm -f "$f" && echo Tag "'$i'" removed 1.123 + else 1.124 + echo Unknown tag "'$i'" 1.125 + fi 1.126 + done 1.127 + exit 0 1.128 +fi 1.129 + 1.130 mkdir -p $WORK_DIR 1.131 oldpwd="$PWD" 1.132 cd $WORK_DIR 1.133 -if echo "$1" | grep -q http: 1.134 +if [ "$MERGE_TAGGED_WORDS" = "YES" ] 1.135 +then 1.136 + VOC_FILES='' 1.137 + for i in $MERGE_THIS_TAGS 1.138 + do 1.139 + f=`tag_file_name $i` 1.140 + [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f" 1.141 + done 1.142 + if [ -z "$VOC_FILES" ] 1.143 + then 1.144 + echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr 1.145 + else 1.146 + cat $VOC_FILES 1.147 + fi 1.148 +elif [ "$MERGE_ALL_TAGGED" = "YES" ] 1.149 +then 1.150 + cat ${LANGUAGE}_*.txt 1.151 +elif echo "$1" | grep -q http: 1.152 then 1.153 text_from_url "$1" 1.154 elif [ "$#" != 0 ] 1.155 @@ -538,16 +614,21 @@ 1.156 then 1.157 cat "$TEMP1" 1.158 else 1.159 - [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2" 1.160 - if [ "$editor" = vim ] 1.161 + if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ] 1.162 then 1.163 - vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty 1.164 - else 1.165 - echo 2 1.166 - $editor "$TEMP2" 1.167 + [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2" 1.168 + if [ "$editor" = vim ] 1.169 + then 1.170 + vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty 1.171 + else 1.172 + $editor "$TEMP2" 1.173 + fi 1.174 + remove_marks "$TEMP2" 1.175 + 1.176 + vocabulary="$VOCABULARY" 1.177 + [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`" 1.178 + diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary" 1.179 fi 1.180 - remove_marks "$TEMP2" 1.181 fi 1.182 1.183 -diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$VOCABULARY" 1.184 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"