new-words

annotate new-words-py.sh @ 43:d532e7b52ab2

-s key support in new-words.py

Now new-words-py.sh -s works in the same way as new-words.sh.
(WPS and UWPS fields are not calculated correctly yet).
author Igor Chubin <igor@chub.in>
date Fri Jan 28 12:40:58 2011 +0200 (2011-01-28)
parents c3a50c0d2400
children 7eb1a8c3eade
rev   line source
igor@38 1 #!/bin/bash
igor@38 2
igor@38 3 show_usage()
igor@38 4 {
igor@38 5 cat <<HELP > /dev/stderr
igor@38 6
igor@38 7 USAGE:
igor@38 8
igor@38 9 new-words [ -l lang ] [ -s ] [ ARG ]
igor@38 10
igor@38 11 SWITCHES:
igor@38 12
igor@38 13 -h print this screen
igor@38 14 -c show compressed wordlist: one word per group
igor@38 15 -k put higher words that are similar to the known words (only for English)
igor@38 16 -l lang override language settings
igor@38 17 -n non-interactive mode (don't run vi)
igor@38 18 -N turn off known words filtering
igor@38 19 -a don't add marks (and don't save marks added by user)
igor@38 20 -p pages work with specified pages only (pages = start-stop/total )
igor@38 21 -s show the text statistics (percentage of known words and so on) and exit
igor@38 22 -S show your vocabulary statistics (number of words and word groups)
igor@38 23 -t tag tag known words with tag
igor@38 24 -T show list of active tags
igor@38 25 -m tag merge the words tagged with "tag" into the main vocabulary
igor@38 26 -M merge the words tagged with any tag into the main vocabulary
igor@38 27 -r tag remove subvocabulary for the "tag"
igor@38 28 -2 -3 find 2 and 3 words' sequences
igor@38 29
igor@38 30 The language of the text can be specified also
igor@38 31 by name of the program new-words (correspondent link must be created before).
igor@38 32 For example, these calls are equivalent:
igor@38 33
igor@38 34 de-words URL
igor@38 35 new-words -l de URL
igor@38 36
igor@38 37 HELP
igor@38 38 }
igor@38 39
igor@38 40 if [ "$1" = "-h" ]
igor@38 41 then
igor@38 42 show_usage
igor@38 43 exit 0
igor@38 44 fi
igor@38 45
igor@38 46 NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py
igor@38 47 WORK_DIR=~/.new-words/
igor@38 48 TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX`
igor@38 49 TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX`
igor@38 50 export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX`
igor@38 51 editor=${EDITOR:-vim}
igor@38 52
igor@38 53 # language detection
igor@38 54
igor@38 55 LANGUAGE=en
igor@38 56 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
igor@38 57 for arg
igor@38 58 do
igor@38 59 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
igor@38 60 then
igor@38 61 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
igor@38 62 fi
igor@38 63 done
igor@38 64 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
igor@38 65
igor@38 66 #----------------------------------------------------
igor@38 67 # command line options processing
igor@38 68
igor@38 69 STAT_ONLY=NO
igor@38 70 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
igor@38 71 DONT_ADD_MARKS=NO
igor@38 72 NON_INTERACTIVE_MODE=NO
igor@38 73 PART_TO_PROCESS=''
igor@38 74 GROUP_WORDS_BY_THREE=NO
igor@38 75 GROUP_WORDS_BY_TWO=NO
igor@38 76 TAG_NAME=''
igor@38 77 MERGE_THIS_TAGS=''
igor@38 78 TAGS_LIST_ONLY=NO
igor@38 79 MERGE_TAGGED_WORDS=NO
igor@38 80 MERGE_ALL_TAGGED=NO
igor@38 81 DONT_ADD_MARKLINES=NO
igor@38 82 FILTER_WORDS=YES
igor@38 83 SHOW_VOC_STAT=NO
igor@38 84 COMPRESSED_WORDLIST=NO
igor@40 85 while getopts cl:sSkanNp:t:Tm:Mr:23 opt
igor@38 86 do
igor@38 87 case "$opt" in
igor@38 88 c) COMPRESSED_WORDLIST=YES;;
igor@38 89 s) STAT_ONLY=YES;;
igor@38 90 S) SHOW_VOC_STAT=YES;;
igor@38 91 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
igor@38 92 l) LANGUAGE="$OPTARG";;
igor@38 93 a) DONT_ADD_MARKS=YES;;
igor@38 94 n) NON_INTERACTIVE_MODE=YES;;
igor@38 95 N) FILTER_WORDS=NO;;
igor@38 96 p) PART_TO_PROCESS="$OPTARG";;
igor@38 97 t) TAG_NAME="$OPTARG";;
igor@38 98 T) TAGS_LIST_ONLY="YES";;
igor@38 99 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
igor@38 100 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
igor@38 101 r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
igor@38 102 2) GROUP_WORDS_BY_TWO=YES;;
igor@38 103 3) GROUP_WORDS_BY_THREE=YES;;
igor@38 104 \?) # unknown flag
igor@38 105 show_usage
igor@38 106 exit 1;;
igor@38 107 esac
igor@38 108 done
igor@38 109 shift `expr $OPTIND - 1`
igor@38 110
igor@38 111 if [ "$1" = "-l" ]
igor@38 112 then
igor@38 113 LANGUAGE="$2"
igor@38 114 shift 2
igor@38 115 fi
igor@38 116
igor@38 117 VOCABULARY=${LANGUAGE}.txt
igor@38 118 NOTES_FILE=notes-${LANGUAGE}.txt
igor@38 119
igor@38 120 if [ "${SHOW_VOC_STAT}" = "YES" ]
igor@38 121 then
igor@38 122 $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>"
igor@38 123 exit 0
igor@38 124 fi
igor@38 125
igor@38 126 two_and_three_words()
igor@38 127 {
igor@38 128 if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ]
igor@38 129 then
igor@38 130 cat
igor@38 131 else
igor@38 132 cat
igor@38 133
igor@38 134 export GROUP_WORDS_BY_THREE
igor@38 135 export GROUP_WORDS_BY_TWO
igor@38 136 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX`
igor@38 137 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
igor@38 138 #!/usr/bin/perl
igor@38 139 local $/;
igor@38 140 $words=<>;
igor@38 141 $words=~ s@[!?;,:#1-9".]@ @g;
igor@38 142 $words =~ s@\s+@ @g;
igor@38 143 @words = split /\s+/, $words;
igor@38 144 for ($i=0; $i<$#words-3;$i++) {
igor@38 145 my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]);
igor@38 146 if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) {
igor@38 147 print "${a}_${b}_${c}\n";
igor@38 148 };
igor@38 149 if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) {
igor@38 150 print "${a}_${b}\n";
igor@38 151 };
igor@38 152 }
igor@38 153 PERL_SCRIPT
igor@38 154 perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT"
igor@38 155 rm $PERL_SCRIPT_TEMP_NAME
igor@38 156 fi
igor@38 157 }
igor@38 158
igor@38 159 text_from_url()
igor@38 160 {
igor@38 161 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
igor@38 162 }
igor@38 163
igor@38 164 add_marks()
igor@38 165 {
igor@40 166 $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1"
igor@39 167 }
igor@40 168 remove_marks()
igor@39 169 {
igor@40 170 $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1"
igor@40 171 }
igor@40 172 get_words_group_words_add_stat()
igor@38 173 {
igor@40 174 $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1"
igor@38 175 }
igor@38 176
igor@38 177 part()
igor@38 178 {
igor@38 179 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX`
igor@38 180 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
igor@38 181 #!/usr/bin/perl
igor@38 182
igor@38 183 my @lines=<STDIN>;
igor@38 184 my $lines=$#lines;
igor@38 185 my $interval=$ARGV[0];
igor@38 186 if (not $interval) {
igor@38 187 print @lines;
igor@38 188 }
igor@38 189 else {
igor@38 190 my ($start,$stop,$total);
igor@38 191 if ($interval =~ m@(.*)/(.*)@) {
igor@38 192 $start = $1;
igor@38 193 $total = $2;
igor@38 194 }
igor@38 195 else {
igor@38 196 $start=$interval;
igor@38 197 $total=0;
igor@38 198 }
igor@38 199 if ($start =~ m@(.*)-(.*)@) {
igor@38 200 $start = $1;
igor@38 201 $stop = $2;
igor@38 202 }
igor@38 203 if ($start =~ m@(.*)\+(.*)@) {
igor@38 204 $start = $1;
igor@38 205 $stop = $start+$2;
igor@38 206 }
igor@38 207
igor@38 208 $start=int($lines/$total*$start);
igor@38 209 $stop=int($lines/$total*$stop);
igor@38 210
igor@38 211 for($i=$start;$i<$stop;$i++){
igor@38 212 print $lines[$i];
igor@38 213 }
igor@38 214 }
igor@38 215 PERL_SCRIPT
igor@38 216 perl $PERL_SCRIPT_TEMP_NAME "$1"
igor@38 217 rm $PERL_SCRIPT_TEMP_NAME
igor@38 218 }
igor@38 219
igor@38 220 if [ "$TAGS_LIST_ONLY" = "YES" ]
igor@38 221 then
igor@38 222 cd "${WORK_DIR}"
igor@38 223 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'
igor@38 224 exit 0
igor@38 225 fi
igor@38 226
igor@38 227 tag_file_name()
igor@38 228 {
igor@38 229 echo "${LANGUAGE}_${1}.txt"
igor@38 230 }
igor@38 231
igor@38 232 if [ "$REMOVE_TAG" = "YES" ]
igor@38 233 then
igor@38 234 cd "${WORK_DIR}"
igor@38 235 for i in $TAG_NAME
igor@38 236 do
igor@38 237 echo "$TAGNAME" | grep -q '[/*?]' && continue
igor@38 238 f="`tag_file_name $i`"
igor@38 239 if [ -e "$f" ]
igor@38 240 then
igor@38 241 rm -f "$f" && echo Tag "'$i'" removed
igor@38 242 else
igor@38 243 echo Unknown tag "'$i'"
igor@38 244 fi
igor@38 245 done
igor@38 246 exit 0
igor@38 247 fi
igor@38 248
igor@38 249 mkdir -p $WORK_DIR
igor@38 250 oldpwd="$PWD"
igor@38 251 cd $WORK_DIR
igor@38 252 if [ "$MERGE_TAGGED_WORDS" = "YES" ]
igor@38 253 then
igor@38 254 VOC_FILES=''
igor@38 255 for i in $MERGE_THIS_TAGS
igor@38 256 do
igor@38 257 f=`tag_file_name $i`
igor@38 258 [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
igor@38 259 done
igor@38 260 if [ -z "$VOC_FILES" ]
igor@38 261 then
igor@38 262 echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
igor@38 263 else
igor@38 264 cat $VOC_FILES
igor@38 265 fi
igor@38 266 elif [ "$MERGE_ALL_TAGGED" = "YES" ]
igor@38 267 then
igor@38 268 cat ${LANGUAGE}_*.txt
igor@38 269 elif echo "$1" | grep -q http:
igor@38 270 then
igor@38 271 text_from_url "$1"
igor@38 272 elif [ "$#" != 0 ]
igor@38 273 then
igor@38 274 if echo $1 | grep -q ^/
igor@38 275 then
igor@38 276 cat "$1"
igor@38 277 else
igor@38 278 cat "$oldpwd/$1"
igor@38 279 fi
igor@38 280 else
igor@38 281 cat
igor@38 282 fi \
igor@38 283 | part $PART_TO_PROCESS \
igor@38 284 | tee $ORIGINAL_TEXT \
igor@38 285 | two_and_three_words \
igor@43 286 | STAT_ONLY="$STAT_ONLY" get_words_group_words_add_stat \
igor@38 287 | tee "$TEMP1" > "$TEMP2"
igor@38 288
igor@38 289 if [ "$STAT_ONLY" = "YES" ]
igor@38 290 then
igor@38 291 cat "$TEMP1"
igor@38 292 elif [ "$NON_INTERACTIVE_MODE" = "YES" ]
igor@38 293 then
igor@38 294 cat "$TEMP1"
igor@38 295 else
igor@38 296 if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ]
igor@38 297 then
igor@38 298 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2"
igor@38 299 if [ "$editor" = vim ]
igor@38 300 then
igor@38 301 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty
igor@38 302 else
igor@38 303 $editor "$TEMP2"
igor@38 304 fi
igor@38 305 remove_marks "$TEMP2"
igor@38 306
igor@38 307 vocabulary="$VOCABULARY"
igor@38 308 [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`"
igor@38 309 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary"
igor@38 310 fi
igor@38 311 fi
igor@38 312
igor@38 313 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"