new-words
annotate new-words-py.sh @ 43:d532e7b52ab2
-s key support in new-words.py
Now new-words-py.sh -s works in the same way as new-words.sh.
(WPS and UWPS fields are not calculated correctly yet).
Now new-words-py.sh -s works in the same way as new-words.sh.
(WPS and UWPS fields are not calculated correctly yet).
author | Igor Chubin <igor@chub.in> |
---|---|
date | Fri Jan 28 12:40:58 2011 +0200 (2011-01-28) |
parents | c3a50c0d2400 |
children | 7eb1a8c3eade |
rev | line source |
---|---|
igor@38 | 1 #!/bin/bash |
igor@38 | 2 |
igor@38 | 3 show_usage() |
igor@38 | 4 { |
igor@38 | 5 cat <<HELP > /dev/stderr |
igor@38 | 6 |
igor@38 | 7 USAGE: |
igor@38 | 8 |
igor@38 | 9 new-words [ -l lang ] [ -s ] [ ARG ] |
igor@38 | 10 |
igor@38 | 11 SWITCHES: |
igor@38 | 12 |
igor@38 | 13 -h print this screen |
igor@38 | 14 -c show compressed wordlist: one word per group |
igor@38 | 15 -k put higher words that are similar to the known words (only for English) |
igor@38 | 16 -l lang override language settings |
igor@38 | 17 -n non-interactive mode (don't run vi) |
igor@38 | 18 -N turn off known words filtering |
igor@38 | 19 -a don't add marks (and don't save marks added by user) |
igor@38 | 20 -p pages work with specified pages only (pages = start-stop/total ) |
igor@38 | 21 -s show the text statistics (percentage of known words and so on) and exit |
igor@38 | 22 -S show your vocabulary statistics (number of words and word groups) |
igor@38 | 23 -t tag tag known words with tag |
igor@38 | 24 -T show list of active tags |
igor@38 | 25 -m tag merge the words tagged with "tag" into the main vocabulary |
igor@38 | 26 -M merge the words tagged with any tag into the main vocabulary |
igor@38 | 27 -r tag remove subvocabulary for the "tag" |
igor@38 | 28 -2 -3 find 2 and 3 words' sequences |
igor@38 | 29 |
igor@38 | 30 The language of the text can be specified also |
igor@38 | 31 by name of the program new-words (correspondent link must be created before). |
igor@38 | 32 For example, these calls are equivalent: |
igor@38 | 33 |
igor@38 | 34 de-words URL |
igor@38 | 35 new-words -l de URL |
igor@38 | 36 |
igor@38 | 37 HELP |
igor@38 | 38 } |
igor@38 | 39 |
igor@38 | 40 if [ "$1" = "-h" ] |
igor@38 | 41 then |
igor@38 | 42 show_usage |
igor@38 | 43 exit 0 |
igor@38 | 44 fi |
igor@38 | 45 |
igor@38 | 46 NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py |
igor@38 | 47 WORK_DIR=~/.new-words/ |
igor@38 | 48 TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX` |
igor@38 | 49 TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX` |
igor@38 | 50 export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX` |
igor@38 | 51 editor=${EDITOR:-vim} |
igor@38 | 52 |
igor@38 | 53 # language detection |
igor@38 | 54 |
igor@38 | 55 LANGUAGE=en |
igor@38 | 56 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `" |
igor@38 | 57 for arg |
igor@38 | 58 do |
igor@38 | 59 if echo "$arg" | grep -q http://...wikipedia.org/wiki/ |
igor@38 | 60 then |
igor@38 | 61 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`" |
igor@38 | 62 fi |
igor@38 | 63 done |
igor@38 | 64 [ "${my_name}" = "new" ] || LANGUAGE="$my_name" |
igor@38 | 65 |
igor@38 | 66 #---------------------------------------------------- |
igor@38 | 67 # command line options processing |
igor@38 | 68 |
igor@38 | 69 STAT_ONLY=NO |
igor@38 | 70 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO |
igor@38 | 71 DONT_ADD_MARKS=NO |
igor@38 | 72 NON_INTERACTIVE_MODE=NO |
igor@38 | 73 PART_TO_PROCESS='' |
igor@38 | 74 GROUP_WORDS_BY_THREE=NO |
igor@38 | 75 GROUP_WORDS_BY_TWO=NO |
igor@38 | 76 TAG_NAME='' |
igor@38 | 77 MERGE_THIS_TAGS='' |
igor@38 | 78 TAGS_LIST_ONLY=NO |
igor@38 | 79 MERGE_TAGGED_WORDS=NO |
igor@38 | 80 MERGE_ALL_TAGGED=NO |
igor@38 | 81 DONT_ADD_MARKLINES=NO |
igor@38 | 82 FILTER_WORDS=YES |
igor@38 | 83 SHOW_VOC_STAT=NO |
igor@38 | 84 COMPRESSED_WORDLIST=NO |
igor@40 | 85 while getopts cl:sSkanNp:t:Tm:Mr:23 opt |
igor@38 | 86 do |
igor@38 | 87 case "$opt" in |
igor@38 | 88 c) COMPRESSED_WORDLIST=YES;; |
igor@38 | 89 s) STAT_ONLY=YES;; |
igor@38 | 90 S) SHOW_VOC_STAT=YES;; |
igor@38 | 91 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;; |
igor@38 | 92 l) LANGUAGE="$OPTARG";; |
igor@38 | 93 a) DONT_ADD_MARKS=YES;; |
igor@38 | 94 n) NON_INTERACTIVE_MODE=YES;; |
igor@38 | 95 N) FILTER_WORDS=NO;; |
igor@38 | 96 p) PART_TO_PROCESS="$OPTARG";; |
igor@38 | 97 t) TAG_NAME="$OPTARG";; |
igor@38 | 98 T) TAGS_LIST_ONLY="YES";; |
igor@38 | 99 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";; |
igor@38 | 100 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";; |
igor@38 | 101 r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";; |
igor@38 | 102 2) GROUP_WORDS_BY_TWO=YES;; |
igor@38 | 103 3) GROUP_WORDS_BY_THREE=YES;; |
igor@38 | 104 \?) # unknown flag |
igor@38 | 105 show_usage |
igor@38 | 106 exit 1;; |
igor@38 | 107 esac |
igor@38 | 108 done |
igor@38 | 109 shift `expr $OPTIND - 1` |
igor@38 | 110 |
igor@38 | 111 if [ "$1" = "-l" ] |
igor@38 | 112 then |
igor@38 | 113 LANGUAGE="$2" |
igor@38 | 114 shift 2 |
igor@38 | 115 fi |
igor@38 | 116 |
igor@38 | 117 VOCABULARY=${LANGUAGE}.txt |
igor@38 | 118 NOTES_FILE=notes-${LANGUAGE}.txt |
igor@38 | 119 |
igor@38 | 120 if [ "${SHOW_VOC_STAT}" = "YES" ] |
igor@38 | 121 then |
igor@38 | 122 $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>" |
igor@38 | 123 exit 0 |
igor@38 | 124 fi |
igor@38 | 125 |
igor@38 | 126 two_and_three_words() |
igor@38 | 127 { |
igor@38 | 128 if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ] |
igor@38 | 129 then |
igor@38 | 130 cat |
igor@38 | 131 else |
igor@38 | 132 cat |
igor@38 | 133 |
igor@38 | 134 export GROUP_WORDS_BY_THREE |
igor@38 | 135 export GROUP_WORDS_BY_TWO |
igor@38 | 136 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX` |
igor@38 | 137 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME |
igor@38 | 138 #!/usr/bin/perl |
igor@38 | 139 local $/; |
igor@38 | 140 $words=<>; |
igor@38 | 141 $words=~ s@[!?;,:#1-9".]@ @g; |
igor@38 | 142 $words =~ s@\s+@ @g; |
igor@38 | 143 @words = split /\s+/, $words; |
igor@38 | 144 for ($i=0; $i<$#words-3;$i++) { |
igor@38 | 145 my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]); |
igor@38 | 146 if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) { |
igor@38 | 147 print "${a}_${b}_${c}\n"; |
igor@38 | 148 }; |
igor@38 | 149 if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) { |
igor@38 | 150 print "${a}_${b}\n"; |
igor@38 | 151 }; |
igor@38 | 152 } |
igor@38 | 153 PERL_SCRIPT |
igor@38 | 154 perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT" |
igor@38 | 155 rm $PERL_SCRIPT_TEMP_NAME |
igor@38 | 156 fi |
igor@38 | 157 } |
igor@38 | 158 |
igor@38 | 159 text_from_url() |
igor@38 | 160 { |
igor@38 | 161 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@' |
igor@38 | 162 } |
igor@38 | 163 |
igor@38 | 164 add_marks() |
igor@38 | 165 { |
igor@40 | 166 $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1" |
igor@39 | 167 } |
igor@40 | 168 remove_marks() |
igor@39 | 169 { |
igor@40 | 170 $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1" |
igor@40 | 171 } |
igor@40 | 172 get_words_group_words_add_stat() |
igor@38 | 173 { |
igor@40 | 174 $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1" |
igor@38 | 175 } |
igor@38 | 176 |
igor@38 | 177 part() |
igor@38 | 178 { |
igor@38 | 179 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-part-XXXXXXXX` |
igor@38 | 180 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME |
igor@38 | 181 #!/usr/bin/perl |
igor@38 | 182 |
igor@38 | 183 my @lines=<STDIN>; |
igor@38 | 184 my $lines=$#lines; |
igor@38 | 185 my $interval=$ARGV[0]; |
igor@38 | 186 if (not $interval) { |
igor@38 | 187 print @lines; |
igor@38 | 188 } |
igor@38 | 189 else { |
igor@38 | 190 my ($start,$stop,$total); |
igor@38 | 191 if ($interval =~ m@(.*)/(.*)@) { |
igor@38 | 192 $start = $1; |
igor@38 | 193 $total = $2; |
igor@38 | 194 } |
igor@38 | 195 else { |
igor@38 | 196 $start=$interval; |
igor@38 | 197 $total=0; |
igor@38 | 198 } |
igor@38 | 199 if ($start =~ m@(.*)-(.*)@) { |
igor@38 | 200 $start = $1; |
igor@38 | 201 $stop = $2; |
igor@38 | 202 } |
igor@38 | 203 if ($start =~ m@(.*)\+(.*)@) { |
igor@38 | 204 $start = $1; |
igor@38 | 205 $stop = $start+$2; |
igor@38 | 206 } |
igor@38 | 207 |
igor@38 | 208 $start=int($lines/$total*$start); |
igor@38 | 209 $stop=int($lines/$total*$stop); |
igor@38 | 210 |
igor@38 | 211 for($i=$start;$i<$stop;$i++){ |
igor@38 | 212 print $lines[$i]; |
igor@38 | 213 } |
igor@38 | 214 } |
igor@38 | 215 PERL_SCRIPT |
igor@38 | 216 perl $PERL_SCRIPT_TEMP_NAME "$1" |
igor@38 | 217 rm $PERL_SCRIPT_TEMP_NAME |
igor@38 | 218 } |
igor@38 | 219 |
igor@38 | 220 if [ "$TAGS_LIST_ONLY" = "YES" ] |
igor@38 | 221 then |
igor@38 | 222 cd "${WORK_DIR}" |
igor@38 | 223 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//' |
igor@38 | 224 exit 0 |
igor@38 | 225 fi |
igor@38 | 226 |
igor@38 | 227 tag_file_name() |
igor@38 | 228 { |
igor@38 | 229 echo "${LANGUAGE}_${1}.txt" |
igor@38 | 230 } |
igor@38 | 231 |
igor@38 | 232 if [ "$REMOVE_TAG" = "YES" ] |
igor@38 | 233 then |
igor@38 | 234 cd "${WORK_DIR}" |
igor@38 | 235 for i in $TAG_NAME |
igor@38 | 236 do |
igor@38 | 237 echo "$TAGNAME" | grep -q '[/*?]' && continue |
igor@38 | 238 f="`tag_file_name $i`" |
igor@38 | 239 if [ -e "$f" ] |
igor@38 | 240 then |
igor@38 | 241 rm -f "$f" && echo Tag "'$i'" removed |
igor@38 | 242 else |
igor@38 | 243 echo Unknown tag "'$i'" |
igor@38 | 244 fi |
igor@38 | 245 done |
igor@38 | 246 exit 0 |
igor@38 | 247 fi |
igor@38 | 248 |
igor@38 | 249 mkdir -p $WORK_DIR |
igor@38 | 250 oldpwd="$PWD" |
igor@38 | 251 cd $WORK_DIR |
igor@38 | 252 if [ "$MERGE_TAGGED_WORDS" = "YES" ] |
igor@38 | 253 then |
igor@38 | 254 VOC_FILES='' |
igor@38 | 255 for i in $MERGE_THIS_TAGS |
igor@38 | 256 do |
igor@38 | 257 f=`tag_file_name $i` |
igor@38 | 258 [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f" |
igor@38 | 259 done |
igor@38 | 260 if [ -z "$VOC_FILES" ] |
igor@38 | 261 then |
igor@38 | 262 echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr |
igor@38 | 263 else |
igor@38 | 264 cat $VOC_FILES |
igor@38 | 265 fi |
igor@38 | 266 elif [ "$MERGE_ALL_TAGGED" = "YES" ] |
igor@38 | 267 then |
igor@38 | 268 cat ${LANGUAGE}_*.txt |
igor@38 | 269 elif echo "$1" | grep -q http: |
igor@38 | 270 then |
igor@38 | 271 text_from_url "$1" |
igor@38 | 272 elif [ "$#" != 0 ] |
igor@38 | 273 then |
igor@38 | 274 if echo $1 | grep -q ^/ |
igor@38 | 275 then |
igor@38 | 276 cat "$1" |
igor@38 | 277 else |
igor@38 | 278 cat "$oldpwd/$1" |
igor@38 | 279 fi |
igor@38 | 280 else |
igor@38 | 281 cat |
igor@38 | 282 fi \ |
igor@38 | 283 | part $PART_TO_PROCESS \ |
igor@38 | 284 | tee $ORIGINAL_TEXT \ |
igor@38 | 285 | two_and_three_words \ |
igor@43 | 286 | STAT_ONLY="$STAT_ONLY" get_words_group_words_add_stat \ |
igor@38 | 287 | tee "$TEMP1" > "$TEMP2" |
igor@38 | 288 |
igor@38 | 289 if [ "$STAT_ONLY" = "YES" ] |
igor@38 | 290 then |
igor@38 | 291 cat "$TEMP1" |
igor@38 | 292 elif [ "$NON_INTERACTIVE_MODE" = "YES" ] |
igor@38 | 293 then |
igor@38 | 294 cat "$TEMP1" |
igor@38 | 295 else |
igor@38 | 296 if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ] |
igor@38 | 297 then |
igor@38 | 298 [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2" |
igor@38 | 299 if [ "$editor" = vim ] |
igor@38 | 300 then |
igor@38 | 301 vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty |
igor@38 | 302 else |
igor@38 | 303 $editor "$TEMP2" |
igor@38 | 304 fi |
igor@38 | 305 remove_marks "$TEMP2" |
igor@38 | 306 |
igor@38 | 307 vocabulary="$VOCABULARY" |
igor@38 | 308 [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`" |
igor@38 | 309 diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary" |
igor@38 | 310 fi |
igor@38 | 311 fi |
igor@38 | 312 |
igor@38 | 313 rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT" |