new-words
annotate new-words-py.sh @ 68:846240941452
added -C key: compress to lines; fixed bug with #90-line
author | Igor Chubin <igor@chub.in> |
---|---|
date | Sun Sep 23 16:07:29 2012 +0300 (2012-09-23) |
parents | e25de9ea9184 |
children |
rev | line source |
---|---|
igor@38 | 1 #!/bin/bash |
igor@38 | 2 |
igor@55 | 3 cat <<EOF |
igor@55 | 4 Please, use the script no more. |
igor@55 | 5 You can execute new-words.py directly. |
igor@55 | 6 EOF |
igor@55 | 7 exit 1 |
igor@55 | 8 |
igor@38 | 9 show_usage() |
igor@38 | 10 { |
igor@38 | 11 cat <<HELP > /dev/stderr |
igor@38 | 12 |
igor@38 | 13 USAGE: |
igor@38 | 14 |
igor@38 | 15 new-words [ -l lang ] [ -s ] [ ARG ] |
igor@38 | 16 |
igor@38 | 17 SWITCHES: |
igor@38 | 18 |
igor@38 | 19 -h print this screen |
igor@38 | 20 -c show compressed wordlist: one word per group |
igor@50 | 21 -f file show only words related to the words in the file |
igor@45 | 22 -G turn off word grouping |
igor@38 | 23 -k put higher words that are similar to the known words (only for English) |
igor@38 | 24 -l lang override language settings |
igor@38 | 25 -n non-interactive mode (don't run vi) |
igor@38 | 26 -N turn off known words filtering |
igor@38 | 27 -a don't add marks (and don't save marks added by user) |
igor@38 | 28 -p pages work with specified pages only (pages = start-stop/total ) |
igor@38 | 29 -s show the text statistics (percentage of known words and so on) and exit |
igor@38 | 30 -S show your vocabulary statistics (number of words and word groups) |
igor@38 | 31 -t tag tag known words with tag |
igor@38 | 32 -T show list of active tags |
igor@38 | 33 -m tag merge the words tagged with "tag" into the main vocabulary |
igor@38 | 34 -M merge the words tagged with any tag into the main vocabulary |
igor@48 | 35 -d tag delete subvocabulary for the "tag" |
igor@48 | 36 -r RANGE show only first RANGE words |
igor@48 | 37 -R RANGE show only words lower than RANGE percent |
igor@38 | 38 -2 -3 find 2 and 3 words' sequences |
igor@38 | 39 |
igor@38 | 40 The language of the text can be specified also |
igor@38 | 41 by name of the program new-words (correspondent link must be created before). |
igor@38 | 42 For example, these calls are equivalent: |
igor@38 | 43 |
igor@38 | 44 de-words URL |
igor@38 | 45 new-words -l de URL |
igor@38 | 46 |
igor@38 | 47 HELP |
igor@38 | 48 } |
igor@38 | 49 |
igor@38 | 50 if [ "$1" = "-h" ] |
igor@38 | 51 then |
igor@38 | 52 show_usage |
igor@38 | 53 exit 0 |
igor@38 | 54 fi |
igor@38 | 55 |
igor@38 | 56 NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py |
igor@38 | 57 WORK_DIR=~/.new-words/ |
igor@38 | 58 editor=${EDITOR:-vim} |
igor@38 | 59 |
igor@38 | 60 # language detection |
igor@38 | 61 |
igor@38 | 62 LANGUAGE=en |
igor@38 | 63 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `" |
igor@38 | 64 for arg |
igor@38 | 65 do |
igor@38 | 66 if echo "$arg" | grep -q http://...wikipedia.org/wiki/ |
igor@38 | 67 then |
igor@38 | 68 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`" |
igor@38 | 69 fi |
igor@38 | 70 done |
igor@38 | 71 [ "${my_name}" = "new" ] || LANGUAGE="$my_name" |
igor@38 | 72 |
igor@38 | 73 #---------------------------------------------------- |
igor@38 | 74 # command line options processing |
igor@38 | 75 |
igor@38 | 76 STAT_ONLY=NO |
igor@38 | 77 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO |
igor@38 | 78 DONT_ADD_MARKS=NO |
igor@38 | 79 NON_INTERACTIVE_MODE=NO |
igor@38 | 80 PART_TO_PROCESS='' |
igor@38 | 81 GROUP_WORDS_BY_THREE=NO |
igor@38 | 82 GROUP_WORDS_BY_TWO=NO |
igor@38 | 83 TAG_NAME='' |
igor@38 | 84 MERGE_THIS_TAGS='' |
igor@38 | 85 TAGS_LIST_ONLY=NO |
igor@38 | 86 MERGE_TAGGED_WORDS=NO |
igor@38 | 87 MERGE_ALL_TAGGED=NO |
igor@38 | 88 DONT_ADD_MARKLINES=NO |
igor@38 | 89 FILTER_WORDS=YES |
igor@38 | 90 SHOW_VOC_STAT=NO |
igor@38 | 91 COMPRESSED_WORDLIST=NO |
igor@45 | 92 WORDS_GROUPING=YES |
igor@50 | 93 ALLOWED_WORDS_FILENAME='' |
igor@50 | 94 while getopts Gcf:l:sSkanNp:t:Tm:Md:r:R:23 opt |
igor@38 | 95 do |
igor@38 | 96 case "$opt" in |
igor@38 | 97 c) COMPRESSED_WORDLIST=YES;; |
igor@50 | 98 f) ALLOWED_WORDS_FILENAME="$OPTARG";; |
igor@45 | 99 G) WORDS_GROUPING=NO;; |
igor@38 | 100 s) STAT_ONLY=YES;; |
igor@38 | 101 S) SHOW_VOC_STAT=YES;; |
igor@38 | 102 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;; |
igor@38 | 103 l) LANGUAGE="$OPTARG";; |
igor@38 | 104 a) DONT_ADD_MARKS=YES;; |
igor@38 | 105 n) NON_INTERACTIVE_MODE=YES;; |
igor@38 | 106 N) FILTER_WORDS=NO;; |
igor@38 | 107 p) PART_TO_PROCESS="$OPTARG";; |
igor@38 | 108 t) TAG_NAME="$OPTARG";; |
igor@38 | 109 T) TAGS_LIST_ONLY="YES";; |
igor@38 | 110 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";; |
igor@38 | 111 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";; |
igor@48 | 112 d) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";; |
igor@48 | 113 r) SHOW_RANGE="$OPTARG";; |
igor@48 | 114 R) SHOW_RANGE_PERCENTAGE="$OPTARG";; |
igor@38 | 115 2) GROUP_WORDS_BY_TWO=YES;; |
igor@38 | 116 3) GROUP_WORDS_BY_THREE=YES;; |
igor@38 | 117 \?) # unknown flag |
igor@38 | 118 show_usage |
igor@38 | 119 exit 1;; |
igor@38 | 120 esac |
igor@38 | 121 done |
igor@38 | 122 shift `expr $OPTIND - 1` |
igor@38 | 123 |
igor@38 | 124 if [ "$1" = "-l" ] |
igor@38 | 125 then |
igor@38 | 126 LANGUAGE="$2" |
igor@38 | 127 shift 2 |
igor@38 | 128 fi |
igor@38 | 129 |
igor@38 | 130 VOCABULARY=${LANGUAGE}.txt |
igor@38 | 131 NOTES_FILE=notes-${LANGUAGE}.txt |
igor@38 | 132 |
igor@38 | 133 if [ "${SHOW_VOC_STAT}" = "YES" ] |
igor@38 | 134 then |
igor@38 | 135 $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>" |
igor@38 | 136 exit 0 |
igor@38 | 137 fi |
igor@38 | 138 |
igor@40 | 139 get_words_group_words_add_stat() |
igor@38 | 140 { |
igor@54 | 141 [ "$PART_TO_PROCESS" == "" ] || PART_TO_PROCESS="-p $PART_TO_PROCESS" |
igor@54 | 142 [ "$ALLOWED_WORDS_FILENAME" = "" ] || ALLOWED_WORDS_FILENAME="-f $ALLOWED_WORDS_FILENAME" |
igor@55 | 143 [ "$SHOW_RANGE" = "" ] || SHOW_RANGE="-r $SHOW_RANGE" |
igor@54 | 144 [ "$SHOW_RANGE_PERCENTAGE" = "" ] || SHOW_RANGE_PERCENTAGE="-R $SHOW_RANGE_PERCENTAGE" |
igor@54 | 145 [ "$NON_INTERACTIVE_MODE" = YES ] && non_interactive="-n" |
igor@54 | 146 [ "$STAT_ONLY" = YES ] && stat_only="-s" |
igor@54 | 147 [ "$COMPRESSED_WORDLIST" = YES ] && compressed_wordlist="-c" |
igor@54 | 148 [ "$FILTER_WORDS" = NO ] && filter_words="-N" |
igor@54 | 149 [ "$GROUP_WORDS_BY_TWO" = YES ] && group_words_by_two="-2" |
igor@54 | 150 [ "$GROUP_WORDS_BY_THREE" = YES ] && group_words_by_three="-3" |
igor@55 | 151 [ "$WORDS_GROUPING" = NO ] && words_grouping="-G" |
igor@54 | 152 |
igor@54 | 153 $NEW_WORDS_PY -l "$LANGUAGE" \ |
igor@55 | 154 $SHOW_RANGE \ |
igor@54 | 155 $SHOW_RANGE_PERCENTAGE \ |
igor@54 | 156 $PART_TO_PROCESS \ |
igor@54 | 157 $ALLOWED_WORDS_FILENAME \ |
igor@54 | 158 $non_interactive \ |
igor@54 | 159 $stat_only \ |
igor@54 | 160 $compressed_wordlist \ |
igor@54 | 161 $filter_words \ |
igor@54 | 162 $group_words_by_two \ |
igor@54 | 163 $group_words_by_three \ |
igor@55 | 164 $words_grouping \ |
igor@54 | 165 -X get_words_group_words_add_stat "$1" |
igor@38 | 166 } |
igor@38 | 167 |
igor@38 | 168 if [ "$TAGS_LIST_ONLY" = "YES" ] |
igor@38 | 169 then |
igor@38 | 170 cd "${WORK_DIR}" |
igor@38 | 171 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//' |
igor@38 | 172 exit 0 |
igor@38 | 173 fi |
igor@38 | 174 |
igor@38 | 175 tag_file_name() |
igor@38 | 176 { |
igor@38 | 177 echo "${LANGUAGE}_${1}.txt" |
igor@38 | 178 } |
igor@38 | 179 |
igor@38 | 180 if [ "$REMOVE_TAG" = "YES" ] |
igor@38 | 181 then |
igor@38 | 182 cd "${WORK_DIR}" |
igor@38 | 183 for i in $TAG_NAME |
igor@38 | 184 do |
igor@38 | 185 echo "$TAGNAME" | grep -q '[/*?]' && continue |
igor@38 | 186 f="`tag_file_name $i`" |
igor@38 | 187 if [ -e "$f" ] |
igor@38 | 188 then |
igor@38 | 189 rm -f "$f" && echo Tag "'$i'" removed |
igor@38 | 190 else |
igor@38 | 191 echo Unknown tag "'$i'" |
igor@38 | 192 fi |
igor@38 | 193 done |
igor@38 | 194 exit 0 |
igor@38 | 195 fi |
igor@38 | 196 |
igor@54 | 197 get_words_group_words_add_stat "$1" |
igor@38 | 198 |
igor@54 | 199 #mkdir -p $WORK_DIR |
igor@54 | 200 #oldpwd="$PWD" |
igor@54 | 201 #cd $WORK_DIR |
igor@54 | 202 #if [ "$MERGE_TAGGED_WORDS" = "YES" ] |
igor@54 | 203 #then |
igor@54 | 204 # VOC_FILES='' |
igor@54 | 205 # for i in $MERGE_THIS_TAGS |
igor@54 | 206 # do |
igor@54 | 207 # f=`tag_file_name $i` |
igor@54 | 208 # [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f" |
igor@54 | 209 # done |
igor@54 | 210 # if [ -z "$VOC_FILES" ] |
igor@54 | 211 # then |
igor@54 | 212 # echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr |
igor@54 | 213 # else |
igor@54 | 214 # cat $VOC_FILES |
igor@54 | 215 # fi |
igor@54 | 216 #elif [ "$MERGE_ALL_TAGGED" = "YES" ] |
igor@54 | 217 #then |
igor@54 | 218 # cat ${LANGUAGE}_*.txt |
igor@54 | 219 #else |
igor@54 | 220 # cat |
igor@54 | 221 #fi |
igor@38 | 222 |
igor@38 | 223 |