new-words

annotate new-words-py.sh @ 54:e25de9ea9184

new-words.py is almost ready
author Igor Chubin <igor@chub.in>
date Tue Nov 01 20:19:18 2011 +0100 (2011-11-01)
parents 4e931db74618
children 2a1a25e61872
rev   line source
igor@38 1 #!/bin/bash
igor@38 2
igor@38 3 show_usage()
igor@38 4 {
igor@38 5 cat <<HELP > /dev/stderr
igor@38 6
igor@38 7 USAGE:
igor@38 8
igor@38 9 new-words [ -l lang ] [ -s ] [ ARG ]
igor@38 10
igor@38 11 SWITCHES:
igor@38 12
igor@38 13 -h print this screen
igor@38 14 -c show compressed wordlist: one word per group
igor@50 15 -f file show only words related to the words in the file
igor@45 16 -G turn off word grouping
igor@38 17 -k put higher words that are similar to the known words (only for English)
igor@38 18 -l lang override language settings
igor@38 19 -n non-interactive mode (don't run vi)
igor@38 20 -N turn off known words filtering
igor@38 21 -a don't add marks (and don't save marks added by user)
igor@38 22 -p pages work with specified pages only (pages = start-stop/total )
igor@38 23 -s show the text statistics (percentage of known words and so on) and exit
igor@38 24 -S show your vocabulary statistics (number of words and word groups)
igor@38 25 -t tag tag known words with tag
igor@38 26 -T show list of active tags
igor@38 27 -m tag merge the words tagged with "tag" into the main vocabulary
igor@38 28 -M merge the words tagged with any tag into the main vocabulary
igor@48 29 -d tag delete subvocabulary for the "tag"
igor@48 30 -r RANGE show only first RANGE words
igor@48 31 -R RANGE show only words lower than RANGE percent
igor@38 32 -2 -3 find 2 and 3 words' sequences
igor@38 33
igor@38 34 The language of the text can be specified also
igor@38 35 by name of the program new-words (correspondent link must be created before).
igor@38 36 For example, these calls are equivalent:
igor@38 37
igor@38 38 de-words URL
igor@38 39 new-words -l de URL
igor@38 40
igor@38 41 HELP
igor@38 42 }
igor@38 43
igor@38 44 if [ "$1" = "-h" ]
igor@38 45 then
igor@38 46 show_usage
igor@38 47 exit 0
igor@38 48 fi
igor@38 49
igor@38 50 NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py
igor@38 51 WORK_DIR=~/.new-words/
igor@38 52 editor=${EDITOR:-vim}
igor@38 53
igor@38 54 # language detection
igor@38 55
igor@38 56 LANGUAGE=en
igor@38 57 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"
igor@38 58 for arg
igor@38 59 do
igor@38 60 if echo "$arg" | grep -q http://...wikipedia.org/wiki/
igor@38 61 then
igor@38 62 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"
igor@38 63 fi
igor@38 64 done
igor@38 65 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"
igor@38 66
igor@38 67 #----------------------------------------------------
igor@38 68 # command line options processing
igor@38 69
igor@38 70 STAT_ONLY=NO
igor@38 71 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO
igor@38 72 DONT_ADD_MARKS=NO
igor@38 73 NON_INTERACTIVE_MODE=NO
igor@38 74 PART_TO_PROCESS=''
igor@38 75 GROUP_WORDS_BY_THREE=NO
igor@38 76 GROUP_WORDS_BY_TWO=NO
igor@38 77 TAG_NAME=''
igor@38 78 MERGE_THIS_TAGS=''
igor@38 79 TAGS_LIST_ONLY=NO
igor@38 80 MERGE_TAGGED_WORDS=NO
igor@38 81 MERGE_ALL_TAGGED=NO
igor@38 82 DONT_ADD_MARKLINES=NO
igor@38 83 FILTER_WORDS=YES
igor@38 84 SHOW_VOC_STAT=NO
igor@38 85 COMPRESSED_WORDLIST=NO
igor@45 86 WORDS_GROUPING=YES
igor@50 87 ALLOWED_WORDS_FILENAME=''
igor@50 88 while getopts Gcf:l:sSkanNp:t:Tm:Md:r:R:23 opt
igor@38 89 do
igor@38 90 case "$opt" in
igor@38 91 c) COMPRESSED_WORDLIST=YES;;
igor@50 92 f) ALLOWED_WORDS_FILENAME="$OPTARG";;
igor@45 93 G) WORDS_GROUPING=NO;;
igor@38 94 s) STAT_ONLY=YES;;
igor@38 95 S) SHOW_VOC_STAT=YES;;
igor@38 96 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;
igor@38 97 l) LANGUAGE="$OPTARG";;
igor@38 98 a) DONT_ADD_MARKS=YES;;
igor@38 99 n) NON_INTERACTIVE_MODE=YES;;
igor@38 100 N) FILTER_WORDS=NO;;
igor@38 101 p) PART_TO_PROCESS="$OPTARG";;
igor@38 102 t) TAG_NAME="$OPTARG";;
igor@38 103 T) TAGS_LIST_ONLY="YES";;
igor@38 104 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;
igor@38 105 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;
igor@48 106 d) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;
igor@48 107 r) SHOW_RANGE="$OPTARG";;
igor@48 108 R) SHOW_RANGE_PERCENTAGE="$OPTARG";;
igor@38 109 2) GROUP_WORDS_BY_TWO=YES;;
igor@38 110 3) GROUP_WORDS_BY_THREE=YES;;
igor@38 111 \?) # unknown flag
igor@38 112 show_usage
igor@38 113 exit 1;;
igor@38 114 esac
igor@38 115 done
igor@38 116 shift `expr $OPTIND - 1`
igor@38 117
igor@38 118 if [ "$1" = "-l" ]
igor@38 119 then
igor@38 120 LANGUAGE="$2"
igor@38 121 shift 2
igor@38 122 fi
igor@38 123
igor@38 124 VOCABULARY=${LANGUAGE}.txt
igor@38 125 NOTES_FILE=notes-${LANGUAGE}.txt
igor@38 126
igor@38 127 if [ "${SHOW_VOC_STAT}" = "YES" ]
igor@38 128 then
igor@38 129 $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>"
igor@38 130 exit 0
igor@38 131 fi
igor@38 132
igor@40 133 get_words_group_words_add_stat()
igor@38 134 {
igor@54 135 [ "$PART_TO_PROCESS" == "" ] || PART_TO_PROCESS="-p $PART_TO_PROCESS"
igor@54 136 [ "$ALLOWED_WORDS_FILENAME" = "" ] || ALLOWED_WORDS_FILENAME="-f $ALLOWED_WORDS_FILENAME"
igor@54 137 [ "$SHOW_RANGE_PERCENTAGE" = "" ] || SHOW_RANGE_PERCENTAGE="-R $SHOW_RANGE_PERCENTAGE"
igor@54 138 [ "$NON_INTERACTIVE_MODE" = YES ] && non_interactive="-n"
igor@54 139 [ "$STAT_ONLY" = YES ] && stat_only="-s"
igor@54 140 [ "$COMPRESSED_WORDLIST" = YES ] && compressed_wordlist="-c"
igor@54 141 [ "$FILTER_WORDS" = NO ] && filter_words="-N"
igor@54 142 [ "$GROUP_WORDS_BY_TWO" = YES ] && group_words_by_two="-2"
igor@54 143 [ "$GROUP_WORDS_BY_THREE" = YES ] && group_words_by_three="-3"
igor@54 144
igor@48 145 SHOW_RANGE="$SHOW_RANGE" \
igor@45 146 WORDS_GROUPING="$WORDS_GROUPING" \
igor@54 147 $NEW_WORDS_PY -l "$LANGUAGE" \
igor@54 148 $SHOW_RANGE_PERCENTAGE \
igor@54 149 $PART_TO_PROCESS \
igor@54 150 $ALLOWED_WORDS_FILENAME \
igor@54 151 $non_interactive \
igor@54 152 $stat_only \
igor@54 153 $compressed_wordlist \
igor@54 154 $filter_words \
igor@54 155 $group_words_by_two \
igor@54 156 $group_words_by_three \
igor@54 157 -X get_words_group_words_add_stat "$1"
igor@38 158 }
igor@38 159
igor@38 160 if [ "$TAGS_LIST_ONLY" = "YES" ]
igor@38 161 then
igor@38 162 cd "${WORK_DIR}"
igor@38 163 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'
igor@38 164 exit 0
igor@38 165 fi
igor@38 166
igor@38 167 tag_file_name()
igor@38 168 {
igor@38 169 echo "${LANGUAGE}_${1}.txt"
igor@38 170 }
igor@38 171
igor@38 172 if [ "$REMOVE_TAG" = "YES" ]
igor@38 173 then
igor@38 174 cd "${WORK_DIR}"
igor@38 175 for i in $TAG_NAME
igor@38 176 do
igor@38 177 echo "$TAGNAME" | grep -q '[/*?]' && continue
igor@38 178 f="`tag_file_name $i`"
igor@38 179 if [ -e "$f" ]
igor@38 180 then
igor@38 181 rm -f "$f" && echo Tag "'$i'" removed
igor@38 182 else
igor@38 183 echo Unknown tag "'$i'"
igor@38 184 fi
igor@38 185 done
igor@38 186 exit 0
igor@38 187 fi
igor@38 188
igor@54 189 get_words_group_words_add_stat "$1"
igor@38 190
igor@54 191 #mkdir -p $WORK_DIR
igor@54 192 #oldpwd="$PWD"
igor@54 193 #cd $WORK_DIR
igor@54 194 #if [ "$MERGE_TAGGED_WORDS" = "YES" ]
igor@54 195 #then
igor@54 196 # VOC_FILES=''
igor@54 197 # for i in $MERGE_THIS_TAGS
igor@54 198 # do
igor@54 199 # f=`tag_file_name $i`
igor@54 200 # [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"
igor@54 201 # done
igor@54 202 # if [ -z "$VOC_FILES" ]
igor@54 203 # then
igor@54 204 # echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr
igor@54 205 # else
igor@54 206 # cat $VOC_FILES
igor@54 207 # fi
igor@54 208 #elif [ "$MERGE_ALL_TAGGED" = "YES" ]
igor@54 209 #then
igor@54 210 # cat ${LANGUAGE}_*.txt
igor@54 211 #else
igor@54 212 # cat
igor@54 213 #fi
igor@38 214
igor@38 215