new-words: e25de9ea9184 new-words-py.sh

new-words

view new-words-py.sh @ 54:e25de9ea9184

new-words.py is almost ready

author	Igor Chubin <igor@chub.in>
date	Tue Nov 01 20:19:18 2011 +0100 (2011-11-01)
parents	4e931db74618
children	2a1a25e61872

line source

1 #!/bin/bash

3 show_usage()

4 {

5 cat <<HELP > /dev/stderr

7 USAGE:

9 new-words [ -l lang ] [ -s ] [ ARG ]

11 SWITCHES:

13 -h print this screen

14 -c show compressed wordlist: one word per group

15 -f file show only words related to the words in the file

16 -G turn off word grouping

17 -k put higher words that are similar to the known words (only for English)

18 -l lang override language settings

19 -n non-interactive mode (don't run vi)

20 -N turn off known words filtering

21 -a don't add marks (and don't save marks added by user)

22 -p pages work with specified pages only (pages = start-stop/total )

23 -s show the text statistics (percentage of known words and so on) and exit

24 -S show your vocabulary statistics (number of words and word groups)

25 -t tag tag known words with tag

26 -T show list of active tags

27 -m tag merge the words tagged with "tag" into the main vocabulary

28 -M merge the words tagged with any tag into the main vocabulary

29 -d tag delete subvocabulary for the "tag"

30 -r RANGE show only first RANGE words

31 -R RANGE show only words lower than RANGE percent

32 -2 -3 find 2 and 3 words' sequences

34 The language of the text can be specified also

35 by name of the program new-words (correspondent link must be created before).

36 For example, these calls are equivalent:

38 de-words URL

39 new-words -l de URL

41 HELP

42 }

44 if [ "$1" = "-h" ]

45 then

46 show_usage

47 exit 0

48 fi

50 NEW_WORDS_PY=/home/igor/hg/new-words/new-words.py

51 WORK_DIR=~/.new-words/

52 editor=${EDITOR:-vim}

54 # language detection

56 LANGUAGE=en

57 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"

58 for arg

59 do

60 if echo "$arg" | grep -q http://...wikipedia.org/wiki/

61 then

62 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"

63 fi

64 done

65 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"

67 #----------------------------------------------------

68 # command line options processing

70 STAT_ONLY=NO

71 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO

72 DONT_ADD_MARKS=NO

73 NON_INTERACTIVE_MODE=NO

74 PART_TO_PROCESS=''

75 GROUP_WORDS_BY_THREE=NO

76 GROUP_WORDS_BY_TWO=NO

77 TAG_NAME=''

78 MERGE_THIS_TAGS=''

79 TAGS_LIST_ONLY=NO

80 MERGE_TAGGED_WORDS=NO

81 MERGE_ALL_TAGGED=NO

82 DONT_ADD_MARKLINES=NO

83 FILTER_WORDS=YES

84 SHOW_VOC_STAT=NO

85 COMPRESSED_WORDLIST=NO

86 WORDS_GROUPING=YES

87 ALLOWED_WORDS_FILENAME=''

88 while getopts Gcf:l:sSkanNp:t:Tm:Md:r:R:23 opt

89 do

90 case "$opt" in

91 c) COMPRESSED_WORDLIST=YES;;

92 f) ALLOWED_WORDS_FILENAME="$OPTARG";;

93 G) WORDS_GROUPING=NO;;

94 s) STAT_ONLY=YES;;

95 S) SHOW_VOC_STAT=YES;;

96 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;

97 l) LANGUAGE="$OPTARG";;

98 a) DONT_ADD_MARKS=YES;;

99 n) NON_INTERACTIVE_MODE=YES;;

100 N) FILTER_WORDS=NO;;

101 p) PART_TO_PROCESS="$OPTARG";;

102 t) TAG_NAME="$OPTARG";;

103 T) TAGS_LIST_ONLY="YES";;

104 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;

105 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;

106 d) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;

107 r) SHOW_RANGE="$OPTARG";;

108 R) SHOW_RANGE_PERCENTAGE="$OPTARG";;

109 2) GROUP_WORDS_BY_TWO=YES;;

110 3) GROUP_WORDS_BY_THREE=YES;;

111 \?) # unknown flag

112 show_usage

113 exit 1;;

114 esac

115 done

116 shift `expr $OPTIND - 1`

117

118 if [ "$1" = "-l" ]

119 then

120 LANGUAGE="$2"

121 shift 2

122 fi

123

124 VOCABULARY=${LANGUAGE}.txt

125 NOTES_FILE=notes-${LANGUAGE}.txt

126

127 if [ "${SHOW_VOC_STAT}" = "YES" ]

128 then

129 $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>"

130 exit 0

131 fi

132

133 get_words_group_words_add_stat()

134 {

135 [ "$PART_TO_PROCESS" == "" ] || PART_TO_PROCESS="-p $PART_TO_PROCESS"

136 [ "$ALLOWED_WORDS_FILENAME" = "" ] || ALLOWED_WORDS_FILENAME="-f $ALLOWED_WORDS_FILENAME"

137 [ "$SHOW_RANGE_PERCENTAGE" = "" ] || SHOW_RANGE_PERCENTAGE="-R $SHOW_RANGE_PERCENTAGE"

138 [ "$NON_INTERACTIVE_MODE" = YES ] && non_interactive="-n"

139 [ "$STAT_ONLY" = YES ] && stat_only="-s"

140 [ "$COMPRESSED_WORDLIST" = YES ] && compressed_wordlist="-c"

141 [ "$FILTER_WORDS" = NO ] && filter_words="-N"

142 [ "$GROUP_WORDS_BY_TWO" = YES ] && group_words_by_two="-2"

143 [ "$GROUP_WORDS_BY_THREE" = YES ] && group_words_by_three="-3"

144

145 SHOW_RANGE="$SHOW_RANGE" \

146 WORDS_GROUPING="$WORDS_GROUPING" \

147 $NEW_WORDS_PY -l "$LANGUAGE" \

148 $SHOW_RANGE_PERCENTAGE \

149 $PART_TO_PROCESS \

150 $ALLOWED_WORDS_FILENAME \

151 $non_interactive \

152 $stat_only \

153 $compressed_wordlist \

154 $filter_words \

155 $group_words_by_two \

156 $group_words_by_three \

157 -X get_words_group_words_add_stat "$1"

158 }

159

160 if [ "$TAGS_LIST_ONLY" = "YES" ]

161 then

162 cd "${WORK_DIR}"

163 echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//'

164 exit 0

165 fi

166

167 tag_file_name()

168 {

169 echo "${LANGUAGE}_${1}.txt"

170 }

171

172 if [ "$REMOVE_TAG" = "YES" ]

173 then

174 cd "${WORK_DIR}"

175 for i in $TAG_NAME

176 do

177 echo "$TAGNAME" | grep -q '[/*?]' && continue

178 f="`tag_file_name $i`"

179 if [ -e "$f" ]

180 then

181 rm -f "$f" && echo Tag "'$i'" removed

182 else

183 echo Unknown tag "'$i'"

184 fi

185 done

186 exit 0

187 fi

188

189 get_words_group_words_add_stat "$1"

190

191 #mkdir -p $WORK_DIR

192 #oldpwd="$PWD"

193 #cd $WORK_DIR

194 #if [ "$MERGE_TAGGED_WORDS" = "YES" ]

195 #then

196 # VOC_FILES=''

197 # for i in $MERGE_THIS_TAGS

198 # do

199 # f=`tag_file_name $i`

200 # [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f"

201 # done

202 # if [ -z "$VOC_FILES" ]

203 # then

204 # echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr

205 # else

206 # cat $VOC_FILES

207 # fi

208 #elif [ "$MERGE_ALL_TAGGED" = "YES" ]

209 #then

210 # cat ${LANGUAGE}_*.txt

211 #else

212 # cat

213 #fi

214

215