new-words: 7a7a88277c08 new-words.sh

new-words

view new-words.sh @ 59:7a7a88277c08

experimental script oneliners.sh moved to misc/

author	Igor Chubin <igor@chub.in>
date	Thu Nov 03 16:10:58 2011 +0100 (2011-11-03)
parents	f95804355b0f
children

line source

1 #!/bin/bash

3 show_usage()

4 {

5 cat <<HELP > /dev/stderr

7 USAGE:

9 new-words [ -l lang ] [ -s ] [ ARG ]

11 SWITCHES:

13 -h print this screen

14 -c show compressed wordlist: one word per group

15 -f file show only words that are related to the words from the file

16 -k put higher words that are similar to the known words (only for English)

17 -l lang override language settings

18 -n non-interactive mode (don't run vi)

19 -N turn off known words filtering

20 -a don't add marks (and don't save marks added by user)

21 -p pages work with specified pages only (pages = start-stop/total )

22 -s show the text statistics (percentage of known words and so on) and exit

23 -S show your vocabulary statistics (number of words and word groups)

24 -t tag tag known words with tag

25 -T show list of active tags

26 -m tag merge the words tagged with "tag" into the main vocabulary

27 -M merge the words tagged with any tag into the main vocabulary

28 -r tag remove subvocabulary for the "tag"

29 -2 -3 find 2 and 3 words' sequences

31 The language of the text can be specified also

32 by name of the program new-words (correspondent link must be created before).

33 For example, these calls are equivalent:

35 de-words URL

36 new-words -l de URL

38 HELP

39 }

41 if [ "$1" = "-h" ]

42 then

43 show_usage

44 exit 0

45 fi

47 WORK_DIR=~/.new-words/

48 TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX`

49 TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX`

50 export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX`

51 editor=${EDITOR:-vim}

53 # language detection

55 LANGUAGE=en

56 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"

57 for arg

58 do

59 if echo "$arg" | grep -q http://...wikipedia.org/wiki/

60 then

61 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"

62 fi

63 done

64 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"

66 #----------------------------------------------------

67 # command line options processing

69 STAT_ONLY=NO

70 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO

71 DONT_ADD_MARKS=NO

72 NON_INTERACTIVE_MODE=NO

73 PART_TO_PROCESS=''

74 GROUP_WORDS_BY_THREE=NO

75 GROUP_WORDS_BY_TWO=NO

76 TAG_NAME=''

77 MERGE_THIS_TAGS=''

78 TAGS_LIST_ONLY=NO

79 MERGE_TAGGED_WORDS=NO

80 MERGE_ALL_TAGGED=NO

81 DONT_ADD_MARKLINES=NO

82 FILTER_WORDS=YES

83 SHOW_VOC_STAT=NO

84 COMPRESSED_WORDLIST=NO

85 ALLOWED_WORDS_FILENAME=''

86 while getopts cl:sSkanNp:t:Tm:Mr:23 opt

87 do

88 case "$opt" in

89 c) COMPRESSED_WORDLIST=YES;;

90 s) STAT_ONLY=YES;;

91 S) SHOW_VOC_STAT=YES;;

92 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;

93 l) LANGUAGE="$OPTARG";;

94 a) DONT_ADD_MARKS=YES;;

95 n) NON_INTERACTIVE_MODE=YES;;

96 N) FILTER_WORDS=NO;;

97 p) PART_TO_PROCESS="$OPTARG";;

98 t) TAG_NAME="$OPTARG";;

99 T) TAGS_LIST_ONLY="YES";;

100 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;

101 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;

102 r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;

103 2) GROUP_WORDS_BY_TWO=YES;;

104 3) GROUP_WORDS_BY_THREE=YES;;

105 \?) # unknown flag

106 show_usage

107 exit 1;;

108 esac

109 done

110 shift `expr $OPTIND - 1`

111

112 if [ "$1" = "-l" ]

113 then

114 LANGUAGE="$2"

115 shift 2

116 fi

117

118 VOCABULARY=${LANGUAGE}.txt

119 NOTES_FILE=notes-${LANGUAGE}.txt

120

121 if [ "${SHOW_VOC_STAT}" = "YES" ]

122 then

123 $0 -l "${LANGUAGE}" -N -n ${WORK_DIR}/${VOCABULARY} | head -1 | awk '{print $5}' | tr -d "<>"

124 exit 0

125 fi

126

127 #----------------------------------------------------

128

129 get_words()

130 {

131 export FILTER_WORDS

132 tr ' ' '\n' | sed 's/--/ /g' \

133 | sed "s/'/__APOSTROPHE__/g" \

134 | perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\

135 | sed "s/__APOSTROPHE__/'/g" \

136 | tr ' ' '\n' \

137 | tee "$1" \

138 | grep_v_english_perl \

139 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn

140 }

141

142 add_stat()

143 {

144 if [ "$DONT_ADD_MARKLINES" = "YES" ]

145 then

146 cat

147 return

148 fi

149 before="$1"

150 after=${before}2

151 cat > "$after"

152 total="`wc -w $1 | awk '{print $1}'`"

153 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"

154 total_known="`echo $total-$total_unknown|bc`"

155 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.$.$.*/.\1/'`"

156 #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`"

157 sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`"

158

159

160 if [ "$STAT_ONLY" = "YES" ]

161 then

162 echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10"

163 echo "$LANGUAGE $percentage `echo $100-$percentage$ | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` "

164 rm $after

165 return 0

166 else

167 groups="`echo $(grep '# groups' $after | awk '{print $3}')`"

168 words="`echo $(grep -v '^#' $after | wc -l)`"

169 echo "# $LANGUAGE, $percentage, <$total_known/$total>, <$groups/$words>"

170 fi

171

172 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`

173 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME

174 my $total=shift(@ARGV);

175 my $total_known=shift(@ARGV);