new-words: 48ca8248e9cc new-words.sh

new-words

view new-words.sh @ 31:48ca8248e9cc

+esperanto normalization

author	Igor Chubin <igor@chub.in>
date	Tue Aug 17 21:35:57 2010 +0200 (2010-08-17)
parents	c631833fa2be
children	753fb84437aa

line source

1 #!/bin/bash

3 show_usage()

4 {

5 cat <<HELP > /dev/stderr

7 USAGE:

9 new-words [ -l lang ] [ -s ] [ ARG ]

11 SWITCHES:

13 -h print this screen

14 -k put higher words that are similar to the known words (only for English)

15 -l lang override language settings

16 -n non-interactive mode (don't run vi)

17 -a don't add marks (and don't save marks added by user)

18 -p pages work with specified pages only (pages = start-stop/total )

19 -s show the text statistics (percentage of known words and so on) and exit

20 -t tag tag known words with tag

21 -T show list of active tags

22 -m tag merge the words tagged with "tag" into the main vocabulary

23 -M merge the words tagged with any tag into the main vocabulary

24 -r tag remove subvocabulary for the "tag"

25 -2 -3 find 2 and 3 words' sequences

27 The language of the text can be specified also

28 by name of the program new-words (correspondent link must be created before).

29 For example, these calls are equivalent:

31 de-words URL

32 new-words -l de URL

34 HELP

35 }

37 if [ "$1" = "-h" ]

38 then

39 show_usage

40 exit 0

41 fi

43 WORK_DIR=~/.new-words/

44 TEMP1=`mktemp /tmp/new-words-temp1.XXXXXXXXXX`

45 TEMP2=`mktemp /tmp/new-words-temp2.XXXXXXXXXX`

46 export ORIGINAL_TEXT=`mktemp /tmp/new-words-orig.XXXXXXXXXX`

47 editor=${EDITOR:-vim}

49 # language detection

51 LANGUAGE=en

52 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"

53 for arg

54 do

55 if echo "$arg" | grep -q http://...wikipedia.org/wiki/

56 then

57 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"

58 fi

59 done

60 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"

62 #----------------------------------------------------

63 # command line options processing

65 STAT_ONLY=NO

66 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO

67 DONT_ADD_MARKS=NO

68 NON_INTERACTIVE_MODE=NO

69 PART_TO_PROCESS=''

70 GROUP_WORDS_BY_THREE=NO

71 GROUP_WORDS_BY_TWO=NO

72 TAG_NAME=''

73 MERGE_THIS_TAGS=''

74 TAGS_LIST_ONLY=NO

75 MERGE_TAGGED_WORDS=NO

76 MERGE_ALL_TAGGED=NO

77 DONT_ADD_MARKLINES=NO

78 while getopts l:skanp:t:Tm:Mr:23 opt

79 do

80 case "$opt" in

81 s) STAT_ONLY=YES;;

82 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;

83 l) LANGUAGE="$OPTARG";;

84 a) DONT_ADD_MARKS=YES;;

85 n) NON_INTERACTIVE_MODE=YES;;

86 p) PART_TO_PROCESS="$OPTARG";;

87 t) TAG_NAME="$OPTARG";;

88 T) TAGS_LIST_ONLY="YES";;

89 m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";;

90 M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";;

91 r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";;

92 2) GROUP_WORDS_BY_TWO=YES;;

93 3) GROUP_WORDS_BY_THREE=YES;;

94 \?) # unknown flag

95 show_usage

96 exit 1;;

97 esac

98 done

99 shift `expr $OPTIND - 1`

100

101 if [ "$1" = "-l" ]

102 then

103 LANGUAGE="$2"

104 shift 2

105 fi

106

107 VOCABULARY=${LANGUAGE}.txt

108 NOTES_FILE=notes-${LANGUAGE}.txt

109

110 #----------------------------------------------------

111

112 get_words()

113 {

114 tr ' ' '\n' | sed 's/--/ /g' \

115 | sed "s/'/__APOSTROPHE__/g" \

116 | perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:] '"'"'_-]*$/'\

117 | sed "s/__APOSTROPHE__/'/g" \

118 | tr ' ' '\n' \

119 | tee "$1" \

120 | grep_v_english_perl \

121 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn

122 }

123

124 add_stat()

125 {

126 if [ "$DONT_ADD_MARKLINES" = "YES" ]

127 then

128 cat

129 return

130 fi

131 before="$1"

132 after=${before}2

133 cat > "$after"

134 total="`wc -w $1 | awk '{print $1}'`"

135 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"

136 total_known="`echo $total-$total_unknown|bc`"

137 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.$.$.*/.\1/'`"

138 #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`"

139 sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`"

140

141

142 if [ "$STAT_ONLY" = "YES" ]

143 then

144 echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10"

145 echo "$LANGUAGE $percentage `echo $100-$percentage$ | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` "

146 rm $after

147 return 0

148 else

149 echo "# $LANGUAGE, $percentage, <$total_known/$total>"

150 fi

151

152 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`

153 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME

154 my $total=shift(@ARGV);

155 my $total_known=shift(@ARGV);

156 my $s=0;

157 my $mark_line=int($total_known*100/$total/5)*5;