new-words: 4a10c0f4510c new-words.sh

new-words

view new-words.sh @ 26:4a10c0f4510c

apostrophe support, some improvements in speed, two and three words combination support

author	Igor Chubin <igor@chub.in>
date	Fri May 21 01:02:21 2010 +0300 (2010-05-21)
parents	d1eb7dc37feb
children	0a80b2fa3ed8

line source

1 #!/bin/bash

3 show_usage()

4 {

5 cat <<HELP > /dev/stderr

7 USAGE:

9 new-words [ -l lang ] [ -s ] [ ARG ]

11 SWITCHES:

13 -h print this screen

14 -k put higher words that are similar to the known words (only for English)

15 -l lang override language settings

16 -n non-interactive mode (don't run vi)

17 -m don't add marks (and don't save marks added by user)

18 -p pages work with specified pages only (pages = start-stop/total )

19 -s show the text statistics (percentage of known words and so on) and exit

20 -2 -3 find 2 and 3 words' sequences

22 The language of the text can be specified also

23 by name of the program new-words (correspondent link must be created before).

24 For example, these calls are equivalent:

26 de-words URL

27 new-words -l de URL

29 HELP

30 }

32 if [ "$1" = "-h" ]

33 then

34 show_usage

35 exit 0

36 fi

38 WORK_DIR=~/.new-words/

39 TEMP1=`mktemp /tmp/new-words-XXXXXXXXXX-temp1`

40 TEMP2=`mktemp /tmp/new-words-XXXXXXXXXX-temp2`

41 export ORIGINAL_TEXT=`mktemp /tmp/new-words-XXXXXXXXXX-orig`

42 editor=${EDITOR:-vim}

44 # language detection

46 LANGUAGE=en

47 my_name="`echo $0 | sed s@.*/@@ | sed s/-.*// `"

48 for arg

49 do

50 if echo "$arg" | grep -q http://...wikipedia.org/wiki/

51 then

52 LANGUAGE="`echo $arg | sed s@http://@@ | sed s@.wikipedia.*@@`"

53 fi

54 done

55 [ "${my_name}" = "new" ] || LANGUAGE="$my_name"

57 #----------------------------------------------------

58 # command line options processing

60 STAT_ONLY=NO

61 NEED_TO_USE_VOCABULARY_WHEN_SORT=NO

62 DONT_ADD_MARKS=NO

63 NON_INTERACTIVE_MODE=NO

64 PART_TO_PROCESS=''

65 GROUP_WORDS_BY_THREE=NO

66 GROUP_WORDS_BY_TWO=NO

67 while getopts l:skmnp:23 opt

68 do

69 case "$opt" in

70 s) STAT_ONLY=YES;;

71 k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;;

72 l) LANGUAGE="$OPTARG";;

73 m) DONT_ADD_MARKS=YES;;

74 n) NON_INTERACTIVE_MODE=YES;;

75 p) PART_TO_PROCESS="$OPTARG";;

76 2) GROUP_WORDS_BY_TWO=YES;;

77 3) GROUP_WORDS_BY_THREE=YES;;

78 \?) # unknown flag

79 show_usage

80 exit 1;;

81 esac

82 done

83 shift `expr $OPTIND - 1`

85 if [ "$1" = "-l" ]

86 then

87 LANGUAGE="$2"

88 shift 2

89 fi

91 VOCABULARY=${LANGUAGE}.txt

92 NOTES_FILE=notes-${LANGUAGE}.txt

94 #----------------------------------------------------

96 get_words()

97 {

98 tr ' ' '\n' | sed 's/--/ /g' \

99 | sed "s/'/__APOSTROPHE__/g" \

100 | perl -MEncode -Mutf8 -n -e '$_ = decode( "utf8", $_);y/*\r,.:#@()+=—<>$;"?!|·[]^%&/ /; binmode STDOUT, ":utf8"; print if /^[[:alpha:]'"'"'_-]*$/'\

101 | sed "s/__APOSTROPHE__/'/g" \

102 | tr ' ' '\n' \

103 | tee "$1" \

104 | grep_v_english_perl \

105 | sort | uniq -c | awk '{if ($2!="") print;}' | sort -rn

106 }

107

108 add_stat()

109 {

110 before="$1"

111 after=${before}2

112 cat > "$after"

113 total="`wc -w $1 | awk '{print $1}'`"

114 total_unknown="`cat $after|awk '{s=s+$1}END{print s}'`"

115 total_known="`echo $total-$total_unknown|bc`"

116 percentage="`echo '100*('$total-$total_unknown')'/$total | bc -l | sed 's/\\.$.$.*/.\1/'`"

117 #sentences="`cat $after | perl -e 'local $/; $_=<>; s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; s@\n@@g; s@(Mr|Mrs)\.@\1POINT@g; @sentences=split /\\./;print $#sentences;'`"

118 sentences="`cat $ORIGINAL_TEXT | perl -e 'local $/; $_=<>; s/[^.]//msg; print length($_);'`"

119

120

121 if [ "$STAT_ONLY" = "YES" ]

122 then

123 echo "LANG KNOWN% UNKNOWN% KNOWN TOTAL WPS UWPS*10"

124 echo "$LANGUAGE $percentage `echo $100-$percentage$ | bc -l` $total_known $total `echo $total/$sentences|bc` `echo 10*$total_unknown/$sentences|bc` "

125 rm $after

126 return 0

127 else

128 echo "# $LANGUAGE, $percentage, <$total_known/$total>"

129 fi

130

131 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`

132 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME

133 my $total=shift(@ARGV);

134 my $total_known=shift(@ARGV);

135 my $s=0;

136 my $mark_line=int($total_known*100/$total/5)*5;