# HG changeset patch # User Igor Chubin # Date 1274460804 -10800 # Node ID 0a80b2fa3ed8b7e951d9f32320a377f8f432e772 # Parent 4a10c0f4510cb4c9387190b0f42abfbcdd197a26 initial tagging support diff -r 4a10c0f4510c -r 0a80b2fa3ed8 new-words.sh --- a/new-words.sh Fri May 21 01:02:21 2010 +0300 +++ b/new-words.sh Fri May 21 19:53:24 2010 +0300 @@ -14,9 +14,14 @@ -k put higher words that are similar to the known words (only for English) -l lang override language settings -n non-interactive mode (don't run vi) - -m don't add marks (and don't save marks added by user) + -a don't add marks (and don't save marks added by user) -p pages work with specified pages only (pages = start-stop/total ) -s show the text statistics (percentage of known words and so on) and exit + -t tag tag known words with tag + -T show list of active tags + -m tag merge the words tagged with "tag" into the main vocabulary + -M merge the words tagged with any tag into the main vocabulary + -r tag remove subvocabulary for the "tag" -2 -3 find 2 and 3 words' sequences The language of the text can be specified also @@ -64,15 +69,26 @@ PART_TO_PROCESS='' GROUP_WORDS_BY_THREE=NO GROUP_WORDS_BY_TWO=NO -while getopts l:skmnp:23 opt +TAG_NAME='' +MERGE_THIS_TAGS='' +TAGS_LIST_ONLY=NO +MERGE_TAGGED_WORDS=NO +MERGE_ALL_TAGGED=NO +DONT_ADD_MARKLINES=NO +while getopts l:skanp:t:Tm:Mr:23 opt do case "$opt" in s) STAT_ONLY=YES;; k) NEED_TO_USE_VOCABULARY_WHEN_SORT=YES;; l) LANGUAGE="$OPTARG";; - m) DONT_ADD_MARKS=YES;; + a) DONT_ADD_MARKS=YES;; n) NON_INTERACTIVE_MODE=YES;; p) PART_TO_PROCESS="$OPTARG";; + t) TAG_NAME="$OPTARG";; + T) TAGS_LIST_ONLY="YES";; + m) DONT_ADD_MARKLINES="YES"; MERGE_TAGGED_WORDS="YES"; MERGE_THIS_TAGS="$TAG_NAME $OPTARG";; + M) DONT_ADD_MARKLINES="YES"; MERGE_ALL_TAGGED="YES";; + r) REMOVE_TAG="YES"; TAG_NAME="$TAG_NAME $OPTARG";; 2) GROUP_WORDS_BY_TWO=YES;; 3) GROUP_WORDS_BY_THREE=YES;; \?) # unknown flag @@ -107,6 +123,11 @@ add_stat() { + if [ "$DONT_ADD_MARKLINES" = "YES" ] + then + cat + return + fi before="$1" after=${before}2 cat > "$after" @@ -196,12 +217,16 @@ { PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME -open(VOC, $ENV{VOCABULARY}) - or die "Can't open VOCABULARY"; -while (){ - chomp; - #s/'//g; - $voc{$_}="1"; +$voc_files=$ENV{VOC_FILES}; +$voc_files=~s@^ @@; +for $voc_file (split /\s+/,$voc_files) { + if (open(VOC, $voc_file)) { + while (){ + chomp; + #s/'//g; + $voc{$_}="1"; + } + } } while(<>) { chomp; @@ -209,7 +234,12 @@ } PERL_SCRIPT [ -e "$VOCABULARY" ] || touch "$VOCABULARY" - export VOCABULARY + export VOCABULARY VOC_FILES + VOC_FILES=$VOCABULARY + for i in $TAG_NAME + do + VOC_FILES="${VOC_FILES} `tag_file_name $i`" + done perl $PERL_SCRIPT_TEMP_NAME rm $PERL_SCRIPT_TEMP_NAME } @@ -506,10 +536,56 @@ rm $PERL_SCRIPT_TEMP_NAME } +if [ "$TAGS_LIST_ONLY" = "YES" ] +then + cd "${WORK_DIR}" + echo ${LANGUAGE}_*.txt | tr ' ' '\n' | grep -v '*' | sed 's/[^_]*_//;s/.txt$//' + exit 0 +fi + +tag_file_name() +{ + echo "${LANGUAGE}_${1}.txt" +} + +if [ "$REMOVE_TAG" = "YES" ] +then + cd "${WORK_DIR}" + for i in $TAG_NAME + do + echo "$TAGNAME" | grep -q '[/*?]' && continue + f="`tag_file_name $i`" + if [ -e "$f" ] + then + rm -f "$f" && echo Tag "'$i'" removed + else + echo Unknown tag "'$i'" + fi + done + exit 0 +fi + mkdir -p $WORK_DIR oldpwd="$PWD" cd $WORK_DIR -if echo "$1" | grep -q http: +if [ "$MERGE_TAGGED_WORDS" = "YES" ] +then + VOC_FILES='' + for i in $MERGE_THIS_TAGS + do + f=`tag_file_name $i` + [ -e "$f" ] && VOC_FILES="${VOC_FILES} $f" + done + if [ -z "$VOC_FILES" ] + then + echo Unknown tags: $MERGE_THIS_TAGS > /dev/stderr + else + cat $VOC_FILES + fi +elif [ "$MERGE_ALL_TAGGED" = "YES" ] +then + cat ${LANGUAGE}_*.txt +elif echo "$1" | grep -q http: then text_from_url "$1" elif [ "$#" != 0 ] @@ -538,16 +614,21 @@ then cat "$TEMP1" else - [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2" - if [ "$editor" = vim ] + if [ `wc -l "$TEMP2" | awk '{print $1}'` != 0 ] then - vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty - else - echo 2 - $editor "$TEMP2" + [ "$DONT_ADD_MARKS" = "YES" ] || add_marks "$TEMP2" + if [ "$editor" = vim ] + then + vim -c 'set keywordprg='"$LANGUAGE" -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255' "$TEMP2" < /dev/tty > /dev/tty + else + $editor "$TEMP2" + fi + remove_marks "$TEMP2" + + vocabulary="$VOCABULARY" + [ -n "$TAG_NAME" ] && vocabulary="`tag_file_name $TAG_NAME`" + diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$vocabulary" fi - remove_marks "$TEMP2" fi -diff "$TEMP1" "$TEMP2" | awk '{print $3}' | sort -u >> "$VOCABULARY" rm -f "$TEMP1" "$TEMP2" "${TEMP1}-full" "$ORIGINAL_TEXT"