# HG changeset patch # User Igor Chubin # Date 1296247558 -3600 # Node ID 7eb1a8c3eadeb8b2796327056eeaa62396353727 # Parent d532e7b52ab22d45f691d22f344dd92cad9d6e96 -2 and -3 are now supported by new-words.py diff -r d532e7b52ab2 -r 7eb1a8c3eade new-words-py.sh --- a/new-words-py.sh Fri Jan 28 12:40:58 2011 +0200 +++ b/new-words-py.sh Fri Jan 28 21:45:58 2011 +0100 @@ -123,39 +123,6 @@ exit 0 fi -two_and_three_words() -{ - if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ] - then - cat - else - cat - - export GROUP_WORDS_BY_THREE - export GROUP_WORDS_BY_TWO - PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX` - cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME -#!/usr/bin/perl -local $/; -$words=<>; -$words=~ s@[!?;,:#1-9".]@ @g; -$words =~ s@\s+@ @g; -@words = split /\s+/, $words; -for ($i=0; $i<$#words-3;$i++) { - my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]); - if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) { - print "${a}_${b}_${c}\n"; - }; - if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) { - print "${a}_${b}\n"; - }; -} -PERL_SCRIPT - perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT" - rm $PERL_SCRIPT_TEMP_NAME - fi -} - text_from_url() { lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@' @@ -171,6 +138,9 @@ } get_words_group_words_add_stat() { + STAT_ONLY="$STAT_ONLY" \ + GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \ + GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \ $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1" } @@ -282,8 +252,8 @@ fi \ | part $PART_TO_PROCESS \ | tee $ORIGINAL_TEXT \ - | two_and_three_words \ - | STAT_ONLY="$STAT_ONLY" get_words_group_words_add_stat \ + | \ + get_words_group_words_add_stat \ | tee "$TEMP1" > "$TEMP2" if [ "$STAT_ONLY" = "YES" ] diff -r d532e7b52ab2 -r 7eb1a8c3eade new-words.py --- a/new-words.py Fri Jan 28 12:40:58 2011 +0200 +++ b/new-words.py Fri Jan 28 21:45:58 2011 +0100 @@ -159,14 +159,15 @@ line = line.rstrip('\n') #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line) #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line) - return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line) + return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line) -def get_words(lines): +def get_words(lines, group_by=[1]): """ Returns hash of words in a file word => number """ result = {} + (a, b, c) = ("", "", "") for line in lines: words = words_from_line(line) for word in words: @@ -174,6 +175,17 @@ continue result.setdefault(word, 0) result[word] += 1 + if 2 in group_by and a != "" and b != "": + w = "%s_%s" % (a,b) + result.setdefault(w, 0) + result[w] += 1 + if 3 in group_by and not "" in [a,b,c]: + w = "%s_%s_%s" % (a,b,c) + result.setdefault(w, 0) + result[w] += 1 + (a,b,c) = (b, c, word) + + logging.debug(result) return result def load_vocabulary(): @@ -203,12 +215,12 @@ if match_object: word = match_object.group(1) if word in notes: - logging.debug(word) - logging.debug(line) + #logging.debug(word) + #logging.debug(line) if notes_filename in notes[word]: line = line.rstrip('\n') line = "%-30s %s\n" % (line, notes[word][notes_filename]) - logging.debug(line) + #logging.debug(line) result += [line] else: result += [line] @@ -379,11 +391,17 @@ vocabulary = load_vocabulary() notes = load_notes(notes_filenames()) lines = readlines_from_stdin() - words = get_words(lines) + group_by = [1] + if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES': + group_by.append(2) + if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES': + group_by.append(3) + words = get_words(lines, group_by) stats_only = False if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES': stats_only = True + stats = {} stats['total'] = sum(words[x] for x in words.keys()) words = substract_dictionary(words, vocabulary) @@ -402,17 +420,17 @@ linked_words = find_linked_words(notes) normalizator = Normalizator(config['language'], linked_words) - word_pairs = [] + words_with_freq = [] for k in sorted(words.keys(), key=lambda k: words[k], reverse=True): - word_pairs.append((words[k], k)) + words_with_freq.append((words[k], k)) - wgw = find_wordgroups_weights(word_pairs, normalizator) - word_pairs = sorted( - word_pairs, + wgw = find_wordgroups_weights(words_with_freq, normalizator) + words_with_freq = sorted( + words_with_freq, cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words), reverse=True) - print_words_sorted(word_pairs, stats, stats_only=stats_only) + print_words_sorted(words_with_freq, stats, stats_only=stats_only) (options, args) = parser.parse_args() if options.language: