new-words
changeset 44:7eb1a8c3eade
-2 and -3 are now supported by new-words.py
author | Igor Chubin <igor@chub.in> |
---|---|
date | Fri Jan 28 21:45:58 2011 +0100 (2011-01-28) |
parents | d532e7b52ab2 |
children | 5f90e44eecfc |
files | new-words-py.sh new-words.py |
line diff
1.1 --- a/new-words-py.sh Fri Jan 28 12:40:58 2011 +0200 1.2 +++ b/new-words-py.sh Fri Jan 28 21:45:58 2011 +0100 1.3 @@ -123,39 +123,6 @@ 1.4 exit 0 1.5 fi 1.6 1.7 -two_and_three_words() 1.8 -{ 1.9 - if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ] 1.10 - then 1.11 - cat 1.12 - else 1.13 - cat 1.14 - 1.15 - export GROUP_WORDS_BY_THREE 1.16 - export GROUP_WORDS_BY_TWO 1.17 - PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX` 1.18 - cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.19 -#!/usr/bin/perl 1.20 -local $/; 1.21 -$words=<>; 1.22 -$words=~ s@[!?;,:#1-9".]@ @g; 1.23 -$words =~ s@\s+@ @g; 1.24 -@words = split /\s+/, $words; 1.25 -for ($i=0; $i<$#words-3;$i++) { 1.26 - my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]); 1.27 - if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) { 1.28 - print "${a}_${b}_${c}\n"; 1.29 - }; 1.30 - if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) { 1.31 - print "${a}_${b}\n"; 1.32 - }; 1.33 -} 1.34 -PERL_SCRIPT 1.35 - perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT" 1.36 - rm $PERL_SCRIPT_TEMP_NAME 1.37 - fi 1.38 -} 1.39 - 1.40 text_from_url() 1.41 { 1.42 lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@' 1.43 @@ -171,6 +138,9 @@ 1.44 } 1.45 get_words_group_words_add_stat() 1.46 { 1.47 + STAT_ONLY="$STAT_ONLY" \ 1.48 + GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \ 1.49 + GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \ 1.50 $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1" 1.51 } 1.52 1.53 @@ -282,8 +252,8 @@ 1.54 fi \ 1.55 | part $PART_TO_PROCESS \ 1.56 | tee $ORIGINAL_TEXT \ 1.57 - | two_and_three_words \ 1.58 - | STAT_ONLY="$STAT_ONLY" get_words_group_words_add_stat \ 1.59 + | \ 1.60 + get_words_group_words_add_stat \ 1.61 | tee "$TEMP1" > "$TEMP2" 1.62 1.63 if [ "$STAT_ONLY" = "YES" ]
2.1 --- a/new-words.py Fri Jan 28 12:40:58 2011 +0200 2.2 +++ b/new-words.py Fri Jan 28 21:45:58 2011 +0100 2.3 @@ -159,14 +159,15 @@ 2.4 line = line.rstrip('\n') 2.5 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line) 2.6 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line) 2.7 - return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line) 2.8 + return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line) 2.9 2.10 -def get_words(lines): 2.11 +def get_words(lines, group_by=[1]): 2.12 """ 2.13 Returns hash of words in a file 2.14 word => number 2.15 """ 2.16 result = {} 2.17 + (a, b, c) = ("", "", "") 2.18 for line in lines: 2.19 words = words_from_line(line) 2.20 for word in words: 2.21 @@ -174,6 +175,17 @@ 2.22 continue 2.23 result.setdefault(word, 0) 2.24 result[word] += 1 2.25 + if 2 in group_by and a != "" and b != "": 2.26 + w = "%s_%s" % (a,b) 2.27 + result.setdefault(w, 0) 2.28 + result[w] += 1 2.29 + if 3 in group_by and not "" in [a,b,c]: 2.30 + w = "%s_%s_%s" % (a,b,c) 2.31 + result.setdefault(w, 0) 2.32 + result[w] += 1 2.33 + (a,b,c) = (b, c, word) 2.34 + 2.35 + logging.debug(result) 2.36 return result 2.37 2.38 def load_vocabulary(): 2.39 @@ -203,12 +215,12 @@ 2.40 if match_object: 2.41 word = match_object.group(1) 2.42 if word in notes: 2.43 - logging.debug(word) 2.44 - logging.debug(line) 2.45 + #logging.debug(word) 2.46 + #logging.debug(line) 2.47 if notes_filename in notes[word]: 2.48 line = line.rstrip('\n') 2.49 line = "%-30s %s\n" % (line, notes[word][notes_filename]) 2.50 - logging.debug(line) 2.51 + #logging.debug(line) 2.52 result += [line] 2.53 else: 2.54 result += [line] 2.55 @@ -379,11 +391,17 @@ 2.56 vocabulary = load_vocabulary() 2.57 notes = load_notes(notes_filenames()) 2.58 lines = readlines_from_stdin() 2.59 - words = get_words(lines) 2.60 + group_by = [1] 2.61 + if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES': 2.62 + group_by.append(2) 2.63 + if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES': 2.64 + group_by.append(3) 2.65 + words = get_words(lines, group_by) 2.66 stats_only = False 2.67 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES': 2.68 stats_only = True 2.69 2.70 + 2.71 stats = {} 2.72 stats['total'] = sum(words[x] for x in words.keys()) 2.73 words = substract_dictionary(words, vocabulary) 2.74 @@ -402,17 +420,17 @@ 2.75 linked_words = find_linked_words(notes) 2.76 normalizator = Normalizator(config['language'], linked_words) 2.77 2.78 - word_pairs = [] 2.79 + words_with_freq = [] 2.80 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True): 2.81 - word_pairs.append((words[k], k)) 2.82 + words_with_freq.append((words[k], k)) 2.83 2.84 - wgw = find_wordgroups_weights(word_pairs, normalizator) 2.85 - word_pairs = sorted( 2.86 - word_pairs, 2.87 + wgw = find_wordgroups_weights(words_with_freq, normalizator) 2.88 + words_with_freq = sorted( 2.89 + words_with_freq, 2.90 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words), 2.91 reverse=True) 2.92 2.93 - print_words_sorted(word_pairs, stats, stats_only=stats_only) 2.94 + print_words_sorted(words_with_freq, stats, stats_only=stats_only) 2.95 2.96 (options, args) = parser.parse_args() 2.97 if options.language: