new-words
diff new-words.py @ 44:7eb1a8c3eade
-2 and -3 are now supported by new-words.py
author | Igor Chubin <igor@chub.in> |
---|---|
date | Fri Jan 28 21:45:58 2011 +0100 (2011-01-28) |
parents | d532e7b52ab2 |
children | 5f90e44eecfc |
line diff
1.1 --- a/new-words.py Fri Jan 28 12:40:58 2011 +0200 1.2 +++ b/new-words.py Fri Jan 28 21:45:58 2011 +0100 1.3 @@ -159,14 +159,15 @@ 1.4 line = line.rstrip('\n') 1.5 #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line) 1.6 #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line) 1.7 - return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line) 1.8 + return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line) 1.9 1.10 -def get_words(lines): 1.11 +def get_words(lines, group_by=[1]): 1.12 """ 1.13 Returns hash of words in a file 1.14 word => number 1.15 """ 1.16 result = {} 1.17 + (a, b, c) = ("", "", "") 1.18 for line in lines: 1.19 words = words_from_line(line) 1.20 for word in words: 1.21 @@ -174,6 +175,17 @@ 1.22 continue 1.23 result.setdefault(word, 0) 1.24 result[word] += 1 1.25 + if 2 in group_by and a != "" and b != "": 1.26 + w = "%s_%s" % (a,b) 1.27 + result.setdefault(w, 0) 1.28 + result[w] += 1 1.29 + if 3 in group_by and not "" in [a,b,c]: 1.30 + w = "%s_%s_%s" % (a,b,c) 1.31 + result.setdefault(w, 0) 1.32 + result[w] += 1 1.33 + (a,b,c) = (b, c, word) 1.34 + 1.35 + logging.debug(result) 1.36 return result 1.37 1.38 def load_vocabulary(): 1.39 @@ -203,12 +215,12 @@ 1.40 if match_object: 1.41 word = match_object.group(1) 1.42 if word in notes: 1.43 - logging.debug(word) 1.44 - logging.debug(line) 1.45 + #logging.debug(word) 1.46 + #logging.debug(line) 1.47 if notes_filename in notes[word]: 1.48 line = line.rstrip('\n') 1.49 line = "%-30s %s\n" % (line, notes[word][notes_filename]) 1.50 - logging.debug(line) 1.51 + #logging.debug(line) 1.52 result += [line] 1.53 else: 1.54 result += [line] 1.55 @@ -379,11 +391,17 @@ 1.56 vocabulary = load_vocabulary() 1.57 notes = load_notes(notes_filenames()) 1.58 lines = readlines_from_stdin() 1.59 - words = get_words(lines) 1.60 + group_by = [1] 1.61 + if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES': 1.62 + group_by.append(2) 1.63 + if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES': 1.64 + group_by.append(3) 1.65 + words = get_words(lines, group_by) 1.66 stats_only = False 1.67 if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES': 1.68 stats_only = True 1.69 1.70 + 1.71 stats = {} 1.72 stats['total'] = sum(words[x] for x in words.keys()) 1.73 words = substract_dictionary(words, vocabulary) 1.74 @@ -402,17 +420,17 @@ 1.75 linked_words = find_linked_words(notes) 1.76 normalizator = Normalizator(config['language'], linked_words) 1.77 1.78 - word_pairs = [] 1.79 + words_with_freq = [] 1.80 for k in sorted(words.keys(), key=lambda k: words[k], reverse=True): 1.81 - word_pairs.append((words[k], k)) 1.82 + words_with_freq.append((words[k], k)) 1.83 1.84 - wgw = find_wordgroups_weights(word_pairs, normalizator) 1.85 - word_pairs = sorted( 1.86 - word_pairs, 1.87 + wgw = find_wordgroups_weights(words_with_freq, normalizator) 1.88 + words_with_freq = sorted( 1.89 + words_with_freq, 1.90 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words), 1.91 reverse=True) 1.92 1.93 - print_words_sorted(word_pairs, stats, stats_only=stats_only) 1.94 + print_words_sorted(words_with_freq, stats, stats_only=stats_only) 1.95 1.96 (options, args) = parser.parse_args() 1.97 if options.language: