new-words
diff new-words.py @ 40:c3a50c0d2400
Functions for adding/removing notes + statistics now implemented in Python.
Option -O (old-style) is not supported anymore. If you need old-style new-words use new-words.sh
Option -O (old-style) is not supported anymore. If you need old-style new-words use new-words.sh
author | Igor Chubin <igor@chub.in> |
---|---|
date | Sun Jan 23 17:09:44 2011 +0100 (2011-01-23) |
parents | a598e0d25784 |
children | 4629e08b0d87 |
line diff
1.1 --- a/new-words.py Sun Jan 23 14:25:52 2011 +0100 1.2 +++ b/new-words.py Sun Jan 23 17:09:44 2011 +0100 1.3 @@ -1,6 +1,7 @@ 1.4 #!/usr/bin/env python 1.5 # -*- coding: utf-8 -*- 1.6 1.7 +from __future__ import with_statement 1.8 import codecs 1.9 import logging 1.10 import os 1.11 @@ -253,10 +254,6 @@ 1.12 f.write(line) 1.13 1.14 1.15 -def print_words_sorted(words_freq): 1.16 - for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True): 1.17 - codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k)) 1.18 - 1.19 def substract_dictionary(dict1, dict2): 1.20 """ 1.21 returns dict1 - dict2 1.22 @@ -275,14 +272,12 @@ 1.23 def error_message(text): 1.24 print text 1.25 1.26 -def find_wordgroups_weights(lines, normalizator): 1.27 +def find_wordgroups_weights(word_pairs, normalizator): 1.28 weight = {} 1.29 - for line in lines: 1.30 - line = re.sub('^\s*', '', line.rstrip('\n')) 1.31 - (num, word) = re.split('\s+', line, maxsplit=1) 1.32 + for (num, word) in word_pairs: 1.33 normalized = normalizator.normalize(word) 1.34 weight.setdefault(normalized, 0) 1.35 - weight[normalized] += int(num) 1.36 + weight[normalized] += num 1.37 return weight 1.38 1.39 def find_linked_words(notes): 1.40 @@ -297,12 +292,9 @@ 1.41 linked_words[word] = main_word 1.42 return linked_words 1.43 1.44 - 1.45 -def compare_word_lines(line1, line2, wgw, normalizator, linked_words): 1.46 - line1 = re.sub('^\s*', '', line1.rstrip('\n')) 1.47 - (num1, word1) = re.split('\s+', line1, 1) 1.48 - line2 = re.sub('^\s*', '', line2.rstrip('\n')) 1.49 - (num2, word2) = re.split('\s+', line2, 1) 1.50 +def compare_word_pairs(pair1, pair2, wgw, normalizator, linked_words): 1.51 + (num1, word1) = pair1 1.52 + (num2, word2) = pair2 1.53 1.54 normalized_word1 = normalizator.normalize(word1) 1.55 normalized_word2 = normalizator.normalize(word2) 1.56 @@ -317,25 +309,28 @@ 1.57 else: 1.58 return cmp(int(num1), int(num2)) 1.59 1.60 -def filter_get_words(args): 1.61 - vocabulary = load_vocabulary() 1.62 - words = get_words(readlines_from_stdin()) 1.63 - dump_words(words, args[0]) 1.64 - words = substract_dictionary(words, vocabulary) 1.65 - print_words_sorted(words) 1.66 +def print_words_sorted(word_pairs, stats, print_stats=True, stats_only=False): 1.67 + if stats_only: 1.68 + codecs.getwriter("utf-8")(sys.stdout).write("stat_only") 1.69 + return 1.70 1.71 -def filter_group_words(args): 1.72 - lines = readlines_from_stdin() 1.73 - notes = load_notes(notes_filenames()) 1.74 - linked_words = find_linked_words(notes) 1.75 - normalizator = Normalizator(config['language'], linked_words) 1.76 + if print_stats: 1.77 + codecs.getwriter("utf-8")(sys.stdout).write( 1.78 + "# %(language)s, %(percentage)s, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats) 1.79 1.80 - wgw = find_wordgroups_weights(lines, normalizator) 1.81 - for line in sorted( 1.82 - lines, 1.83 - cmp=lambda x,y:compare_word_lines(x,y, wgw, normalizator, linked_words), 1.84 - reverse=True): 1.85 - codecs.getwriter("utf-8")(sys.stdout).write(line) 1.86 + level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102) 1.87 + known = int(stats['total_known']) 1.88 + total = int(stats['total']) 1.89 + current_level = 0 1.90 + for word_pair in word_pairs: 1.91 + codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair) 1.92 + known += word_pair[0] 1.93 + if 100.0*known/total >= level_lines[0]: 1.94 + current_level = level_lines[0] 1.95 + while 100.0*known/total > level_lines[0]: 1.96 + current_level = level_lines[0] 1.97 + level_lines = level_lines[1:] 1.98 + codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level) 1.99 1.100 def filter_add_notes(args): 1.101 lines = readlines_from_file(args[0]) 1.102 @@ -353,16 +348,48 @@ 1.103 for line in lines: 1.104 f.write(line) 1.105 1.106 +def filter_get_words_group_words_add_stat(args): 1.107 + vocabulary = load_vocabulary() 1.108 + notes = load_notes(notes_filenames()) 1.109 + lines = readlines_from_stdin() 1.110 + words = get_words(lines) 1.111 + 1.112 + stats = {} 1.113 + stats['total'] = sum(words[x] for x in words.keys()) 1.114 + words = substract_dictionary(words, vocabulary) 1.115 + 1.116 + stats['total_unknown'] = sum(words[x] for x in words.keys()) 1.117 + stats['total_known'] = stats['total'] - stats['total_unknown'] 1.118 + stats['percentage'] = "%7.2f"%(100.0*stats['total_known']/stats['total']) 1.119 + stats['groups'] = 0 1.120 + stats['words'] = len(words) 1.121 + stats['sentences'] = 0 #FIXME 1.122 + stats['language'] = config['language'] 1.123 + 1.124 + linked_words = find_linked_words(notes) 1.125 + normalizator = Normalizator(config['language'], linked_words) 1.126 + 1.127 + word_pairs = [] 1.128 + for k in sorted(words.keys(), key=lambda k: words[k], reverse=True): 1.129 + word_pairs.append((words[k], k)) 1.130 + 1.131 + wgw = find_wordgroups_weights(word_pairs, normalizator) 1.132 + word_pairs = sorted( 1.133 + word_pairs, 1.134 + cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words), 1.135 + reverse=True) 1.136 + 1.137 + print_words_sorted(word_pairs, stats) 1.138 + 1.139 (options, args) = parser.parse_args() 1.140 if options.language: 1.141 config['language'] = options.language 1.142 1.143 if options.function: 1.144 function_names = { 1.145 - 'get_words' : filter_get_words, 1.146 - 'group_words' : filter_group_words, 1.147 'add_notes' : filter_add_notes, 1.148 'remove_notes': filter_remove_notes, 1.149 + 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat, 1.150 } 1.151 if options.function in function_names: 1.152 function_names[options.function](args)