new-words
diff new-words.py @ 54:e25de9ea9184
new-words.py is almost ready
author | Igor Chubin <igor@chub.in> |
---|---|
date | Tue Nov 01 20:19:18 2011 +0100 (2011-11-01) |
parents | f583256b7ab1 |
children | 2a1a25e61872 |
line diff
1.1 --- a/new-words.py Mon Oct 31 20:21:20 2011 +0200 1.2 +++ b/new-words.py Tue Nov 01 20:19:18 2011 +0100 1.3 @@ -11,6 +11,7 @@ 1.4 import subprocess 1.5 import sys 1.6 import Stemmer 1.7 +import tempfile 1.8 try: 1.9 import psyco 1.10 psyco.full() 1.11 @@ -141,7 +142,13 @@ 1.12 dest="language") 1.13 1.14 parser.add_option( 1.15 - "-f", "--function", 1.16 + "-f", "--allowed-words", 1.17 + help="file with list of allowed words (words that will be shown in the output)", 1.18 + action="store", 1.19 + dest="allowed_words") 1.20 + 1.21 +parser.add_option( 1.22 + "-X", "--function", 1.23 help="filter through subsystem [INTERNAL]", 1.24 action="store", 1.25 dest="function") 1.26 @@ -183,6 +190,12 @@ 1.27 dest="delete_tag") 1.28 1.29 parser.add_option( 1.30 + "-R", "--show-range-percentage", 1.31 + help="show only words that cover specified percentage of the text, skip the rest", 1.32 + action="store", 1.33 + dest="show_range_percentage") 1.34 + 1.35 +parser.add_option( 1.36 "-s", "--text-stats", 1.37 help="show the text statistics (percentage of known words and so on) and exit", 1.38 action="store_true", 1.39 @@ -225,6 +238,16 @@ 1.40 res += [line] 1.41 return res 1.42 1.43 +def readlines_from_url(url): 1.44 + return [x.decode('utf-8') for x in 1.45 + subprocess.Popen( 1.46 + "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url), 1.47 + shell = True, 1.48 + stdout = subprocess.PIPE, 1.49 + stderr = subprocess.STDOUT 1.50 + ).communicate()[0].split('\n') 1.51 + ] 1.52 + 1.53 def readlines_from_stdin(): 1.54 return codecs.getreader("utf-8")(sys.stdin).readlines() 1.55 1.56 @@ -261,8 +284,11 @@ 1.57 logging.debug(result) 1.58 return result 1.59 1.60 +def voc_filename(): 1.61 + return "%s/%s.txt"%(config['config_directory'], config['language']) 1.62 + 1.63 def load_vocabulary(): 1.64 - return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language']))) 1.65 + return get_words(readlines_from_file(voc_filename())) 1.66 1.67 def notes_filenames(): 1.68 return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])] 1.69 @@ -409,8 +435,10 @@ 1.70 show_range=0, 1.71 show_range_percentage=0, 1.72 ): 1.73 + result = [] 1.74 if stats_only: 1.75 - codecs.getwriter("utf-8")(sys.stdout).write( 1.76 + #codecs.getwriter("utf-8")(sys.stdout).write( 1.77 + result.append( 1.78 " ".join([ 1.79 "%-10s" % x for x in [ 1.80 "LANG", 1.81 @@ -421,7 +449,7 @@ 1.82 "WPS", 1.83 "UWPS*10" 1.84 ]]) + "\n") 1.85 - codecs.getwriter("utf-8")(sys.stdout).write( 1.86 + result.append( 1.87 " ".join([ 1.88 "%(language)-10s", 1.89 "%(percentage)-10.2f", 1.90 @@ -431,10 +459,10 @@ 1.91 "%(wps)-11d" 1.92 "%(uwps)-11d" 1.93 ]) % stats + "\n") 1.94 - return 1.95 + return "".join(result) 1.96 1.97 if print_stats: 1.98 - codecs.getwriter("utf-8")(sys.stdout).write( 1.99 + result.append( 1.100 "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats) 1.101 1.102 level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102) 1.103 @@ -448,14 +476,12 @@ 1.104 1.105 normalized_word = normalizator.normalize(word_pair[1]) 1.106 if old_normalized_word and old_normalized_word != normalized_word: 1.107 - #codecs.getwriter("utf-8")(sys.stdout).write( 1.108 - # "### %s\n" % normalizator.best_word_from_group(words_of_this_group)) 1.109 if compressed_wordlist: 1.110 compressed_word_pair = ( 1.111 sum(x[0] for x in words_of_this_group), 1.112 normalizator.best_word_from_group(words_of_this_group) 1.113 ) 1.114 - codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair) 1.115 + result.append("%10s %s\n" % compressed_word_pair) 1.116 printed_words += 1 1.117 words_of_this_group = [] 1.118 1.119 @@ -463,7 +489,7 @@ 1.120 words_of_this_group.append(word_pair) 1.121 1.122 if not compressed_wordlist: 1.123 - codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair) 1.124 + result.append("%10s %s\n" % word_pair) 1.125 printed_words += 1 1.126 1.127 1.128 @@ -473,28 +499,14 @@ 1.129 while 100.0*known/total > level_lines[0]: 1.130 current_level = level_lines[0] 1.131 level_lines = level_lines[1:] 1.132 - codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level) 1.133 + result.append("# %s\n" % current_level) 1.134 1.135 if show_range >0 and printed_words >= show_range: 1.136 break 1.137 if show_range_percentage >0 and 100.0*known/total >= show_range_percentage: 1.138 break 1.139 1.140 -def filter_add_notes(args): 1.141 - lines = readlines_from_file(args[0]) 1.142 - notes = load_notes(notes_filenames()) 1.143 - lines = add_notes(lines, notes) 1.144 - with codecs.open(args[0], "w", "utf-8") as f: 1.145 - for line in lines: 1.146 - f.write(line) 1.147 - 1.148 -def filter_remove_notes(args): 1.149 - lines = readlines_from_file(args[0]) 1.150 - notes = load_notes(notes_filenames()) 1.151 - lines = remove_notes(lines, notes) 1.152 - with codecs.open(args[0], "w", "utf-8") as f: 1.153 - for line in lines: 1.154 - f.write(line) 1.155 + return result 1.156 1.157 def parse_parts_description(parts_description): 1.158 """ 1.159 @@ -503,8 +515,6 @@ 1.160 from-to/step 1.161 from+delta/step 1.162 """ 1.163 - def incorrect_parts_description(pd): 1.164 - raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % pd) 1.165 1.166 try: 1.167 (a, step) = parts_description.split("/", 1) 1.168 @@ -525,7 +535,7 @@ 1.169 return (start, stop, step) 1.170 1.171 except: 1.172 - raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % pd) 1.173 + raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % parts_description) 1.174 1.175 1.176 def take_part(lines, part_description = None): 1.177 @@ -536,27 +546,45 @@ 1.178 part_size = (1.0*n) / step 1.179 result = [] 1.180 for i in range(n): 1.181 - if part_size * i >= start and part_size * i <= stop: 1.182 - result += lines[i] 1.183 + if i >= start * part_size and i <= stop * part_size: 1.184 + result += [lines[i]] 1.185 return result 1.186 1.187 def filter_get_words_group_words_add_stat(args): 1.188 vocabulary = load_vocabulary() 1.189 notes = load_notes(notes_filenames()) 1.190 - lines = take_part(readlines_from_stdin(), config.get('pages', '')) 1.191 + 1.192 + if len(args) > 0: 1.193 + if 'http://' in args[0]: 1.194 + input_lines = readlines_from_url(args[0]) 1.195 + else: 1.196 + input_lines = readlines_from_file(args[0]) 1.197 + else: 1.198 + input_lines = readlines_from_stdin() 1.199 + 1.200 + if len(input_lines) == 0: 1.201 + print >> sys.stderr, "Nothing to do, standard input is empty, exiting." 1.202 + sys.exit(1) 1.203 + 1.204 + lines = take_part(input_lines, config.get('pages', '')) 1.205 + 1.206 + (_, original_text_tempfile) = tempfile.mkstemp(prefix='new-word') 1.207 + with codecs.open(original_text_tempfile, "w", "utf-8") as f: 1.208 + f.write("".join(lines)) 1.209 + 1.210 group_by = [1] 1.211 1.212 - if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES': 1.213 + if 'two_words' in config: 1.214 group_by.append(2) 1.215 - if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES': 1.216 + if 'three_words' in config: 1.217 group_by.append(3) 1.218 words = get_words(lines, group_by) 1.219 stats_only = False 1.220 - if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES': 1.221 + if 'text_stats' in config: 1.222 stats_only = True 1.223 1.224 compressed_wordlist = False 1.225 - if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES': 1.226 + if 'compressed' in config: 1.227 compressed_wordlist = True 1.228 1.229 show_range = os.environ.get('SHOW_RANGE', '') 1.230 @@ -564,16 +592,16 @@ 1.231 show_range = int(show_range) 1.232 else: 1.233 show_range = 0 1.234 - show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '') 1.235 - if show_range_percentage != '': 1.236 - show_range_percentage = int(show_range_percentage) 1.237 + 1.238 + if 'show_range_percentage' in config: 1.239 + show_range_percentage = int(config['show_range_percentage']) 1.240 else: 1.241 show_range_percentage = 0 1.242 1.243 1.244 stats = {} 1.245 stats['total'] = sum(words[x] for x in words.keys()) 1.246 - if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES': 1.247 + if not 'no_filter' in config: 1.248 words = substract_dictionary(words, vocabulary) 1.249 1.250 stats['total_unknown'] = sum(words[x] for x in words.keys()) 1.251 @@ -591,8 +619,8 @@ 1.252 normalizator = Normalizator(config['language'], linked_words) 1.253 1.254 # filter words by allowed_words_filter 1.255 - if os.environ.get('ALLOWED_WORDS_FILENAME', ''): 1.256 - allowed_words_filename = os.environ.get('ALLOWED_WORDS_FILENAME', '') 1.257 + if 'allowed_words' in config: 1.258 + allowed_words_filename = config['allowed_words'] 1.259 normalized_allowed_words = [ 1.260 normalizator.normalize(w.rstrip('\n')) 1.261 for w in readlines_from_file(allowed_words_filename) 1.262 @@ -615,7 +643,7 @@ 1.263 cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words), 1.264 reverse=True) 1.265 1.266 - print_words_sorted( 1.267 + output = print_words_sorted( 1.268 words_with_freq, 1.269 stats, 1.270 normalizator, 1.271 @@ -625,14 +653,87 @@ 1.272 show_range_percentage=show_range_percentage, 1.273 ) 1.274 1.275 + 1.276 + if ('non_interactive' in config or 'text_stats' in config): 1.277 + codecs.getwriter("utf-8")(sys.stdout).write("".join(output)) 1.278 + else: 1.279 + (_, temp1) = tempfile.mkstemp(prefix='new-word') 1.280 + (_, temp2) = tempfile.mkstemp(prefix='new-word') 1.281 + 1.282 + with codecs.open(temp1, "w", "utf-8") as f: 1.283 + f.write("".join(output)) 1.284 + with codecs.open(temp2, "w", "utf-8") as f: 1.285 + f.write("".join(add_notes(output, notes))) 1.286 + 1.287 + os.putenv('ORIGINAL_TEXT', original_text_tempfile) 1.288 + os.system(( 1.289 + "vim" 1.290 + " -c 'setlocal spell spelllang={language}'" 1.291 + " -c 'set keywordprg={language}'" 1.292 + " -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255'" 1.293 + " {filename}" 1.294 + " < /dev/tty > /dev/tty" 1.295 + ).format(language=config['language'], filename=temp2)) 1.296 + 1.297 + lines = remove_notes(readlines_from_file(temp2), notes) 1.298 + 1.299 + # compare lines_before and lines_after and return deleted words 1.300 + lines_before = output 1.301 + lines_after = lines 1.302 + deleted_words = [] 1.303 + 1.304 + for line in lines_before: 1.305 + if line not in lines_after: 1.306 + line = line.strip() 1.307 + if ' ' in line: 1.308 + word = re.split('\s+', line, 1)[1] 1.309 + if ' ' in word: 1.310 + word = re.split('\s+', word, 1)[0] 1.311 + deleted_words.append(word) 1.312 + 1.313 + with codecs.open(voc_filename(), "a", "utf-8") as f: 1.314 + f.write("\n".join(deleted_words + [''])) 1.315 + 1.316 + os.unlink(temp1) 1.317 + os.unlink(temp2) 1.318 + 1.319 + os.unlink(original_text_tempfile) 1.320 + 1.321 (options, args) = parser.parse_args() 1.322 if options.language: 1.323 config['language'] = options.language 1.324 1.325 +if options.pages: 1.326 + config['pages'] = options.pages 1.327 +else: 1.328 + config['pages'] = "" 1.329 + 1.330 +if options.allowed_words: 1.331 + config['allowed_words'] = options.allowed_words 1.332 + 1.333 +if options.show_range_percentage: 1.334 + config['show_range_percentage'] = options.show_range_percentage 1.335 + 1.336 +if options.non_interactive: 1.337 + config['non_interactive'] = True 1.338 + 1.339 +if options.text_stats: 1.340 + config['text_stats'] = True 1.341 + 1.342 +if options.compressed: 1.343 + config['compressed'] = True 1.344 + 1.345 +if options.no_filter: 1.346 + config['no_filter'] = True 1.347 + 1.348 +if options.two_words: 1.349 + config['two_words'] = True 1.350 + 1.351 +if options.three_words: 1.352 + config['three_words'] = True 1.353 + 1.354 if options.function: 1.355 function_names = { 1.356 - 'add_notes' : filter_add_notes, 1.357 - 'remove_notes': filter_remove_notes, 1.358 'get_words_group_words_add_stat': filter_get_words_group_words_add_stat, 1.359 } 1.360 if options.function in function_names: