new-words
diff new-words.py @ 39:a598e0d25784
add_notes (add_marks) + remove_notes (remove_marks) implemented in python
author | Igor Chubin <igor@chub.in> |
---|---|
date | Sun Jan 23 14:25:52 2011 +0100 (2011-01-23) |
parents | adbc809d3924 |
children | c3a50c0d2400 |
line diff
1.1 --- a/new-words.py Sat Jan 22 23:42:31 2011 +0100 1.2 +++ b/new-words.py Sun Jan 23 14:25:52 2011 +0100 1.3 @@ -177,13 +177,82 @@ 1.4 def load_notes(files): 1.5 notes = {} 1.6 for filename in files: 1.7 - with open(filename) as f: 1.8 + with codecs.open(filename, "r", "utf-8") as f: 1.9 for line in f.readlines(): 1.10 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) 1.11 notes.setdefault(word, {}) 1.12 notes[word][filename] = note 1.13 return notes 1.14 1.15 +def add_notes(lines, notes): 1.16 + notes_filename = notes_filenames()[0] 1.17 + result = [] 1.18 + for line in lines: 1.19 + if line.startswith('#'): 1.20 + result += [line] 1.21 + else: 1.22 + match_object = re.search('^\s*\S+\s*(\S+)', line) 1.23 + if match_object: 1.24 + word = match_object.group(1) 1.25 + if word in notes: 1.26 + logging.debug(word) 1.27 + logging.debug(line) 1.28 + if notes_filename in notes[word]: 1.29 + line = line.rstrip('\n') 1.30 + line = "%-30s %s\n" % (line, notes[word][notes_filename]) 1.31 + logging.debug(line) 1.32 + result += [line] 1.33 + else: 1.34 + result += [line] 1.35 + else: 1.36 + result += [line] 1.37 + return result 1.38 + 1.39 +def remove_notes(lines, notes_group): 1.40 + notes_filename = notes_filenames()[0] 1.41 + notes = {} 1.42 + for k in notes_group.keys(): 1.43 + if notes_filename in notes_group[k]: 1.44 + notes[k] = notes_group[k][notes_filename] 1.45 + 1.46 + result = [] 1.47 + for line in lines: 1.48 + line = line.rstrip('\n') 1.49 + match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line) 1.50 + if match_object: 1.51 + result.append("".join([ 1.52 + match_object.group(1), 1.53 + match_object.group(2), 1.54 + match_object.group(3), 1.55 + match_object.group(4), 1.56 + "\n" 1.57 + ])) 1.58 + notes[match_object.group(4)] = match_object.group(6) 1.59 + else: 1.60 + result.append(line+"\n") 1.61 + 1.62 + save_notes(notes_filename, notes) 1.63 + return result 1.64 + 1.65 +def save_notes(filename, notes): 1.66 + lines = [] 1.67 + saved_words = [] 1.68 + with codecs.open(filename, "r", "utf-8") as f: 1.69 + for line in f.readlines(): 1.70 + (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) 1.71 + if word in notes: 1.72 + line = "%-29s %s\n" % (word, notes[word]) 1.73 + saved_words.append(word) 1.74 + lines.append(line) 1.75 + for word in [x for x in notes.keys() if not x in saved_words]: 1.76 + line = "%-29s %s\n" % (word, notes[word]) 1.77 + lines.append(line) 1.78 + 1.79 + with codecs.open(filename, "w", "utf-8") as f: 1.80 + for line in lines: 1.81 + f.write(line) 1.82 + 1.83 + 1.84 def print_words_sorted(words_freq): 1.85 for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True): 1.86 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k)) 1.87 @@ -221,11 +290,9 @@ 1.88 for word in notes.keys(): 1.89 for note in notes[word].values(): 1.90 if "@" in note: 1.91 - logging.debug("%s %s" % (word, note)) 1.92 result = re.search(r'\@(\S*)', note) 1.93 if result: 1.94 main_word = result.group(1) 1.95 - logging.debug("%s %s" % (word, main_word)) 1.96 if main_word: 1.97 linked_words[word] = main_word 1.98 return linked_words 1.99 @@ -261,7 +328,6 @@ 1.100 lines = readlines_from_stdin() 1.101 notes = load_notes(notes_filenames()) 1.102 linked_words = find_linked_words(notes) 1.103 - logging.debug(linked_words) 1.104 normalizator = Normalizator(config['language'], linked_words) 1.105 1.106 wgw = find_wordgroups_weights(lines, normalizator) 1.107 @@ -271,14 +337,32 @@ 1.108 reverse=True): 1.109 codecs.getwriter("utf-8")(sys.stdout).write(line) 1.110 1.111 +def filter_add_notes(args): 1.112 + lines = readlines_from_file(args[0]) 1.113 + notes = load_notes(notes_filenames()) 1.114 + lines = add_notes(lines, notes) 1.115 + with codecs.open(args[0], "w", "utf-8") as f: 1.116 + for line in lines: 1.117 + f.write(line) 1.118 + 1.119 +def filter_remove_notes(args): 1.120 + lines = readlines_from_file(args[0]) 1.121 + notes = load_notes(notes_filenames()) 1.122 + lines = remove_notes(lines, notes) 1.123 + with codecs.open(args[0], "w", "utf-8") as f: 1.124 + for line in lines: 1.125 + f.write(line) 1.126 + 1.127 (options, args) = parser.parse_args() 1.128 if options.language: 1.129 config['language'] = options.language 1.130 1.131 if options.function: 1.132 function_names = { 1.133 - 'get_words' : filter_get_words, 1.134 + 'get_words' : filter_get_words, 1.135 'group_words' : filter_group_words, 1.136 + 'add_notes' : filter_add_notes, 1.137 + 'remove_notes': filter_remove_notes, 1.138 } 1.139 if options.function in function_names: 1.140 function_names[options.function](args)