new-words
changeset 39:a598e0d25784
add_notes (add_marks) + remove_notes (remove_marks) implemented in python
author | Igor Chubin <igor@chub.in> |
---|---|
date | Sun Jan 23 14:25:52 2011 +0100 (2011-01-23) |
parents | adbc809d3924 |
children | c3a50c0d2400 |
files | new-words-py.sh new-words.py |
line diff
1.1 --- a/new-words-py.sh Sat Jan 22 23:42:31 2011 +0100 1.2 +++ b/new-words-py.sh Sun Jan 23 14:25:52 2011 +0100 1.3 @@ -578,6 +578,16 @@ 1.4 1.5 add_marks() 1.6 { 1.7 + if [ "$OLD_STYLE" = NO ] 1.8 + then 1.9 + $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1" 1.10 + else 1.11 + group_words_OLD "$@" 1.12 + fi 1.13 +} 1.14 + 1.15 +add_marks_OLD() 1.16 +{ 1.17 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` 1.18 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.19 use Encode; 1.20 @@ -632,6 +642,16 @@ 1.21 1.22 remove_marks() 1.23 { 1.24 + if [ "$OLD_STYLE" = NO ] 1.25 + then 1.26 + $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1" 1.27 + else 1.28 + group_words_OLD "$@" 1.29 + fi 1.30 +} 1.31 + 1.32 +remove_marks_OLD() 1.33 +{ 1.34 PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` 1.35 cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME 1.36 $file = $ARGV[0];
2.1 --- a/new-words.py Sat Jan 22 23:42:31 2011 +0100 2.2 +++ b/new-words.py Sun Jan 23 14:25:52 2011 +0100 2.3 @@ -177,13 +177,82 @@ 2.4 def load_notes(files): 2.5 notes = {} 2.6 for filename in files: 2.7 - with open(filename) as f: 2.8 + with codecs.open(filename, "r", "utf-8") as f: 2.9 for line in f.readlines(): 2.10 (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) 2.11 notes.setdefault(word, {}) 2.12 notes[word][filename] = note 2.13 return notes 2.14 2.15 +def add_notes(lines, notes): 2.16 + notes_filename = notes_filenames()[0] 2.17 + result = [] 2.18 + for line in lines: 2.19 + if line.startswith('#'): 2.20 + result += [line] 2.21 + else: 2.22 + match_object = re.search('^\s*\S+\s*(\S+)', line) 2.23 + if match_object: 2.24 + word = match_object.group(1) 2.25 + if word in notes: 2.26 + logging.debug(word) 2.27 + logging.debug(line) 2.28 + if notes_filename in notes[word]: 2.29 + line = line.rstrip('\n') 2.30 + line = "%-30s %s\n" % (line, notes[word][notes_filename]) 2.31 + logging.debug(line) 2.32 + result += [line] 2.33 + else: 2.34 + result += [line] 2.35 + else: 2.36 + result += [line] 2.37 + return result 2.38 + 2.39 +def remove_notes(lines, notes_group): 2.40 + notes_filename = notes_filenames()[0] 2.41 + notes = {} 2.42 + for k in notes_group.keys(): 2.43 + if notes_filename in notes_group[k]: 2.44 + notes[k] = notes_group[k][notes_filename] 2.45 + 2.46 + result = [] 2.47 + for line in lines: 2.48 + line = line.rstrip('\n') 2.49 + match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line) 2.50 + if match_object: 2.51 + result.append("".join([ 2.52 + match_object.group(1), 2.53 + match_object.group(2), 2.54 + match_object.group(3), 2.55 + match_object.group(4), 2.56 + "\n" 2.57 + ])) 2.58 + notes[match_object.group(4)] = match_object.group(6) 2.59 + else: 2.60 + result.append(line+"\n") 2.61 + 2.62 + save_notes(notes_filename, notes) 2.63 + return result 2.64 + 2.65 +def save_notes(filename, notes): 2.66 + lines = [] 2.67 + saved_words = [] 2.68 + with codecs.open(filename, "r", "utf-8") as f: 2.69 + for line in f.readlines(): 2.70 + (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) 2.71 + if word in notes: 2.72 + line = "%-29s %s\n" % (word, notes[word]) 2.73 + saved_words.append(word) 2.74 + lines.append(line) 2.75 + for word in [x for x in notes.keys() if not x in saved_words]: 2.76 + line = "%-29s %s\n" % (word, notes[word]) 2.77 + lines.append(line) 2.78 + 2.79 + with codecs.open(filename, "w", "utf-8") as f: 2.80 + for line in lines: 2.81 + f.write(line) 2.82 + 2.83 + 2.84 def print_words_sorted(words_freq): 2.85 for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True): 2.86 codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k)) 2.87 @@ -221,11 +290,9 @@ 2.88 for word in notes.keys(): 2.89 for note in notes[word].values(): 2.90 if "@" in note: 2.91 - logging.debug("%s %s" % (word, note)) 2.92 result = re.search(r'\@(\S*)', note) 2.93 if result: 2.94 main_word = result.group(1) 2.95 - logging.debug("%s %s" % (word, main_word)) 2.96 if main_word: 2.97 linked_words[word] = main_word 2.98 return linked_words 2.99 @@ -261,7 +328,6 @@ 2.100 lines = readlines_from_stdin() 2.101 notes = load_notes(notes_filenames()) 2.102 linked_words = find_linked_words(notes) 2.103 - logging.debug(linked_words) 2.104 normalizator = Normalizator(config['language'], linked_words) 2.105 2.106 wgw = find_wordgroups_weights(lines, normalizator) 2.107 @@ -271,14 +337,32 @@ 2.108 reverse=True): 2.109 codecs.getwriter("utf-8")(sys.stdout).write(line) 2.110 2.111 +def filter_add_notes(args): 2.112 + lines = readlines_from_file(args[0]) 2.113 + notes = load_notes(notes_filenames()) 2.114 + lines = add_notes(lines, notes) 2.115 + with codecs.open(args[0], "w", "utf-8") as f: 2.116 + for line in lines: 2.117 + f.write(line) 2.118 + 2.119 +def filter_remove_notes(args): 2.120 + lines = readlines_from_file(args[0]) 2.121 + notes = load_notes(notes_filenames()) 2.122 + lines = remove_notes(lines, notes) 2.123 + with codecs.open(args[0], "w", "utf-8") as f: 2.124 + for line in lines: 2.125 + f.write(line) 2.126 + 2.127 (options, args) = parser.parse_args() 2.128 if options.language: 2.129 config['language'] = options.language 2.130 2.131 if options.function: 2.132 function_names = { 2.133 - 'get_words' : filter_get_words, 2.134 + 'get_words' : filter_get_words, 2.135 'group_words' : filter_group_words, 2.136 + 'add_notes' : filter_add_notes, 2.137 + 'remove_notes': filter_remove_notes, 2.138 } 2.139 if options.function in function_names: 2.140 function_names[options.function](args)