# HG changeset patch # User Igor Chubin # Date 1295789152 -3600 # Node ID a598e0d25784559ddc8d5b33532712a9bb9a6ed2 # Parent adbc809d39242cc78d735fafc32f3adcbe1c2519 add_notes (add_marks) + remove_notes (remove_marks) implemented in python diff -r adbc809d3924 -r a598e0d25784 new-words-py.sh --- a/new-words-py.sh Sat Jan 22 23:42:31 2011 +0100 +++ b/new-words-py.sh Sun Jan 23 14:25:52 2011 +0100 @@ -578,6 +578,16 @@ add_marks() { + if [ "$OLD_STYLE" = NO ] + then + $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1" + else + group_words_OLD "$@" + fi +} + +add_marks_OLD() +{ PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME use Encode; @@ -632,6 +642,16 @@ remove_marks() { + if [ "$OLD_STYLE" = NO ] + then + $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1" + else + group_words_OLD "$@" + fi +} + +remove_marks_OLD() +{ PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX` cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME $file = $ARGV[0]; diff -r adbc809d3924 -r a598e0d25784 new-words.py --- a/new-words.py Sat Jan 22 23:42:31 2011 +0100 +++ b/new-words.py Sun Jan 23 14:25:52 2011 +0100 @@ -177,13 +177,82 @@ def load_notes(files): notes = {} for filename in files: - with open(filename) as f: + with codecs.open(filename, "r", "utf-8") as f: for line in f.readlines(): (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) notes.setdefault(word, {}) notes[word][filename] = note return notes +def add_notes(lines, notes): + notes_filename = notes_filenames()[0] + result = [] + for line in lines: + if line.startswith('#'): + result += [line] + else: + match_object = re.search('^\s*\S+\s*(\S+)', line) + if match_object: + word = match_object.group(1) + if word in notes: + logging.debug(word) + logging.debug(line) + if notes_filename in notes[word]: + line = line.rstrip('\n') + line = "%-30s %s\n" % (line, notes[word][notes_filename]) + logging.debug(line) + result += [line] + else: + result += [line] + else: + result += [line] + return result + +def remove_notes(lines, notes_group): + notes_filename = notes_filenames()[0] + notes = {} + for k in notes_group.keys(): + if notes_filename in notes_group[k]: + notes[k] = notes_group[k][notes_filename] + + result = [] + for line in lines: + line = line.rstrip('\n') + match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line) + if match_object: + result.append("".join([ + match_object.group(1), + match_object.group(2), + match_object.group(3), + match_object.group(4), + "\n" + ])) + notes[match_object.group(4)] = match_object.group(6) + else: + result.append(line+"\n") + + save_notes(notes_filename, notes) + return result + +def save_notes(filename, notes): + lines = [] + saved_words = [] + with codecs.open(filename, "r", "utf-8") as f: + for line in f.readlines(): + (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1) + if word in notes: + line = "%-29s %s\n" % (word, notes[word]) + saved_words.append(word) + lines.append(line) + for word in [x for x in notes.keys() if not x in saved_words]: + line = "%-29s %s\n" % (word, notes[word]) + lines.append(line) + + with codecs.open(filename, "w", "utf-8") as f: + for line in lines: + f.write(line) + + def print_words_sorted(words_freq): for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True): codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k)) @@ -221,11 +290,9 @@ for word in notes.keys(): for note in notes[word].values(): if "@" in note: - logging.debug("%s %s" % (word, note)) result = re.search(r'\@(\S*)', note) if result: main_word = result.group(1) - logging.debug("%s %s" % (word, main_word)) if main_word: linked_words[word] = main_word return linked_words @@ -261,7 +328,6 @@ lines = readlines_from_stdin() notes = load_notes(notes_filenames()) linked_words = find_linked_words(notes) - logging.debug(linked_words) normalizator = Normalizator(config['language'], linked_words) wgw = find_wordgroups_weights(lines, normalizator) @@ -271,14 +337,32 @@ reverse=True): codecs.getwriter("utf-8")(sys.stdout).write(line) +def filter_add_notes(args): + lines = readlines_from_file(args[0]) + notes = load_notes(notes_filenames()) + lines = add_notes(lines, notes) + with codecs.open(args[0], "w", "utf-8") as f: + for line in lines: + f.write(line) + +def filter_remove_notes(args): + lines = readlines_from_file(args[0]) + notes = load_notes(notes_filenames()) + lines = remove_notes(lines, notes) + with codecs.open(args[0], "w", "utf-8") as f: + for line in lines: + f.write(line) + (options, args) = parser.parse_args() if options.language: config['language'] = options.language if options.function: function_names = { - 'get_words' : filter_get_words, + 'get_words' : filter_get_words, 'group_words' : filter_group_words, + 'add_notes' : filter_add_notes, + 'remove_notes': filter_remove_notes, } if options.function in function_names: function_names[options.function](args)