new-words

diff new-words.py @ 39:a598e0d25784
add_notes (add_marks) + remove_notes (remove_marks) implemented in python
author: Igor Chubin <igor@chub.in>
date: Sun Jan 23 14:25:52 2011 +0100 (2011-01-23)
parents: adbc809d3924
children: c3a50c0d2400
     1.1 --- a/new-words.py	Sat Jan 22 23:42:31 2011 +0100
     1.2 +++ b/new-words.py	Sun Jan 23 14:25:52 2011 +0100
     1.3 @@ -177,13 +177,82 @@
     1.4  def load_notes(files):
     1.5      notes = {}
     1.6      for filename in files:
     1.7 -        with open(filename) as f:
     1.8 +        with codecs.open(filename, "r", "utf-8") as f:
     1.9              for line in f.readlines():
    1.10                  (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
    1.11                  notes.setdefault(word, {})
    1.12                  notes[word][filename] = note
    1.13      return notes
    1.14  
    1.15 +def add_notes(lines, notes):
    1.16 +    notes_filename = notes_filenames()[0]
    1.17 +    result = []
    1.18 +    for line in lines:
    1.19 +        if line.startswith('#'):
    1.20 +            result += [line]
    1.21 +        else:
    1.22 +            match_object = re.search('^\s*\S+\s*(\S+)', line)
    1.23 +            if match_object:
    1.24 +                word = match_object.group(1)
    1.25 +                if word in notes:
    1.26 +                    logging.debug(word)
    1.27 +                    logging.debug(line)
    1.28 +                    if notes_filename in notes[word]:
    1.29 +                        line = line.rstrip('\n')
    1.30 +                        line = "%-30s %s\n" % (line, notes[word][notes_filename])
    1.31 +                        logging.debug(line)
    1.32 +                        result += [line]
    1.33 +                else:
    1.34 +                    result += [line]
    1.35 +            else:
    1.36 +                result += [line]
    1.37 +    return result
    1.38 +
    1.39 +def remove_notes(lines, notes_group):
    1.40 +    notes_filename = notes_filenames()[0]
    1.41 +    notes = {}
    1.42 +    for k in notes_group.keys():
    1.43 +        if notes_filename in notes_group[k]:
    1.44 +            notes[k] = notes_group[k][notes_filename]
    1.45 +
    1.46 +    result = []
    1.47 +    for line in lines:
    1.48 +        line = line.rstrip('\n')
    1.49 +        match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
    1.50 +        if match_object:
    1.51 +            result.append("".join([
    1.52 +                match_object.group(1),
    1.53 +                match_object.group(2),
    1.54 +                match_object.group(3),
    1.55 +                match_object.group(4),
    1.56 +                "\n"
    1.57 +                ]))
    1.58 +            notes[match_object.group(4)] = match_object.group(6)
    1.59 +        else:
    1.60 +            result.append(line+"\n")
    1.61 +
    1.62 +    save_notes(notes_filename, notes)
    1.63 +    return result
    1.64 +
    1.65 +def save_notes(filename, notes):
    1.66 +    lines = []
    1.67 +    saved_words = []
    1.68 +    with codecs.open(filename, "r", "utf-8") as f:
    1.69 +        for line in f.readlines():
    1.70 +            (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
    1.71 +            if word in notes:
    1.72 +                line = "%-29s %s\n" % (word, notes[word])
    1.73 +                saved_words.append(word)
    1.74 +            lines.append(line)
    1.75 +    for word in [x for x in notes.keys() if not x in saved_words]:
    1.76 +        line = "%-29s %s\n" % (word, notes[word])
    1.77 +        lines.append(line)
    1.78 +
    1.79 +    with codecs.open(filename, "w", "utf-8") as f:
    1.80 +        for line in lines:
    1.81 +            f.write(line)
    1.82 +
    1.83 +
    1.84  def print_words_sorted(words_freq):
    1.85      for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True):
    1.86          codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k))
    1.87 @@ -221,11 +290,9 @@
    1.88      for word in notes.keys():
    1.89          for note in notes[word].values():
    1.90              if "@" in note:
    1.91 -                logging.debug("%s %s" % (word, note))
    1.92                  result = re.search(r'\@(\S*)', note)
    1.93                  if result:
    1.94                      main_word = result.group(1)
    1.95 -                    logging.debug("%s %s" % (word, main_word))
    1.96                      if main_word:
    1.97                          linked_words[word] = main_word
    1.98      return linked_words
    1.99 @@ -261,7 +328,6 @@
   1.100      lines = readlines_from_stdin()
   1.101      notes = load_notes(notes_filenames())
   1.102      linked_words = find_linked_words(notes)
   1.103 -    logging.debug(linked_words)
   1.104      normalizator = Normalizator(config['language'], linked_words)
   1.105  
   1.106      wgw = find_wordgroups_weights(lines, normalizator)
   1.107 @@ -271,14 +337,32 @@
   1.108                  reverse=True):
   1.109          codecs.getwriter("utf-8")(sys.stdout).write(line)
   1.110  
   1.111 +def filter_add_notes(args):
   1.112 +    lines = readlines_from_file(args[0])
   1.113 +    notes = load_notes(notes_filenames())
   1.114 +    lines = add_notes(lines, notes)
   1.115 +    with codecs.open(args[0], "w", "utf-8") as f:
   1.116 +        for line in lines:
   1.117 +            f.write(line)
   1.118 +
   1.119 +def filter_remove_notes(args):
   1.120 +    lines = readlines_from_file(args[0])
   1.121 +    notes = load_notes(notes_filenames())
   1.122 +    lines = remove_notes(lines, notes)
   1.123 +    with codecs.open(args[0], "w", "utf-8") as f:
   1.124 +        for line in lines:
   1.125 +            f.write(line)
   1.126 +
   1.127  (options, args) = parser.parse_args()
   1.128  if options.language:
   1.129      config['language'] = options.language
   1.130  
   1.131  if options.function:
   1.132      function_names = {
   1.133 -        'get_words' : filter_get_words,
   1.134 +        'get_words' :   filter_get_words,
   1.135          'group_words' : filter_group_words,
   1.136 +        'add_notes' :   filter_add_notes,
   1.137 +        'remove_notes': filter_remove_notes,
   1.138      }
   1.139      if options.function in function_names:
   1.140          function_names[options.function](args)
author	Igor Chubin <igor@chub.in>
date	Sun Jan 23 14:25:52 2011 +0100 (2011-01-23)
parents	adbc809d3924
children	c3a50c0d2400