new-words

changeset 39:a598e0d25784

add_notes (add_marks) + remove_notes (remove_marks) implemented in python
author Igor Chubin <igor@chub.in>
date Sun Jan 23 14:25:52 2011 +0100 (2011-01-23)
parents adbc809d3924
children c3a50c0d2400
files new-words-py.sh new-words.py
line diff
     1.1 --- a/new-words-py.sh	Sat Jan 22 23:42:31 2011 +0100
     1.2 +++ b/new-words-py.sh	Sun Jan 23 14:25:52 2011 +0100
     1.3 @@ -578,6 +578,16 @@
     1.4  
     1.5  add_marks()
     1.6  {
     1.7 +   if [ "$OLD_STYLE" = NO ]
     1.8 +   then 
     1.9 +    $NEW_WORDS_PY -l "$LANGUAGE" -f add_notes "$1"
    1.10 +   else
    1.11 +    group_words_OLD "$@"
    1.12 +   fi
    1.13 +}
    1.14 +
    1.15 +add_marks_OLD()
    1.16 +{
    1.17      PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
    1.18      cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
    1.19  use Encode;
    1.20 @@ -632,6 +642,16 @@
    1.21  
    1.22  remove_marks()
    1.23  {
    1.24 +   if [ "$OLD_STYLE" = NO ]
    1.25 +   then 
    1.26 +    $NEW_WORDS_PY -l "$LANGUAGE" -f remove_notes "$1"
    1.27 +   else
    1.28 +    group_words_OLD "$@"
    1.29 +   fi
    1.30 +}
    1.31 +
    1.32 +remove_marks_OLD()
    1.33 +{
    1.34      PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-grep-v-english-XXXXXXXX`
    1.35      cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
    1.36  $file = $ARGV[0];
     2.1 --- a/new-words.py	Sat Jan 22 23:42:31 2011 +0100
     2.2 +++ b/new-words.py	Sun Jan 23 14:25:52 2011 +0100
     2.3 @@ -177,13 +177,82 @@
     2.4  def load_notes(files):
     2.5      notes = {}
     2.6      for filename in files:
     2.7 -        with open(filename) as f:
     2.8 +        with codecs.open(filename, "r", "utf-8") as f:
     2.9              for line in f.readlines():
    2.10                  (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
    2.11                  notes.setdefault(word, {})
    2.12                  notes[word][filename] = note
    2.13      return notes
    2.14  
    2.15 +def add_notes(lines, notes):
    2.16 +    notes_filename = notes_filenames()[0]
    2.17 +    result = []
    2.18 +    for line in lines:
    2.19 +        if line.startswith('#'):
    2.20 +            result += [line]
    2.21 +        else:
    2.22 +            match_object = re.search('^\s*\S+\s*(\S+)', line)
    2.23 +            if match_object:
    2.24 +                word = match_object.group(1)
    2.25 +                if word in notes:
    2.26 +                    logging.debug(word)
    2.27 +                    logging.debug(line)
    2.28 +                    if notes_filename in notes[word]:
    2.29 +                        line = line.rstrip('\n')
    2.30 +                        line = "%-30s %s\n" % (line, notes[word][notes_filename])
    2.31 +                        logging.debug(line)
    2.32 +                        result += [line]
    2.33 +                else:
    2.34 +                    result += [line]
    2.35 +            else:
    2.36 +                result += [line]
    2.37 +    return result
    2.38 +
    2.39 +def remove_notes(lines, notes_group):
    2.40 +    notes_filename = notes_filenames()[0]
    2.41 +    notes = {}
    2.42 +    for k in notes_group.keys():
    2.43 +        if notes_filename in notes_group[k]:
    2.44 +            notes[k] = notes_group[k][notes_filename]
    2.45 +
    2.46 +    result = []
    2.47 +    for line in lines:
    2.48 +        line = line.rstrip('\n')
    2.49 +        match_object = re.match('(\s+)(\S+)(\s+)(\S+)(\s+)(.*)', line)
    2.50 +        if match_object:
    2.51 +            result.append("".join([
    2.52 +                match_object.group(1),
    2.53 +                match_object.group(2),
    2.54 +                match_object.group(3),
    2.55 +                match_object.group(4),
    2.56 +                "\n"
    2.57 +                ]))
    2.58 +            notes[match_object.group(4)] = match_object.group(6)
    2.59 +        else:
    2.60 +            result.append(line+"\n")
    2.61 +
    2.62 +    save_notes(notes_filename, notes)
    2.63 +    return result
    2.64 +
    2.65 +def save_notes(filename, notes):
    2.66 +    lines = []
    2.67 +    saved_words = []
    2.68 +    with codecs.open(filename, "r", "utf-8") as f:
    2.69 +        for line in f.readlines():
    2.70 +            (word, note) = re.split('\s+', line.rstrip('\n'), maxsplit=1)
    2.71 +            if word in notes:
    2.72 +                line = "%-29s %s\n" % (word, notes[word])
    2.73 +                saved_words.append(word)
    2.74 +            lines.append(line)
    2.75 +    for word in [x for x in notes.keys() if not x in saved_words]:
    2.76 +        line = "%-29s %s\n" % (word, notes[word])
    2.77 +        lines.append(line)
    2.78 +
    2.79 +    with codecs.open(filename, "w", "utf-8") as f:
    2.80 +        for line in lines:
    2.81 +            f.write(line)
    2.82 +
    2.83 +
    2.84  def print_words_sorted(words_freq):
    2.85      for k in sorted(words_freq.keys(), key=lambda k: words_freq[k], reverse=True):
    2.86          codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % (words_freq[k], k))
    2.87 @@ -221,11 +290,9 @@
    2.88      for word in notes.keys():
    2.89          for note in notes[word].values():
    2.90              if "@" in note:
    2.91 -                logging.debug("%s %s" % (word, note))
    2.92                  result = re.search(r'\@(\S*)', note)
    2.93                  if result:
    2.94                      main_word = result.group(1)
    2.95 -                    logging.debug("%s %s" % (word, main_word))
    2.96                      if main_word:
    2.97                          linked_words[word] = main_word
    2.98      return linked_words
    2.99 @@ -261,7 +328,6 @@
   2.100      lines = readlines_from_stdin()
   2.101      notes = load_notes(notes_filenames())
   2.102      linked_words = find_linked_words(notes)
   2.103 -    logging.debug(linked_words)
   2.104      normalizator = Normalizator(config['language'], linked_words)
   2.105  
   2.106      wgw = find_wordgroups_weights(lines, normalizator)
   2.107 @@ -271,14 +337,32 @@
   2.108                  reverse=True):
   2.109          codecs.getwriter("utf-8")(sys.stdout).write(line)
   2.110  
   2.111 +def filter_add_notes(args):
   2.112 +    lines = readlines_from_file(args[0])
   2.113 +    notes = load_notes(notes_filenames())
   2.114 +    lines = add_notes(lines, notes)
   2.115 +    with codecs.open(args[0], "w", "utf-8") as f:
   2.116 +        for line in lines:
   2.117 +            f.write(line)
   2.118 +
   2.119 +def filter_remove_notes(args):
   2.120 +    lines = readlines_from_file(args[0])
   2.121 +    notes = load_notes(notes_filenames())
   2.122 +    lines = remove_notes(lines, notes)
   2.123 +    with codecs.open(args[0], "w", "utf-8") as f:
   2.124 +        for line in lines:
   2.125 +            f.write(line)
   2.126 +
   2.127  (options, args) = parser.parse_args()
   2.128  if options.language:
   2.129      config['language'] = options.language
   2.130  
   2.131  if options.function:
   2.132      function_names = {
   2.133 -        'get_words' : filter_get_words,
   2.134 +        'get_words' :   filter_get_words,
   2.135          'group_words' : filter_group_words,
   2.136 +        'add_notes' :   filter_add_notes,
   2.137 +        'remove_notes': filter_remove_notes,
   2.138      }
   2.139      if options.function in function_names:
   2.140          function_names[options.function](args)