new-words

diff new-words.py @ 54:e25de9ea9184
new-words.py is almost ready
author: Igor Chubin <igor@chub.in>
date: Tue Nov 01 20:19:18 2011 +0100 (2011-11-01)
parents: f583256b7ab1
children: 2a1a25e61872
     1.1 --- a/new-words.py	Mon Oct 31 20:21:20 2011 +0200
     1.2 +++ b/new-words.py	Tue Nov 01 20:19:18 2011 +0100
     1.3 @@ -11,6 +11,7 @@
     1.4  import subprocess
     1.5  import sys
     1.6  import Stemmer
     1.7 +import tempfile
     1.8  try:
     1.9      import psyco
    1.10      psyco.full()
    1.11 @@ -141,7 +142,13 @@
    1.12      dest="language")
    1.13  
    1.14  parser.add_option(
    1.15 -    "-f", "--function",
    1.16 +    "-f", "--allowed-words",
    1.17 +    help="file with list of allowed words (words that will be shown in the output)",
    1.18 +    action="store",
    1.19 +    dest="allowed_words")
    1.20 +
    1.21 +parser.add_option(
    1.22 +    "-X", "--function",
    1.23      help="filter through subsystem [INTERNAL]",
    1.24      action="store",
    1.25      dest="function")
    1.26 @@ -183,6 +190,12 @@
    1.27      dest="delete_tag")
    1.28  
    1.29  parser.add_option(
    1.30 +    "-R", "--show-range-percentage",
    1.31 +    help="show only words that cover specified percentage of the text, skip the rest",
    1.32 +    action="store",
    1.33 +    dest="show_range_percentage")
    1.34 +
    1.35 +parser.add_option(
    1.36      "-s", "--text-stats",
    1.37      help="show the text statistics (percentage of known words and so on) and exit",
    1.38      action="store_true",
    1.39 @@ -225,6 +238,16 @@
    1.40              res += [line]
    1.41      return res
    1.42  
    1.43 +def readlines_from_url(url):
    1.44 +    return [x.decode('utf-8') for x in
    1.45 +        subprocess.Popen(
    1.46 +            "lynx -dump '{url}' | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'".format(url=url),
    1.47 +            shell = True,
    1.48 +            stdout = subprocess.PIPE,
    1.49 +            stderr = subprocess.STDOUT
    1.50 +            ).communicate()[0].split('\n')
    1.51 +    ]
    1.52 +
    1.53  def readlines_from_stdin():
    1.54      return codecs.getreader("utf-8")(sys.stdin).readlines()
    1.55  
    1.56 @@ -261,8 +284,11 @@
    1.57      logging.debug(result)
    1.58      return result
    1.59  
    1.60 +def voc_filename():
    1.61 +    return "%s/%s.txt"%(config['config_directory'], config['language'])
    1.62 +
    1.63  def load_vocabulary():
    1.64 -    return get_words(readlines_from_file("%s/%s.txt"%(config['config_directory'], config['language'])))
    1.65 +    return get_words(readlines_from_file(voc_filename()))
    1.66  
    1.67  def notes_filenames():
    1.68      return ["%s/notes-%s.txt"%(config['config_directory'], config['language'])]
    1.69 @@ -409,8 +435,10 @@
    1.70          show_range=0,
    1.71          show_range_percentage=0,
    1.72          ):
    1.73 +    result = []
    1.74      if stats_only:
    1.75 -        codecs.getwriter("utf-8")(sys.stdout).write(
    1.76 +        #codecs.getwriter("utf-8")(sys.stdout).write(
    1.77 +        result.append(
    1.78              " ".join([
    1.79                  "%-10s" % x for x in [
    1.80                  "LANG",
    1.81 @@ -421,7 +449,7 @@
    1.82                  "WPS",
    1.83                  "UWPS*10"
    1.84                  ]]) + "\n")
    1.85 -        codecs.getwriter("utf-8")(sys.stdout).write(
    1.86 +        result.append(
    1.87              " ".join([
    1.88                  "%(language)-10s",
    1.89                  "%(percentage)-10.2f",
    1.90 @@ -431,10 +459,10 @@
    1.91                  "%(wps)-11d"
    1.92                  "%(uwps)-11d"
    1.93                  ]) % stats + "\n")
    1.94 -        return
    1.95 +        return "".join(result)
    1.96  
    1.97      if print_stats:
    1.98 -        codecs.getwriter("utf-8")(sys.stdout).write(
    1.99 +        result.append(
   1.100              "# %(language)s, %(percentage)-7.2f, <%(total_known)s/%(total)s>, <%(groups)s/%(words)s>\n" % stats)
   1.101  
   1.102      level_lines = range(int(float(stats['percentage']))/5*5+5,95,5)+range(90,102)
   1.103 @@ -448,14 +476,12 @@
   1.104  
   1.105          normalized_word = normalizator.normalize(word_pair[1])
   1.106          if old_normalized_word and old_normalized_word != normalized_word:
   1.107 -            #codecs.getwriter("utf-8")(sys.stdout).write(
   1.108 -            #    "### %s\n" % normalizator.best_word_from_group(words_of_this_group))
   1.109              if compressed_wordlist:
   1.110                  compressed_word_pair = (
   1.111                      sum(x[0] for x in words_of_this_group),
   1.112                      normalizator.best_word_from_group(words_of_this_group)
   1.113                      )
   1.114 -                codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % compressed_word_pair)
   1.115 +                result.append("%10s %s\n" % compressed_word_pair)
   1.116                  printed_words += 1
   1.117              words_of_this_group = []
   1.118  
   1.119 @@ -463,7 +489,7 @@
   1.120          words_of_this_group.append(word_pair)
   1.121  
   1.122          if not compressed_wordlist:
   1.123 -            codecs.getwriter("utf-8")(sys.stdout).write("%10s %s\n" % word_pair)
   1.124 +            result.append("%10s %s\n" % word_pair)
   1.125              printed_words += 1
   1.126  
   1.127  
   1.128 @@ -473,28 +499,14 @@
   1.129              while 100.0*known/total > level_lines[0]:
   1.130                  current_level = level_lines[0]
   1.131                  level_lines = level_lines[1:]
   1.132 -            codecs.getwriter("utf-8")(sys.stdout).write("# %s\n" % current_level)
   1.133 +            result.append("# %s\n" % current_level)
   1.134  
   1.135          if show_range >0 and printed_words >= show_range:
   1.136              break
   1.137          if show_range_percentage >0 and 100.0*known/total >= show_range_percentage:
   1.138              break
   1.139  
   1.140 -def filter_add_notes(args):
   1.141 -    lines = readlines_from_file(args[0])
   1.142 -    notes = load_notes(notes_filenames())
   1.143 -    lines = add_notes(lines, notes)
   1.144 -    with codecs.open(args[0], "w", "utf-8") as f:
   1.145 -        for line in lines:
   1.146 -            f.write(line)
   1.147 -
   1.148 -def filter_remove_notes(args):
   1.149 -    lines = readlines_from_file(args[0])
   1.150 -    notes = load_notes(notes_filenames())
   1.151 -    lines = remove_notes(lines, notes)
   1.152 -    with codecs.open(args[0], "w", "utf-8") as f:
   1.153 -        for line in lines:
   1.154 -            f.write(line)
   1.155 +    return result
   1.156  
   1.157  def parse_parts_description(parts_description):
   1.158      """
   1.159 @@ -503,8 +515,6 @@
   1.160       from-to/step
   1.161       from+delta/step
   1.162      """
   1.163 -    def incorrect_parts_description(pd):
   1.164 -        raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % pd)
   1.165  
   1.166      try:
   1.167          (a, step) = parts_description.split("/", 1)
   1.168 @@ -525,7 +535,7 @@
   1.169          return (start, stop, step)
   1.170  
   1.171      except:
   1.172 -        raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % pd)
   1.173 +        raise ValueError("Parts description must be in format: num[[+-]num]/num; this [%s] is incorrect" % parts_description)
   1.174  
   1.175  
   1.176  def take_part(lines, part_description = None):
   1.177 @@ -536,27 +546,45 @@
   1.178      part_size = (1.0*n) / step
   1.179      result = []
   1.180      for i in range(n):
   1.181 -        if part_size * i >= start and part_size * i <= stop:
   1.182 -            result += lines[i]
   1.183 +        if i >= start * part_size and i <= stop * part_size:
   1.184 +            result += [lines[i]]
   1.185      return result
   1.186  
   1.187  def filter_get_words_group_words_add_stat(args):
   1.188      vocabulary = load_vocabulary()
   1.189      notes = load_notes(notes_filenames())
   1.190 -    lines = take_part(readlines_from_stdin(), config.get('pages', ''))
   1.191 +
   1.192 +    if len(args) > 0:
   1.193 +        if 'http://' in args[0]:
   1.194 +            input_lines = readlines_from_url(args[0])
   1.195 +        else:
   1.196 +            input_lines = readlines_from_file(args[0])
   1.197 +    else:
   1.198 +        input_lines = readlines_from_stdin()
   1.199 +
   1.200 +    if len(input_lines) == 0:
   1.201 +        print >> sys.stderr, "Nothing to do, standard input is empty, exiting."
   1.202 +        sys.exit(1)
   1.203 +
   1.204 +    lines = take_part(input_lines, config.get('pages', ''))
   1.205 +
   1.206 +    (_, original_text_tempfile) = tempfile.mkstemp(prefix='new-word')
   1.207 +    with codecs.open(original_text_tempfile, "w", "utf-8") as f:
   1.208 +        f.write("".join(lines))
   1.209 +
   1.210      group_by = [1]
   1.211  
   1.212 -    if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
   1.213 +    if 'two_words' in config:
   1.214          group_by.append(2)
   1.215 -    if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
   1.216 +    if 'three_words' in config:
   1.217          group_by.append(3)
   1.218      words = get_words(lines, group_by)
   1.219      stats_only = False
   1.220 -    if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
   1.221 +    if 'text_stats' in config:
   1.222          stats_only = True
   1.223  
   1.224      compressed_wordlist = False
   1.225 -    if 'COMPRESSED_WORDLIST' in os.environ and os.environ['COMPRESSED_WORDLIST'] == 'YES':
   1.226 +    if 'compressed' in config:
   1.227          compressed_wordlist = True
   1.228  
   1.229      show_range = os.environ.get('SHOW_RANGE', '')
   1.230 @@ -564,16 +592,16 @@
   1.231          show_range = int(show_range)
   1.232      else:
   1.233          show_range = 0
   1.234 -    show_range_percentage = os.environ.get('SHOW_RANGE_PERCENTAGE', '')
   1.235 -    if show_range_percentage != '':
   1.236 -        show_range_percentage = int(show_range_percentage)
   1.237 +
   1.238 +    if 'show_range_percentage' in config:
   1.239 +        show_range_percentage = int(config['show_range_percentage'])
   1.240      else:
   1.241          show_range_percentage = 0
   1.242  
   1.243  
   1.244      stats = {}
   1.245      stats['total'] = sum(words[x] for x in words.keys())
   1.246 -    if 'FILTER_WORDS' in os.environ and os.environ['FILTER_WORDS'] == 'YES':
   1.247 +    if not 'no_filter' in config:
   1.248          words = substract_dictionary(words, vocabulary)
   1.249  
   1.250      stats['total_unknown'] = sum(words[x] for x in words.keys())
   1.251 @@ -591,8 +619,8 @@
   1.252      normalizator = Normalizator(config['language'], linked_words)
   1.253  
   1.254      # filter words by allowed_words_filter
   1.255 -    if os.environ.get('ALLOWED_WORDS_FILENAME', ''):
   1.256 -        allowed_words_filename = os.environ.get('ALLOWED_WORDS_FILENAME', '')
   1.257 +    if 'allowed_words' in config:
   1.258 +        allowed_words_filename = config['allowed_words']
   1.259          normalized_allowed_words = [
   1.260              normalizator.normalize(w.rstrip('\n')) 
   1.261              for w in readlines_from_file(allowed_words_filename)
   1.262 @@ -615,7 +643,7 @@
   1.263                  cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
   1.264                  reverse=True)
   1.265  
   1.266 -    print_words_sorted(
   1.267 +    output = print_words_sorted(
   1.268          words_with_freq,
   1.269          stats,
   1.270          normalizator,
   1.271 @@ -625,14 +653,87 @@
   1.272          show_range_percentage=show_range_percentage,
   1.273          )
   1.274  
   1.275 +
   1.276 +    if ('non_interactive' in config or 'text_stats' in config):
   1.277 +        codecs.getwriter("utf-8")(sys.stdout).write("".join(output))
   1.278 +    else:
   1.279 +        (_, temp1) = tempfile.mkstemp(prefix='new-word')
   1.280 +        (_, temp2) = tempfile.mkstemp(prefix='new-word')
   1.281 +
   1.282 +        with codecs.open(temp1, "w", "utf-8") as f:
   1.283 +            f.write("".join(output))
   1.284 +        with codecs.open(temp2, "w", "utf-8") as f:
   1.285 +            f.write("".join(add_notes(output, notes)))
   1.286 +
   1.287 +        os.putenv('ORIGINAL_TEXT', original_text_tempfile)
   1.288 +        os.system((
   1.289 +            "vim"
   1.290 +            " -c 'setlocal spell spelllang={language}'"
   1.291 +            " -c 'set keywordprg={language}'"
   1.292 +            " -c 'set iskeyword=@,48-57,/,.,-,_,+,,,#,$,%,~,=,48-255'"
   1.293 +            " {filename}"
   1.294 +            " < /dev/tty > /dev/tty"
   1.295 +            ).format(language=config['language'], filename=temp2))
   1.296 +
   1.297 +        lines = remove_notes(readlines_from_file(temp2), notes)
   1.298 +
   1.299 +        # compare lines_before and lines_after and return deleted words
   1.300 +        lines_before = output
   1.301 +        lines_after = lines
   1.302 +        deleted_words = []
   1.303 +
   1.304 +        for line in lines_before:
   1.305 +            if line not in lines_after:
   1.306 +                line = line.strip()
   1.307 +                if ' ' in line:
   1.308 +                    word = re.split('\s+', line, 1)[1]
   1.309 +                    if ' ' in word:
   1.310 +                        word = re.split('\s+', word, 1)[0]
   1.311 +                deleted_words.append(word)
   1.312 +
   1.313 +        with codecs.open(voc_filename(), "a", "utf-8") as f:
   1.314 +            f.write("\n".join(deleted_words + ['']))
   1.315 +
   1.316 +        os.unlink(temp1)
   1.317 +        os.unlink(temp2)
   1.318 +
   1.319 +    os.unlink(original_text_tempfile)
   1.320 +
   1.321  (options, args) = parser.parse_args()
   1.322  if options.language:
   1.323      config['language'] = options.language
   1.324  
   1.325 +if options.pages:
   1.326 +    config['pages'] = options.pages
   1.327 +else:
   1.328 +    config['pages'] = ""
   1.329 +
   1.330 +if options.allowed_words:
   1.331 +    config['allowed_words'] = options.allowed_words
   1.332 +
   1.333 +if options.show_range_percentage:
   1.334 +    config['show_range_percentage'] = options.show_range_percentage
   1.335 +
   1.336 +if options.non_interactive:
   1.337 +    config['non_interactive'] = True
   1.338 +
   1.339 +if options.text_stats:
   1.340 +    config['text_stats'] = True
   1.341 +
   1.342 +if options.compressed:
   1.343 +    config['compressed'] = True
   1.344 +
   1.345 +if options.no_filter:
   1.346 +    config['no_filter'] = True
   1.347 +
   1.348 +if options.two_words:
   1.349 +    config['two_words'] = True
   1.350 +
   1.351 +if options.three_words:
   1.352 +    config['three_words'] = True
   1.353 +
   1.354  if options.function:
   1.355      function_names = {
   1.356 -        'add_notes' :   filter_add_notes,
   1.357 -        'remove_notes': filter_remove_notes,
   1.358          'get_words_group_words_add_stat': filter_get_words_group_words_add_stat,
   1.359      }
   1.360      if options.function in function_names:
author	Igor Chubin <igor@chub.in>
date	Tue Nov 01 20:19:18 2011 +0100 (2011-11-01)
parents	f583256b7ab1
children	2a1a25e61872