new-words

diff new-words.py @ 44:7eb1a8c3eade

-2 and -3 are now supported by new-words.py
author Igor Chubin <igor@chub.in>
date Fri Jan 28 21:45:58 2011 +0100 (2011-01-28)
parents d532e7b52ab2
children 5f90e44eecfc
line diff
     1.1 --- a/new-words.py	Fri Jan 28 12:40:58 2011 +0200
     1.2 +++ b/new-words.py	Fri Jan 28 21:45:58 2011 +0100
     1.3 @@ -159,14 +159,15 @@
     1.4      line = line.rstrip('\n')
     1.5      #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
     1.6      #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
     1.7 -    return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line)
     1.8 +    return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
     1.9  
    1.10 -def get_words(lines):
    1.11 +def get_words(lines, group_by=[1]):
    1.12      """
    1.13      Returns hash of words in a file
    1.14      word => number
    1.15      """
    1.16      result = {}
    1.17 +    (a, b, c) = ("", "", "")
    1.18      for line in lines:
    1.19          words = words_from_line(line)
    1.20          for word in words:
    1.21 @@ -174,6 +175,17 @@
    1.22                  continue
    1.23              result.setdefault(word, 0)
    1.24              result[word] += 1
    1.25 +            if 2 in group_by and a != "" and b != "":
    1.26 +                w = "%s_%s" % (a,b)
    1.27 +                result.setdefault(w, 0)
    1.28 +                result[w] += 1
    1.29 +            if 3 in group_by and not "" in [a,b,c]:
    1.30 +                w = "%s_%s_%s" % (a,b,c)
    1.31 +                result.setdefault(w, 0)
    1.32 +                result[w] += 1
    1.33 +            (a,b,c) = (b, c, word)
    1.34 +
    1.35 +    logging.debug(result)
    1.36      return result
    1.37  
    1.38  def load_vocabulary():
    1.39 @@ -203,12 +215,12 @@
    1.40              if match_object:
    1.41                  word = match_object.group(1)
    1.42                  if word in notes:
    1.43 -                    logging.debug(word)
    1.44 -                    logging.debug(line)
    1.45 +                    #logging.debug(word)
    1.46 +                    #logging.debug(line)
    1.47                      if notes_filename in notes[word]:
    1.48                          line = line.rstrip('\n')
    1.49                          line = "%-30s %s\n" % (line, notes[word][notes_filename])
    1.50 -                        logging.debug(line)
    1.51 +                        #logging.debug(line)
    1.52                          result += [line]
    1.53                  else:
    1.54                      result += [line]
    1.55 @@ -379,11 +391,17 @@
    1.56      vocabulary = load_vocabulary()
    1.57      notes = load_notes(notes_filenames())
    1.58      lines = readlines_from_stdin()
    1.59 -    words = get_words(lines)
    1.60 +    group_by = [1]
    1.61 +    if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
    1.62 +        group_by.append(2)
    1.63 +    if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
    1.64 +        group_by.append(3)
    1.65 +    words = get_words(lines, group_by)
    1.66      stats_only = False
    1.67      if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
    1.68          stats_only = True
    1.69  
    1.70 +
    1.71      stats = {}
    1.72      stats['total'] = sum(words[x] for x in words.keys())
    1.73      words = substract_dictionary(words, vocabulary)
    1.74 @@ -402,17 +420,17 @@
    1.75      linked_words = find_linked_words(notes)
    1.76      normalizator = Normalizator(config['language'], linked_words)
    1.77  
    1.78 -    word_pairs = []
    1.79 +    words_with_freq = []
    1.80      for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
    1.81 -        word_pairs.append((words[k], k))
    1.82 +        words_with_freq.append((words[k], k))
    1.83  
    1.84 -    wgw = find_wordgroups_weights(word_pairs, normalizator)
    1.85 -    word_pairs = sorted(
    1.86 -                word_pairs,
    1.87 +    wgw = find_wordgroups_weights(words_with_freq, normalizator)
    1.88 +    words_with_freq = sorted(
    1.89 +                words_with_freq,
    1.90                  cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
    1.91                  reverse=True)
    1.92  
    1.93 -    print_words_sorted(word_pairs, stats, stats_only=stats_only)
    1.94 +    print_words_sorted(words_with_freq, stats, stats_only=stats_only)
    1.95  
    1.96  (options, args) = parser.parse_args()
    1.97  if options.language: