new-words

changeset 44:7eb1a8c3eade

-2 and -3 are now supported by new-words.py
author Igor Chubin <igor@chub.in>
date Fri Jan 28 21:45:58 2011 +0100 (2011-01-28)
parents d532e7b52ab2
children 5f90e44eecfc
files new-words-py.sh new-words.py
line diff
     1.1 --- a/new-words-py.sh	Fri Jan 28 12:40:58 2011 +0200
     1.2 +++ b/new-words-py.sh	Fri Jan 28 21:45:58 2011 +0100
     1.3 @@ -123,39 +123,6 @@
     1.4    exit 0
     1.5  fi
     1.6  
     1.7 -two_and_three_words()
     1.8 -{
     1.9 -    if [ $GROUP_WORDS_BY_THREE = NO -a $GROUP_WORDS_BY_TWO = NO ]
    1.10 -    then 
    1.11 -        cat
    1.12 -    else
    1.13 -        cat 
    1.14 -
    1.15 -    export GROUP_WORDS_BY_THREE
    1.16 -    export GROUP_WORDS_BY_TWO
    1.17 -    PERL_SCRIPT_TEMP_NAME=`mktemp /tmp/perl-two-and-three-XXXXXXXX`
    1.18 -    cat <<'PERL_SCRIPT' > $PERL_SCRIPT_TEMP_NAME
    1.19 -#!/usr/bin/perl
    1.20 -local $/;
    1.21 -$words=<>;
    1.22 -$words=~ s@[!?;,:#1-9".]@ @g;
    1.23 -$words =~ s@\s+@ @g;
    1.24 -@words = split /\s+/, $words;
    1.25 -for ($i=0; $i<$#words-3;$i++) {
    1.26 -    my ($a,$b,$c)= ($words[$i],$words[$i+1],$words[$i+2]);
    1.27 -    if ($ENV{GROUP_WORDS_BY_THREE} eq "YES" and ($a && $b && $c)) {
    1.28 -        print "${a}_${b}_${c}\n";
    1.29 -    };  
    1.30 -    if ($ENV{GROUP_WORDS_BY_TWO} eq "YES" and ($a && $b)) {
    1.31 -        print "${a}_${b}\n";
    1.32 -    };
    1.33 -}
    1.34 -PERL_SCRIPT
    1.35 -    perl $PERL_SCRIPT_TEMP_NAME "$ORIGINAL_TEXT"
    1.36 -    rm $PERL_SCRIPT_TEMP_NAME
    1.37 -    fi
    1.38 -}
    1.39 -
    1.40  text_from_url()
    1.41  {
    1.42  lynx -dump "$1" | perl -p -e 's@http://[a-zA-Z&_.:/0-9%?=,#+()\[\]~-]*@@'
    1.43 @@ -171,6 +138,9 @@
    1.44  }
    1.45  get_words_group_words_add_stat()
    1.46  {
    1.47 +    STAT_ONLY="$STAT_ONLY" \
    1.48 +    GROUP_WORDS_BY_TWO="$GROUP_WORDS_BY_TWO" \
    1.49 +    GROUP_WORDS_BY_THREE="$GROUP_WORDS_BY_THREE" \
    1.50      $NEW_WORDS_PY -l "$LANGUAGE" -f get_words_group_words_add_stat "$1"
    1.51  }
    1.52  
    1.53 @@ -282,8 +252,8 @@
    1.54  fi \
    1.55     | part $PART_TO_PROCESS \
    1.56     | tee $ORIGINAL_TEXT \
    1.57 -   | two_and_three_words \
    1.58 -   | STAT_ONLY="$STAT_ONLY" get_words_group_words_add_stat \
    1.59 +   | \
    1.60 +    get_words_group_words_add_stat \
    1.61     | tee "$TEMP1" > "$TEMP2"
    1.62  
    1.63  if [ "$STAT_ONLY" = "YES" ]
     2.1 --- a/new-words.py	Fri Jan 28 12:40:58 2011 +0200
     2.2 +++ b/new-words.py	Fri Jan 28 21:45:58 2011 +0100
     2.3 @@ -159,14 +159,15 @@
     2.4      line = line.rstrip('\n')
     2.5      #return re.split('(?:\s|[*\r,.:#@()+=<>$;"?!|\[\]^%&~{}«»–])+', line)
     2.6      #return re.split('[^a-zA-ZäöëüßÄËÖÜß]+', line)
     2.7 -    return re.compile("(?!')(?:\W)+", flags=re.UNICODE).split(line)
     2.8 +    return re.compile("(?!['_])(?:\W)+", flags=re.UNICODE).split(line)
     2.9  
    2.10 -def get_words(lines):
    2.11 +def get_words(lines, group_by=[1]):
    2.12      """
    2.13      Returns hash of words in a file
    2.14      word => number
    2.15      """
    2.16      result = {}
    2.17 +    (a, b, c) = ("", "", "")
    2.18      for line in lines:
    2.19          words = words_from_line(line)
    2.20          for word in words:
    2.21 @@ -174,6 +175,17 @@
    2.22                  continue
    2.23              result.setdefault(word, 0)
    2.24              result[word] += 1
    2.25 +            if 2 in group_by and a != "" and b != "":
    2.26 +                w = "%s_%s" % (a,b)
    2.27 +                result.setdefault(w, 0)
    2.28 +                result[w] += 1
    2.29 +            if 3 in group_by and not "" in [a,b,c]:
    2.30 +                w = "%s_%s_%s" % (a,b,c)
    2.31 +                result.setdefault(w, 0)
    2.32 +                result[w] += 1
    2.33 +            (a,b,c) = (b, c, word)
    2.34 +
    2.35 +    logging.debug(result)
    2.36      return result
    2.37  
    2.38  def load_vocabulary():
    2.39 @@ -203,12 +215,12 @@
    2.40              if match_object:
    2.41                  word = match_object.group(1)
    2.42                  if word in notes:
    2.43 -                    logging.debug(word)
    2.44 -                    logging.debug(line)
    2.45 +                    #logging.debug(word)
    2.46 +                    #logging.debug(line)
    2.47                      if notes_filename in notes[word]:
    2.48                          line = line.rstrip('\n')
    2.49                          line = "%-30s %s\n" % (line, notes[word][notes_filename])
    2.50 -                        logging.debug(line)
    2.51 +                        #logging.debug(line)
    2.52                          result += [line]
    2.53                  else:
    2.54                      result += [line]
    2.55 @@ -379,11 +391,17 @@
    2.56      vocabulary = load_vocabulary()
    2.57      notes = load_notes(notes_filenames())
    2.58      lines = readlines_from_stdin()
    2.59 -    words = get_words(lines)
    2.60 +    group_by = [1]
    2.61 +    if 'GROUP_WORDS_BY_TWO' in os.environ and os.environ['GROUP_WORDS_BY_TWO'] == 'YES':
    2.62 +        group_by.append(2)
    2.63 +    if 'GROUP_WORDS_BY_THREE' in os.environ and os.environ['GROUP_WORDS_BY_THREE'] == 'YES':
    2.64 +        group_by.append(3)
    2.65 +    words = get_words(lines, group_by)
    2.66      stats_only = False
    2.67      if 'STAT_ONLY' in os.environ and os.environ['STAT_ONLY'] == 'YES':
    2.68          stats_only = True
    2.69  
    2.70 +
    2.71      stats = {}
    2.72      stats['total'] = sum(words[x] for x in words.keys())
    2.73      words = substract_dictionary(words, vocabulary)
    2.74 @@ -402,17 +420,17 @@
    2.75      linked_words = find_linked_words(notes)
    2.76      normalizator = Normalizator(config['language'], linked_words)
    2.77  
    2.78 -    word_pairs = []
    2.79 +    words_with_freq = []
    2.80      for k in sorted(words.keys(), key=lambda k: words[k], reverse=True):
    2.81 -        word_pairs.append((words[k], k))
    2.82 +        words_with_freq.append((words[k], k))
    2.83  
    2.84 -    wgw = find_wordgroups_weights(word_pairs, normalizator)
    2.85 -    word_pairs = sorted(
    2.86 -                word_pairs,
    2.87 +    wgw = find_wordgroups_weights(words_with_freq, normalizator)
    2.88 +    words_with_freq = sorted(
    2.89 +                words_with_freq,
    2.90                  cmp=lambda x,y:compare_word_pairs(x,y, wgw, normalizator, linked_words),
    2.91                  reverse=True)
    2.92  
    2.93 -    print_words_sorted(word_pairs, stats, stats_only=stats_only)
    2.94 +    print_words_sorted(words_with_freq, stats, stats_only=stats_only)
    2.95  
    2.96  (options, args) = parser.parse_args()
    2.97  if options.language: