new-words

annotate grep-sentences.pl @ 43:d532e7b52ab2

-s key support in new-words.py

Now new-words-py.sh -s works in the same way as new-words.sh.
(WPS and UWPS fields are not calculated correctly yet).
author Igor Chubin <igor@chub.in>
date Fri Jan 28 12:40:58 2011 +0200 (2011-01-28)
parents c631833fa2be
children 3a61988109a8
rev   line source
igor@5 1 #!/usr/bin/perl
igor@5 2
igor@29 3 use utf8;
igor@29 4 use Encode;
igor@29 5 binmode STDIN,":utf8";
igor@29 6 binmode STDOUT,":utf8";
igor@5 7
igor@6 8 $regexp=$ARGV[0];
igor@29 9 $regexp=decode("utf8",$regexp);
igor@29 10
igor@6 11 $page=$ARGV[1];
igor@31 12 $number_of_printed = 0;
igor@31 13 $number_of_printed_max = 10;
igor@5 14 #if (open(PAGE, "lynx -dump '$page'|")) {
igor@5 15 if (open(PAGE, "$page")) {
igor@29 16 binmode PAGE,":utf8";
igor@5 17 local $/;
igor@5 18 $text=<PAGE>;
igor@5 19 $text =~ s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g;
igor@5 20 $text =~ s@\n@@g;
igor@28 21 $text =~ s@(Mr|Mrs|viz)\.@\1POINT@g;
igor@28 22 $text =~ s@e\.g\.@ePOINTgPOINT@g;
igor@28 23 $text =~ s@i\.e\.@iPOINTePOINT@g;
igor@19 24 @sentences=split /[.!?]/, $text;
igor@5 25 for (@sentences) {
igor@28 26 s@iPOINTePOINT@i\.e\.@g;
igor@28 27 s@ePOINTgPOINT@e\.g\.@g;
igor@28 28 s@(Mr|Mrs|viz)POINT@\1.@g;
igor@5 29 s/^\s*//;
igor@5 30 s/\s*$//;
igor@5 31 s/\[[0-9]+\]//g;
igor@5 32 s/\s+/ /g;
igor@31 33 if (/\b$regexp\b/ and $number_of_printed < $number_of_printed_max ) {
igor@31 34 print "$_.\n\n";
igor@31 35 $number_of_printed++;
igor@31 36 }
igor@5 37 }
igor@5 38 }