new-words

view grep-sentences.pl @ 36:f95804355b0f

compressed wordlist support
author Igor Chubin <igor@chub.in>
date Sat Jan 01 19:47:39 2011 +0100 (2011-01-01)
parents c631833fa2be
children 3a61988109a8
line source
1 #!/usr/bin/perl
3 use utf8;
4 use Encode;
5 binmode STDIN,":utf8";
6 binmode STDOUT,":utf8";
8 $regexp=$ARGV[0];
9 $regexp=decode("utf8",$regexp);
11 $page=$ARGV[1];
12 $number_of_printed = 0;
13 $number_of_printed_max = 10;
14 #if (open(PAGE, "lynx -dump '$page'|")) {
15 if (open(PAGE, "$page")) {
16 binmode PAGE,":utf8";
17 local $/;
18 $text=<PAGE>;
19 $text =~ s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g;
20 $text =~ s@\n@@g;
21 $text =~ s@(Mr|Mrs|viz)\.@\1POINT@g;
22 $text =~ s@e\.g\.@ePOINTgPOINT@g;
23 $text =~ s@i\.e\.@iPOINTePOINT@g;
24 @sentences=split /[.!?]/, $text;
25 for (@sentences) {
26 s@iPOINTePOINT@i\.e\.@g;
27 s@ePOINTgPOINT@e\.g\.@g;
28 s@(Mr|Mrs|viz)POINT@\1.@g;
29 s/^\s*//;
30 s/\s*$//;
31 s/\[[0-9]+\]//g;
32 s/\s+/ /g;
33 if (/\b$regexp\b/ and $number_of_printed < $number_of_printed_max ) {
34 print "$_.\n\n";
35 $number_of_printed++;
36 }
37 }
38 }