new-words

view grep-sentences.pl @ 66:53ba2847501f

added misc scripts
author Igor Chubin <igor@chub.in>
date Tue Mar 27 14:16:46 2012 +0200 (2012-03-27)
parents 48ca8248e9cc
children
line source
1 #!/usr/bin/perl
3 use utf8;
4 use Encode;
5 binmode STDIN,":utf8";
6 binmode STDOUT,":utf8";
8 $regexp=$ARGV[0];
9 $regexp=decode("utf8",$regexp);
11 $page=$ARGV[1];
12 shift @ARGV;
13 $number_of_printed = 0;
14 $number_of_printed_max = 10;
15 #if (open(PAGE, "lynx -dump '$page'|")) {
16 for $page (@ARGV) {
17 if (open(PAGE, "$page")) {
18 binmode PAGE,":utf8";
19 local $/;
20 $text=<PAGE>;
21 $text =~ s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g;
22 $text =~ s@\n@@g;
23 $text =~ s@(Mr|Mrs|viz)\.@\1POINT@g;
24 $text =~ s@e\.g\.@ePOINTgPOINT@g;
25 $text =~ s@i\.e\.@iPOINTePOINT@g;
26 @sentences=split /[.!?]/, $text;
27 for (@sentences) {
28 s@iPOINTePOINT@i\.e\.@g;
29 s@ePOINTgPOINT@e\.g\.@g;
30 s@(Mr|Mrs|viz)POINT@\1.@g;
31 s/^\s*//;
32 s/\s*$//;
33 s/\[[0-9]+\]//g;
34 s/\s+/ /g;
35 if (/\b$regexp\b/ and $number_of_printed < $number_of_printed_max ) {
36 $mark_start = "\e[35;1m";
37 $mark_stop = "\e[0m";
38 s/\b($regexp)\b/$mark_start$1$mark_stop/;
39 print "$_.\n\n";
40 $number_of_printed++;
41 }
42 if ($number_of_printed >= $number_of_printed_max) {
43 exit(0);
44 };
45 }
46 }
47 }