new-words

annotate grep-sentences.pl @ 67:87bb1c5e6616

added de script to misc/
author Igor Chubin <igor@chub.in>
date Wed Mar 28 15:54:30 2012 +0200 (2012-03-28)
parents 48ca8248e9cc
children
rev   line source
igor@5 1 #!/usr/bin/perl
igor@5 2
igor@29 3 use utf8;
igor@29 4 use Encode;
igor@29 5 binmode STDIN,":utf8";
igor@29 6 binmode STDOUT,":utf8";
igor@5 7
igor@6 8 $regexp=$ARGV[0];
igor@29 9 $regexp=decode("utf8",$regexp);
igor@29 10
igor@6 11 $page=$ARGV[1];
igor@62 12 shift @ARGV;
igor@31 13 $number_of_printed = 0;
igor@31 14 $number_of_printed_max = 10;
igor@5 15 #if (open(PAGE, "lynx -dump '$page'|")) {
igor@62 16 for $page (@ARGV) {
igor@62 17 if (open(PAGE, "$page")) {
igor@62 18 binmode PAGE,":utf8";
igor@62 19 local $/;
igor@62 20 $text=<PAGE>;
igor@62 21 $text =~ s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g;
igor@62 22 $text =~ s@\n@@g;
igor@62 23 $text =~ s@(Mr|Mrs|viz)\.@\1POINT@g;
igor@62 24 $text =~ s@e\.g\.@ePOINTgPOINT@g;
igor@62 25 $text =~ s@i\.e\.@iPOINTePOINT@g;
igor@62 26 @sentences=split /[.!?]/, $text;
igor@62 27 for (@sentences) {
igor@62 28 s@iPOINTePOINT@i\.e\.@g;
igor@62 29 s@ePOINTgPOINT@e\.g\.@g;
igor@62 30 s@(Mr|Mrs|viz)POINT@\1.@g;
igor@62 31 s/^\s*//;
igor@62 32 s/\s*$//;
igor@62 33 s/\[[0-9]+\]//g;
igor@62 34 s/\s+/ /g;
igor@62 35 if (/\b$regexp\b/ and $number_of_printed < $number_of_printed_max ) {
igor@62 36 $mark_start = "\e[35;1m";
igor@62 37 $mark_stop = "\e[0m";
igor@62 38 s/\b($regexp)\b/$mark_start$1$mark_stop/;
igor@62 39 print "$_.\n\n";
igor@62 40 $number_of_printed++;
igor@62 41 }
igor@62 42 if ($number_of_printed >= $number_of_printed_max) {
igor@62 43 exit(0);
igor@62 44 };
igor@31 45 }
igor@5 46 }
igor@5 47 }