new-words
annotate grep-sentences.pl @ 68:846240941452
added -C key: compress to lines; fixed bug with #90-line
author | Igor Chubin <igor@chub.in> |
---|---|
date | Sun Sep 23 16:07:29 2012 +0300 (2012-09-23) |
parents | 48ca8248e9cc |
children |
rev | line source |
---|---|
igor@5 | 1 #!/usr/bin/perl |
igor@5 | 2 |
igor@29 | 3 use utf8; |
igor@29 | 4 use Encode; |
igor@29 | 5 binmode STDIN,":utf8"; |
igor@29 | 6 binmode STDOUT,":utf8"; |
igor@5 | 7 |
igor@6 | 8 $regexp=$ARGV[0]; |
igor@29 | 9 $regexp=decode("utf8",$regexp); |
igor@29 | 10 |
igor@6 | 11 $page=$ARGV[1]; |
igor@62 | 12 shift @ARGV; |
igor@31 | 13 $number_of_printed = 0; |
igor@31 | 14 $number_of_printed_max = 10; |
igor@5 | 15 #if (open(PAGE, "lynx -dump '$page'|")) { |
igor@62 | 16 for $page (@ARGV) { |
igor@62 | 17 if (open(PAGE, "$page")) { |
igor@62 | 18 binmode PAGE,":utf8"; |
igor@62 | 19 local $/; |
igor@62 | 20 $text=<PAGE>; |
igor@62 | 21 $text =~ s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; |
igor@62 | 22 $text =~ s@\n@@g; |
igor@62 | 23 $text =~ s@(Mr|Mrs|viz)\.@\1POINT@g; |
igor@62 | 24 $text =~ s@e\.g\.@ePOINTgPOINT@g; |
igor@62 | 25 $text =~ s@i\.e\.@iPOINTePOINT@g; |
igor@62 | 26 @sentences=split /[.!?]/, $text; |
igor@62 | 27 for (@sentences) { |
igor@62 | 28 s@iPOINTePOINT@i\.e\.@g; |
igor@62 | 29 s@ePOINTgPOINT@e\.g\.@g; |
igor@62 | 30 s@(Mr|Mrs|viz)POINT@\1.@g; |
igor@62 | 31 s/^\s*//; |
igor@62 | 32 s/\s*$//; |
igor@62 | 33 s/\[[0-9]+\]//g; |
igor@62 | 34 s/\s+/ /g; |
igor@62 | 35 if (/\b$regexp\b/ and $number_of_printed < $number_of_printed_max ) { |
igor@62 | 36 $mark_start = "\e[35;1m"; |
igor@62 | 37 $mark_stop = "\e[0m"; |
igor@62 | 38 s/\b($regexp)\b/$mark_start$1$mark_stop/; |
igor@62 | 39 print "$_.\n\n"; |
igor@62 | 40 $number_of_printed++; |
igor@62 | 41 } |
igor@62 | 42 if ($number_of_printed >= $number_of_printed_max) { |
igor@62 | 43 exit(0); |
igor@62 | 44 }; |
igor@31 | 45 } |
igor@5 | 46 } |
igor@5 | 47 } |