new-words
diff grep-sentences.pl @ 67:87bb1c5e6616
added de script to misc/
author | Igor Chubin <igor@chub.in> |
---|---|
date | Wed Mar 28 15:54:30 2012 +0200 (2012-03-28) |
parents | 48ca8248e9cc |
children |
line diff
1.1 --- a/grep-sentences.pl Tue Aug 17 21:35:57 2010 +0200 1.2 +++ b/grep-sentences.pl Wed Mar 28 15:54:30 2012 +0200 1.3 @@ -9,30 +9,39 @@ 1.4 $regexp=decode("utf8",$regexp); 1.5 1.6 $page=$ARGV[1]; 1.7 +shift @ARGV; 1.8 $number_of_printed = 0; 1.9 $number_of_printed_max = 10; 1.10 #if (open(PAGE, "lynx -dump '$page'|")) { 1.11 -if (open(PAGE, "$page")) { 1.12 - binmode PAGE,":utf8"; 1.13 - local $/; 1.14 - $text=<PAGE>; 1.15 - $text =~ s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; 1.16 - $text =~ s@\n@@g; 1.17 - $text =~ s@(Mr|Mrs|viz)\.@\1POINT@g; 1.18 - $text =~ s@e\.g\.@ePOINTgPOINT@g; 1.19 - $text =~ s@i\.e\.@iPOINTePOINT@g; 1.20 - @sentences=split /[.!?]/, $text; 1.21 - for (@sentences) { 1.22 - s@iPOINTePOINT@i\.e\.@g; 1.23 - s@ePOINTgPOINT@e\.g\.@g; 1.24 - s@(Mr|Mrs|viz)POINT@\1.@g; 1.25 - s/^\s*//; 1.26 - s/\s*$//; 1.27 - s/\[[0-9]+\]//g; 1.28 - s/\s+/ /g; 1.29 - if (/\b$regexp\b/ and $number_of_printed < $number_of_printed_max ) { 1.30 - print "$_.\n\n"; 1.31 - $number_of_printed++; 1.32 +for $page (@ARGV) { 1.33 + if (open(PAGE, "$page")) { 1.34 + binmode PAGE,":utf8"; 1.35 + local $/; 1.36 + $text=<PAGE>; 1.37 + $text =~ s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; 1.38 + $text =~ s@\n@@g; 1.39 + $text =~ s@(Mr|Mrs|viz)\.@\1POINT@g; 1.40 + $text =~ s@e\.g\.@ePOINTgPOINT@g; 1.41 + $text =~ s@i\.e\.@iPOINTePOINT@g; 1.42 + @sentences=split /[.!?]/, $text; 1.43 + for (@sentences) { 1.44 + s@iPOINTePOINT@i\.e\.@g; 1.45 + s@ePOINTgPOINT@e\.g\.@g; 1.46 + s@(Mr|Mrs|viz)POINT@\1.@g; 1.47 + s/^\s*//; 1.48 + s/\s*$//; 1.49 + s/\[[0-9]+\]//g; 1.50 + s/\s+/ /g; 1.51 + if (/\b$regexp\b/ and $number_of_printed < $number_of_printed_max ) { 1.52 + $mark_start = "\e[35;1m"; 1.53 + $mark_stop = "\e[0m"; 1.54 + s/\b($regexp)\b/$mark_start$1$mark_stop/; 1.55 + print "$_.\n\n"; 1.56 + $number_of_printed++; 1.57 + } 1.58 + if ($number_of_printed >= $number_of_printed_max) { 1.59 + exit(0); 1.60 + }; 1.61 } 1.62 } 1.63 }