new-words

diff grep-sentences.pl @ 67:87bb1c5e6616

added de script to misc/
author Igor Chubin <igor@chub.in>
date Wed Mar 28 15:54:30 2012 +0200 (2012-03-28)
parents 48ca8248e9cc
children
line diff
     1.1 --- a/grep-sentences.pl	Tue Aug 17 21:35:57 2010 +0200
     1.2 +++ b/grep-sentences.pl	Wed Mar 28 15:54:30 2012 +0200
     1.3 @@ -9,30 +9,39 @@
     1.4  $regexp=decode("utf8",$regexp);
     1.5  
     1.6  $page=$ARGV[1];
     1.7 +shift @ARGV;
     1.8  $number_of_printed = 0;
     1.9  $number_of_printed_max = 10;
    1.10  #if (open(PAGE, "lynx -dump '$page'|")) {
    1.11 -if (open(PAGE, "$page")) {
    1.12 -    binmode PAGE,":utf8";
    1.13 -    local $/;
    1.14 -    $text=<PAGE>;
    1.15 -    $text =~ s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g;
    1.16 -    $text =~ s@\n@@g;
    1.17 -    $text =~ s@(Mr|Mrs|viz)\.@\1POINT@g;
    1.18 -    $text =~ s@e\.g\.@ePOINTgPOINT@g;
    1.19 -    $text =~ s@i\.e\.@iPOINTePOINT@g;
    1.20 -    @sentences=split /[.!?]/, $text;
    1.21 -    for (@sentences) {
    1.22 -        s@iPOINTePOINT@i\.e\.@g;
    1.23 -        s@ePOINTgPOINT@e\.g\.@g;
    1.24 -        s@(Mr|Mrs|viz)POINT@\1.@g;
    1.25 -        s/^\s*//;
    1.26 -        s/\s*$//;
    1.27 -        s/\[[0-9]+\]//g;
    1.28 -        s/\s+/ /g;
    1.29 -        if (/\b$regexp\b/ and $number_of_printed < $number_of_printed_max ) {
    1.30 -            print "$_.\n\n";
    1.31 -            $number_of_printed++;
    1.32 +for $page (@ARGV) {
    1.33 +    if (open(PAGE, "$page")) {
    1.34 +        binmode PAGE,":utf8";
    1.35 +        local $/;
    1.36 +        $text=<PAGE>;
    1.37 +        $text =~ s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g;
    1.38 +        $text =~ s@\n@@g;
    1.39 +        $text =~ s@(Mr|Mrs|viz)\.@\1POINT@g;
    1.40 +        $text =~ s@e\.g\.@ePOINTgPOINT@g;
    1.41 +        $text =~ s@i\.e\.@iPOINTePOINT@g;
    1.42 +        @sentences=split /[.!?]/, $text;
    1.43 +        for (@sentences) {
    1.44 +            s@iPOINTePOINT@i\.e\.@g;
    1.45 +            s@ePOINTgPOINT@e\.g\.@g;
    1.46 +            s@(Mr|Mrs|viz)POINT@\1.@g;
    1.47 +            s/^\s*//;
    1.48 +            s/\s*$//;
    1.49 +            s/\[[0-9]+\]//g;
    1.50 +            s/\s+/ /g;
    1.51 +            if (/\b$regexp\b/ and $number_of_printed < $number_of_printed_max ) {
    1.52 +                $mark_start = "\e[35;1m";
    1.53 +                $mark_stop = "\e[0m";
    1.54 +                s/\b($regexp)\b/$mark_start$1$mark_stop/;
    1.55 +                print "$_.\n\n";
    1.56 +                $number_of_printed++;
    1.57 +            }
    1.58 +            if ($number_of_printed >= $number_of_printed_max) {
    1.59 +                exit(0);
    1.60 +            };
    1.61          }
    1.62      }
    1.63  }