new-words

view grep-sentences.pl @ 30:07d89c2505e7

lingvo en->ru by Andrii Grytsenko
author Igor Chubin <igor@chub.in>
date Sun Jul 04 12:24:01 2010 +0200 (2010-07-04)
parents 7db7bbf96fad
children 48ca8248e9cc
line source
1 #!/usr/bin/perl
3 use utf8;
4 use Encode;
5 binmode STDIN,":utf8";
6 binmode STDOUT,":utf8";
8 $regexp=$ARGV[0];
9 $regexp=decode("utf8",$regexp);
11 $page=$ARGV[1];
12 #if (open(PAGE, "lynx -dump '$page'|")) {
13 if (open(PAGE, "$page")) {
14 binmode PAGE,":utf8";
15 local $/;
16 $text=<PAGE>;
17 $text =~ s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g;
18 $text =~ s@\n@@g;
19 $text =~ s@(Mr|Mrs|viz)\.@\1POINT@g;
20 $text =~ s@e\.g\.@ePOINTgPOINT@g;
21 $text =~ s@i\.e\.@iPOINTePOINT@g;
22 @sentences=split /[.!?]/, $text;
23 for (@sentences) {
24 s@iPOINTePOINT@i\.e\.@g;
25 s@ePOINTgPOINT@e\.g\.@g;
26 s@(Mr|Mrs|viz)POINT@\1.@g;
27 s/^\s*//;
28 s/\s*$//;
29 s/\[[0-9]+\]//g;
30 s/\s+/ /g;
31 print "$_.\n\n" if /\b$regexp\b/;
32 }
33 }