new-words
annotate grep-sentences.pl @ 30:07d89c2505e7
lingvo en->ru by Andrii Grytsenko
| author | Igor Chubin <igor@chub.in> |
|---|---|
| date | Sun Jul 04 12:24:01 2010 +0200 (2010-07-04) |
| parents | 7db7bbf96fad |
| children | 48ca8248e9cc |
| rev | line source |
|---|---|
| igor@5 | 1 #!/usr/bin/perl |
| igor@5 | 2 |
| igor@29 | 3 use utf8; |
| igor@29 | 4 use Encode; |
| igor@29 | 5 binmode STDIN,":utf8"; |
| igor@29 | 6 binmode STDOUT,":utf8"; |
| igor@5 | 7 |
| igor@6 | 8 $regexp=$ARGV[0]; |
| igor@29 | 9 $regexp=decode("utf8",$regexp); |
| igor@29 | 10 |
| igor@6 | 11 $page=$ARGV[1]; |
| igor@5 | 12 #if (open(PAGE, "lynx -dump '$page'|")) { |
| igor@5 | 13 if (open(PAGE, "$page")) { |
| igor@29 | 14 binmode PAGE,":utf8"; |
| igor@5 | 15 local $/; |
| igor@5 | 16 $text=<PAGE>; |
| igor@5 | 17 $text =~ s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; |
| igor@5 | 18 $text =~ s@\n@@g; |
| igor@28 | 19 $text =~ s@(Mr|Mrs|viz)\.@\1POINT@g; |
| igor@28 | 20 $text =~ s@e\.g\.@ePOINTgPOINT@g; |
| igor@28 | 21 $text =~ s@i\.e\.@iPOINTePOINT@g; |
| igor@19 | 22 @sentences=split /[.!?]/, $text; |
| igor@5 | 23 for (@sentences) { |
| igor@28 | 24 s@iPOINTePOINT@i\.e\.@g; |
| igor@28 | 25 s@ePOINTgPOINT@e\.g\.@g; |
| igor@28 | 26 s@(Mr|Mrs|viz)POINT@\1.@g; |
| igor@5 | 27 s/^\s*//; |
| igor@5 | 28 s/\s*$//; |
| igor@5 | 29 s/\[[0-9]+\]//g; |
| igor@5 | 30 s/\s+/ /g; |
| igor@5 | 31 print "$_.\n\n" if /\b$regexp\b/; |
| igor@5 | 32 } |
| igor@5 | 33 } |
