new-words
annotate grep-sentences.pl @ 45:5f90e44eecfc
new-words.py: turn words filtering and grouping on and off
author | Igor Chubin <igor@chub.in> |
---|---|
date | Fri Feb 04 06:18:50 2011 +0100 (2011-02-04) |
parents | c631833fa2be |
children | 3a61988109a8 |
rev | line source |
---|---|
igor@5 | 1 #!/usr/bin/perl |
igor@5 | 2 |
igor@29 | 3 use utf8; |
igor@29 | 4 use Encode; |
igor@29 | 5 binmode STDIN,":utf8"; |
igor@29 | 6 binmode STDOUT,":utf8"; |
igor@5 | 7 |
igor@6 | 8 $regexp=$ARGV[0]; |
igor@29 | 9 $regexp=decode("utf8",$regexp); |
igor@29 | 10 |
igor@6 | 11 $page=$ARGV[1]; |
igor@31 | 12 $number_of_printed = 0; |
igor@31 | 13 $number_of_printed_max = 10; |
igor@5 | 14 #if (open(PAGE, "lynx -dump '$page'|")) { |
igor@5 | 15 if (open(PAGE, "$page")) { |
igor@29 | 16 binmode PAGE,":utf8"; |
igor@5 | 17 local $/; |
igor@5 | 18 $text=<PAGE>; |
igor@5 | 19 $text =~ s@http://[a-zA-Z&_.:/0-9%?=,\#+()\[\]~-]*@@g; |
igor@5 | 20 $text =~ s@\n@@g; |
igor@28 | 21 $text =~ s@(Mr|Mrs|viz)\.@\1POINT@g; |
igor@28 | 22 $text =~ s@e\.g\.@ePOINTgPOINT@g; |
igor@28 | 23 $text =~ s@i\.e\.@iPOINTePOINT@g; |
igor@19 | 24 @sentences=split /[.!?]/, $text; |
igor@5 | 25 for (@sentences) { |
igor@28 | 26 s@iPOINTePOINT@i\.e\.@g; |
igor@28 | 27 s@ePOINTgPOINT@e\.g\.@g; |
igor@28 | 28 s@(Mr|Mrs|viz)POINT@\1.@g; |
igor@5 | 29 s/^\s*//; |
igor@5 | 30 s/\s*$//; |
igor@5 | 31 s/\[[0-9]+\]//g; |
igor@5 | 32 s/\s+/ /g; |
igor@31 | 33 if (/\b$regexp\b/ and $number_of_printed < $number_of_printed_max ) { |
igor@31 | 34 print "$_.\n\n"; |
igor@31 | 35 $number_of_printed++; |
igor@31 | 36 } |
igor@5 | 37 } |
igor@5 | 38 } |