本帖最后由 523066680 于 2018-10-4 16:32 编辑
[Perl]抓取句子大全的网页
分两步,第一步先提取网页,然后再本地提取文本。
可以中途终止脚本,重新打开后会略过已经完成的条目。
抓取的 HTML 保存在 D:/temp/句子大全 文件夹 | | | | | | | | | | | use utf8; | | use Encode; | | use File::Path; | | use File::Slurp; | | use LWP::UserAgent; | | use File::Path; | | use File::Basename qw/basename/; | | use Mojo::DOM; | | STDOUT->autoflush(1); | | | | our $wdir = encode('gbk', "D:/temp/句子大全"); | | mkpath $wdir unless -e $wdir; | | our $main = "http://www.1juzi.com"; | | our $ua = LWP::UserAgent->new( keep_alive => 1, timemout => 8 ); | | my $res = $ua->get($main); | | my $html = $res->content(); | | my $dom = Mojo::DOM->new($html); | | | | my (@urls, @dirs); | | get_item($dom, \@urls, \@dirs); | | | | my $tdir; | | for my $id ( 0 .. $#urls ) | | { | | printf "%s\n", $dirs[$id]; | | next if -e $dirs[$id]; | | $tdir = $dirs[$id] ."_"; | | mkpath $tdir unless -e $tdir; | | get_alist( $main .$urls[$id], $tdir ); | | rename( $tdir , $dirs[$id] ); | | } | | | | sub get_item | | { | | our $wdir; | | my ($dom, $urls, $dirs) = @_; | | my $menu = $dom->at(".header-menu"); | | | | for my $e ( $menu->find("ul li a")->each ) | | { | | push @$urls, $e->attr("href"); | | push @$dirs, sprintf "%s/%s/%s", $wdir, $e->parent->parent->previous->text, $e->text; | | } | | } | | | | sub get_alist | | { | | our $main; | | my ($url, $dir) = @_; | | my $res = $ua->get( $url ); | | my $dom = Mojo::DOM->new( $res->content ); | | my @links; | | @links = @{ $dom->at(".alist")->find("a")->map(attr=>"href") }; | | | | | | my $retry; | | for my $link ( @links ) | | { | | printf " %s\n", $link; | | $retry = 0; | | do | | { | | $res = $ua->get( $main .$link ); | | $retry++; | | print "retry times: $retry\n" if ($retry > 1 ); | | } | | until ( $res->is_success() ); | | | | write_file( $dir ."/". basename($link), $res->content ); | | } | | }COPY |
|