#!/usr/bin/perl
use Web::Scraper;
use URI;
use Data::Dumper;
my $html = << "END_OF_HTML";
<html>
<head>
<base href="file:///diggin.musicrider.com/" />
<base href="http://diggin.musicrider.com/" />
<base href="file://musicrider.com/" />
</head>
<body> <a href="?param=get">get</a>
<hr /><a href="test12.html">aaa</a>
<hr /><a href="http://www.exaample.net/test12.html">aaa</a>
<hr /><a>aaa</a>
<br /><a href="file://test">test</a>
<img src="image/760.jpg" />
</body>
</html>
END_OF_HTML
my $scraper = scraper {
process 'a', 'link[]' => '@href';
process 'img', 'images[]' => '@src';
};
my $res = $scraper->scrape($html, 'http://127.0.0.1/sandbox/test.html');
print Dumper($res);
#head baseが考慮されてない。
#hrefがないものも取得されている。
$VAR1 = {
'link' => [
bless( do{\(my $o = 'http://127.0.0.1/sandbox/test.html?param=get')}, 'URI::http' ),
bless( do{\(my $o = 'http://127.0.0.1/sandbox/test12.html')}, 'URI::http' ),
bless( do{\(my $o = 'http://www.exaample.net/test12.html')}, 'URI::http' ),
bless( do{\(my $o = 'http://127.0.0.1/sandbox/test.html')}, 'URI::http' ),
bless( do{\(my $o = 'file://test')}, 'URI::file' )
],
'images' => [
bless( do{\(my $o = 'http://127.0.0.1/sandbox/image/760.jpg')}, 'URI::http' )
]
};
<?php
require_once 'Diggin/Scraper.php';
require_once 'Zend/Http/Client.php';
require_once 'Zend/Http/Client/Adapter/Test.php';
$html = <<<HTML
<html>
<head>
<base href="file:///diggin.musicrider.com/" />
<base href="http://diggin.musicrider.com/" />
<base href="file://musicrider.com/" />
</head>
<body> <a href="?param=get">get</a>
<hr /><a href="test12.html">aaa</a>
<hr /><a href="http://www.exaample.net/test12.html">aaa</a>
<hr /><a>aaa</a>
<br /><a href="file://test">test</a>
<img src="image/760.jpg" />
</body>
</html>
HTML;
$adapter = new Zend_Http_Client_Adapter_Test();
$adapter->setResponse(
"HTTP/1.1 200 OK" . "\r\n" .
"Content-type: text/xml" . "\r\n" .
"\r\n" .
$html);
$test = new Zend_Http_Client($url = 'http://127.0.0.1/sandbox/test.html', array('adapter' => $adapter));
$scraper = new Diggin_Scraper();
$scraper->setHttpClient($test);
$res = $scraper->process('a', 'link[] => @href')
->process('img', 'images[] => @src')
->scrape('http://127.0.0.1/sandbox/test.html');
var_dump($res);
array(2) {
["link"]=>
array(4) {
[0]=>
string(39) "http://diggin.musicrider.com/?param=get"
[1]=>
string(40) "http://diggin.musicrider.com/test12.html"
[2]=>
string(35) "http://www.exaample.net/test12.html"
[4]=>
string(11) "file://test"
}
["images"]=>
array(1) {
[0]=>
string(42) "http://diggin.musicrider.com/image/760.jpg"
}
}