xpath vs simpleTag vs 正規表現 ? 全部出来ればいいんじゃないのかな?
作成中
http://diggin.googlecode.com/svn/trunk/library/Diggin/Scraper/
http://code.google.com/p/diggin/source/browse/trunk/library/Diggin/Scraper/
<?php require_once 'Diggin/Scraper/Client.php'; require_once 'Zend/Http/Client.php'; require_once 'Zend/Http/Client/Adapter/Test.php'; $adapter = new Zend_Http_Client_Adapter_Test(); $source = <<<EOF <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> <title>titleTest</title> <link rel="alternate" type="application/rss+xml" title="hogehoge(RSS)" href="../rss" /> <link rel="alternate" type="application/rss+xml" title="hogehoge(RSS)" href="rss.xml" /> </head>/rss <body> <a href="./hogehoge">test</a> <img id="id" src="/images/button.gif" alt="gazou" />tes</img> <img id="id" src="../images/button.gif" alt="gazou" /> <img id="id" src="button.gif" alt="gazou" /> </body> </html> EOF; $adapter->setResponse( "HTTP/1.1 200 OK" . "\r\n" . "Content-type: text/xml" . "\r\n" . "\r\n" . $source); $url = "http://www.example.org/hoge/do.php/test?test#test"; $httpClient = new Zend_Http_Client($url, array('adapter' => $adapter)); $scraper = new Diggin_Scraper_Client(); $scraper->setHttpClient($httpClient); require_once 'Diggin/Scraper/Strategy/Xpath/Adapter/Tidy.php'; require_once 'Diggin/Scraper/Strategy/Xpath/Adapter/Loadhtml.php'; require_once 'Diggin/Scraper/Strategy/Xpath/Adapter/Htmlparser.php'; $tidy = new Diggin_Scraper_Strategy_Xpath_Adapter_Tidy(); $loadhtml = new Diggin_Scraper_Strategy_Xpath_Adapter_Loadhtml(); $scraper->setStrategy("Diggin_Scraper_Strategy_Xpath", $tidy); print_r($scraper->scrape("//a")); $scraper->setStrategy("Diggin_Scraper_Strategy_Xpath", $loadhtml); print_r($scraper->scrape("//a")); //simpleTagで取得 $scraper->setStrategy("Diggin_Scraper_Strategy_SimpleTag"); print_r($scraper->scrape("a")); //preg_match_all(今のところ)で取得 $scraper->setStrategy("Diggin_Scraper_Strategy_Regex"); print_r($scraper->scrape("/<\/\w+>/"));
結果
Array ( [0] => SimpleXMLElement Object ( [@attributes] => Array ( [href] => ./hogehoge ) [0] => test ) ) Array ( [0] => SimpleXMLElement Object ( [@attributes] => Array ( [href] => ./hogehoge ) [0] => test ) ) Array ( [0] => SimpleTag Object ( [name] => a [value] => test [plain] => <a href="./hogehoge">test</a> [parameterList] => Array ( [href] => SimpleTagParameter Object ( [id] => href [value] => ./hogehoge [name] => href ) ) [attributeList] => Array ( ) [pos] => 240 [start] => <a href="./hogehoge"> [end] => </a> [normalization] => ) ) Array ( [0] => Array ( [0] => </title> [1] => </head> [2] => </a> [3] => </img> [4] => </body> [5] => </html> ) )