$html = curl($url); libxml_use_internal_errors(true); $dom = new DOMDocument(); $dom->loadHTML($html);
<?php
$doc = new DOMDocument();
$doc->loadHTMLFile('http://netkiller.github.io/');
$xpath = new DOMXPath($doc);
$title = $xpath->evaluate('string(/html/head/title)');
echo "Document title is: " . $title . "\n";
?>
<?php
function curl($url, $fields = array(), $auth = false){
$url_arr = parse_url($url);
$curl = curl_init($url);
$headers = array(
'Accept: text/plain, */*; q=0.01',
'Accept-Encoding: gzip, deflate',
'Accept-Language: zh-CN,zh;q=0.8,en;q=0.6,vi;q=0.4,zh-TW;q=0.2',
'Connection: keep-alive',
'Content-Type: application/x-www-form-urlencoded; charset=UTF-8',
);
$headers[]= 'Host: '.$url_arr['host'];
$headers[]= 'Origin: https://'.$url_arr['host'];
$headers[]= 'X-Requested-With: XMLHttpRequest';
// curl_setopt($curl, CURLOPT_HTTPHEADER, $headers);
curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1");
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_VERBOSE, 0);
curl_setopt($curl, CURLOPT_HEADER, 0);
curl_setopt($curl, CURLOPT_REFERER, $url) ;
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
// curl_setopt($curl, CURLOPT_COOKIEFILE, $jar);
// curl_setopt($curl, CURLOPT_COOKIEJAR, $jar);
// if($auth){
// curl_setopt($curl, CURLOPT_USERPWD, "$auth");
// curl_setopt($curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
// }
if($fields){
$fields_string = http_build_query($fields);
curl_setopt($curl, CURLOPT_POST, true);
curl_setopt($curl, CURLOPT_BINARYTRANSFER, true);
curl_setopt($curl, CURLOPT_POSTFIELDS, $fields_string);
}
$response = curl_exec($curl);
curl_close($curl);
// $this->referer = $url;
return $response;
}
$url = "http://netkiller.github.io/journal/index.html";
$html = curl($url);
libxml_use_internal_errors(true);
$dom = new DOMDocument();
$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$xml = $xpath->query('//div[@class="section"]');
foreach ($xml as $result_object){
//echo $result_object->childNodes->item(0)->nodeValue;
print_r($result_object);
}
下面的例子是从某个网站扣取一个HTML块的例子
$url = "http://netkiller.github.io/journal/index.html";
$html = curl($url);
libxml_use_internal_errors(true);
$dom = new DOMDocument();
$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$xml = $xpath->query('//div[@class="section"]');
$xhtml = $dom->saveHTML($xml->item(0));
print_r($xhtml);