版权声明:[email protected] https://blog.csdn.net/qq_36324113/article/details/88953422
php抓取网页信息的N种方法
一.symfony/dom-crawler+guzzlehttp/guzzle
1.安装两个组建
composer require guzzlehttp/guzzle
composer require symfony/dom-crawler
2.仿照代码编写
有点基础的都能看懂 ,看不懂的联系博主吧,详细用法稍后更新。
q:4612006
index.php
<?php
require_once './vendor/autoload.php';
header('Content-type:text/html;charset=UTF-8');
$Crawler = new \Symfony\Component\DomCrawler\Crawler();
class Command
{
//抓取地址
const URL = 'http://www.ccgp-liaoning.gov.cn/bulletin.do?method=showbulletin&bulletin_id=';
//当前最大数据
private $max_page = 0;
//初始化获取最大id
public function __construct ()
{
$url = 'http://www.ccgp-liaoning.gov.cn/bulletininfo.do?method=bdetail&treenum=05&treenumfalse=';
$client = new \GuzzleHttp\Client([
'timeout' => 10 ,
]);
$response = $client->request('GET' ,$url);
// 转换成页面使用的编码,默认为UTF-8,此网站乱码。
$type = $response->getHeader('content-type');
$parsed = \GuzzleHttp\Psr7\parse_header($type);
$original_body = (string)$response->getBody()->getContents();;
$utf8_body = mb_convert_encoding($original_body ,'UTF-8' ,$parsed[0]['charset'] ? : 'UTF-8');
$crawler = new \Symfony\Component\DomCrawler\Crawler();
$crawler->addHtmlContent($utf8_body);
$err = $crawler->filterXPath('//tbody/tr')->first()->attr('id');
if (!empty($err)) {
$this->max_page = $err;
} else {
echo '暂无数据';
exit;
}
}
/**
* @throws \GuzzleHttp\Exception\GuzzleException
* 执行抓取
*/
public function execute ()
{
header('Content-type:text/html;charset=UTF-8');
//需要爬取的页面
$url = self::URL;
//下载网页内容
$client = new \GuzzleHttp\Client([
'timeout' => 10 ,
]);
$data = [];
//默认抓取30条
for ($i = $this->max_page; $i >= $this->max_page - 30; $i--) {
$response = $client->request('GET' ,$url . $i);
// 转换成页面使用的编码,默认为UTF-8,此网站乱码。
$type = $response->getHeader('content-type');
$parsed = \GuzzleHttp\Psr7\parse_header($type);
$original_body = (string)$response->getBody()->getContents();;
$utf8_body = mb_convert_encoding($original_body ,'UTF-8' ,$parsed[0]['charset'] ? : 'UTF-8');
$crawler = new \Symfony\Component\DomCrawler\Crawler();
$crawler->addHtmlContent($utf8_body);
try {
$err = $crawler->filterXPath('//*[@class="newinfotr1"]')->text();
if (!empty($err)) {
//TODO 你的业务逻辑
}
} catch (\Exception $e) {
}
}
print_r($data);
exit;
}
/**
* @param $str
* @return string
* 删除字符串中的空格
*/
private function clearHtml ($str)
{
$str = trim($str); //清除字符串两边的空格
$str = preg_replace("/\t/" ,"" ,$str); //使用正则表达式替换内容,如:空格,换行,并将替换为空。
$str = preg_replace("/\r\n/" ,"" ,$str);
$str = preg_replace("/\r/" ,"" ,$str);
$str = preg_replace("/\n/" ,"" ,$str);
$str = preg_replace("/ /" ,"" ,$str);
$str = preg_replace("/ /" ,"" ,$str); //匹配html中的空格
return trim($str); //返回字符串
}
}
//执行
$obj = new Command();
$obj->execute();