php抓取网页信息(一)

版权声明:[email protected] https://blog.csdn.net/qq_36324113/article/details/88953422

php抓取网页信息的N种方法

一.symfony/dom-crawler+guzzlehttp/guzzle

1.安装两个组建

 composer require guzzlehttp/guzzle
 composer require symfony/dom-crawler

2.仿照代码编写
有点基础的都能看懂 ,看不懂的联系博主吧,详细用法稍后更新。
q:4612006
index.php

<?php

require_once './vendor/autoload.php';
header('Content-type:text/html;charset=UTF-8');
$Crawler = new \Symfony\Component\DomCrawler\Crawler();

class Command
{
    //抓取地址
    const URL = 'http://www.ccgp-liaoning.gov.cn/bulletin.do?method=showbulletin&bulletin_id=';

    //当前最大数据
    private $max_page = 0;

    //初始化获取最大id
    public function __construct ()
    {
        $url      = 'http://www.ccgp-liaoning.gov.cn/bulletininfo.do?method=bdetail&treenum=05&treenumfalse=';
        $client   = new \GuzzleHttp\Client([
            'timeout' => 10 ,
        ]);
        $response = $client->request('GET' ,$url);
        // 转换成页面使用的编码,默认为UTF-8,此网站乱码。
        $type          = $response->getHeader('content-type');
        $parsed        = \GuzzleHttp\Psr7\parse_header($type);
        $original_body = (string)$response->getBody()->getContents();;
        $utf8_body = mb_convert_encoding($original_body ,'UTF-8' ,$parsed[0]['charset'] ? : 'UTF-8');
        $crawler   = new \Symfony\Component\DomCrawler\Crawler();
        $crawler->addHtmlContent($utf8_body);
        $err = $crawler->filterXPath('//tbody/tr')->first()->attr('id');
        if (!empty($err)) {
            $this->max_page = $err;
        } else {
            echo '暂无数据';
            exit;
        }
    }

    /**
     * @throws \GuzzleHttp\Exception\GuzzleException
     * 执行抓取
     */
    public function execute ()
    {
        header('Content-type:text/html;charset=UTF-8');
        //需要爬取的页面
        $url = self::URL;
        //下载网页内容
        $client = new \GuzzleHttp\Client([
            'timeout' => 10 ,
        ]);
        $data   = [];
        //默认抓取30条
        for ($i = $this->max_page; $i >= $this->max_page - 30; $i--) {
            $response = $client->request('GET' ,$url . $i);
            // 转换成页面使用的编码,默认为UTF-8,此网站乱码。
            $type          = $response->getHeader('content-type');
            $parsed        = \GuzzleHttp\Psr7\parse_header($type);
            $original_body = (string)$response->getBody()->getContents();;
            $utf8_body = mb_convert_encoding($original_body ,'UTF-8' ,$parsed[0]['charset'] ? : 'UTF-8');
            $crawler   = new \Symfony\Component\DomCrawler\Crawler();
            $crawler->addHtmlContent($utf8_body);
            try {
                $err = $crawler->filterXPath('//*[@class="newinfotr1"]')->text();
                if (!empty($err)) {
                   //TODO 你的业务逻辑
                }
            } catch (\Exception $e) {

            }
        }
        print_r($data);
        exit;
    }

    /**
     * @param $str
     * @return string
     * 删除字符串中的空格
     */
    private function clearHtml ($str)
    {
        $str = trim($str); //清除字符串两边的空格
        $str = preg_replace("/\t/" ,"" ,$str); //使用正则表达式替换内容,如:空格,换行,并将替换为空。
        $str = preg_replace("/\r\n/" ,"" ,$str);
        $str = preg_replace("/\r/" ,"" ,$str);
        $str = preg_replace("/\n/" ,"" ,$str);
        $str = preg_replace("/ /" ,"" ,$str);
        $str = preg_replace("/  /" ,"" ,$str);  //匹配html中的空格
        return trim($str); //返回字符串
    }
}

//执行
$obj = new Command();
$obj->execute();

猜你喜欢

转载自blog.csdn.net/qq_36324113/article/details/88953422
今日推荐