使用phpquerylist爬取csdn文章

composer安装phpquerylist4
在项目目录下创建composer.json文件：

{
    "require": {
        "jaeger/querylist": "^4.0"
    }
}

命令行运行命令：

composer install

就可以得到vendor文件夹，文件结构如下图：
在这里插入图片描述
2、创建方法，采集、存储数据

<?php
/**
 * @Filename: index.php
 * @desc:数据采集
 */
require('autoload.php');
use QL\QueryList;
set_time_limit(0);
header("Content-type:text/html;charset=utf-8");


class catchInfo
{
    private $conn;
    private $host = '127.0.0.1';
    private $password = 'root';
    private $username = 'root';
    private $dbname = 'test';
    private $url = 'http://so.csdn.net/so/search/s.do?';
    private $path = '../images/';
    public function __construct()
    {
        $this->conn = mysqli_connect($this->host,$this->username,$this->password,$this->dbname);
    }

    public function searchdata($keywords,$page)
    {
        if(empty($keywords)){
            $keywords = $_REQUEST['keywords']?:'php';//关键字
        }
        if(empty($page)){
            $page = $_REQUEST['page']?:1;//页码
        }

        phpQuery::$defaultCharset="utf-8";
        $html = $this->url.'p='.$page.'&q='.$keywords;
        $rule = array(
            "url"=>array('.search-link a','href'),
        );
        $hj = QueryList::get($html)->rules($rule)->queryData();
        $i = 0;
        foreach($hj as $v){
            if(strpos($v['url'],'blog.csdn') && strpos($v['url'],'article/details')){
                $article_url = $v['url'];
                $sql = "select id from ay_content where outlink = '$article_url'";
                $res = mysqli_query($this->conn,$sql);
                //匹配数据库中已爬取的url
                if($res->num_rows == 0){
                	//爬取规则
                    $source = QueryList::get($article_url);
                    $title = $source->find('.article-title-box h1')->text();
                    $author = $source->find('.article-bar-top a')->text();
                    $content = $source->find('.blog-content-box article')->html();
                    $content = addslashes(htmlspecialchars($content));
                    $sql = "insert into ay_content(title,author,content,outlink,sorting) values('$title','$author','$content','$article_url',$page)";
                    $res = mysqli_query($this->conn,$sql);
                    //自增
                    if($res){
                        $i++;
                    }
                    //一次采集10条
                    if($i == 10){
                        exit;
                    }
                }
            }
        }
        //一次采集10条
        if($i < 10){
            $this->searchdata($keywords,$page+1);
        }
    }

    /**
     * @Function show_aritcle 数据回显
     * @Return: void
     */
    public function show_aritcle()
    {
        $sql = "select content from ay_content where id = 1";
        $res = mysqli_query($this->conn,$sql);
        var_dump(htmlspecialchars_decode(mysqli_fetch_row($res)[0]));
    }
}
if(!empty($_POST)){
    $type = $_POST['type']?:'';
    $page = $_POST['page']?:'';
    $keywords = $_POST['keywords']?:'';
}
$obj = new catchInfo();
if(!empty($type)){
    $obj->$type($keywords,$page);
}

 ?>

<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<title>数据采集</title>
</head>
<body>
<form action="" method="post">
    <input type="hidden" name="type" value="searchdata">
    关键词：<input type="text" name="keywords">
    页码：<input type="number" name="page">
    <input type="submit" value="提交">
</form>
</body>
</html>

3、页面效果
在这里插入图片描述
4、采集结果

声明：本人所采集CSDN文章仅为学习用途，并未用于任何盈利性商业目的。
说明：在采集过程中存在一些不尽如人意的地方，文章中的图片我想过多种办法下载下来，然后采用本地的图片地址进行替换，但都失败了，先是用的str_replace()函数，将文章中的图片链接地址替换为本地图片存储的相对路径，失败了，后来我再用正则匹配图片链接地址，只能匹配部分链接，因为文章中的图片来源也有可能是第三方网址，匹配失败。若是有朋友能解释下str_replace无法替换的原因或者提供一个合适的正则匹配，在下就先行谢过了！

使用phpquerylist爬取csdn文章

猜你喜欢