版权声明:CopyRight @CSDN 码农Robin https://blog.csdn.net/weixin_41423450/article/details/84068343
- composer安装phpquerylist4
在项目目录下创建composer.json文件:
{
"require": {
"jaeger/querylist": "^4.0"
}
}
命令行运行命令:
composer install
就可以得到vendor文件夹,文件结构如下图:
2、创建方法,采集、存储数据
<?php
/**
* @Filename: index.php
* @desc:数据采集
*/
require('autoload.php');
use QL\QueryList;
set_time_limit(0);
header("Content-type:text/html;charset=utf-8");
class catchInfo
{
private $conn;
private $host = '127.0.0.1';
private $password = 'root';
private $username = 'root';
private $dbname = 'test';
private $url = 'http://so.csdn.net/so/search/s.do?';
private $path = '../images/';
public function __construct()
{
$this->conn = mysqli_connect($this->host,$this->username,$this->password,$this->dbname);
}
public function searchdata($keywords,$page)
{
if(empty($keywords)){
$keywords = $_REQUEST['keywords']?:'php';//关键字
}
if(empty($page)){
$page = $_REQUEST['page']?:1;//页码
}
phpQuery::$defaultCharset="utf-8";
$html = $this->url.'p='.$page.'&q='.$keywords;
$rule = array(
"url"=>array('.search-link a','href'),
);
$hj = QueryList::get($html)->rules($rule)->queryData();
$i = 0;
foreach($hj as $v){
if(strpos($v['url'],'blog.csdn') && strpos($v['url'],'article/details')){
$article_url = $v['url'];
$sql = "select id from ay_content where outlink = '$article_url'";
$res = mysqli_query($this->conn,$sql);
//匹配数据库中已爬取的url
if($res->num_rows == 0){
//爬取规则
$source = QueryList::get($article_url);
$title = $source->find('.article-title-box h1')->text();
$author = $source->find('.article-bar-top a')->text();
$content = $source->find('.blog-content-box article')->html();
$content = addslashes(htmlspecialchars($content));
$sql = "insert into ay_content(title,author,content,outlink,sorting) values('$title','$author','$content','$article_url',$page)";
$res = mysqli_query($this->conn,$sql);
//自增
if($res){
$i++;
}
//一次采集10条
if($i == 10){
exit;
}
}
}
}
//一次采集10条
if($i < 10){
$this->searchdata($keywords,$page+1);
}
}
/**
* @Function show_aritcle 数据回显
* @Return: void
*/
public function show_aritcle()
{
$sql = "select content from ay_content where id = 1";
$res = mysqli_query($this->conn,$sql);
var_dump(htmlspecialchars_decode(mysqli_fetch_row($res)[0]));
}
}
if(!empty($_POST)){
$type = $_POST['type']?:'';
$page = $_POST['page']?:'';
$keywords = $_POST['keywords']?:'';
}
$obj = new catchInfo();
if(!empty($type)){
$obj->$type($keywords,$page);
}
?>
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>数据采集</title>
</head>
<body>
<form action="" method="post">
<input type="hidden" name="type" value="searchdata">
关键词:<input type="text" name="keywords">
页码:<input type="number" name="page">
<input type="submit" value="提交">
</form>
</body>
</html>
3、页面效果
4、采集结果
-
声明:本人所采集CSDN文章仅为学习用途,并未用于任何盈利性商业目的。
-
说明:在采集过程中存在一些不尽如人意的地方,文章中的图片我想过多种办法下载下来,然后采用本地的图片地址进行替换,但都失败了,先是用的str_replace()函数,将文章中的图片链接地址替换为本地图片存储的相对路径,失败了,后来我再用正则匹配图片链接地址,只能匹配部分链接,因为文章中的图片来源也有可能是第三方网址,匹配失败。若是有朋友能解释下str_replace无法替换的原因或者提供一个合适的正则匹配,在下就先行谢过了!