利用QT编写一个简单爬虫程序

从高中到大学,一直在固定小说网中下载小说,小说网停机过好几次但最后又起死回生。最近萌发一个想法,把小说网里的小说都爬下来。。

既然要爬网站肯定要对网站结构十分了解,幸好小说网没有弄什么登陆防爬措施,结构还是很简单的。


几个分类,各个分类下按页以时间排序


获取小说详细页地址其实很简单,利用正则表达式即可获得。

<div class="s">作者:<a href="/Writer/22265.html" target="_blank">星殒落</a><br />大小:5.3MB<br>等级:<em class="lstar3"></em><br>更新:2017-07-12</div>
    <a href="/35942.html"><img src="/d/file/soft/sort01/sort016/2017-07-12/9e56a6da0e765ead099e80a0017698d2.jpg">《超级造化炉》全集</a>

这就是网页源码,看到规律直接正则即可获取


点击进小说详情页



详细页也一样 

<a class="downButton" href='http://dzs.qisuu.com/txt/超级造化炉.txt' title="《超级造化炉》全集打包下载">Txt格式下载</a>
                        <div class="tabBox">兼容性最好的txt格式,支持所有设备(右键另存为下载)

思路清晰了,首先获取当前页小说详细页的地址和下一页地址,然后进入小说详细页获取下载地址并储存下来并反复循环。

爬虫程序有三个类,Mainwindow QT界面,http (封装get请求获取网页源码)

class http:public QObject
{
     Q_OBJECT
public:
 
 
    QByteArray get(const QString &strUrl);
    QByteArray post(const QString &strUrl, QByteArray data);
};
#include "http.h"
#include <QEventLoop>
#pragma execution_character_set("utf-8")
 
 
QByteArray http::get(const QString &strUrl)
{
 
 
 
 
    const QUrl url = QUrl::fromUserInput(strUrl);
 
 
    QNetworkAccessManager m_qnam;
    QNetworkRequest qnr(url);
    QNetworkReply* reply = m_qnam.get(qnr); //m_qnam是QNetworkAccessManager对象
 
 
    QEventLoop eventLoop;
    connect(reply, &QNetworkReply::finished, &eventLoop, &QEventLoop::quit);
    eventLoop.exec(QEventLoop::ExcludeUserInputEvents);
 
 
    QByteArray replyData = reply->readAll();
    int statusCode = reply->attribute(QNetworkRequest::HttpStatusCodeAttribute).toInt();
    QVariant redirectAttr = reply->attribute(QNetworkRequest::RedirectionTargetAttribute);
    if (reply->error()
            || 300 == statusCode //状态码300 Multiple Choices,既不是错误也不算重定向,应该是qt bug
            || !redirectAttr.isNull())
    {
        QString errString = reply->error() ? reply->errorString() : QString("发生重定向(%1),不允许此情况").arg(statusCode);
 
 
        qDebug()<<QString("发送get请求时出现错误:\n网址:%1\n错误信息:%2").arg(reply->request().url().toDisplayString(), errString);
        replyData.clear();
    }
 
 
    reply->deleteLater();
    reply = nullptr;
  //   qDebug()<<replyData.size();
   //  qDebug()<<replyData;
    return replyData;
}
 
 
 
 
QByteArray http::post(const QString &strUrl, QByteArray data)
{
    const QUrl url = QUrl::fromUserInput(strUrl);
QByteArray replyData;
    QNetworkAccessManager m_qnam;
    QNetworkRequest qnr(url);
    qnr.setRawHeader("content-type","application/x-www-form-urlencoded");
    QNetworkReply* reply = m_qnam.post(qnr,data); //m_qnam是QNetworkAccessManager对象
 
 
    QEventLoop eventLoop;
    connect(reply, &QNetworkReply::finished, &eventLoop, &QEventLoop::quit);
    eventLoop.exec(QEventLoop::ExcludeUserInputEvents);
 
 
    QVariant statusCodeV =
    reply->attribute(QNetworkRequest::HttpStatusCodeAttribute);
    QVariant redirectionTargetUrl =
    reply->attribute(QNetworkRequest::RedirectionTargetAttribute);
    if (reply->error() == QNetworkReply::NoError)
    {
         replyData = reply->readAll();  // bytes
        //QString string(bytes); // string
        QString string = QString::fromUtf8(replyData);
        qDebug()<<string.size();
        string=string.mid(0,5000);
        qDebug()<<string;
 
 
    }
    else
    {
        qDebug()<<reply->errorString();
 
 
    }
    reply->deleteLater();
     qDebug()<<replyData.size();
     qDebug()<<replyData;
     QJsonParseError json_error;
     QJsonDocument parse_doucment = QJsonDocument::fromJson(replyData, &json_error);
     if(json_error.error == QJsonParseError::NoError)
     {
         if(parse_doucment.isObject())
         {
             QJsonObject obj = parse_doucment.object();
             if(obj.contains("info"))
             {
 
 
                 QJsonValue name_value = obj.take("info");
                 if(name_value.isString())
                 {
                     QString ID = name_value.toString();
                      qDebug()<<ID;
                  //   processs_data.append(ID);
 
 
                 }
                 else
                     {
                     qDebug()<<"777";
                 }
             }
             else{
                 qDebug()<<"444";
             }
         }
         else
         {
             qDebug()<<"333";
         }
     }
     else
     {
         qDebug()<<"00";
     }
 
 
 
 
    return replyData;
}
 regular (正则表达式) 
 

#include "regular.h"
 
 
regular::regular()
{
 
 
}
 
 
 
 
QStringList regular::search(QString regulardata,QString data)
{
    QStringList searchdata;
    QRegularExpression regularExpression(regulardata);
        int index = 0;
        QRegularExpressionMatch match;
        do {
            match = regularExpression.match(data, index);
            if(match.hasMatch()) {
                index = match.capturedEnd();
                searchdata.append(match.captured(0));
       //         qDebug()<<"("<<match.capturedStart()<<","<<index<<") "<<match.captured(0);
            } else {
                break;
            }
        } while(index < data.length());
       // qDebug()<<searchdata;
        return searchdata;
}
 
 
QString regular::searchnextpage(QString data)
{
    QString i;
    QStringList linshi;
    i=QString("上一页(.*)下一页");
    linshi=search(i,data);
    if(linshi.size()>0)
    {
    i=QString("a href='(.*)>下一页");
    linshi=search(i,linshi[0]);
    if(linshi.size()>0)
    {
        linshi[0]=linshi[0].replace("a href='","");
        linshi[0]=linshi[0].replace("'>下一页","");
     //   qDebug()<<linshi[0];
        return linshi[0];
    }
    else
    {
        qDebug()<<"没有找到上下页";
        return "";
    }
    }
    else
    {
        qDebug()<<"没有找到上下页";
        return "";
    }
}
 
 
 
 
QStringList regular::searchnovelurl(QString data)
{
    QString i=QString("<a href=(.*)<img src=");
 
 
    QStringList linshi;
    linshi=search(i,data);
 
 
    for(int a=0;a<linshi.size();a++)
    {
        if(linshi[a].indexOf("html")==-1)
        {
     //       qDebug()<<linshi[a]<<"移除";
            linshi.removeOne(linshi[a]);
        }
 
 
    }
 
 
    for(int a=0;a<linshi.length();a++)
    {
        linshi[a].replace("<a href=\"","");
        linshi[a].replace("\"><img src=","");
     //   qDebug()<<linshi[a];
 
 
    }
    return linshi;
}
 
 
QStringList regular::searchnoveldata(QString data)
{
    //////////////////////////////////////查找小说下载地址
    QString novelname,novelauthor,noveltime,noveldownurl;
    QString regulardata="<div class=\"detail_right\">\r\n(.*)</h1>\r\n";
    QStringList searchdata=search(regulardata,data);
    QStringList test,test1;
    if(searchdata.size()>0)
    {
        regulardata="<h1>(.*)</h1>";
        searchdata=search(regulardata,searchdata[0]);
        if(searchdata.size()>0)
        {
            searchdata[0].replace("<h1>","");
            searchdata[0].replace("</h1>","");
            novelname=searchdata[0];                      //获取到小说名称
 
 
        }
    }
    else
    {
        qDebug()<<"查找小说名称时发生错误";
 
 
    }
    regulardata="书籍作者(.*)<";
    searchdata=search(regulardata,data);
    if(searchdata.size()>0)
    {
         ///////////////////////////////////////////有两种可能,1作者没有其他作品 2作者有其他作品可以直接点击链接搜索
 
 
        if(searchdata[0].indexOf("target")==-1)           //小说作者没其他作品
        {
        searchdata[0].replace("书籍作者:","");
        searchdata[0].replace("<","");
        novelauthor=searchdata[0];                         //获取到小说作者名
        }
        else
        {
            regulardata="blank(.*)</a>";
            searchdata=search(regulardata,searchdata[0]);
            if(searchdata.size()>0)
            {
                searchdata[0].replace("blank\">","");
                searchdata[0].replace("</a>","");
                novelauthor=searchdata[0];                      //获取到小说名称
 
 
            }
 
 
 
 
 
 
 
 
        }
 
 
 
 
 
 
    }
    else
    {
        qDebug()<<"查找小说作者失败";
    }
 
 
    regulardata="发布日期(.*)<";
    searchdata=search(regulardata,data);
    if(searchdata.size()>0)
    {
        searchdata[0].replace("发布日期:","");
        searchdata[0].replace("<","");
        noveltime=searchdata[0];                          //获取小说发布时间
    }
    else
    {
        qDebug()<<"查找小说时间失败";
    }
 
 
 
 
    regulardata="href=(.*)Txt格式下载";
 
 
    searchdata=search(regulardata,data);
    if(searchdata.size()>0)
    {
        regulardata="href=(.*)txt";
        test.append(searchdata);
        searchdata=search(regulardata,searchdata[0]);
        if(searchdata.size()>0)
        {
            searchdata[0].replace("href='","");
            searchdata[0].replace("<","");
            noveldownurl=searchdata[0];
        }
        else
        {
            qDebug()<<"查找小说下载地址失败2";
 
 
            qDebug()<<data;
            qDebug()<<test;
            noveldownurl="出错";
        }
    }
    else
    {
        qDebug()<<"查找小说下载地址失败1";
        qDebug()<<data.mid(10000,30000);
        novelname="出错";
    }
 
 
 
 
 
 
 
 
QStringList senddata;
//qDebug()<<novelname;
senddata.append(novelname);
senddata.append(novelauthor);
senddata.append(noveltime);
senddata.append(noveldownurl);
 
 
    return senddata;
}
 
 
 
 
 
 
MainWindow::MainWindow(QWidget *parent) :
    QMainWindow(parent),
    ui(new Ui::MainWindow)
{
    ui->setupUi(this);
       int yeshu=1,xiaoshuoshu=1;
 
 
    QFile file("C:/Users/10515/Desktop/novel.txt");
       if(!file.open(QIODevice::ReadWrite | QIODevice::Text)) {
           qDebug()<<"Can't open the file!"<<endl;
       }
       QTextStream stream(&file);
 
 
       stream.seek(file.size());//将当前读取文件指针移动到文件末尾
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
    QList<QStringList>   novelinfo;
    QString weburl="";
    QString starturl="";
for(int h=0;;h++)
    {
    QString data(web.get(starturl));
  //  qDebug()<<data.mid(0,30000);
    QStringList novelurl=zhengze.searchnovelurl(data);
    novel.append(novelurl);
   // qDebug()<<novelurl;
    QString     nextpage=zhengze.searchnextpage(data);
    //qDebug()<<nextpage;
    if(nextpage.size()>0)
    page.append(nextpage);
    for(int b=0;b<novel.size();b++)
    {
        QString data1(web.get(weburl+novel[b]));
        QStringList result=zhengze.searchnoveldata(data1);
        novelinfo.append(result);
        stream <<result.join("    ")<<"\n";
        qDebug()<<xiaoshuoshu++<<"本"<<result.join("    ");
 
 
    }
    novel.clear();
    if(nextpage.size()==0)
        break;
    starturl=weburl+nextpage;
    qDebug()<<yeshu++<<"页";
}
 
 
qDebug()<<novel.size();
qDebug()<<page<<page.size();
qDebug()<<novelinfo;
file.close();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
}
 
 
 
 

将小说地址都弄到一个txt文本内,然后写了一个专门的下载器下载





猜你喜欢

转载自blog.csdn.net/d7185540/article/details/75041289