通过python爬取笔趣阁小说,获取图片保存本地,数据保存mysql

通过python爬取小说图片、小说名称、作者、章节、内容。直接上代码

import requests
import time
from bs4 import BeautifulSoup
from queue import Queue
import threading
import pymysql
from DBUtils.PooledDB import PooledDB

proxy='124.243.226.18:8888'

#如果代理需要验证,只需要在前面加上用户名密码,如下所示

# proxy='username:[email protected]:8888'
proxies={
    'http':'http://119.140.186.17:894',
    'http':'http://27.24.20.22:766',
    'http':'http://114.98.189.158:766',
    'http':'http://114.101.246.185:23564'
}

pool = PooledDB(creator=pymysql,
                        maxconnections=0,  # 连接池允许的最大连接数,0和None表示不限制连接数
                        mincached=10,  # 初始化时,链接池中至少创建的空闲的链接,0表示不创建
                        maxcached=0,  # 链接池中最多闲置的链接,0和None不限制

                        blocking=True,  # 连接池中如果没有可用连接后,是否阻塞等待。True,等待;False,不等待然后报错
                        host='47.106.122.177',  # 数据库ip地址
                        port=3306,
                        user='root',
                        passwd='123456',
                        db='zskxt',
                        use_unicode=True,
                        charset='utf8')


send_headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
    "Connection": "keep-alive",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.8"
}  # 伪装成浏览器

#获取小说URL列表
def huoquxiaoshuo():
    search_real_url = 'http://www.xbiquge.la/xiaoshuodaquan/'
    try:
        html = requests.get(search_real_url).text

        htmlTree = BeautifulSoup(html, 'html.parser')
        url_list=htmlTree.find(id='main').find_all('a')
        print(url_list[0]);
        urlreutrn = []
        for url in url_list:
            urlreutrn.append(url.attrs['href'])
        return urlreutrn
    except Exception as e:
        print(e)

class Producer(threading.Thread):
    def __init__(self, url_queue , *args, **kwargs):
        super(Producer, self).__init__(*args, **kwargs)
        self.url_queue = url_queue

    def run(self):

        # 获取小说目录
        try:
            while True:
                if self.url_queue.empty():
                    break

                url=self.url_queue.get()
                while True:
                    try:
                        text = requests.get(url,send_headers)
                    except Exception as e:
                        print(e)
                        continue
                    text.encoding = 'utf-8'
                    novel_source=text.text
                    htmlTree = BeautifulSoup(novel_source, 'html.parser')
                    print(htmlTree.title);

                    if str(htmlTree.title).find('503')==-1:
                        break
                    else :
                        time.sleep(1)

                img = htmlTree.find(id='fmimg').find_all('img')
                #print(img)
                #保存图片
                while True:
                    try:
                        response = requests.get(img[0].get('src'),send_headers)
                    except Exception as e:
                        print(e)
                        continue
                    # 获取的文本实际上是图片的二进制文本
                    imgxr = response.content
                    if len(imgxr) <300:
                        time.sleep(1)
                    else :
                        break

                # 将他拷贝到本地文件 w 写  b 二进制  wb代表写入二进制文本
                imgxrlist=[]
                imgxrlist=img[0].get('src').split("/")
                bookmsimg=imgxrlist[len(imgxrlist) - 1]
                with open('d:\\img\\'+bookmsimg, 'wb') as f:
                    f.write(imgxr)
                id=url.split("/")
                bookid=id[len(id) - 2]
                bookname=htmlTree.find(id='info').find('h1').string
                bookzz = htmlTree.find(id='info').find_all('p')[0].string.replace('作    者:','')
                bookms = htmlTree.find(id='intro').find_all('p')[1].string
                bookms=bookms.replace("'","\'")
                sql="insert into zsk_book_sm (bookid,bookname,zz,zhgxsj,msnr,tp) values ('xbqg"+bookid+"','"+bookname+"','"+bookzz+"',now(),'"+bookms+"','"+bookmsimg+"')"
                db = pool.connection()  # 连接数据池
                cursor = db.cursor()  # 获取游标
                try:
                    cursor.execute(sql)
                    db.commit()
                except Exception as e:
                    print("SQL ERROR!", e)
                    db.rollback()
                #finally:
                    #cursor.close()
                    #db.close()

                bookzjlisturl=[]

                bookzjlist=htmlTree.find(id='list').find_all('a')
                for url in bookzjlist:
                    urlzj=str(url.get('href'))
                    zjid=urlzj.split("/")[3].replace(".html", "")
                    #获取章节内容
                    while True:
                        try:
                            time.sleep(1)
                            #nrtext = requests.get('http://www.xbiquge.la/2/2210/1744317.html', send_headers)

                            nrtext = requests.get('http://www.xbiquge.la'+urlzj, send_headers)
                            #nrtext = requests.get('http://www.xbiquge.la' + urlzj, send_headers, proxies=proxies)
                        except Exception as e:
                            print(e)
                            continue
                        nrtext.encoding = 'utf-8'
                        nrnovel_source = nrtext.text
                        nrhtmlTree = BeautifulSoup(nrnovel_source, 'html.parser')
                        print(nrhtmlTree.title);
                        if str(nrhtmlTree.title).find('503') == -1:
                            break
                        else:
                            time.sleep(1)
                    content=nrhtmlTree.find(id='content')
                    #if str(content).index("</p>")!=-1:
                    #   nrhtmlTree.find('p').decompose()
                    a=str(content).replace("<div id=\"content\">","").replace("</div>","")
                    a=a.replace("'","")
                    nrsql = "INSERT INTO `zsk_book_content` (`id`, `content`) VALUES ('" +zjid + "', '" + a + "')"
                    zjsql ="INSERT INTO zsk_book_ml (mlid, zjmc, bookid, content_id, is_delete, words) VALUES ('"+ zjid + "', '" + url.text + "', '" +bookid+ "', '" + urlzj.split("/")[3].replace(".html", "") + "', 0, 1)";
                    try:
                        cursor.execute(zjsql)
                        cursor.execute(nrsql)
                        db.commit()
                    except Exception as e:
                        print("SQL ERROR!", e)
                        db.rollback()
                        #finally:
                        #cursor.close()
                        #db.close()

        except Exception as e:
            print(e)

if __name__ == '__main__':
    url_list=huoquxiaoshuo();
    print(url_list[0]);
    url_queue = Queue(len(url_list))

    for url in url_list:
        url_queue.put(url)

    for x in range(3):
        t = Producer(url_queue)
        t.start()

    # book_name = input('请输入你要下载的小说名字(确保输入的小说名字正确):')
    # novel_url, novel_name = download(book_name)
    # chapter_list = get_chapter(novel_url)
    # get_content(chapter_list, novel_name)

数据库结构如下

Target Server Type    : MYSQL
Target Server Version : 50562
File Encoding         : 65001

Date: 2020-03-28 12:32:08
*/

SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for `zsk_book_content`
-- ----------------------------
DROP TABLE IF EXISTS `zsk_book_content`;
CREATE TABLE `zsk_book_content` (
`id`  varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL DEFAULT '' COMMENT '内容id' ,
`content`  text CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '章节内容' ,
PRIMARY KEY (`id`)
)
ENGINE=InnoDB
DEFAULT CHARACTER SET=utf8 COLLATE=utf8_general_ci

;

-- ----------------------------
-- Table structure for `zsk_book_ml`
-- ----------------------------
DROP TABLE IF EXISTS `zsk_book_ml`;
CREATE TABLE `zsk_book_ml` (
`mlid`  varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '目录id' ,
`zjmc`  varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '目录章节名称' ,
`bookid`  varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL DEFAULT '' COMMENT '书id' ,
`content_id`  varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '文章内容id' ,
`is_delete`  int(11) NULL DEFAULT NULL COMMENT '删除标识' ,
`words`  int(11) NULL DEFAULT NULL COMMENT '字数' ,
`ctime`  timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '创建时间' ,
`mtime`  timestamp NOT NULL DEFAULT '0000-00-00 00:00:00' COMMENT '修改时间' ,
PRIMARY KEY (`mlid`, `bookid`)
)
ENGINE=InnoDB
DEFAULT CHARACTER SET=utf8 COLLATE=utf8_general_ci

;

-- ----------------------------
-- Table structure for `zsk_book_sm`
-- ----------------------------
DROP TABLE IF EXISTS `zsk_book_sm`;
CREATE TABLE `zsk_book_sm` (
`bookid`  varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '书id' ,
`bookname`  varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '书名' ,
`zz`  varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '作者' ,
`zhgxsj`  timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间' ,
`tp`  varchar(500) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '封面' ,
`zxzj`  varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '最新章节' ,
`zxzjid`  varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '最新章节id' ,
`msnr`  varchar(500) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '描述内容' ,
`lb`  varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '类别' ,
PRIMARY KEY (`bookid`)
)
ENGINE=InnoDB
DEFAULT CHARACTER SET=utf8 COLLATE=utf8_general_ci

;

猜你喜欢

转载自blog.csdn.net/terry711/article/details/105159021