通过python爬取小说图片、小说名称、作者、章节、内容。直接上代码
import requests
import time
from bs4 import BeautifulSoup
from queue import Queue
import threading
import pymysql
from DBUtils.PooledDB import PooledDB
proxy='124.243.226.18:8888'
#如果代理需要验证,只需要在前面加上用户名密码,如下所示
# proxy='username:[email protected]:8888'
proxies={
'http':'http://119.140.186.17:894',
'http':'http://27.24.20.22:766',
'http':'http://114.98.189.158:766',
'http':'http://114.101.246.185:23564'
}
pool = PooledDB(creator=pymysql,
maxconnections=0, # 连接池允许的最大连接数,0和None表示不限制连接数
mincached=10, # 初始化时,链接池中至少创建的空闲的链接,0表示不创建
maxcached=0, # 链接池中最多闲置的链接,0和None不限制
blocking=True, # 连接池中如果没有可用连接后,是否阻塞等待。True,等待;False,不等待然后报错
host='47.106.122.177', # 数据库ip地址
port=3306,
user='root',
passwd='123456',
db='zskxt',
use_unicode=True,
charset='utf8')
send_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
"Connection": "keep-alive",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8"
} # 伪装成浏览器
#获取小说URL列表
def huoquxiaoshuo():
search_real_url = 'http://www.xbiquge.la/xiaoshuodaquan/'
try:
html = requests.get(search_real_url).text
htmlTree = BeautifulSoup(html, 'html.parser')
url_list=htmlTree.find(id='main').find_all('a')
print(url_list[0]);
urlreutrn = []
for url in url_list:
urlreutrn.append(url.attrs['href'])
return urlreutrn
except Exception as e:
print(e)
class Producer(threading.Thread):
def __init__(self, url_queue , *args, **kwargs):
super(Producer, self).__init__(*args, **kwargs)
self.url_queue = url_queue
def run(self):
# 获取小说目录
try:
while True:
if self.url_queue.empty():
break
url=self.url_queue.get()
while True:
try:
text = requests.get(url,send_headers)
except Exception as e:
print(e)
continue
text.encoding = 'utf-8'
novel_source=text.text
htmlTree = BeautifulSoup(novel_source, 'html.parser')
print(htmlTree.title);
if str(htmlTree.title).find('503')==-1:
break
else :
time.sleep(1)
img = htmlTree.find(id='fmimg').find_all('img')
#print(img)
#保存图片
while True:
try:
response = requests.get(img[0].get('src'),send_headers)
except Exception as e:
print(e)
continue
# 获取的文本实际上是图片的二进制文本
imgxr = response.content
if len(imgxr) <300:
time.sleep(1)
else :
break
# 将他拷贝到本地文件 w 写 b 二进制 wb代表写入二进制文本
imgxrlist=[]
imgxrlist=img[0].get('src').split("/")
bookmsimg=imgxrlist[len(imgxrlist) - 1]
with open('d:\\img\\'+bookmsimg, 'wb') as f:
f.write(imgxr)
id=url.split("/")
bookid=id[len(id) - 2]
bookname=htmlTree.find(id='info').find('h1').string
bookzz = htmlTree.find(id='info').find_all('p')[0].string.replace('作 者:','')
bookms = htmlTree.find(id='intro').find_all('p')[1].string
bookms=bookms.replace("'","\'")
sql="insert into zsk_book_sm (bookid,bookname,zz,zhgxsj,msnr,tp) values ('xbqg"+bookid+"','"+bookname+"','"+bookzz+"',now(),'"+bookms+"','"+bookmsimg+"')"
db = pool.connection() # 连接数据池
cursor = db.cursor() # 获取游标
try:
cursor.execute(sql)
db.commit()
except Exception as e:
print("SQL ERROR!", e)
db.rollback()
#finally:
#cursor.close()
#db.close()
bookzjlisturl=[]
bookzjlist=htmlTree.find(id='list').find_all('a')
for url in bookzjlist:
urlzj=str(url.get('href'))
zjid=urlzj.split("/")[3].replace(".html", "")
#获取章节内容
while True:
try:
time.sleep(1)
#nrtext = requests.get('http://www.xbiquge.la/2/2210/1744317.html', send_headers)
nrtext = requests.get('http://www.xbiquge.la'+urlzj, send_headers)
#nrtext = requests.get('http://www.xbiquge.la' + urlzj, send_headers, proxies=proxies)
except Exception as e:
print(e)
continue
nrtext.encoding = 'utf-8'
nrnovel_source = nrtext.text
nrhtmlTree = BeautifulSoup(nrnovel_source, 'html.parser')
print(nrhtmlTree.title);
if str(nrhtmlTree.title).find('503') == -1:
break
else:
time.sleep(1)
content=nrhtmlTree.find(id='content')
#if str(content).index("</p>")!=-1:
# nrhtmlTree.find('p').decompose()
a=str(content).replace("<div id=\"content\">","").replace("</div>","")
a=a.replace("'","")
nrsql = "INSERT INTO `zsk_book_content` (`id`, `content`) VALUES ('" +zjid + "', '" + a + "')"
zjsql ="INSERT INTO zsk_book_ml (mlid, zjmc, bookid, content_id, is_delete, words) VALUES ('"+ zjid + "', '" + url.text + "', '" +bookid+ "', '" + urlzj.split("/")[3].replace(".html", "") + "', 0, 1)";
try:
cursor.execute(zjsql)
cursor.execute(nrsql)
db.commit()
except Exception as e:
print("SQL ERROR!", e)
db.rollback()
#finally:
#cursor.close()
#db.close()
except Exception as e:
print(e)
if __name__ == '__main__':
url_list=huoquxiaoshuo();
print(url_list[0]);
url_queue = Queue(len(url_list))
for url in url_list:
url_queue.put(url)
for x in range(3):
t = Producer(url_queue)
t.start()
# book_name = input('请输入你要下载的小说名字(确保输入的小说名字正确):')
# novel_url, novel_name = download(book_name)
# chapter_list = get_chapter(novel_url)
# get_content(chapter_list, novel_name)
数据库结构如下
Target Server Type : MYSQL
Target Server Version : 50562
File Encoding : 65001
Date: 2020-03-28 12:32:08
*/
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for `zsk_book_content`
-- ----------------------------
DROP TABLE IF EXISTS `zsk_book_content`;
CREATE TABLE `zsk_book_content` (
`id` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL DEFAULT '' COMMENT '内容id' ,
`content` text CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '章节内容' ,
PRIMARY KEY (`id`)
)
ENGINE=InnoDB
DEFAULT CHARACTER SET=utf8 COLLATE=utf8_general_ci
;
-- ----------------------------
-- Table structure for `zsk_book_ml`
-- ----------------------------
DROP TABLE IF EXISTS `zsk_book_ml`;
CREATE TABLE `zsk_book_ml` (
`mlid` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '目录id' ,
`zjmc` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '目录章节名称' ,
`bookid` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL DEFAULT '' COMMENT '书id' ,
`content_id` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '文章内容id' ,
`is_delete` int(11) NULL DEFAULT NULL COMMENT '删除标识' ,
`words` int(11) NULL DEFAULT NULL COMMENT '字数' ,
`ctime` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '创建时间' ,
`mtime` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00' COMMENT '修改时间' ,
PRIMARY KEY (`mlid`, `bookid`)
)
ENGINE=InnoDB
DEFAULT CHARACTER SET=utf8 COLLATE=utf8_general_ci
;
-- ----------------------------
-- Table structure for `zsk_book_sm`
-- ----------------------------
DROP TABLE IF EXISTS `zsk_book_sm`;
CREATE TABLE `zsk_book_sm` (
`bookid` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '书id' ,
`bookname` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '书名' ,
`zz` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '作者' ,
`zhgxsj` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间' ,
`tp` varchar(500) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '封面' ,
`zxzj` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '最新章节' ,
`zxzjid` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '最新章节id' ,
`msnr` varchar(500) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '描述内容' ,
`lb` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '类别' ,
PRIMARY KEY (`bookid`)
)
ENGINE=InnoDB
DEFAULT CHARACTER SET=utf8 COLLATE=utf8_general_ci
;