import re, requests, xlwt, os
import urllib.request
from multiprocessing import Pool
from fake_useragent import UserAgent
from lxml import etree
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
headers = {
'UserAgent': UserAgent().random
}
def get_proxy():
return requests.get('http://localhost:5010/get/').text
class QiShu(object):
def __init__(self, pool):
self.pool = pool
self.row = 1
def get_list(self):
url = 'https://www.qisuu.la/'
proxy = get_proxy()
proxies = 'http:' + proxy
print(proxies)
# try:
response = requests.get(url, headers=headers, proxies=proxies, verify=False)
if response.status_code == 200:
self.parse_list(response.text)
# self.pool.apply_async(self.parse_list, args=(response.text,), callback=self.parse_list)
# return response.text
else:
print(response.status_code)
# except Exception as e:
# print('请求首页失败, 错误{}, '.format(e))
def parse_list(self, html):
print('*')
pattern = re.compile(r'<div class="nav".*?<a.*?href="(.*?)".*?>(.*?)</a>'
r'.*?<a.*?href="(.*?)".*?>(.*?)</a>'
r'.*?<a.*?href="(.*?)".*?>(.*?)</a>'
r'.*?<a.*?href="(.*?)".*?>(.*?)</a>'
r'.*?<a.*?href="(.*?)".*?>(.*?)</a>'
r'.*?<a.*?href="(.*?)".*?>(.*?)</a>'
r'.*?<a.*?href="(.*?)".*?>(.*?)</a>'
r'.*?<a.*?href="(.*?)".*?>(.*?)</a>'
r'.*?<a.*?href="(.*?)".*?>(.*?)</a>'
r'.*?<a.*?href="(.*?)".*?>(.*?)</a>'
r'.*?<a.*?href="(.*?)".*?>(.*?)</a>', re.S)
result_list = re.findall(pattern, html)
print(result_list)
for s in result_list:
for i in range(0, 11):
href = s[2 * i]
print(href)
if href != "/":
href = "https://www.qisuu.la/" + href
self.get_detail(href)
self.pool.apply_async(self.get_detail, args=(href,), callback=self.parse_list)
# self.write_data(result_list)
# 将数据处理并保存
def get_detail(self, url):
print('**')
print(url)
url = url + 'index_{}.html'
print(url)
for r in range(1, 11):
print(url.format(r))
proxy = get_proxy()
proxies = 'http:' + proxy
# try:
response = requests.get(url.format(r), headers=headers, proxies=proxies, verify=False)
if response.status_code == 200:
self.parse_detail(response.text)
# self.pool.apply_async(self.parse_detail, args=(response.text,), callback=self.parse_detail)
# return response.text
else:
print(response.status_code)
# except Exception as e:
# print('请求首页失败, 错误{}, '.format(e))
def parse_detail(self, ht):
print('****')
# pattern = re.compile(r'<li .*?<a href="(.*?)"><img .*?>(.*?)</a>', re.S)
# res =re.findall(pattern, ht)
# print(res)
# for
h = etree.HTML(ht)
pattern = h.xpath('/html/body/div[4]/div[2]/div/ul/li/a')
# url_list = h.xpath('/html/body/div[4]/div[2]/div/div[2]/a[1]')[0].get('href')
# print(url_list)
for url in pattern:
href = 'https://www.qisuu.la/' + url.get('href')
# print(href)
self.detail_detail(href)
# self.pool.apply_async(self.get_detail, args=(href,), callback=self.parse_list)
self.pool.apply_async(self.detail_detail, args=(href,), callback=self.parse_detail)
def detail_detail(self, url):
print('*****')
print(url)
proxy = get_proxy()
proxies = 'http:' + proxy
# try:
response = requests.get(url, headers=headers, verify=False)
if response.status_code == 200:
response.encoding = 'utf-8'
self.parse_detail_detail(sheet, response.text)
# self.pool.apply_async(self.parse_detail_detail, args=(response.text,), callback=self.parse_detail_detail)
else:
print(response.status_code)
def parse_detail_detail(self, sheet, html):
print('******')
os.chdir('tupian')
ht = etree.HTML(html)
# 小说名称,点击次数,文件大小,更新日期,连载状态,书记作者,最新章节,小说介绍,下载地址
book_name = ht.xpath('//div[@class="detail_right"]/h1/text()')[0]
click_times = ht.xpath('//div[2]/div/ul/li[1]/text()')[0]
file_size = ht.xpath('//div[2]/div/ul/li[2]/text()')[0]
update_date = ht.xpath('//div[2]/div/ul/li[4]/text()')[0]
status = ht.xpath('//div[2]/div/ul/li[5]/text()')[0]
author = ht.xpath('//div[2]/div/ul/li[6]/text()')[0]
zxzj = ht.xpath('//div[2]/div/ul/li[8]/a/text()')
if zxzj!= [] :
print(zxzj)
zxzj=zxzj[0]
else:
zxzj = None
xsjs = ht.xpath('//div[2]/div[2]/div[2]/p/text()')[0]
download_address = ht.xpath('/html/body/div[4]/div[2]/div[3]/div[2]/ul/li[3]/script/text()')[0]
download = download_address.split(",'")[1]
picture = ht.xpath('/html/body/div[4]/div[2]/div[1]/div/div[1]/img')[0]
src1 = picture.get('src')
src = 'https://www.qisuu.la/' + src1
if requests.get(src).status_code != 404:
if '/' in download_address.split(",'")[2].split("')")[0]:
urllib.request.urlretrieve(src, download_address.split(",'")[2].split("')")[1].split('/')[0]+'.jpg')
os.chdir(os.path.pardir)
else:
if '*' in download_address.split(",'")[2].split("')")[0]:
ss = download_address.split(",'")[2].split("')")[0].replace('*', '1')
urllib.request.urlretrieve(src, ss + '.jpg')
os.chdir(os.path.pardir)
else:
ss = download_address.split(",'")[2].split("')")[0]
urllib.request.urlretrieve(src, ss + '.jpg')
os.chdir(os.path.pardir)
else:
s = picture.get('onerror')
sr = s.split("=")[1]
sr = sr.split("'")[1]
print(sr)
src = 'https://www.qisuu.la/' + sr
if '/' in download_address.split(",'")[2].split("')")[0]:
urllib.request.urlretrieve(src, download_address.split(",'")[2].split("')")[1].split('/')[0]+'.jpg')
os.chdir(os.path.pardir)
else:
if '*' in download_address.split(",'")[2].split("')")[0]:
ss = download_address.split(",'")[2].split("')")[0].replace('*', '1')
urllib.request.urlretrieve(src, ss + '.jpg')
os.chdir(os.path.pardir)
else:
ss = download_address.split(",'")[2].split("')")[0]
urllib.request.urlretrieve(src, ss + '.jpg')
os.chdir(os.path.pardir)
# print(download_address)
novel = [book_name, click_times, file_size, update_date, status, author, zxzj, xsjs, download]
self.write_data(sheet, novel)
def open_file(self):
print('*************************')
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('小说')
sheet.write(0, 0, '小说名称')
sheet.write(0, 1, '点击次数')
sheet.write(0, 2, '文件大小')
sheet.write(0, 3, '更新日期')
sheet.write(0, 4, '连载状态')
sheet.write(0, 5, '书记作者')
sheet.write(0, 6, '最新章节')
sheet.write(0, 7, '下载地址')
sheet.write(0, 8, '下载地址')
return book, sheet
def write_data(self, sheet, data):
print('^^^^^^^^^^^^^^^^^^^^^^^^')
sheet.write(self.row, 0, data[0])
sheet.write(self.row, 1, data[1])
sheet.write(self.row, 2, data[2])
sheet.write(self.row, 3, data[3])
sheet.write(self.row, 4, data[4])
sheet.write(self.row, 5, data[5])
sheet.write(self.row, 6, data[6])
sheet.write(self.row, 7, data[7])
sheet.write(self.row, 8, data[8])
self.row += 1
self.close_file(book)
# self.pool.apply_async(self.close_file, args=(book,), callback=self.parse_detail_detail)
def close_file(self, book):
print('----------------------------')
book.save('小说信息表.xls')
if __name__ == '__main__':
pool = Pool(8)
os.makedirs('tupian', exist_ok=True)
# main()
q = QiShu(pool)
book, sheet = q.open_file()
q.get_list()