版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/antch620/article/details/79897871
#!/usr/bin/python3
import urllib.request
import re
import urllib.parse
import urllib.error
def downloadfile(url):
# 请求
try:
request_list = urllib.request.Request(url)
# 爬取结果
response_list = urllib.request.urlopen(request_list)
data_list = response_list.read()
# 设置解码方式
data_list = data_list.decode('utf-8')
content_obj = re.search('<div id="catalog">(.*?)\s*</div>\s*<div id="pages">', data_list, re.M | re.S)
if content_obj:
pass
else:
return True
content = content_obj.group(1)
novel_list_obj = re.split('<div class="listbg">', content, re.M | re.S)
if novel_list_obj:
pass
else:
return True
del novel_list_obj[0]
for novel in novel_list_obj:
name_obj = re.search('<a href="(.*?)".*>(.*?)</a></span><span class="newDate">', novel, re.M | re.S)
if name_obj:
pass
else:
name_obj = re.search('<a href="(.*?)".*>(.*?)</a></span><span class="oldDate">', novel, re.M | re.S)
# abstractObj = re.search('<div style="padding:0 20px">(.*?)</div>', novel, re.M | re.S)
size_obj = re.search('文件大小:</small>(\d*\.*\d*)(.*?)\s*<small>', novel, re.M | re.S)
if size_obj:
pass
else:
return True
process_obj = re.search('写作进度:</small>(.*?)\s*<small>', novel, re.M | re.S)
if process_obj:
pass
else:
return True
if (process_obj.group(1) == '已完结' and (
size_obj.group(2).strip() == 'MB' or (
size_obj.group(2).strip() == 'KB' and float(size_obj.group(1)) >= 800))):
down_load_url = 'http://www.jjxsw.com' + name_obj.group(1)
# 请求
request_down = urllib.request.Request(down_load_url)
if request_down:
pass
else:
return True
# 爬取结果
response_down = urllib.request.urlopen(request_down)
data_down = response_down.read()
# 设置解码方式
data_down = data_down.decode('utf-8')
# 下载链接
down_load_obj = re.search('<li class="downAddress_li"><a href="(.*?)" title', data_down, re.M | re.S)
if down_load_obj:
pass
else:
return True
txt_url = 'http://www.jjxsw.com' + down_load_obj.group(1)
# 请求
request_txt = urllib.request.Request(txt_url)
if request_txt:
pass
else:
return True
# 爬取结果
response_txt = urllib.request.urlopen(request_txt)
data_txt = response_txt.read()
# 设置解码方式
data_txt = data_txt.decode('utf-8')
# 下载链接
txt_obj = re.search('<a href="http://down2.txt99.com/d/file/p/txt/(.*?)" class="strong green">',
data_txt,
re.M | re.S)
if txt_obj:
pass
else:
txt_obj = re.search('<a href="http://down1.txt99.com/d/file/p/txt/(.*?)" class="strong green">',
data_txt,
re.M | re.S)
link = "http://down2.txt99.com/d/file/p/txt/" + urllib.parse.quote(txt_obj.group(1))
load_path = 'D:\\novel\\tiexue\\' + name_obj.group(2) + ".txt"
urllib.request.urlretrieve(link, load_path)
except:
print(url)
# 网址
url = "http://www.jjxsw.com/txt/tiexue/"
# 请求
request = urllib.request.Request(url)
# 爬取结果
response = urllib.request.urlopen(request)
data = response.read()
# 设置解码方式
data = data.decode('utf-8')
# 总页数
totalNumObj = re.search('<a title="总数"> <b>(.*?)</b> </a>', data, re.M | re.S)
totalNum = int(totalNumObj.group(1))
for x in range(14, totalNum):
page_url = "http://www.jjxsw.com/txt/tiexue/index_"+str(x)+".html"
downloadfile(page_url)
import urllib.request
import re
import urllib.parse
import urllib.error
def downloadfile(url):
# 请求
try:
request_list = urllib.request.Request(url)
# 爬取结果
response_list = urllib.request.urlopen(request_list)
data_list = response_list.read()
# 设置解码方式
data_list = data_list.decode('utf-8')
content_obj = re.search('<div id="catalog">(.*?)\s*</div>\s*<div id="pages">', data_list, re.M | re.S)
if content_obj:
pass
else:
return True
content = content_obj.group(1)
novel_list_obj = re.split('<div class="listbg">', content, re.M | re.S)
if novel_list_obj:
pass
else:
return True
del novel_list_obj[0]
for novel in novel_list_obj:
name_obj = re.search('<a href="(.*?)".*>(.*?)</a></span><span class="newDate">', novel, re.M | re.S)
if name_obj:
pass
else:
name_obj = re.search('<a href="(.*?)".*>(.*?)</a></span><span class="oldDate">', novel, re.M | re.S)
# abstractObj = re.search('<div style="padding:0 20px">(.*?)</div>', novel, re.M | re.S)
size_obj = re.search('文件大小:</small>(\d*\.*\d*)(.*?)\s*<small>', novel, re.M | re.S)
if size_obj:
pass
else:
return True
process_obj = re.search('写作进度:</small>(.*?)\s*<small>', novel, re.M | re.S)
if process_obj:
pass
else:
return True
if (process_obj.group(1) == '已完结' and (
size_obj.group(2).strip() == 'MB' or (
size_obj.group(2).strip() == 'KB' and float(size_obj.group(1)) >= 800))):
down_load_url = 'http://www.jjxsw.com' + name_obj.group(1)
# 请求
request_down = urllib.request.Request(down_load_url)
if request_down:
pass
else:
return True
# 爬取结果
response_down = urllib.request.urlopen(request_down)
data_down = response_down.read()
# 设置解码方式
data_down = data_down.decode('utf-8')
# 下载链接
down_load_obj = re.search('<li class="downAddress_li"><a href="(.*?)" title', data_down, re.M | re.S)
if down_load_obj:
pass
else:
return True
txt_url = 'http://www.jjxsw.com' + down_load_obj.group(1)
# 请求
request_txt = urllib.request.Request(txt_url)
if request_txt:
pass
else:
return True
# 爬取结果
response_txt = urllib.request.urlopen(request_txt)
data_txt = response_txt.read()
# 设置解码方式
data_txt = data_txt.decode('utf-8')
# 下载链接
txt_obj = re.search('<a href="http://down2.txt99.com/d/file/p/txt/(.*?)" class="strong green">',
data_txt,
re.M | re.S)
if txt_obj:
pass
else:
txt_obj = re.search('<a href="http://down1.txt99.com/d/file/p/txt/(.*?)" class="strong green">',
data_txt,
re.M | re.S)
link = "http://down2.txt99.com/d/file/p/txt/" + urllib.parse.quote(txt_obj.group(1))
load_path = 'D:\\novel\\tiexue\\' + name_obj.group(2) + ".txt"
urllib.request.urlretrieve(link, load_path)
except:
print(url)
# 网址
url = "http://www.jjxsw.com/txt/tiexue/"
# 请求
request = urllib.request.Request(url)
# 爬取结果
response = urllib.request.urlopen(request)
data = response.read()
# 设置解码方式
data = data.decode('utf-8')
# 总页数
totalNumObj = re.search('<a title="总数"> <b>(.*?)</b> </a>', data, re.M | re.S)
totalNum = int(totalNumObj.group(1))
for x in range(14, totalNum):
page_url = "http://www.jjxsw.com/txt/tiexue/index_"+str(x)+".html"
downloadfile(page_url)