1、阅读限制
2、实际返回html页面中有内容。
3、代码如下:
# coding: utf-8
import urllib2
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
f = open("out.txt", "a+")
headers = {
"Host": "www.ihuaben.com",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1"
}
url = "http://www.ihuaben.com/book/316254/5495438.html" # 第一章网址
page = 89 # 章节数
nextHref = url
def conn_try_again(function):
RETRIES = 0
# 重试的次数
count = {"num": RETRIES}
def wrapped(*args, **kwargs):
try:
return function(*args, **kwargs)
except Exception, err:
if count['num'] < 2:
count['num'] += 1
return wrapped(*args, **kwargs)
else:
raise Exception(err)
return wrapped
bsObj = None
@conn_try_again
def getContent(url):
global nextHref, page, bsObj
try:
req = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(req)
bsObj = BeautifulSoup(response, 'lxml')
except Exception, err:
err = u'接口间通信异常'
raise Exception(err)
content = bsObj.find('div', id='contentsource').get_text()
preAndNextBar = bsObj.find('div', id='preAndNextBar')
title = bsObj.find('div', id='chaptertitle').h1.get_text()
if ("下一章" in preAndNextBar.get_text()):
next = None
if (len(preAndNextBar.findAll('a')) > 2):
next = preAndNextBar.findAll('a')[2]
else:
next = preAndNextBar.findAll('a')[1]
nextHref = next.get('href')
print(title)
print(content)
print(nextHref)
f.write("#####" + '\n')
f.write(title + '\n')
f.write(content + '\n')
else:
return True
def main():
global page
try:
for num in range(1, page):
if (getContent(nextHref)):
break
print("--- end ---")
except Exception, e:
print(str(e))
finally:
f.close()
main()
下载链接:https://download.csdn.net/download/u012795120/10508304