话本小说网-文章内容爬取

1、阅读限制
这里写图片描述
2、实际返回html页面中有内容。
3、代码如下:

# coding: utf-8
import urllib2
from bs4 import BeautifulSoup
import sys

reload(sys)
sys.setdefaultencoding('utf-8')

f = open("out.txt", "a+")
headers = {
    "Host": "www.ihuaben.com",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "X-Requested-With": "XMLHttpRequest",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1"
}

url = "http://www.ihuaben.com/book/316254/5495438.html"  # 第一章网址
page = 89  # 章节数
nextHref = url


def conn_try_again(function):
    RETRIES = 0
    # 重试的次数
    count = {"num": RETRIES}

    def wrapped(*args, **kwargs):
        try:
            return function(*args, **kwargs)
        except Exception, err:
            if count['num'] < 2:
                count['num'] += 1
                return wrapped(*args, **kwargs)
            else:
                raise Exception(err)

    return wrapped


bsObj = None


@conn_try_again
def getContent(url):
    global nextHref, page, bsObj
    try:
        req = urllib2.Request(url, headers=headers)
        response = urllib2.urlopen(req)
        bsObj = BeautifulSoup(response, 'lxml')
    except Exception, err:
        err = u'接口间通信异常'
        raise Exception(err)
    content = bsObj.find('div', id='contentsource').get_text()
    preAndNextBar = bsObj.find('div', id='preAndNextBar')
    title = bsObj.find('div', id='chaptertitle').h1.get_text()
    if ("下一章" in preAndNextBar.get_text()):
        next = None
        if (len(preAndNextBar.findAll('a')) > 2):
            next = preAndNextBar.findAll('a')[2]
        else:
            next = preAndNextBar.findAll('a')[1]
        nextHref = next.get('href')
        print(title)
        print(content)
        print(nextHref)
        f.write("#####" + '\n')
        f.write(title + '\n')
        f.write(content + '\n')
    else:
        return True


def main():
    global page
    try:
        for num in range(1, page):
            if (getContent(nextHref)):
                break
        print("--- end ---")
    except Exception, e:
        print(str(e))
    finally:
        f.close()


main()

下载链接:https://download.csdn.net/download/u012795120/10508304

猜你喜欢

转载自blog.csdn.net/u012795120/article/details/80758197
今日推荐