使用requests爬取小说

实现:

import requests
import re
import time


def get_chapter(aim_url):
    '''
    获取小说章节名称和对应的url地址
    :param aim_url:小说目录页url
    :return:章节名和url的字典
    '''

    ret = requests.get(aim_url)
    content = ret.content.decode('gbk')

    li = re.findall('<li><a.*</li>',content)    #获取小说章节的地址

    url_dic = {}
    for i in li:
        print(i)
        chapter = re.search(r'">.*</a',i).group()[2:-3]
        url = aim_url + re.search(r'[0-9]{7}.html',i).group()
        url_dic[chapter] = url

    return url_dic

def get_content(chapter_url):
    '''
    获取小说内容
    :param chapter_url:章节名和url的字典
    :return:
    '''
    requests.adapters.DEFAULT_RETRIES = 5
 
    for chapter,url in chapter_url.items():
       
        try:
            ret = requests.get(url).content.decode('gbk')
        except requests.exceptions.ConnectionError:
            print('连接太快了..等等')
            time.sleep(5)
            ret = requests.get(url).content.decode('gbk')

        #匹配正文
        ret = ret.replace('\n','')
        content = re.search(r'<div id="content" class="content">.*chapterpage',ret).group()[35:]

        #替换正文中的html标签
        content = content.replace('&nbsp;',' ')
        content = content.replace('<br />', ' ')
        content = content.replace('&#039', '')

        #将内容写入文件中
        with open('novel.txt',mode='a',encoding='utf-8') as f:
            f.write(chapter+'\n')
            f.write(content)
            f.write('\n\n')

        print(chapter)


if __name__ == '__main__':
    li =  get_chapter('https://www.9dxs.com/2/2348/index.html')
    get_content(li)

遇到的问题:

爬取一半时,抛出了requests.exceptions.ConnectionError异常

分析:

  1. 请求连接数太多
  2. 请求速度过快

解决:

  1. 设置默认重连数:
    requests.adapters.DEFAULT_RETRIES = 5
  2. 捕捉异常,等待5秒后再重试
    try:
                ret = requests.get(url).content.decode('gbk')
    except requests.exceptions.ConnectionError:
                print('连接太快了..等等')
                time.sleep(5)
                ret = requests.get(url).content.decode('gbk')

猜你喜欢

转载自www.cnblogs.com/walthwang/p/10452643.html