Python爬虫系列之小说网爬取

版权声明:Author:WangLei https://blog.csdn.net/qq_41287993/article/details/83930053

今日爬虫—小说网

再次声明所有爬虫仅仅为技术交流,没有任何恶意,若有侵权请☞私信☚

此次爬取由主页爬取到各本小说地址,然后通过这些地址获取到小说目录结构,在通过目录结构获取章节内容,同时以小说名字为文件夹,每一个章节为txt文本存储到本地。

  • 话不多说,直接上代码
  • 欢迎探讨
import urllib.request
import re
import os
import time
'''
	@Author:王磊
	@Time  :2018/11/10 15:39:02
'''

def get_html(url):
    '''获取url界面数据'''
    page = urllib.request.Request(url)
    html = urllib.request.urlopen(page).read().decode("utf-8")
    #print(html)
    return html


def get_all_index():
    '''获取站点下所有小数的目录地址以及小说名称'''
    html = get_html("http://book.zongheng.com/store/c0/c0/b0/u0/p1/v9/s1/t0/u0/i1/ALL.html")
    #print(html)
    # reg = re.compile(r'<div class="bookname"><a href="(.*?)" target="_blank">(.*?)</a></div>')
    reg = re.compile(r'<a href="(http://book.zongheng.com/book/.*?.html)" target="_blank">(.*?)</a>')
    urls_names = re.findall(reg, html)
    res_list = []
    for url_name in urls_names:
        mete_list = []
        html_in = get_html(url_name[0])
        reg_in = re.compile(r'<a class="all-catalog".*?href="(.*?)">')
        url_mete = re.findall(reg_in, html_in)
        url_mete.append(url_name[1])
        res_list.append(url_mete)
    return res_list


def get_urls_titles_list(html):
    '''获取目录下当前元素的章节内容地址以及章节名称'''
    reg = re.compile(r'<a  href="(.*?)" target="_blank".*?>(.*?)</a>')
    urls_titles = re.findall(reg, html)
    #print(urls_titles)
    return urls_titles


def get_content(url):
    '''通过章节内容地址获取章节内容'''
    html = get_html(url)
    reg1 = re.compile(r'<p>(.*?)</p>')
    # reg1 = re.compile(r'<div class="title_txtbox">(.*?)</div>')
    content = re.findall(reg1, html)[0:-1]
    str1 = ""
    for str0 in content:
        str1 += str0 + "\r\n"
    return str1


def run():
    #获取站点书名地址目录数据
    books = get_all_index()
    for book in books:
        # 创建存储目录地址
        if not os.path.exists("C:\\Users\\asus\\Desktop\\pc\\story\\" + book[1]):
            os.mkdir("C:\\Users\\asus\\Desktop\\pc\\story\\" + book[1])
        #获取页面数据
        html = get_html(book[0])
        #获取界面地址
        urls_titles_list = get_urls_titles_list(html)
        print("*" * 10 + "开始下载书籍:《%s》" % book[1] + "*" * 10)
        if len(urls_titles_list) != 0:
            list_err = []
            print("开始下载章节!")
            for url_title in urls_titles_list:
                print("正在下载章节:", url_title[1])
                try:
                    content = get_content(url_title[0])
                    with open("C:\\Users\\asus\\Desktop\\pc\\story\\" + book[1] + "\\" + url_title[1] + ".txt", "a", encoding="utf-8") as f:
                        f.write(content)
                        print("章节:%s下载成功!" % url_title[1])
                except Exception as e:
                    list_err.append(url_title[1])
                    print("章节:%s下载失败!" % url_title[1])
                    continue
                #防止ip过度活跃,降低下载速度
                time.sleep(1)
            if len(list_err) == 0:
                print("《" + book[1] + "》所有章节已经下载完成!没有失败下载的章节!")
            else:
                print("《%s》以下章节下载失败:" % book[1])
                for errdownload in list_err:
                    print(errdownload)
        else:
            print("操作频繁导致站点黑名单识别或网络异常!请稍后尝试或更新ip地址!")
        time.sleep(3)


if __name__ == "__main__":
    run()
    

☞点击这里与我探讨☚

♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪
♪♪后续会更新系列基于Python的爬虫小例子,欢迎关注。♪♪
♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪♪

猜你喜欢

转载自blog.csdn.net/qq_41287993/article/details/83930053
今日推荐