爬取笔趣阁网站小说

主程序

from urllib import request
import gzip
from lxml import etree
import download_novelist

url = "http://www.xbiquge.la/xiaoshuodaquan/"

res = request.urlopen(url).read()

try:
    data = gzip.decompress(res).decode()
except:
    data = res.decode()

ele = etree.HTML(data)

#小说的名
book_names = ele.xpath("//div[@class='novellist']//ul/li/a/text()")
#小说的url
book_urls = ele.xpath("//div[@class='novellist']//ul/li/a/@href")

#获得div
for book_url in book_urls:
    download_novelist.download_ficition(book_url)

# download_novelist.download_ficition("http://www.xbiquge.la/7/7931/")

download_novelist.py

# coding=utf-8
from urllib import request
import gzip
from lxml import etree
import time

def download_ficition(url):

    res = request.urlopen(url).read()
    try:
        data = gzip.decompress(res).decode()
    except:
        data = res.decode()

    ele = etree.HTML(data)

#获得此url的小说名
    book_name = ele.xpath("//div[@id='info']//h1/text()")[0]
#获得此url的章节url地址列表
    charpter_urls = ele.xpath("//div[@id='list']//dl/dd/a/@href")

#初始化number,用于显示进度信息
    number = 0

    for charpter_url in charpter_urls:
        #沉睡2秒钟，速度过快，服务器无法相应
        time.sleep(2)
        number = number + 1
       #构建 章节 完整的url
        url2 = "http://www.xbiquge.la/"+charpter_url

        res2 = request.urlopen(url2).read()
        try:
            data2 = gzip.decompress(res2).decode()
        except:
            data2 = res2.decode()

        ele = etree.HTML(data2)
        #获得章节名
        charpter_name = ele.xpath("//div[@class='bookname']/h1/text()")[0]
        #获得章节内容
        charpter_content = ele.xpath("//div[@id='content']/text()")
        #去掉章节中的“全部章节字样”
        end_charpter_name = charpter_name.replace("全部章节 ", "")

        file = open("%s.txt"%(book_name), "a", encoding="utf-8")

        try:
            print("正在保存%s的%s；现存储了%d次；已经完成%f！"%(book_name,charpter_name,number,number-1/len(charpter_urls)))
            file.write(end_charpter_name+'\n')
            for s in charpter_content:
                file.write("".join(s.split())+'\n')

        except Exception as e:
            print("%s小说保存失败！"%(book_name))
        finally:
            file.close()

爬取笔趣阁网站小说

猜你喜欢