xpath + multi-process crawling eighty eBook Lily Love classification in all fiction.

 

Code

# Required libraries 
Import Requests
 from lxml Import etree
 from multiprocessing Import Pool
 Import OS
 # request header 
headers = {
     ' the User-- Agent ' : ' the Mozilla / 5.0 (the Windows NT 6.1; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / Safari 65.0.3325.181 / 537.36 ' 
} 
# create a storage path 
pathname = ' ./ eighty eBook / ' 
IF  not os.path.exists (pathname): 
    os.mkdir (pathname) 
# get a list of books 
def get_booklist(url):
    try:
        response = requests.get(url=url,headers=headers)
        etrees = etree.HTML(response.text)
        sum = etrees.xpath('//a[@class="last"]/text()')[0]
        booklist = etrees.xpath('//div[@class="book_bg"]/a/@href')
        pool.map(get_book,booklist)
        urls = ['http://www.quanshuwang.com/list/3_{}.html'.format(i) for i in range(2,int(sum)+1)]
        pool.map(get_booklist,urls)
    except Exception:
        print('get_booklist failed')
# 获取具体书籍
def get_book(url):
    try:
        response = requests.get(url=url, headers=headers)
        etrees = etree.HTML(response.text)
        mulu = etrees.xpath('//a[@id="read_book"]/@href')[1]
        get_mulu(mulu)
    except  Exception:
        print('get_book failed')
# 获取书籍目录
def get_mulu(url):
    try:
        response = requests.get(url=url, headers=headers)
        etrees = etree.HTML(response.text)
        zhangjie = etrees.xpath('//div[@id="yulan"]/li/a/@href')
        for i in zhangjie:
            get_content(i)
    except Exception:
        print('get_mulu failed')
# 获取书籍内容
def get_content(url):
    try:
        response = requests.get(url=url, headers=headers)
        etrees = etree.HTML(response.text.encode(response.encoding).decode(response.apparent_encoding))
        book_name = etrees.xpath('//p[@class="text"]/a/text()')[1]
        zhangjie = etrees.xpath('//div[@class="date"]/h1/text()')[0]
        contents = etrees.xpath('//div[@id="content"]/text()')
        print(zhangjie+'..正在下载')
        f = open(pathname+book_name+'.txt','a+',encoding='utf-8')
        f.write(zhangjie+'\n\n')
        for con in contents:
            f.write(con+'\n')
        f.close()
    except Exception:
        print('get_content failed')



# 程序入口
if __name__ == '__main__':
    url = 'https://www.80txt.la/sort5/1.html '
    # Create a process pool 
    the pool = Pool ()
     # start function 
    get_booklist (url)

Console output

E: \ anaconda \ python.exe E: / practice / final phase of / 0809 / eighty eBook .py 
1 Chapter I picked up a small female .. Downloading
 01 will .. Downloading 
Chapter I picked up things that are not will be able to change money .. Loading 
2 Chapter thrown out of the earth .. downloading
 02 mutation are downloading .. 
3 Chapter this is a high-tech world .. downloading 
Chapter Grandpa! You are my pro-grandfather .. Downloading
 03 bracelets .. Downloading 
novice Chapter Xinshoucun not mix .. Downloading 
Chapter 4 called Rongji force .. downloading 
the first chapter I will fight .. Downloading 
04 longevity .. downloading

Open the folder to see if the download was successful

done。

 

Guess you like

Origin www.cnblogs.com/nmsghgnv/p/11334613.html