Use Xpath reptile class library download Shishu Dian Ji poetry famous network of all articles.

# Required libraries 
from lxml Import etree
 Import Requests
 # request header 
headers = {
     ' the User-- Agent ' : ' the Mozilla / 5.0 (the Windows NT 6.1; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 65.0.3325.181 Safari / 537.36 ' 
} 
# save the text of the address of the 
pathname = r ' E: \ crawler \ famous poems net \\ ' 
# get the function name books 
DEF get_book (url): 
    the Response = requests.get (url, headers) 
    etrees = etree.HTML ( response.text) 
    url_infosEtrees.xpath = ( ' // div [@ class = "Bookmark-List"] / UL / Li ' ) 

    for I in url_infos: 
        url_info = i.xpath ( ' ./h2/a/@href ' ) 
        BOOK_NAME = I .xpath ( ' ./h2/a/text () ' ) [0]
         Print ( ' start downloading. ' + BOOK_NAME)
         # Print ( 'HTTP: //www.shicimingju.com'+url_info [0]) 
        get_index ( ' http://www.shicimingju.com ' + url_info [0])
 # acquired book directory function 
def get_index(url):
    response = requests.get(url, headers)
    etrees = etree.HTML(response.text)
    url_infos = etrees.xpath('//div[@class="book-mulu"]/ul/li')
    for i in url_infos:
        url_info = i.xpath('./a/@href')
        # print('http://www.shicimingju.com' + url_info[0])
        get_content('http://www.shicimingju.com' + url_info[0])
# 获取书籍内容并写入.txt文件
def get_content(url):
    response = requests.get(url, headers)
    etrees = etree.HTML(response.text)
    title = etrees.xpath('//div[@class="www-main-container www-shadow-card "]/h1/text()')[0]
    content = etrees.xpath('//div[@class="chapter_content"]/p/text()')
    content = ''.join(content)
    book_name=etrees.xpath('//div[@class="nav-top"]/a[3]/text()')[0]
    with open(pathname+book_name+'.txt','a+', encoding = ' UTF-. 8 ' ) AS F: 
        f.write (title + ' \ n-\ n- ' + Content + ' \ n-\ n-\ n- ' )
         Print (title + ' .. download completes ' ) 

# program entry 
IF  the __name__ == ' __main__ ' : 
    URL = ' http://www.shicimingju.com/book/ ' 
    get_book (URL)

Console to view the download process;

Open the folder to see if the download is successful;

done.

Guess you like

Origin www.cnblogs.com/nmsghgnv/p/11314609.html