# Required libraries from lxml Import etree Import Requests # request header headers = { ' the User-- Agent ' : ' the Mozilla / 5.0 (the Windows NT 6.1; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 65.0.3325.181 Safari / 537.36 ' } # save the text of the address of the pathname = r ' E: \ crawler \ famous poems net \\ ' # get the function name books DEF get_book (url): the Response = requests.get (url, headers) etrees = etree.HTML ( response.text) url_infosEtrees.xpath = ( ' // div [@ class = "Bookmark-List"] / UL / Li ' ) for I in url_infos: url_info = i.xpath ( ' ./h2/a/@href ' ) BOOK_NAME = I .xpath ( ' ./h2/a/text () ' ) [0] Print ( ' start downloading. ' + BOOK_NAME) # Print ( 'HTTP: //www.shicimingju.com'+url_info [0]) get_index ( ' http://www.shicimingju.com ' + url_info [0]) # acquired book directory function def get_index(url): response = requests.get(url, headers) etrees = etree.HTML(response.text) url_infos = etrees.xpath('//div[@class="book-mulu"]/ul/li') for i in url_infos: url_info = i.xpath('./a/@href') # print('http://www.shicimingju.com' + url_info[0]) get_content('http://www.shicimingju.com' + url_info[0]) # 获取书籍内容并写入.txt文件 def get_content(url): response = requests.get(url, headers) etrees = etree.HTML(response.text) title = etrees.xpath('//div[@class="www-main-container www-shadow-card "]/h1/text()')[0] content = etrees.xpath('//div[@class="chapter_content"]/p/text()') content = ''.join(content) book_name=etrees.xpath('//div[@class="nav-top"]/a[3]/text()')[0] with open(pathname+book_name+'.txt','a+', encoding = ' UTF-. 8 ' ) AS F: f.write (title + ' \ n-\ n- ' + Content + ' \ n-\ n-\ n- ' ) Print (title + ' .. download completes ' ) # program entry IF the __name__ == ' __main__ ' : URL = ' http://www.shicimingju.com/book/ ' get_book (URL)
Console to view the download process;
Open the folder to see if the download is successful;
done.