爬虫 实践 小例子 爬取书籍保存到本地

爬虫 实践 小例子

import requests,os
from urllib import request
from lxml import etree

dirName = './books'
if not os.path.exists(dirName):
    os.mkdir(dirName)

headers={
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}

url = 'http://www.shicimingju.com/book'

page_text = requests.get(url,headers=headers).text   

# print(page_text)

tree = etree.HTML(page_text)
a_list = tree.xpath('//div[@class="bookmark-list"]//a')   
for a in a_list:
    bookname = a.xpath('./text()')[0]
    book_path = "http://www.shicimingju.com" + a.xpath('./@href')[0]
    #print(bookname,book_path) # 不取第一个元素的话返回的是列表   ['三国演义'] ['/book/sanguoyanyi.html']
    book_page = requests.get(book_path,headers=headers).text
    tree = etree.HTML(book_page)
    book_a_list = tree.xpath('//div[@class="book-mulu"]//a')
    path = dirName + '/' + bookname
    with open(path,'w',encoding='utf-8') as f:
        for a in book_a_list:
            title = a.xpath('./text()')[0]
            detail_path = 'http://www.shicimingju.com'+a.xpath('./@href')[0]
            detail_page = requests.get(detail_path,headers=headers).text
            content = etree.HTML(detail_page).xpath('//div[@class="chapter_content"]//text()')
            content = ''.join(content)
            f.write(title+':' + content + '\n')
            
            print(title,"下载成功")
           

猜你喜欢

转载自www.cnblogs.com/he-qing-qing/p/11502543.html