python诗词名句网爬取《三国演义》

import requests
import re

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
#爬取章节内容
def textPaqu(url):
    bigUrl = 'http://www.shicimingju.com'
    new_url = bigUrl+url
    response = requests.get(url=new_url, headers=headers)
    content = response.content.decode('utf-8')
    pat_t = re.compile('<h1>(.*?)</h1>')
    pat_content = re.compile('<p>(.*?)</p>')
    title = pat_t.findall(content)
    text = pat_content.findall(content)
    fr = open(r'text\三国演义.txt', 'a+')
    fr.write(title[0]+'\n\n')
    for i in text:
        fr.write(i.strip().replace('&nbsp;',' ')+'\n')
    fr.write('\n\n')
    fr.close()

#爬取简介和章节名
def paqu():
    url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
    response = requests.get(url=url,headers=headers)
    content = response.content.decode('utf-8')
    pat_T = re.compile('<h1>(.*?)</h1>')
    pat_jianjie = re.compile('<p>(.*?)</p>')
    pat_t = re.compile('<li><a href="(/book/sanguoyanyi/.*?)">.*?</a></li>')
    title = pat_T.findall(content)
    jianjie = pat_jianjie.findall(content)
    zhangjie = pat_t.findall(content)
    fr = open(r'text\三国演义.txt','a+')
    fr.write(title[0]+'\n')
    for i in jianjie:
        fr.write(i+'\n')
    fr.write('\n\n')
    fr.close()
    for i in range(len(zhangjie)):
        print('正在爬取第%d章'%(i+1))
        textPaqu(zhangjie[i])
        print('第%d章爬去完成'%(i+1))

paqu()

猜你喜欢

转载自blog.csdn.net/qq_42591058/article/details/88379029