import requests
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
#爬取章节内容
def textPaqu(url):
bigUrl = 'http://www.shicimingju.com'
new_url = bigUrl+url
response = requests.get(url=new_url, headers=headers)
content = response.content.decode('utf-8')
pat_t = re.compile('<h1>(.*?)</h1>')
pat_content = re.compile('<p>(.*?)</p>')
title = pat_t.findall(content)
text = pat_content.findall(content)
fr = open(r'text\三国演义.txt', 'a+')
fr.write(title[0]+'\n\n')
for i in text:
fr.write(i.strip().replace(' ',' ')+'\n')
fr.write('\n\n')
fr.close()
#爬取简介和章节名
def paqu():
url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
response = requests.get(url=url,headers=headers)
content = response.content.decode('utf-8')
pat_T = re.compile('<h1>(.*?)</h1>')
pat_jianjie = re.compile('<p>(.*?)</p>')
pat_t = re.compile('<li><a href="(/book/sanguoyanyi/.*?)">.*?</a></li>')
title = pat_T.findall(content)
jianjie = pat_jianjie.findall(content)
zhangjie = pat_t.findall(content)
fr = open(r'text\三国演义.txt','a+')
fr.write(title[0]+'\n')
for i in jianjie:
fr.write(i+'\n')
fr.write('\n\n')
fr.close()
for i in range(len(zhangjie)):
print('正在爬取第%d章'%(i+1))
textPaqu(zhangjie[i])
print('第%d章爬去完成'%(i+1))
paqu()
python诗词名句网爬取《三国演义》
猜你喜欢
转载自blog.csdn.net/qq_42591058/article/details/88379029
今日推荐
周排行