import threading
import requests
import re
def parse_page(url):
headers = {
'user - agent': 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 81.0.4044.129Safari / 537.36'
}
response = requests.get(url,headers)
text = response.text
titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL)
dynasties = re.findall(r'<p\sclass="source">.*?<a\s.*?>(.*?)</a>',text)
authors = re.findall(r'<p\sclass="source">.*?<a\s.*?>.*?</a>.*?<a\s.*?>(.*?)</a>',text)
contents = re.findall(r'<div class="contson" .*?>(.*?)</div>',text,re.DOTALL)
contents_1 = list()
for i in contents:
x = re.sub(r'<.*?>',"",i)
contents_1.append(x.strip())
poems = []
for value in zip(titles,dynasties,authors,contents_1):
title,dynasty,author,content = value
poem = {
'名字':title,
'朝代':dynasty,
'作者':author,
'诗句':content
}
poems.append(poem)
for i in poems:
print(i)
def splider():
for i in range(1,11):
thread = []
url = 'https://www.gushiwen.org/default_%s.aspx'%i
thread.append(threading.Thread(target=parse_page,args=(url,)))
thread[-1].start()
for i in thread:
i.join()
if __name__ == '__main__':
splider()
print('+'*20)
python 爬取古诗词网
猜你喜欢
转载自blog.csdn.net/weixin_45949073/article/details/106102323
今日推荐
周排行