python 爬取古诗词网

import threading
import requests
import re

def parse_page(url):
    headers = {
        'user - agent': 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 81.0.4044.129Safari / 537.36'
    }
    response = requests.get(url,headers)
    text = response.text
    titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL)
    dynasties = re.findall(r'<p\sclass="source">.*?<a\s.*?>(.*?)</a>',text)
    authors = re.findall(r'<p\sclass="source">.*?<a\s.*?>.*?</a>.*?<a\s.*?>(.*?)</a>',text)
    contents = re.findall(r'<div class="contson" .*?>(.*?)</div>',text,re.DOTALL)
    contents_1 = list()
    for i in contents:
        x = re.sub(r'<.*?>',"",i)
        contents_1.append(x.strip())
    poems = []
    for value in zip(titles,dynasties,authors,contents_1):
        title,dynasty,author,content = value
        poem = {
            '名字':title,
            '朝代':dynasty,
            '作者':author,
            '诗句':content
        }
        poems.append(poem)
    for i in poems:
        print(i)
def splider():

    for i in range(1,11):
        thread = []
        url = 'https://www.gushiwen.org/default_%s.aspx'%i
        thread.append(threading.Thread(target=parse_page,args=(url,)))
        thread[-1].start()
    for i in thread:
        i.join()


if __name__ == '__main__':
    splider()
    print('+'*20)

猜你喜欢

转载自blog.csdn.net/weixin_45949073/article/details/106102323