python 爬虫(五)爬取多页内容

import urllib.request
import ssl
import re

def ajaxCrawler(url):
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
    req = urllib.request.Request(url,headers=headers)

    #使用ssl创建未验证的上下文
    context = ssl._create_unverified_context()

    response = urllib.request.urlopen(req,context=context)
    jsonStr = response.read().decode("utf-8")

    return jsonStr

url = "https://www.qiushibaike.com/text/page/1/" #然后循环爬取page/2/ 、、、
#filePath = "qiushi.html"
par1 = r'''article block untagged mb15(.*?)class="stats-comments'''
re_ob = re.compile(par1,re.S)
listStr = re_ob.findall(ajaxCrawler(url))

jsonStr ={}

for ss in listStr:
    re_Content = re.compile(r'''class="content".*?<span>(.*?)</span>''',re.S)  #前期不要写的太严格,防止有的匹配不到
    userContent = re_Content.findall(ss)[0] #返回的是一个数组,取第一个

    re_name = re.compile(r'''<h2>(.*?)</h2>''',re.S)
    userName = re_name.findall(ss)[0]

    jsonStr[userName] = userContent
for k,v in jsonStr.items():
    print(k+":说"+v)

猜你喜欢

转载自blog.csdn.net/weixin_40938748/article/details/85310881