源码:
import urllib
import random
def JokeSet(Url,UserAgent)
'''
Url : 动态url网址
UserAgent :动态请求头
'''
#设置请求头
Headers ={
"User-Agent" : UserAgent
}
#设置请求体
req = urllib.request.Request(Url,headers = Headers)
response = urllib.request.urlopen(req)
data = response.read().decode("utf-8")
#爬取的数据写入文件中
path = r"D:\pythonItem\爬虫Practice\糗事百科.html"
with open(path,"w",encode="utf-8") as f :
f.write(data)
for i in range(1,11) :
#获取url页码,爬取的数量可以调整
url = r"https://www.qiushibaike.com/text/page/"+str(2)+"/"
HeadersList =[
[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69."
]
#随机产生列表中的请求头,预防被封ip
UserAgent = random.choice(HeadersList)
try :
JokeSet(url,UserAgent)
print("第{}次爬取成功".format(i))
except :
print("爬取失败”)
#Html数据处理后续更新