python爬取知乎话题:日常穿JK制服是一种怎样的体验?

from urllib import request
import math
import requests
import json
import re
imdef saveUrl():
   jpgcount = 0
for i in range(math.ceil(569/20)):
  '''
  该话题一共有569个答案,分析知乎json数据
  发现limit对应每次加载多少行,offset对应每次
  从第多少个回答开始加载,于是开始构造json的url
  '''
try:
kv = {'user-agent': 'Mozillar/5.0'}#构造请求头
       #知乎懒加载每次都产生一个新的url,即返回一个json数据
url = "https://www.zhihu.com/api/v4/questions/29814297/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=20&offset="+str(20*i)+"&platform=desktop&sort_by=default"
r = requests.get(url, headers=kv)
j = json.loads(r.text)#将返回的json构造成Python的一个对象,字典化 print(type(j))
for k in range(20):
jpgurl=re.findall(r'data-original="(.*?)"', j["data"][k]["content"])#json中content里才是需要解析的内容,通过正则表达式得到url(data-original=",,,url,,,,")
name = j["data"][k]["author"]["name"]#获取知乎用户昵称
for m in range(0,len(jpgurl),2):#由于在处理json数据的时候,每个图片的url有两个,于是使用步长为2循环,就筛选掉了一半奇数的重复url
picture = request.urlopen(jpgurl[m]).read()#使用urllib的request库读取url对应的图片
with open("C:/Users/23504/Desktop/JK制服/" + str(jpgcount)+".jpg",'wb') as file:
            #创建文件.格式:with open("文件目录",‘wb’) as file
file.write(picture)#写图片
print("第" + str(jpgcount) + "张图片下载完成")
jpgcount = jpgcount + 1
except:
print("url链接无效")
if __name__ == "__main__":
saveUrl()


猜你喜欢

转载自www.cnblogs.com/TryFirst/p/10322783.html