import re
import requests
import json
import os
from urllib import request
url = 'https://www.toutiao.com/search_content/?offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'
response = requests.get(url)
# 可以通过response.json, 直接获取转化后的对象(dict)
html_str_dict = response.json()
# print(html_str_dict)
# 获取dict中的data key对应的列表
data_list = html_str_dict['data']
# print(data_list)
#如果列表中的每一项, 有article_url 我们就取这个值
for i in data_list:
if 'article_url' in i: # 取到这个url
article_url = i['article_url']
# response = requests.get(article_url)
print(article_url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
response = requests.get(article_url,headers=headers)
# with open('beiying.html', 'wb') as f:
# f.write(response.content)
html_str = response.text
pattern = r'gallery: JSON\.parse\((.*)\),' # 正则匹配
match_res = re.search(pattern, html_str)
# 新建文件夹
if not os.path.exists('download') :
os.mkdir('download')
if match_res:
# 这本来就是str
# print(match_res.group(1))
json_origin = match_res.group(1)
# 这是第一遍loads, 返回值是str
res_buzhidao = json.loads(json_origin)
# print(res_buzhidao)
# print(type(res_buzhidao))
res_dict = json.loads(res_buzhidao)
# print(res_dict)
# print(type(res_dict))
sub_images_list = res_dict['sub_images']
for image in sub_images_list:
image_url = image['url']
filename = 'download/' + image_url.split('/')[-1] + '.jpg'
# 下载图片
request.urlretrieve(image_url, filename)
else:
print('你写错了')
今日头条 json 街拍 baocun
猜你喜欢
转载自blog.csdn.net/qq_41996633/article/details/81750233
今日推荐
周排行