今日头条街拍数据保存在js里 爬取时建议使用代理服务器
#-*- coding = utf-8 -*-
import requests
from urllib.parse import urlencode
import json
import re
from requests.exceptions import RequestException
#设置UA
headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0'}
def get_one_page(offset,keyword):
# 使用urlencode拼接查询字符串
data = {
'autoload': 'true',
'count': 20,
'cur_tab': '3',
'format': 'json',
'from': 'gallery',
'keyword': keyword,
'offset': offset}
url = 'https://www.toutiao.com/search_content/?'+urlencode(data)
# 判断是否请求成功
try:
response = requests.get(url,headers=headers)
if response.status_code ==200:
return response.text
except RequestException:
print("请求出错")
return None
# def get_image(html):
# result = json.loads(html)
# for i in range(20):
# url =result['data'][i]['article_url']
# response = requests.get(url,headers=headers)
# if response.status_code ==200:
# html = response.text
# print('***')
# get_image_url(html)
# else:
# return None
def parse_page_index(html):
# 解析获得的网页源代码 将目标信息遍历并且生成生成器传出
result = json.loads(html)
if result and 'data' in result.keys():
for items in result.get('data'):
yield items.get('article_url')
def get_page_url(url):
try:
response = requests.get(url,headers=headers)
if response.status_code ==200:
return response.text
except RequestException:
print("请求出错",url)
return None
def parse_page_detail(html,url):
# 使用正则去匹配标题
pattern = re.compile("title: '([\s\S]*?)'")
items = re.findall(pattern,html)
title = items[0]
#使用正则的search去匹配所有复合条件的子组并且将无效的字符串替换
pattern = re.compile(r'gallery: JSON.parse\("(.*?)"\)')
items = re.search(pattern,html).group(1)
items = re.sub(r"\\",'',items)
result = json.loads(items)
# 遍历出所需要的信息
if result and 'sub_images' in result.keys():
sub_images = result.get('sub_images')
image = [i['url'] for i in sub_images ]
return {'title':title,
'image':image,
'url':url}
# def get_image_url(html):
# pattern = re.compile(r'"url\\":\\"([\s\S]*?)"')
# items = re.findall(pattern,html)
# for index,item in enumerate(items):
# result = re.sub(r'\\','',item)
# print(index,item)
# response = requests.get(result,headers=headers)
# with open('image/%s.jpg'%index,'ab') as f:
# f.write(response.content)
def main():
html = get_one_page(0,'街拍')
for url in parse_page_index(html):
html = get_page_url(url)
result = parse_page_detail(html,url)
print(result)
if __name__=="__main__":
main()