通过分析AJAX抓取今日头条街拍数据

今日头条街拍数据保存在js里 爬取时建议使用代理服务器

#-*- coding = utf-8 -*-

import requests
from urllib.parse import urlencode
import json
import re
from requests.exceptions import RequestException

#设置UA
headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0'}

def get_one_page(offset,keyword):
#    使用urlencode拼接查询字符串
    data = {
        'autoload': 'true',
        'count': 20,
        'cur_tab': '3',
        'format': 'json',
        'from': 'gallery',
        'keyword': keyword,
        'offset': offset}
    url = 'https://www.toutiao.com/search_content/?'+urlencode(data)
#    判断是否请求成功
    try:
        response = requests.get(url,headers=headers)
        if response.status_code ==200:
            return response.text
    except RequestException:
        print("请求出错")
        return None

# def get_image(html):

#     result = json.loads(html)
#     for i in range(20):

#         url =result['data'][i]['article_url']
#         response = requests.get(url,headers=headers)
#         if response.status_code ==200:
#             html = response.text
#             print('***')
#             get_image_url(html)
#         else:
#             return None
def parse_page_index(html):
#    解析获得的网页源代码 将目标信息遍历并且生成生成器传出
    result = json.loads(html)
    if result and 'data' in result.keys():
        for items in result.get('data'):
            yield items.get('article_url')

def get_page_url(url):
    try:
        response = requests.get(url,headers=headers)
        if response.status_code ==200:
            return response.text
    except RequestException:
        print("请求出错",url)
        return None

def parse_page_detail(html,url):
#    使用正则去匹配标题
    pattern = re.compile("title: '([\s\S]*?)'")
    items = re.findall(pattern,html)
    title = items[0]
#使用正则的search去匹配所有复合条件的子组并且将无效的字符串替换
    pattern = re.compile(r'gallery: JSON.parse\("(.*?)"\)')
    items = re.search(pattern,html).group(1)

    items = re.sub(r"\\",'',items)

    result = json.loads(items)
#    遍历出所需要的信息
    if result and 'sub_images' in result.keys():
        sub_images = result.get('sub_images')
        image = [i['url'] for i in sub_images ]
        return {'title':title,
                'image':image,
                'url':url}


# def get_image_url(html):
   
#     pattern = re.compile(r'"url\\":\\"([\s\S]*?)"')
#     items = re.findall(pattern,html)

#     for index,item in enumerate(items):
#         result = re.sub(r'\\','',item)
#         print(index,item)
        # response = requests.get(result,headers=headers)
        # with open('image/%s.jpg'%index,'ab') as f:
        #     f.write(response.content)

def main():

    html = get_one_page(0,'街拍')
    for url in parse_page_index(html):
        html = get_page_url(url)
        result = parse_page_detail(html,url)
        print(result)

if __name__=="__main__":
    main()

猜你喜欢

转载自blog.csdn.net/qq_42196922/article/details/81281297