python爬虫爬取今日头条APP数据(无需破解as ,cp,_cp_signature参数)

版权声明:版权声明:本文为博主原创文章,转载注明出处! https://blog.csdn.net/weixin_39416561/article/details/84672104
#!coding=utf-8
import requests
import re
import json
import math
import random
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import pandas as pd
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)  ###禁止提醒SSL警告


def ttapi(url):  ####APP模式
    channel = re.search('ch/(.*?)/', url).group(1)
    s = requests.session()
    headers = {
            'Accept':'image/webp,image/*;q=0.8',
            'User-Agent':'News/6.9.8.36 CFNetwork/975.0.3 Darwin/18.2.0',
            'Accept-Language':'zh-cn'
               }
    s.headers.update(headers)
    df=pd.DataFrame(columns=(
        'abstract 简报','title 标题','keywords 关键词','read_count 阅读量','share_count 分享数量',
        'ban_comment 可评论','publish_time 推送时间','share_url url 链接','user_info_name 用户名',
         'user_id 用户 id','description 用户描述','user_verified 官方账号','time 抓取时间','category 频道'
    ))
    t2 = int(time.time())-500
    x=0
    for i in range(10):  ###爬取页数
        time.sleep(3)
        t=int(time.time())
        params={
        'category':channel,   ###频道名
        'refer':'1',   ###???,固定值1
        'count':'20',   ####返回数量,默认为20
        'min_behot_time':t2,          ####上次请求时间的时间戳,例:1491981025
        'last_refresh_sub_entrance_interval':t-10,#####本次请求时间的时间戳,例:1491981165
        'loc_time':int(t/1000)*1000,###本地时间
        'latitude':'',###经度
        'longitude':'',###纬度
        'city':'',###当前城市
        'iid':'1234876543',###某个唯一 id,长度为10
        'device_id':'42433242851',###设备id,长度为11
        'abflag':'3',
        'ssmix':'a',
        'language':'zh',
        'openudid':'1b8d5bf69dc4a561',####某个唯一id,长度为16

        }
        url='http://is.snssdk.com/api/news/feed/v51/'
        app=s.get(url=url,params=params,verify=False).json()
        print(app)
        t2=t-10
        total_number=app['total_number']
        #print(total_number)
        for j in range(0,total_number):
            content=json.loads(app['data'][j]['content'])
            try:
                abstract=content['abstract']  ##简报
            except:
                abstract = ''
            try:
                title = content['title']   ##标题
            except:
                title =''
            try:
                keywords = content['keywords']   ##关键词
            except:
                keywords =''
            try:
                read_count=content['read_count']   ##阅读量
            except:
                read_count=''
            try:
                share_count = content['share_count']   ##分享数量
            except:
                share_count =''
            try:
                ban_comment = content['ban_comment']   ###是否可以评论,0为可评论,1不可评论
            except:
                ban_comment =''
            try:
                publish_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(content['publish_time']))   ##推送时间
            except:
                publish_time =''
            try:
                share_url = content['share_url']   ###分享 url 链接
            except:
                share_url =''
            try:
                user_info_name = content['user_info']['name']   ##用户名
            except:
                user_info_name =''
            try:
                user_id = content['user_info']['user_id']   ##用户 id
            except:
                user_id =''
            try:
                description = content['user_info']['description']  ##用户描述
            except:
                description =''
            try:
                 user_verified = content['user_info']['user_verified']   ###是否官方账号
            except:
                user_verified =''


            nowtime=time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
            df.loc[x] =[abstract, title, keywords, read_count, share_count, ban_comment,
                        publish_time, share_url, user_info_name, user_id, description,
                        user_verified,nowtime,channel]
            x=x+1

         
    df.to_csv('tt.csv',index=False, encoding="GB18030")
    s.close()

if __name__=='__main__':
    url='https://www.toutiao.com/ch/news_tech/'
    ttapi(url)

网页版as ,cp,_cp_signature参数破解:

https://blog.csdn.net/weixin_39416561/article/details/82111455

猜你喜欢

转载自blog.csdn.net/weixin_39416561/article/details/84672104
cp
今日推荐