python 爬取微信公众号历史文章

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import json
import time


# 2
# fillder 中取得一些不变得信息
# req_id = "0614ymV0y86FlTVXB02AXd8p"
# uin 【自己改一次就行】
this_uin = "Mjg5NTk4NTU0"
#公众号id
__biz="MzI2OTAzOTk2OA=="

# 【常需要修改的参数】
this_pass_ticket = "GeFL9a717Y9q9rT8RPY3LaCkKU19wWCcgOU2Eda62AM9sHPEhgZGHv6IYjRf9ody"
# 【常需要修改的参数】
this_appmsg_token = "1056_U4iR8EA%2FFsqZfnn7BRs6uU5coEix3vDYNHCILA~~"
# 【常需要修改的参数】
this_key = "1392818bdbc0aa183e6006e86b04ef5962a5d76f13d96eaf8aedde03d29da7c08cdb88a50d27e7168c5fe532d859fb9bce5505bebdaedb5736ecbda77c134f54cc222dabec713e2ed4468920660a3577"


#获取历史文章列表
def get_list_data(offset):
offset=str(offset)
url = "https://mp.weixin.qq.com/mp/profile_ext"
headers={
"Cookie":"rewardsn=; wxtokenkey=777; wxuin=1691556501; devicetype=iPhoneiOS13.3; version=17000b24; lang=zh_CN; pass_ticket=52mszxPTOFRlaXh1gIDZwittx558clhLYxzWuECmj0i6ztBYTI/ntw2WySOtVyq; wap_sid2=CJW1zKYGElxVUGVQbHlqbVZBazM0M2diNXhZUTBJcllRdXZ1ZU9PUWpZenFSOEJTZW5LaVp4ODhlaktCV09zMnVmX1hkSEd5ZGdVaW9BLWdPa1B6Tm1XbWlSWnV4aUFFQUFBfjD05NH0BTgNQJVO",

}
data={"action":"getmsg",
"__biz":__biz,
"f":"json",
"offset":offset,
"count":"10",
"is_ok":"1",
"scene":"124",
"uin":this_uin,
"key":this_key,
"pass_ticket":this_pass_ticket,
"wxtoken":"",
"appmsg_token":this_appmsg_token,
"x5":"0",
}
ret=requests.get(url,headers=headers,params=data)
print(ret.json())
return ret.json()

#获取文章阅读数
def getMoreInfo(link):
# 获得mid,_biz,idx,sn 这几个在link中的信息
mid = link.split("&")[1].split("=")[1]
idx = link.split("&")[2].split("=")[1]
sn = link.split("&")[3].split("=")[1]
_biz = link.split("&")[0].split("_biz=")[1]

# fillder 中取得一些不变得信息
# req_id = "0614ymV0y86FlTVXB02AXd8p"
uin = this_uin
pass_ticket = this_pass_ticket
appmsg_token = this_appmsg_token
key = this_key
# 目标url
url = "http://mp.weixin.qq.com/mp/getappmsgext"
# 添加Cookie避免登陆操作,这里的"User-Agent"最好为手机浏览器的标识
phoneCookie = "wxtokenkey=777; rewardsn=; wxuin=2529518319; devicetype=Windows10; version=62060619; lang=zh_CN; pass_ticket=4KzFV+kaUHM+atRt91i/shNERUQyQ0EOwFbc9/Oe4gv6RiV6/J293IIDnggg1QzC; wap_sid2=CO/FlbYJElxJc2NLcUFINkI4Y1hmbllPWWszdXRjMVl6Z3hrd2FKcTFFOERyWkJZUjVFd3cyS3VmZHBkWGRZVG50d0F3aFZ4NEFEVktZeDEwVHQyN1NrNG80NFZRdWNEQUFBfjC5uYLkBTgNQAE="
headers = {
"Cookie": phoneCookie,
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.901.400 QQBrowser/9.0.2524.400"
}
# 添加data,`req_id`、`pass_ticket`分别对应文章的信息,从fiddler复制即可。
data = {
"is_only_read": "1",
"is_temp_url": "0",
"appmsg_type": "9",
'reward_uin_count': '0'
}
"""
添加请求参数
__biz对应公众号的信息,唯一
mid、sn、idx分别对应每篇文章的url的信息,需要从url中进行提取
key、appmsg_token从fiddler上复制即可
pass_ticket对应的文章的信息,也可以直接从fiddler复制
"""
params = {
"__biz": _biz,
"mid": mid,
"sn": sn,
"idx": idx,
"key": key,
"pass_ticket": pass_ticket,
"appmsg_token": appmsg_token,
"uin": uin,
"wxtoken": "777",
}

# 使用post方法进行提交
content = requests.post(url, headers=headers, data=data, params=params).json()

# 提取其中的阅读数和点赞数
# print(content["appmsgstat"]["read_num"], content["appmsgstat"]["like_num"])
try:
readNum = content["appmsgstat"]["read_num"]
# print(readNum)
except:
readNum = 0

# 歇3s,防止被封
time.sleep(3)
return readNum

#时间戳转换时间格式
def getDate(times):
# print(times)
timearr = time.localtime(times)
date = time.strftime("%Y-%m-%d %H:%M:%S", timearr)
return date

#写文件
def write(data):
print(data)
f=open("data.csv","a",encoding='utf-8')
for i in data:
f.write(str(i)+',')
f.write("\n")
f.close()


def run():
page=260
while True:
data=get_list_data(page)
if data['msg_count']==0:
break
page=data['next_offset']
print(page)
for i in json.loads(data["general_msg_list"])["list"]:
l=[]
datetime=i['comm_msg_info']['datetime']
datetime=getDate(datetime)
title=i['app_msg_ext_info']['title']
content_url=i['app_msg_ext_info']['content_url']
if content_url != "":
readNum=getMoreInfo(content_url)
l.extend([title, datetime, readNum])
write(l)
for j in i['app_msg_ext_info']['multi_app_msg_item_list']:
l=[]
title=j["title"]
content_url=j['content_url']
if content_url=="":
continue
readNum = getMoreInfo(content_url)
l.extend([title,datetime,readNum])
write(l)
# print(title)


if __name__ == '__main__':
# get_list_data("0")
run()
# getMoreInfo("http://mp.weixin.qq.com/s?__biz=MzI2OTAzOTk2OA==&mid=2650921879&idx=1&sn=918dc4e3ff5b8232697111e6de5cd445&chksm=f1131b98c664928e14e3117bb1d736e3476530bc01e5617b1fb29ea37c94d0790033c4195041&scene=27#wechat_redirect")


猜你喜欢

转载自www.cnblogs.com/yzre/p/12695867.html