#评论链接有max_id值,下个链接里的必要参数max_id是上个链接结果里的值,所以考虑到最后用递归,递归感觉还是挺难的,重点要考虑好结束条件,本案例结束条件就是max_id==0,等于0表示就是没有下一页了
import requests
import json
from lxml import etree
max_id=0
html_contents=[]
def down(url):
headers={
"accept":"application / json, text / plain, * / *",
"upgrade-insecure-requests":"1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36",
# "Referer":"https: // m.weibo.cn / detail / 4323281584327025",
"cookie":"_T_WM=74b5406b79cd18adabbcaac40f997914; WEIBOCN_FROM=1110006030; MLOGIN=1; SSOLoginState=1546235890; ALF=1548827890; SCF=Arj6zmmKiOmQAk_IgSYwafWcdI6LlAtTIuAWJCXnxyWffuZOwcMEjITykhpkEIjdpvk1Tl-MAFRtjZPwLBkKg7w.; SUB=_2A25xLd-iDeRhGeBG41IS9yzJzD2IHXVS0eHqrDV6PUNbktAKLRHTkW1NQeU4KyxGbCrkBPK46ssmM7owlLLmzyNw; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WF6hmlpjTzkNkQzFAuzj21D5JpX5KMhUgL.FoqR1h50S0zfS022dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMXShBfehzRe0eX; SUHB=03oFS1TMqpmO_Q; M_WEIBOCN_PARAMS=oid%3D4323281584327025%26luicode%3D20000174%26lfid%3D4323281584327025%26uicode%3D20000061%26fid%3D4323281584327025",
}
html = requests.get(url,headers=headers).text
print(html)
if json.loads(html)['data']['max_id'] == 0:
html_contents.append(json.loads(html))
return 0
else:
html_contents.append([json.loads(html)])
max_id = json.loads(html)['data']['max_id']
print(max_id)
print(type(max_id))
print(down("https://m.weibo.cn/comments/hotflow?id=4323281584327025&mid=4323281584327025&max_id={}&max_id_type=0".format(str(max_id))))
return 1
# return down("https://m.weibo.cn/comments/hotflow?id=4323281584327025&mid=4323281584327025&max_id={}&max_id_type=0".format(str(max_id)))
print(down("https://m.weibo.cn/comments/hotflow?id=4323281584327025&mid=4323281584327025&max_id_type=0"))
print(html_contents)