因公司需求,需要爬取微博相关大v的账号以及相关信息,一开始是直接爬取,发现只可以爬取10条数据(没有登陆的情况下),所以就涉及到模拟登陆。
import requests
import json
import base64
from lxml import etree
from bs4 import BeautifulSoup
import time
def login(username, password):
username = base64.b64encode(username.encode('utf-8')).decode('utf-8')
postData = {
"entry": "sso",
"gateway": "1",
"from": "null",
"savestate": "30",
"useticket": "0",
"pagerefer": "",
"vsnf": "1",
"su": username,
"service": "sso",
"sp": password,
"sr": "1440*900",
"encoding": "UTF-8",
"cdult": "3",
"domain": "sina.com.cn",
"prelt": "0",
"returntype": "TEXT",
}
loginURL = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)'
session = requests.Session()
res = session.post(loginURL, data = postData)
jsonStr = res.content.decode('gbk')
info = json.loads(jsonStr)
if info["retcode"] == "0":
print("登录成功")
# 把cookies添加到headers中,必须写这一步,否则后面调用API失败
cookies = session.cookies.get_dict()
cookies = [key + "=" + value for key, value in cookies.items()]
cookies = "; ".join(cookies)
session.headers["cookie"] = cookies
else:
print("登录失败,原因: %s" % info["reason"])
return session
if __name__ == '__main__':
session = login('your account', 'your password')