import requests
class TiebaSpider:
def __init__(self,tieba_name):
"""
初始化参数,完成基础配置
"""
self.tieba_name = tieba_name
self.url_base = "https://tieba.baidu.com/f?kw=" + tieba_name_crawl + "&ie=utf-8&pn={}"
self.headers = {"User-Agent":"WSF"}
def make_url_lists(self):
"""
生成下载列表
"""
return [self.url_base.format(i) for i in range(1,11)]
def download_url(self,url_str):
"""
使用requests get方法下载指定页面,并返回页面效果
"""
result = requests.get(url_str,headers = self.headers)
return result.content
def save(self,result,page_num):
"""
存储下载内容
"""
file_path = "{}-第{}页.html".format(self.tieba_name,page_num)
with open(file_path,"wb") as f:
f.write(result)
def run(self):
"""
下载主线程,实现主要的下载逻辑
"""
url_lists = self.make_url_lists()
for url_str in url_lists:
result_str = self.download_url(url_str)
p_num = url_lists.index(url_str) + 1
self.save_result(result_str,p_num)
if __name__ == '__main__':
tieba_spider = TiebaSpider("薛之谦")
tieba_spider.run()
理解 session 和 cookie
session:当用户访问http-server时,会生成一个sessionID(唯一标识),在一定访问周期内可用,在浏览网页时会将记录保存在cookie中,下次访问有缓存记录.
session 服务器端生成一个字符串存储在某个用户的唯一标识.用来唯一标识客户端的访问(如健身中心会员卡)
cookie 存储在客户机的数据,其中含有sessionID,发送给服务器后表明用户身份.
import lxml.html
import requests
import re
def parse_form(html):
tree = lxml.html.fromstring(html)
data = {}
for e in tree.cssselect('form input'):
if e.get('name'):
data[e.get('name')] = e.get('value')
return data
def get_cookie():
s = requests.session()
result = s.get('http://example.webscraping.com/places/default/user/login?_next=/places/default/index')
post_data = parse_form(result.text)
print(s.cookies.get_dict())
login_url ='http://example.webscraping.com/places/default/user/login?_next=/places/default/index'
post_data['email']= '[email protected]'
post_data['password'] = '2336517498'
s.post(login_url,post_data)
rs = s.post('http://example.webscraping.com/places/default/user/login?_next=/places/default/index')
with open('login1.html','w+') as f:
f.write(rs.text)
if __name__ == '__main__':
get_cookie()