title: python reptile 01
DATE: 2020-03-08 22:56:12
Tags:
1, requests module
Underlying requests module is urllib, but more powerful than urllib and more concise.
basic method:
- requests.get()
- requests.post()
- requests.request()
- requests.head()
At this stage only learn to get and post method.
Wherein the request is to obtain the get method, post request is sent.
2, pages crawled Baidu Post Bar Example
# coding=utf-8
import requests
class TiebaSpider:
def __init__(self,tieba_name):
self.tieba_name = tieba_name
self.tieba_url = "https://tieba.baidu.com/f?kw="+ tieba_name +"&ie=utf-8&pn={}"
self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36"}
def getUrlList(self):
# url_list = []
# for i in range(3):
# url_list.append(self.tieba_url.format(i*50))
# return url_list
# 下方的是简洁写法,是主流的写法
return [self.tieba_url.format(i*50) for i in range(3)]
def get_content(self,url):
print(url)
response = requests.get(url,headers = self.header)
return response.content.decode()
def save_html(self,response,index):
file_name = "{}吧-第{}页.html".format(self.tieba_name,index)
# with open as 变量名 方式打开文件会在最后关闭文件
with open(file_name,"w",encoding="utf-8") as f:
f.write(response)
def run(self):
#1.获取要爬的url列表
url_list = self.getUrlList()
#2.开始爬取
for url in url_list:
response = self.get_content(url)
# 3.保存文件
self.save_html(response,url_list.index(url)+1)
if __name__ == '__main__':
tieba_name = input("请输入要爬取的贴吧名称:")
spider = TiebaSpider(tieba_name)
spider.run()