By analyzing the url structure corresponding to the post bar, two variables kw and pages are extracted. After the user enters the values of these two variables, the corresponding page is crawled through the requests library. The code is as follows:
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
}
kw = input("请输入您想要爬取的贴吧名称:《示例:宝马》 ")
pages = input("请输入您想要爬取的页面数:《示例:5》")
for i in range(int(pages)):
url = f'https://tieba.baidu.com/f?kw={kw}&ie=utf-8&pn={i*50}'
response = requests.get(url,headers = headers).content.decode()
with open(f'{kw}+{i}.html','w',encoding='utf8')as f:
f.write(response)
print(f'{kw}+{i}页面采集完成!')
Encapsulate the crawler into a class, and encapsulate the main steps of the crawler into class member methods. The refactored code is as follows:
class TBSpider(object):
# 第一步,准备基础数据
def __init__(self):
self.kw = input('请输入您想要爬取的贴吧名称:《示例:宝马》')
self.pages = input('请输入您想要爬取的页面数:《示例:5》')
self.start_url = 'https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
# 第二步,发送请求,获取响应
def start(self):
for i in range(int(self.pages)):
user_url = self.start_url.format(quote(self.kw), i*50)
response = requests.get(user_url, headers=self.headers)
self.deal_data(response, i)
# 第三步,解析响应,处理数据
def deal_data(self, response, i):
data = response.content.decode()
self.save_data(data, i)
# 第四步,存储数据
def save_data(self, data, i):
with open(f'{self.kw}{i}.html', 'w', encoding='utf8')as f:
f.write(data)
print(f'{self.kw}+第{i}页采集完成!!!')
if __name__ == '__main__':
t = TBSpider()
t.start()