封装函数抓取某贴吧
分析贴吧的url规律。
第1页:https://tieba.baidu.com/f?kw=%E7%8E%8B%E8%8F%8A&ie=utf-8&pn=0
第2页:https://tieba.baidu.com/f?kw=%E7%8E%8B%E8%8F%8A&ie=utf-8&pn=50
第3页: https://tieba.baidu.com/f?kw=%E7%8E%8B%E8%8F%8A&ie=utf-8&pn=100
pn是控制页码的。着重关注代码封装的思路。
from urllib import request, parse import os import ssl # 解决ssl证书问题 ssl._create_default_https_context = ssl._create_unverified_context # 处理url,返回request def handle_url(url, page, name): # 拼接url pn = (page -1) * 50 data = { 'kw': name, 'pn': pn } data = parse.urlencode(data) url = url + data req = request.Request(url=url) return req # 负责下载动作 def download(req, page): response = request.urlopen(req) dirname = 'tieba' filename = '第' + str(page) + '页.html' filepath = os.path.join(dirname, filename) with open(filepath, 'wb') as fp: fp.write(response.read()) def main(): name = input('请输入要爬取的贴吧名:') start_page = int(input('请输入起始页码:')) end_page = int(input('请输入起始页码:')) url = 'https://tieba.baidu.com/f?ie=utf-8&' # 循环下载每一页的内容 for page in range(start_page, end_page+1): # 封装函数获取request对象 req = handle_url(url, page, name) print('开始下载第%d页'%page) # 封装函数执行下载功能 download(req, page) print('结束下载第%d页' % page) if __name__ == '__main__': main()