封装函数爬虫

封装函数抓取某贴吧

分析贴吧的url规律。

        第1页:https://tieba.baidu.com/f?kw=%E7%8E%8B%E8%8F%8A&ie=utf-8&pn=0

        第2页:https://tieba.baidu.com/f?kw=%E7%8E%8B%E8%8F%8A&ie=utf-8&pn=50

        第3页: https://tieba.baidu.com/f?kw=%E7%8E%8B%E8%8F%8A&ie=utf-8&pn=100



            pn是控制页码的。着重关注代码封装的思路。


from urllib import request, parse
import os
import ssl

# 解决ssl证书问题
ssl._create_default_https_context = ssl._create_unverified_context
# 处理url,返回request



def handle_url(url, page, name):
    # 拼接url
    pn = (page -1) * 50
    data = {
        'kw': name,
        'pn': pn
    }
    data = parse.urlencode(data)
    url = url + data
    req = request.Request(url=url)
    return req


# 负责下载动作
def download(req, page):
    response = request.urlopen(req)
    dirname = 'tieba'
    filename = '第' + str(page) + '页.html'
    filepath = os.path.join(dirname, filename)
    with open(filepath, 'wb') as fp:
        fp.write(response.read())


def main():
    name = input('请输入要爬取的贴吧名:')
    start_page = int(input('请输入起始页码:'))
    end_page = int(input('请输入起始页码:'))
    url = 'https://tieba.baidu.com/f?ie=utf-8&'

    # 循环下载每一页的内容
    for page in range(start_page, end_page+1):
        # 封装函数获取request对象
        req = handle_url(url, page, name)
        print('开始下载第%d页'%page)
        # 封装函数执行下载功能
        download(req, page)
        print('结束下载第%d页' % page)



if __name__ == '__main__':
    main()


猜你喜欢

转载自blog.csdn.net/g_sangsk/article/details/80867872