Crawl Tieba web pages and save them in a local folder

Crawl Tieba webpage and save it locally

#!usr/bin/env python
# -*- coding:utf-8 _*-
"""
@author: Administrator
@file: tieba_test.py
@time: 2020/09/15
@desc:
"""
import requests
import os

class tiebaSpider():
    def __init__(self, tieba_name):
        self.tieba_name = tieba_name
        self.headers = {
    
    
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
        }
        self.base_url = "https://tieba.baidu.com/f?kw=" + tieba_name + "&ie=utf-8&pn={}"

    def get_url_list(self):
        url_list = []
        for i in range(10):
            url_list.append(self.base_url.format(i * 50))
        return url_list

    def parse_url(self, url):
        print(url)
        response = requests.get(url, headers=self.headers)
        return response.content.decode()

    def save_url(self, url_html, page_num):
        name = '贴吧目录'
        if name not in os.listdir('./'):
            os.mkdir(name)
        path = './贴吧目录/{}贴吧-第{}页'.format(self.tieba_name, page_num)
        with open(path, 'w', encoding='utf8') as f:
            f.write(url_html)

    def run(self):
        url_list = self.get_url_list()
        for url in url_list:
            url_html = self.parse_url(url)
        	# url在url_list的索引+1就是页面的页数
            page_nam = url_list.index(url) + 1
            self.save_url(url_html, page_nam)


if __name__ == '__main__':
    tiebaSpider('lol').run()

Guess you like

Origin blog.csdn.net/weixin_44429965/article/details/108609852