python 使用requests模块,爬取百度贴吧内容

爬取百度贴吧

import requests
from lxml import etree
import json

class TiebaSpider(object):
    def __init__(self,name):
        self.name = name
        self.url = 'http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/m?kw='+name+'&lp=5011&lm=&pinf=1&pn=0'
        # 请求头
        self.headers = {
            'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
        }
        self.prefix_url = 'http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/'
    def get_page_from_url(self,url):
        """ 发送请求, 获取页面数据"""
        response = requests.get(url,headers=self.headers)
        return response.content
    def get_data_from_page(self,page):
        element = etree.HTML(page)
        a_s = element.xpath('//div[contains(@class,"i")]/a')
        data_list = []
        for a in a_s:
            data = {}
            data['title'] = a.xpath('./text()')[0]
            data['url'] = self.prefix_url + a.xpath('./@href')[0]
            data_list.append(data)
        next_url = element.xpath('//a[text()="下一页"]/@href')
        if len(next_url) != 0:
            next_url = self.prefix_url + next_url[0]
        else:
            next_url = None
        return data_list,next_url

    def save_data(self,data_list):
        """保存数据  """
        file_name = "{}.jsonlines".format(self.name)
        with open(file_name,'a',encoding='utf8') as f:
            for data in data_list:
                json.dump(data,f,ensure_ascii=False)
                f.write('\n')
    def run(self):
        url = self.url
        # 循环每一页
        while url:
            # 发送请求, 获取页面数据
            page = self.get_page_from_url(url)
            # 提取数据(xpath)
            data_list,url = self.get_data_from_page(page)
            # 保存数据
            self.save_data(data_list)

if __name__ == '__main__':
    tbs = TiebaSpider('刘亦菲')		# 刘亦菲可以替他明星
    tbs.run()


有问题请留言

猜你喜欢

转载自blog.csdn.net/weixin_43407092/article/details/88429297