爬取斗鱼平台

知识点：

1.运用selenium自动化驱动模块

2.find_elements_by_xpath（）与fin_element_by_xpath（）的区别，以及对元素的定位，内容的提取

3.获取请求下一页方法，注：time.sleep()

代码：

#encoding=utf-8
from selenium import webdriver
import time

class DouYu():
    def __init__(self):
        self.url = "https://www.douyu.com/directory/all"
        self.driver = webdriver.Chrome()

    def get_content_list(self):
        """
        get：每页中直播间的内容信息
        :return:
        """
        # 获取用户直播间的对象列表
        content_list = self.driver.find_elements_by_xpath(".//ul[@id='live-list-contentbox']/li")

        # 提取每页每个直播间的信息
        get_contents_list = []
        for content in content_list:
            dict = {}
            dict["room_img"] = content.find_element_by_xpath(".//span[@class='imgbox']/img").get_attribute("src")
            dict["room_name"] = content.find_element_by_xpath(".//a").get_attribute("title")
            dict["room_info"] = content.find_element_by_xpath(".//div[@class='mes-tit']/span").text
            get_contents_list.append(dict)
        # 获取下一页元素,获得元素对象列表
        next_url = self.driver.find_elements_by_xpath("//a[@class='shark-pager-next']")
        # 获取元素对象：next_url[0]
        next_url = next_url[0] if len(next_url) > 0 else None
        return get_contents_list, next_url

    def run(self):
        # 获取请求
        self.driver.get(self.url)
        # 获取请求页内容列表,每页的内容信息
        get_contents_list, next_url = self.get_content_list()
        # 保存
        print(get_contents_list)
        # 请求下一页的元素
        while next_url is not None:
            next_url.click()
            time.sleep(3)
            get_contents_list, next_url = self.get_content_list()
            # 保存
            print(get_contents_list)
            # with open("D:\\save.txt", "rb") as f:
            #     f.write(get_contents_list)
            #     f.close()

if __name__=="__main__":
    spider = DouYu()
    spider.run()

猜你喜欢