爬虫-某直播平台图片批量爬取url并下载

虎牙直播图片爬取
在这里插入图片描述

import json
import time
import requests
from selenium import webdriver

class HuYa(object):
    def __init__(self):
        self.start_url = "https://www.hxyx.com/l"#某直播平台的url地址"x"为缺省值:地址自己定义
        self.driver = webdriver.Chrome()
        self.part_url = "https:"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
        }


    def run(self):
        # 1. 准备start_url
        # 2. 发送请求 获取响应
        self.driver.get(self.start_url)
        # 3. 提取数据
        content_list, next_list = self.get_content_url()
        # 4. 保存数据
        self.save_coutent(content_list)

        # 5. 请求下一页的数据
        while next_list is not None:
            next_list.click()
            time.sleep(3)
            #  提取数据
            content_list, next_list = self.get_content_url()
            #  保存数据
            self.save_coutent(content_list)


    def get_content_url(self):
        li_list = self.driver.find_elements_by_xpath("//ul[@id='js-live-list']/li")
        content_list = []
        for li in li_list:
            images = li.find_element_by_xpath(".//a/img[@class='pic']").get_attribute("data-original").split("?")[0]

            if  not images.startswith("https:"):
                images = self.part_url + images

            print(images)
            content_list.append(images)

        next_url = self.driver.find_elements_by_xpath("//a[@class='laypage_next']")
        next_url = next_url[0] if len(next_url)>0 else None
        return content_list, next_url

    def save_coutent(self,conutent_list,):
    #enumerate 函数用在for中作用是以序列递增
        for i,img_url in enumerate(conutent_list):
            response = requests.get(img_url, headers = self.headers)
            with open("./image/" "huya"+ str(i) + ".png","wb") as f:
                f.write(response.content)



if __name__ == '__main__':
    huya = HuYa()
    huya.run()

猜你喜欢

转载自blog.csdn.net/weixin_44090435/article/details/86499413