使用selenium爬取斗鱼直播数据

分析斗鱼直播首页https://www.douyu.com/directory/all
使用selenium来实现跳转页面，定位到跳转页面位置。
拿到页面后通过xpath提取想要的数据进行分析。
保存数据

# -*- coding:utf-8 -*-
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from lxml import etree
import csv


#把浏览器设为无头模式
chrome_option = Options()
chrome_option.add_argument('-headless')
browser =  webdriver.Chrome(chrome_options=chrome_option)


def get_next_page(n):
    '''获取下一页页面'''
    # 1.找到下一页的标签,点击进行跳转
    browser.find_element_by_class_name('jumptxt').send_keys(n)
    browser.find_element_by_class_name('shark-pager-submit').click()
    # 3.延时等待1秒
    time.sleep(1)
    # 4.获取页面数据
    html = browser.page_source
    #返回页面内容
    return html


def get_data(html):
    '''解析页面数据'''
    html = etree.HTML(html)
    #获取到所有主播的列表
    zb_list= html.xpath('//ul[@id="live-list-contentbox"]/li')
    for zb in zb_list:
        item={}
        item['houst_name']= zb.xpath('.//div[@class="mes-tit"]/h3/text()')[0].strip()
        item['group'] = zb.xpath('.//div[@class="mes-tit"]/span/text()')[0].strip()
        item['zb_name'] = zb.xpath('.//div[@class="mes"]/p/span[1]/text()')[0].strip()
        item['count']  = zb.xpath('.//div[@class="mes"]/p/span[2]/text()')[0].strip()
        sava_data(item)


def sava_data(data):
    '''存数数据'''
    filedname =['houst_name','group','zb_name','count']
    with open('douyu1.csv','a',newline='',encoding='utf8') as f:
        csv_writer = csv.DictWriter(f,fieldnames=filedname)
        csv_writer.writerow(data)

def main():
    '''控制爬虫程序的运行'''
    browser.get('https://www.douyu.com/directory/all')
    # 获取页面内容
    page_num = 1
    while page_num<139:
        print('当前正在爬取第{}页'.format(page_num))
        html = get_next_page(page_num)
        get_data(html)
        page_num+=1


if __name__ == '__main__':
    main()

使用selenium爬取斗鱼直播数据

猜你喜欢