知识点:
1.运用selenium自动化驱动模块
2.find_elements_by_xpath()与fin_element_by_xpath()的区别,以及对元素的定位,内容的提取
3.获取请求下一页方法,注:time.sleep()
代码:
#encoding=utf-8 from selenium import webdriver import time class DouYu(): def __init__(self): self.url = "https://www.douyu.com/directory/all" self.driver = webdriver.Chrome() def get_content_list(self): """ get:每页中直播间的内容信息 :return: """ # 获取用户直播间的对象列表 content_list = self.driver.find_elements_by_xpath(".//ul[@id='live-list-contentbox']/li") # 提取每页每个直播间的信息 get_contents_list = [] for content in content_list: dict = {} dict["room_img"] = content.find_element_by_xpath(".//span[@class='imgbox']/img").get_attribute("src") dict["room_name"] = content.find_element_by_xpath(".//a").get_attribute("title") dict["room_info"] = content.find_element_by_xpath(".//div[@class='mes-tit']/span").text get_contents_list.append(dict) # 获取下一页元素,获得元素对象列表 next_url = self.driver.find_elements_by_xpath("//a[@class='shark-pager-next']") # 获取元素对象:next_url[0] next_url = next_url[0] if len(next_url) > 0 else None return get_contents_list, next_url def run(self): # 获取请求 self.driver.get(self.url) # 获取请求页内容列表,每页的内容信息 get_contents_list, next_url = self.get_content_list() # 保存 print(get_contents_list) # 请求下一页的元素 while next_url is not None: next_url.click() time.sleep(3) get_contents_list, next_url = self.get_content_list() # 保存 print(get_contents_list) # with open("D:\\save.txt", "rb") as f: # f.write(get_contents_list) # f.close() if __name__=="__main__": spider = DouYu() spider.run()