#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2018/1/23 15:42 # @Author : Aries # @Site : # @File : yy.py # @Software: PyCharm import requests import time from lxml import html headers = { 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", 'x-devtools-emulate-network-conditions-client-id': "(6DC99B5E32009D9E60CDB0C3B620074)", 'upgrade-insecure-requests': "1", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 'accept-language': "zh-CN,zh;q=0.9,en;q=0.8", 'cookie': "udb_passdata=1; PHPSESSID=91siudst3ptb6egbjgh8j3rn42; SoundValue=0.50; guid=0e74abb6d4e5665a0c52c0e3c1e35727; __yasmid=0.2352245147818357; __yamid_tt1=0.2352245147818357; __yamid_new=C7D8A9F6CD3000013AA46C206EC0135D; _yasids=__rootsid%3DC7D8A9F6CDA00001226020701A00E5B0; Hm_lvt_51700b6c722f5bb4cf39906a596ea41f=1516692949; isInLiveRoom=; Hm_lpvt_51700b6c722f5bb4cf39906a596ea41f=1516696335", 'cache-control': "no-cache", } class huyaFcukYouSpider: urlStyle = "http://www.huya.com/g" ''' 解析出来全部分类网页内容 ''' def huyaRootHtml(self): print("开始分析虎牙分类======》》》") roothtml = requests.get(huyaFcukYouSpider.urlStyle,headers = headers) print("爬分类返回状态码======》》》"+str(roothtml.status_code)) return roothtml.text ''' 解析出来分类html ''' def itmStyle(self,roothtml): tree = html.fromstring(roothtml) hrefs = tree.xpath('//*[@id="js-game-list"]/li/a/@href') texts = tree.xpath('//*[@id="js-game-list"]/li/a/img/@title') styleObjects = [] for (text, href) in zip(texts, hrefs): styleObject = {"text":text,'href':href} styleObjects.append(styleObject) print("分类已经获取成功,大哥开始对下面的频道吧") print("分类数量:"+ str(len(styleObjects))) return styleObjects ''' 请求频道内的信息 ''' def itemContent(self, styleObjects): for item in styleObjects: print("开始分析 "+item["text"]+" ======》》》") itemhtml = requests.get(item["href"], headers=headers) print("爬"+item["text"]+"返回状态码======》》》" + str(itemhtml.status_code)) tree = html.fromstring(itemhtml.text) titles = tree.xpath('//*[@id="js-live-list"]/li/a[contains(@class,"title") and contains(@class,"new-clickstat")]/text()') nicknames = tree.xpath('//*[@id="js-live-list"]/li[@class="game-live-item"]/span/span[contains(@class,"avatar") and contains(@class,"fl")]/i/text()') numbers = tree.xpath('//*[@id="js-live-list"]/li[@class="game-live-item"]/span/span[@class="num"]/i[@class="js-num"]/text()') for (title, nickname,number) in zip(titles, nicknames,numbers): print("\t\t主播:"+nickname+";正在频道名称为: "+title +"直播,观看人数:"+str(number)) #感觉慢就去掉 time.sleep(1) # 感觉慢就去掉 time.sleep(10) pass def go(self): rootHtml = self.huyaRootHtml() styleObjects = self.itmStyle(rootHtml) self.itemContent(styleObjects) huya = huyaFcukYouSpider() huya.go()
感觉慢就把time.sleep去掉