requests crawling tiger teeth channel and anchor information

 

 

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/1/23 15:42
# @Author  : Aries
# @Site    :
# @File    : yy.py
# @Software: PyCharm
import requests
import time
from lxml import html

headers = {
    'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    'x-devtools-emulate-network-conditions-client-id': "(6DC99B5E32009D9E60CDB0C3B620074)",
    'upgrade-insecure-requests': "1",
    'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    'accept-language': "zh-CN,zh;q=0.9,en;q=0.8",
    Cookie: "udb_passdata = 1; PHPSESSID = 91siudst3ptb6egbjgh8j3rn42; Sound Value = 0.50; GUID = 0e74abb6d4e5665a0c52c0e3c1e35727; __yasmid = 0.2352245147818357; __yamid_tt1 = 0.2352245147818357; __yamid_new = C7D8A9F6CD3000013AA46C206EC0135D; _yasids = __ rootsid% 3DC7D8A9F6CDA00001226020701A00E5B0; Hm_lvt_51700b6c722f5bb4cf39906a596ea41f = 1516692949; isInLiveRoom =; Hm_lpvt_51700b6c722f5bb4cf39906a596ea41f = 1516696335" ,
    'cache-control': "no-cache",
    }

class huyaFcukYouSpider:
    urlStyle = "http://www.huya.com/g"
    '''
    Parse out all classified web content
    '''
    def huyaRootHtml(self):
        print("Start analyzing Huya classification======"""")
        roothtml = requests.get(huyaFcukYouSpider.urlStyle,headers = headers)
        print("Crawling classification returns status code======""""+str(roothtml.status_code))
        return roothtml.text

    '''
    Parse out the classification html
    '''
    def itmStyle(self,roothtml):
        tree = html.fromstring(roothtml)
        hrefs = tree.xpath('//*[@id="js-game-list"]/li/a/@href')
        texts = tree.xpath('//*[@id="js-game-list"]/li/a/img/@title')
        styleObjects = []
        for (text, href) in zip(texts, hrefs):
            styleObject = {"text":text,'href':href}
            styleObjects.append(styleObject)
        print("The classification has been successfully obtained, brother, let's start the channel below")
        print("Number of categories: "+ str(len(styleObjects)))
        return styleObjects

    '''
    Request information within a channel
    '''
    def itemContent(self, styleObjects):
        for item in styleObjects:
            print("Start analysis"+item["text"]+" ======"""")
            itemhtml = requests.get(item["href"], headers=headers)
            print("Climb"+item["text"]+"return status code======"""" + str(itemhtml.status_code))
            tree = html.fromstring(itemhtml.text)
            titles = tree.xpath('//*[@id="js-live-list"]/li/a[contains(@class,"title") and contains(@class,"new-clickstat")]/text()')
            nicknames = tree.xpath('//*[@id="js-live-list"]/li[@class="game-live-item"]/span/span[contains(@class,"avatar") and contains(@class,"fl")]/i/text()')
            numbers = tree.xpath('//*[@id="js-live-list"]/li[@class="game-live-item"]/span/span[@class="num"]/i[@class="js-num"]/text()')
            for (title, nickname,number) in zip(titles, nicknames,numbers):
                print("\t\t anchor: "+nickname+"; the channel name is: "+title +" live broadcast, the number of viewers: "+str(number))
                #feel slow and remove
                time.sleep(1)
            # If it feels slow, remove it
            time.sleep(10)
            pass


    def go(self):
        rootHtml = self.huyaRootHtml()
        styleObjects = self.itmStyle(rootHtml)
        self.itemContent(styleObjects)


huya = huyaFcukYouSpider()
huya.go()

 If it feels slow, remove time.sleep

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=326304477&siteId=291194637