bilibili website crawler

# -*- coding: utf-8 -*-
"""
Created on Mon May 07 11:05:49 2018
Station B crawler:
Function: 1 Get the number of comments, the number of likes, the user's personal information and comments, etc.
@author: Alis
"""

import re,time
import requests
import them
import json


headers = {'user-agents':'User-Agent:Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}


def getavID(pn):
    avID = []
    for i in range(1,pn+1):
        url = 'https://api.bilibili.com/x/web-interface/dynamic/region?&jsonp=jsonp&pn=%d&ps=50&rid=24&_=1525679623909'%i
        r = requests.get(url,headers = headers).text
        data = json.loads(r)
        archives = data['data']['archives']
        for ac in archives:
            avID.append(ac['aid'])
            print 'aid: ',ac['aid']
            print 'title: ',ac['title']
            print 'attribute: ',ac['attribute']
        time.sleep(2)
    return avID
    

def getHTMLText(i,num = 2):
    for n in range(1,num):
        url = "https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn="+str(n)+"&type=1&oid="+str(i)+"&sort=0&_=1496477384198"
        r = requests.get(url,headers = headers)
        text = r.text
        print("Printing page "+str(n)+" comment!")
        num = printTXT (text)  
        if num > 1:
            break
        
    for n in range(2,num+1):
        url = "https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn="+str(n)+"&type=1&oid="+str(i)+"&sort=0&_=1496477384198"
        r = requests.get(url,headers = headers)
        text = r.text
        print("Printing page "+str(n)+" comment!")        
        printTXT (text)  
        time.sleep(1)


def printTXT(text):
    data = json.loads(text)
    reply = data['data']['replies']
    for t in reply:
        print u"User nickname:",t['member']['uname'],u'gender',t['member']['sex']
        print u'comment information',t['content']['message']
        
        if len(t['replies']) > 0:
            for t2 in t['replies']:
                print u"User nickname:",t2['member']['uname'],u'gender',t2['member']['sex']
                print u'comment information',t2['content']['message']                                
    pageNum = data['data']['page']['count'] / 20 + 1 # number of comment pages
    return pageNum


if __name__    ==  "__main__":
    
    #i = input(u"Please enter the av number (number):")
    begin = time.clock()
    avid = getavID(1)
    map(getHTMLText,avid)
    end = time.clock()
    print 'cost time is: ',round(end-begin,3),'s'

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325682827&siteId=291194637