# -*- coding: utf-8 -*- """ Created on Mon May 07 11:05:49 2018 B站爬虫: 功能: 1 获取评论人数,点赞数,用户个人信息及评论等等 @author: Alis """ import re,time import requests import os import json headers = {'user-agents':'User-Agent:Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'} def getavID(pn): avID = [] for i in range(1,pn+1): url = 'https://api.bilibili.com/x/web-interface/dynamic/region?&jsonp=jsonp&pn=%d&ps=50&rid=24&_=1525679623909'%i r = requests.get(url,headers = headers).text data = json.loads(r) archives = data['data']['archives'] for ac in archives: avID.append(ac['aid']) print 'aid: ',ac['aid'] print 'title: ',ac['title'] print 'attribute: ',ac['attribute'] time.sleep(2) return avID def getHTMLText(i,num = 2): for n in range(1,num): url = "https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn="+str(n)+"&type=1&oid="+str(i)+"&sort=0&_=1496477384198" r = requests.get(url,headers = headers) text = r.text print("正在打印第"+str(n)+"页评论!") num = printTXT(text) if num > 1: break for n in range(2,num+1): url = "https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn="+str(n)+"&type=1&oid="+str(i)+"&sort=0&_=1496477384198" r = requests.get(url,headers = headers) text = r.text print("正在打印第"+str(n)+"页评论!") printTXT(text) time.sleep(1) def printTXT(text): data = json.loads(text) reply = data['data']['replies'] for t in reply: print u"用户昵称 :",t['member']['uname'],u'性别',t['member']['sex'] print u'评论信息',t['content']['message'] if len(t['replies']) > 0: for t2 in t['replies']: print u"用户昵称 :",t2['member']['uname'],u'性别',t2['member']['sex'] print u'评论信息',t2['content']['message'] pageNum = data['data']['page']['count'] / 20 + 1 # 评论页数 return pageNum if __name__ == "__main__": #i = input(u"请输入av号(数字):") begin = time.clock() avid = getavID(1) map(getHTMLText,avid) end = time.clock() print 'cost time is: ',round(end-begin,3),'s'
bilibili网站爬虫
猜你喜欢
转载自blog.csdn.net/Alis_xt/article/details/80224131
今日推荐
周排行