Crawling all popular video b station and stored in the database mongodb

Method One (providing only ideas)

Because the initial load time of a page are already loaded, and go directly to the data BILIBILI this page just fine in JS this page, you use regular take out objects are JS, and find ways to turn into a dictionary on OK .

Method Two

# -*- coding: utf-8 -*-
import scrapy
import json
from pabz.items import PabzItem
import re
import time
import requests
from selenium import webdriver
from time import sleep





class BzSpider(scrapy.Spider):
    name = 'bz'
    # allowed_domains = ['www.com']
    start_urls = ['https://www.bilibili.com/']
    bro = webdriver.Chrome(executable_path=r'F:\爬虫包\通用爬虫selenium\chromedriver.exe')
    page=1



    def parse(self, response):
        self.bro.get(response.url)
        sleep(2)

        while(self.page<3):
            #热门视频url
            hot_url_list=response.xpath('//*[@id="reportFirst1"]/div[2]/div')[0:8]
            for url in  hot_url_list:
                item={}
                #dic是用来传网址的
                dic={}
                need_url=''.join(url.xpath('./div/a/@href').extract())
                whole_url='https:'+need_url #完整的url
                aid_number = need_url.split('av')[-1]
                cid= requests.get(url=whole_url).text
                need_cid = re.findall('","cid":(.*?),"', cid, re.M)
                need_cid=''.join(need_cid)
                dm_api = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + need_cid   # 弹幕
                zan_api = 'https://api.bilibili.com/x/web-interface/archive/' + 'stat?aid='+aid_number #点赞人数
                #在线人数
                online_url='https://api.bilibili.com/x/player.so?id=cid%3A' + need_cid+'&aid='+aid_number+'&buvid=D7512C54-9EB9-4D8A-ADF9-040A66C06A6C190950infoc'
                dic['online_url']=online_url
                dic['dm_api']=dm_api
                dic['zan_api']= zan_api

                yield scrapy.Request(whole_url,callback=self.title,meta={'dic':dic,'item':item})
        self.bro.quit()


            #标题
    def title(self,response):
        item = response.meta['item']
        dic=response.meta['dic']
        title=response.xpath('//*[@id="viewbox_report"]/h1/span/text()').extract()
        item['title'] = title
        #由于后台的原因 必须带一个参数
        header = {
            ' Access - Control - Allow - Origin': ' https: // www.bilibili.com'
        }
        yield scrapy.Request(dic['online_url'], callback=self.online, headers=header,meta={'dic':dic,'item':item})
#获取在线人数
    def online(self,response):
        dic = response.meta['dic']
        item = response.meta['item']
        response = response.body.decode()
        online = re.findall('<online_count>(.*?)</online_count>', response, re.M)
        online_people=''.join(online)
        item['online_people']=online_people
        yield  scrapy.Request(dic['zan_api'],callback=self.dianzan,meta={'dic':dic,'item':item})
    #这个函数是获取点赞 投币 收藏转发 的数量
    def dianzan(self,response):
        dic = response.meta['dic']
        item = response.meta['item']
        all_data = json.loads(response.text)
        #点赞
        detail_data =all_data.get('data')
        dian_zan=detail_data.get('like')
        item['dian_zan'] = dian_zan
        #投币
        coins=detail_data.get('coin')
        item['coins'] = coins
        #收藏
        favorite=detail_data.get('favorite')
        item['collect']=favorite
        # print(favorite)
        yield scrapy.Request(dic['dm_api'],callback=self.dm,meta={'dic':dic,'item':item})

    def dm(self,response):
        dic = response.meta['dic']
        item = response.meta['item']
        d_list = response.xpath('/i/d')
        all=[]#装 弹幕 最后加到item里去,防止覆盖了
        for d in d_list:
            content = d.xpath('./text()').extract()
            content=''.join(content)
            time_base = d.xpath('./@p').extract()
            str_time_base = ''.join(time_base)
            unix_time = str_time_base.split(',')[4]
            unix_time = int(unix_time)
            x = time.localtime(unix_time)
            end_finish_time = time.strftime('%Y-%m-%d %H:%M:%S', x)
            all_dm_content = str(end_finish_time) + content
            all.append(all_dm_content)
        item['dm'] = ''.join(all)


        yield PabzItem(
            title=item['title'],
            zan=item['dian_zan'],
            coins=item['coins'],
            collect=item['collect'],
            screen_shoot=item['dm'],
            online_people=item['online_people'])
        self.page += 1
        To_clik=self.bro.find_element_by_xpath('//*[@id="reportFirst1"]/div[2]/div[10]/i')
        To_clik.click()
        sleep(1)

My item

class PabzItem(scrapy.Item):
    title =scrapy.Field()
    zan = scrapy.Field()
    coins =scrapy.Field()
    collect = scrapy.Field()
    screen_shoot=scrapy.Field()
    online_people=scrapy.Field()
  

My pipine

class PabzPipeline(object):
    def process_item(self, item, spider):
        return item




from pymongo import MongoClient  # 使用MongoClient连接mongo
from pabz.settings import Mongoip,MongoDBname,MongoPort,MongoItem   #从settings.py导入第一步配置的连接信息
class CrawldataToMongoPipline(object):
    def __init__(self):
        host = Mongoip      #主机
        port = MongoPort    #端口
        dbName = MongoDBname  #文档名
        client = MongoClient(host=host,port=port)    # 创建连接对象client
        db = client[dbName]                          # 使用文档dbName='mylove1'
        self.post = db[MongoItem]                  # 使用item MongoItem='DouluodaluItem'

    def process_item(self, item, spider):
        dl_info = dict(item)                      # item转换为字典格式
        self.post.insert(dl_info)                 # 将item写入mongo
        return item

My setting

Mongoip='127.0.0.1'       #mongoDB节点 ip地址 可以写127.0.0.1,或者cmdifconfig查自己的ip 前提是可视化工具(robo 3t 客户端)能打开你可视化能打开本地的mongodb
MongoPort = 27017             #端口号  一般都是 27017
MongoDBname='mylove1'         #文档名
MongoItem='PabzItem'    #item名

To open the last item - pipiline

ITEM_PIPELINES = {
   'pabz.pipelines.PabzPipeline': 300,
   'pabz.pipelines.CrawldataToMongoPipline': 301,#入数据库方式
}

Released six original articles · won praise 2 · Views 1769

Guess you like

Origin blog.csdn.net/MYLOVEis77/article/details/104264148