Scrapy 框架抓取美拍视频

抓取美拍的数据并不算是很难 关键是他的视频url的加密算法是有点难搞。

打开美拍的网址我们查看一下源代码,他的网页加载方式跟其它的网站差不多,video_url也是在源代码中,但是我们仔细看,诺就是下边这一串,是人都能猜测这应该是是他的video_url的地址,但是经过某种加密或者编码,根据我的经验我猜是base64,但是,不仅仅是base64, 随便给你加点盐,添加一些随机字符串,就是你看到的现在的美味佳肴,一坨看不懂的字符串


我把我写的代码贴下 ,有兴趣的同法可以尝试下:

items.py

class MeipaiItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    cut_url = scrapy.Field()
    create_time = scrapy.Field()
    video_url = scrapy.Field()
    title = scrapy.Field()
    author = scrapy.Field()

spider

# -*- coding, utf-8 -*-
import scrapy
import json
import base64
from Meipai.items import MeipaiItem
import logging


class MeipaiSpider(scrapy.Spider):
    name = 'meipai'
    allowed_domains = ['meipai.com']
    start_urls = ['http://www.meipai.com/']
    offset = 1
    MeiPai = [
        ('搞笑', '13'),
        ('明星', '16'),
        ('高颜值','474'),
        ('舞蹈', '5872239354896137479'),
        ('精选', '488'),
        ('音乐', '5871155236525660080'),
        ('美食', '5870490265939297486'),
        ('时尚', '27'),
        ('美状', '6161763227134314911'),
        ('吃秀', '5871963671268989887'),
        ('宝宝', '5864549574576746574'),
        ('创意', '5875185672678760586'),
        ('游戏', '5879621667768487138'),
        ('体育', '5872639793429995335'),
        ('娱乐','6204189999771523532'),
    ]
    def parse(self, response):


        for channel,id in self.MeiPai:
            JsonUrl = 'http://www.meipai.com/topics/hot_timeline?page=1&count=24&tid={}'.format(id)
            yield scrapy.Request(url=JsonUrl,callback=self.parse_item)
        while self.offset < 11:
            for channel, id in self.MeiPai:
                JsonUrlAgain = 'http://www.meipai.com/topics/hot_timeline?page={}&count=24&tid={}'.format(self.offset,id)
                yield scrapy.Request(url=JsonUrlAgain, callback=self.parse_item)
            self.offset += 1
            print('**'*5,self.offset,'**'*5)


    def system(self,string_num):
        return str(int(string_num.upper(), 16))


    def parse_item(self,response):
        item = MeipaiItem()
        OriginalHtml = json.loads(response.body.decode('utf-8'))
        NowHtml = OriginalHtml.get('medias')
        for NowData in NowHtml:
            # print(NowData)
            CutPicture = NowData.get('cover_pic')
            item['cut_url'] = CutPicture
            CreateTime = NowData.get('created_at')
            item['create_time'] = CreateTime
            Title = NowData.get('caption')
            if Title:
                item['title'] = Title
            else:
                return
            User = NowData.get('user').get('screen_name')
            if User:
                item['author'] = User
            else:
                return
            try:
                EncryptionVideoUrl = NowData.get('video')
                Num = self.system(EncryptionVideoUrl[:4][::-1])
                StartNum = Num[0]
                StartCount = Num[1]
                EndNum = Num[2]
                EndCount = Num[3]
                AddendNum = -(int(EndCount) + int(EndNum))
                HeaderNUm = int(StartCount) + int(StartNum) + int(4)
                TrueMindPart = EncryptionVideoUrl[HeaderNUm:AddendNum]
                TrueStartUrl = EncryptionVideoUrl[4:4 + int(StartNum)]
                TrueEndtUrl = EncryptionVideoUrl[(-int(EndNum)):]
                DealWithFinalUrl = TrueStartUrl + TrueMindPart + str(TrueEndtUrl)
                # Mp4Url = base64.b64decode(DealWithFinalUrl)
                FinalMp4UrlData = (str(base64.b64decode(DealWithFinalUrl), 'utf-8'))
                item['video_url'] = FinalMp4UrlData
            except Exception as e:
                logging.info(e)
                return


            # print(CutPicture,CreateTime,User,Title,Mp4Url)
            if not item['video_url']:
                return
            yield item

piplines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import logging

class MeipaiPipeline(object):

    def __init__(self):
        self.name = open('data.json','w')


    def process_item(self, item, spider):
        content = json.dumps(dict(item)) + '\n'
        logging.info(type(content))
        self.name.write(content.encode('utf-8').decode('unicode-escape'))
        return item

    def close_item(self,spider):
        self.name.close()

我们可以看下日志信息,视频的url,现在是我们常见的MP4格式哈,也是可以在浏览器打开的,可以请求的。

我们随便找一个拿到浏览器,是没有问题的:


然后我们这些数据的处理方式,我们可以保存到本地,也可以存放数据库,感兴趣的伙伴可以吧视频下载下来,我这里仅仅是以文件的格式保存到了本地:




猜你喜欢

转载自blog.csdn.net/redpintings/article/details/80052427
今日推荐