python3 scrapy爬取微信公众号及历史信息V2.0

代码部分，日后补充：
# -*- coding: utf-8 -*-
# @Time    : 2018/2/25 14:24
# @Author  : 蛇崽
# @Email   : [email protected]
# @File    : WeChatSogou.py（微信公众号爬虫：不包含浏览量跟评论）
import scrapy
import re
import time
import json
import requests
from fake_useragent import UserAgent
from scrapy import Request
from scrapy_redis.spiders import RedisSpider
from News_scrapy.items import NewsItem
from News_scrapy.constants.WeChatSource import WXSource


class WeChatSogouSpider(scrapy.Spider):
    name = "WeChatAiWeiBang"
    allowed_domains = ["aiweibang.com", 'mp.weixin.qq.com']
    start_urls = ['http://top.aiweibang.com/user/search_advanced']

    def parse(self, response):
        wx_source = WXSource.get_dict()
        for v_wx_source in wx_source:
            try:
                type = v_wx_source.split('-')[0]
                channel = v_wx_source.split('-')[1]
                print("正在抓取:", type, channel)
                url = 'http://top.aiweibang.com/user/search?kw={}'.format(channel)
                yield scrapy.Request(url=url,callback=self.get_target_gzh,meta={'type':type,'channel':channel})
            except:
                continue
                print('wx_source error ===', v_wx_source)

    def get_target_gzh(self,response):
        print('get_target_gzh == ',response)

        type = response.meta['type']
        channel = response.meta["channel"]

        gzh_url = 'http://top.aiweibang.com/user/getsearch'
        formdata = {'PageIndex': '1', 'PageSize': '10', 'Kw': channel}
        yield scrapy.FormRequest(url=gzh_url,callback=self.get_v_gzh,formdata=formdata,meta={'type':type,'channel':channel})


    def get_v_gzh(self,response):
        print('get_v_gzh  jsondata === ',response.body)

        type = response.meta['type']
        channel = response.meta["channel"]

        jsondatas = json.loads(str(response.body.decode('utf-8')))
        first_gzh_Id = jsondatas['data']['data'][0]['Id']
        print('first_gzh_ID ==== ',first_gzh_Id)

        if first_gzh_Id:
            gzh_url = 'http://top.aiweibang.com/article/{}'.format(first_gzh_Id)
            yield scrapy.Request(url=gzh_url,callback=self.get_article_gzh,meta={'type':type,'channel':channel,'gzh_ID':first_gzh_Id})

    def get_article_gzh(self,response):
        print('get_article_gzh === ',response.body)

        type = response.meta['type']
        channel = response.meta["channel"]
        gzh_ID = response.meta["gzh_ID"]

        # 获取全部文章列表的url
        all_url = 'http://top.aiweibang.com/article/getarticles'
        formadata = {'PageIndex': '1', 'PageSize': '20', 'Type': '0', 'Wechat': gzh_ID}

        yield scrapy.FormRequest(url=all_url,formdata=formadata,callback=self.get_all_article,meta={'type':type,'channel':channel,'gzh_ID':gzh_ID})


    def get_all_article(self,response):
        print('get_all_article ***** the last *** ',response)

        type = response.meta['type']
        channel = response.meta["channel"]

        jsondatas = json.loads(str(response.body.decode('utf-8')))
        # 获取拼接aid
        aids = jsondatas['data']['data']
        for aid in aids:
            print('aid -- ',aid)
            v_aid = aid['Id']
            print('v_aid ',v_aid)
            if v_aid:
                detail_url = 'http://top.aiweibang.com/article/url?aid={}'.format(v_aid)
                yield scrapy.Request(url=detail_url,callback=self.get_wx_url,meta={'type':type,'channel':channel})


    def get_wx_url(self,response):
        print('get_wx_url   -----  ',response)
        print('wx_url 00000000000  ',response.url)
python3 scrapy爬取微信公众号及历史信息V2.0

猜你喜欢