代码部分,日后补充:
# -*- coding: utf-8 -*-
# @Time : 2018/2/25 14:24
# @Author : 蛇崽
# @Email : [email protected]
# @File : WeChatSogou.py(微信公众号爬虫:不包含浏览量跟评论)
import scrapy
import re
import time
import json
import requests
from fake_useragent import UserAgent
from scrapy import Request
from scrapy_redis.spiders import RedisSpider
from News_scrapy.items import NewsItem
from News_scrapy.constants.WeChatSource import WXSource
class WeChatSogouSpider(scrapy.Spider):
name = "WeChatAiWeiBang"
allowed_domains = ["aiweibang.com", 'mp.weixin.qq.com']
start_urls = ['http://top.aiweibang.com/user/search_advanced']
def parse(self, response):
wx_source = WXSource.get_dict()
for v_wx_source in wx_source:
try:
type = v_wx_source.split('-')[0]
channel = v_wx_source.split('-')[1]
print("正在抓取:", type, channel)
url = 'http://top.aiweibang.com/user/search?kw={}'.format(channel)
yield scrapy.Request(url=url,callback=self.get_target_gzh,meta={'type':type,'channel':channel})
except:
continue
print('wx_source error ===', v_wx_source)
def get_target_gzh(self,response):
print('get_target_gzh == ',response)
type = response.meta['type']
channel = response.meta["channel"]
gzh_url = 'http://top.aiweibang.com/user/getsearch'
formdata = {'PageIndex': '1', 'PageSize': '10', 'Kw': channel}
yield scrapy.FormRequest(url=gzh_url,callback=self.get_v_gzh,formdata=formdata,meta={'type':type,'channel':channel})
def get_v_gzh(self,response):
print('get_v_gzh jsondata === ',response.body)
type = response.meta['type']
channel = response.meta["channel"]
jsondatas = json.loads(str(response.body.decode('utf-8')))
first_gzh_Id = jsondatas['data']['data'][0]['Id']
print('first_gzh_ID ==== ',first_gzh_Id)
if first_gzh_Id:
gzh_url = 'http://top.aiweibang.com/article/{}'.format(first_gzh_Id)
yield scrapy.Request(url=gzh_url,callback=self.get_article_gzh,meta={'type':type,'channel':channel,'gzh_ID':first_gzh_Id})
def get_article_gzh(self,response):
print('get_article_gzh === ',response.body)
type = response.meta['type']
channel = response.meta["channel"]
gzh_ID = response.meta["gzh_ID"]
# 获取全部文章列表的url
all_url = 'http://top.aiweibang.com/article/getarticles'
formadata = {'PageIndex': '1', 'PageSize': '20', 'Type': '0', 'Wechat': gzh_ID}
yield scrapy.FormRequest(url=all_url,formdata=formadata,callback=self.get_all_article,meta={'type':type,'channel':channel,'gzh_ID':gzh_ID})
def get_all_article(self,response):
print('get_all_article ***** the last *** ',response)
type = response.meta['type']
channel = response.meta["channel"]
jsondatas = json.loads(str(response.body.decode('utf-8')))
# 获取拼接aid
aids = jsondatas['data']['data']
for aid in aids:
print('aid -- ',aid)
v_aid = aid['Id']
print('v_aid ',v_aid)
if v_aid:
detail_url = 'http://top.aiweibang.com/article/url?aid={}'.format(v_aid)
yield scrapy.Request(url=detail_url,callback=self.get_wx_url,meta={'type':type,'channel':channel})
def get_wx_url(self,response):
print('get_wx_url ----- ',response)
print('wx_url 00000000000 ',response.url)