[Python爬虫]使用Scrapy框架爬取图虫图片

启动文件main.py

from scrapy.cmdline import execute

execute('scrapy crawl tuchong'.split())

在spiders下的执行爬虫文件

# -*- coding: utf-8 -*-
import scrapy,json
from ..items import tu_baoc   #实例化函数

class TuchongSpider(scrapy.Spider):
    name = 'tuchong'
    allowed_domains = ['stock.tuchong.com']
    # start_urls = ['https://tuchong.com/rest/2/sites/2634795/posts?count=20&page=1&before_timestamp=0']

    # 我们使用这个函数作为初始的执行函数
    def start_requests(self):

        # 循环拼接页数,这里我循环了3次取了前三页
        for i in range(1,4):
            url='https://tuchong.com/rest/2/sites/2634795/posts?count=20&page='+str(i)+'&before_timestamp=0'

            # 使用get方法请求,发送给parse
            req=scrapy.Request(url=url,callback=self.parse)

            # 发送
            yield  req

    def parse(self, response):

        # 调用items里的自定义tu_baoc类实例化函数
        # 创建实例化对象
        t=tu_baoc()

        # 将返回值赋值给req
        req=response.text

        # 用json进行编码
        html=json.loads(req)

        # 遍历出post_list下的信息
        for i in html['post_list']:

            # 获取名称
            name=i['title']

            # 将遍历图片的链接和名称
            for j in i['images']:
                # 获取的是图片的id,然后我将id拼接成url
                t['img_url']= 'http://photo.tuchong.com/2634795/f/{}.jpg'.format(j['img_id'])
                t['name']=name

                print(t['img_url'])
                # 我是将图片下载到了本地也可以保存到数据库里等等
                # 将链接发送给pipelines.py的img_URL_baoc
                yield t

items.py

# 自定义实例化函数,构造函数
class tu_baoc(scrapy.Item):
    # define the fields for your item here like:
    # 创建参数
    img_url = scrapy.Field()
    name=scrapy.Field()

配置文件settings.py

#导包
from ..piaot import *

#ROBOTSTXT_OBEY改成False
ROBOTSTXT_OBEY = False

#开启报头
DEFAULT_REQUEST_HEADERS = {

  "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
}

#配置存储文件地址和优先级
ITEM_PIPELINES = {
   'tu_chong.pipelines.img_URL_baoc': 300,
}

存储文件pipelines.py

# -*- coding: utf-8 -*-
import requests,time        #调用时间函数time
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class TuChongPipeline(object):
    def process_item(self, item, spider):
        return item


class img_URL_baoc(object):
    def process_item(self, item, spider):

        # 将返回的值再用get请求返回二进制
        req=requests.get(url=item['img_url'])

        # 用时间函数当随机名称
        x = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))

        # 拼接本地地址
        url="C:/Users/黑神/Desktop/pc_zy/图虫/"+item['name']+x+'.jpg'

        # 保存图片
        with open(url,"wb") as f:
            f.write(req.content)

猜你喜欢

转载自blog.csdn.net/Black_God1/article/details/82147157