scrapy grab pictures of school flower net

One: Basic version (grab the homepage picture)

Crawler py file code:

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 import sys
 4 import io
 5 from scrapy.selector import Selector
 6 from scrapy.http import Request
 7 from ..items import Day96XiaohuaItem
 8 import re
 9 sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
10 
11 
12 class XiaohuaSpider(scrapy.Spider):
13     name = 'xiaohua'
14     allowed_domains = ['www.xueshengmai.com/hua/']
15     start_urls = ['http://www.xueshengmai.com/hua/']
16 
17     def parse(self, response):
18         # ------------持久化数据--------------
19         hxs = Selector(response=response).xpath("//div[@class='item_t']/div[@class='img']/a/img").extract()
20         # print(hxs)
21         for i in hxs:
22             # print(i)
23             title = re.findall("alt=(.*) src=",i)[0].strip('"')+".jpg"
24             src = "http://www.xueshengmai.com%s"%re.findall("src=(.*)>",i)[0].strip('"')
25             print(title,src)
26             item_obj = Day96XiaohuaItem(title=title, src=src)
27             yield item_obj

items.py code:

1 import scrapy
2 
3 
4 class Day96XiaohuaItem(scrapy.Item):
5     # define the fields for your item here like:
6     # name = scrapy.Field()
7     title=scrapy.Field()
8     src=scrapy.Field()

pipelines code:

import requests

class Day96XiaohuaPipeline(object):
    def process_item(self, item, spider):
        file_path="imgs/%s"%item["title"]
        file_src=item["src"]
        f=open(file_path,"wb")
        img_date=requests.get(file_src)
        f.write(img_date.content)
        f.close()

Two: paging to grab pictures of Xiaohuanet

Guess you like

Origin www.cnblogs.com/sun-10387834/p/12723029.html