絮叨一下
主要就是爬取图赏中的图片,然后分类保存出来
1.新建项目:
scrapy startproject hanfu_project
2.新建爬虫
scrapy genspider hanfu_spider 'aihanfu.com'
3.修改settings.py
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
}
ITEM_PIPELINES = {
'hanfu_project.pipelines.HanfuProjectPipeline': 300,
}
4.hanfu_spider.py
# -*- coding: utf-8 -*-
import scrapy
from hanfu_project.items import HanfuProjectItem
import os
class HanfuSpiderSpider(scrapy.Spider):
name = 'hanfu_spider'
allowed_domains = ['aihanfu.com']
start_urls = ['http://www.aihanfu.com/zixun/tushang-1/']
def parse(self, response):
# 获取到第一页所有
url = response.xpath("//h3[@class='yh']/a/@href").getall()
for url in url:
yield scrapy.Request(url,callback=self.img)
def img(self,response):
# 定位到存放名称以及图片的位置然后去遍历
base = response.xpath ('//*[@id="main_article"]')
for i in base:
# 提取图片名称
img_name = i.xpath(".//header/h1/text()").get()
# 提取图片的url地址
img_url = i.xpath(".//div[@class='arc_body']/figure/img/@src").getall()
# yield图片的名字以及地址
yield HanfuProjectItem(img_name=img_name,img_url=img_url)
5.items.py
import scrapy
class HanfuProjectItem(scrapy.Item):
img_name =scrapy.Field()
# 图片名称
img_url = scrapy.Field()
# 图片url地址
6.piplines.py
import os
from urllib import request
class HanfuProjectPipeline:
def __init__(self):
# 查看当前目录,join是拼接构建的是绝对路径
self.img_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),'汉服')
if not os.path.exists(self.img_path):
# 如果没有该路径则新建路径
os.mkdir(self.img_path)
def process_item(self, item, spider):
img_name = item["img_name"]
img_url = item["img_url"]
# 二级目录
img_name_path = os.path.join(self.img_path,img_name)
if not os.path.exists(img_name_path):
os.mkdir(img_name_path)
for url in img_url:
img_name = url.split('/')[-1]
request.urlretrieve(url,os.path.join(img_name_path,img_name))
# 这里下载图片使用的是urllib库进行的下载,后期说一下怎么使用scrapy正确的下载方式
7.打开终端进入项目目录输入命令:
scrapy crawl hanfu_spider
然后开始运行
查看效果图
然后就可以慢慢欣赏了
写在最后
朋友你好感谢你看到了最后
公众号:Linux下撸python
期待和你再次相遇
愿你学的愉快