Python learning (four) use the scrpay framework to crawl pictures

Knock on the blackboard and rewrite the previous article to use the scrapy framework to crawl pictures.
This time, using the search function, you can set the search content and save the pictures in the search results. The main script of
pic.py

# -*- coding: utf-8 -*-
import scrapy
import json
import random
import os
from urllib import parse
from BeautyPic.items import BeautypicItem

class PicSpider(scrapy.Spider):
    kw="大长腿"
    name = "pic"
    allowed_domains = ["laosiji.com"]
    start_urls = ['https://www.laosiji.com/proxy/api']
    cookies_str = "UM_distinctid=16a85bc719e11-0291ba522d6f78-39395704-1fa400-16a85bc719f8b2; _ga=GA1.2.457987246.1557021881; LSJLOGCOOKIE=11911911946108971111151051061054699111109-11273461-1557021880937; OdStatisticsToken=a2bd510b-6855-457b-87fa-89e9c0a729a9-1557021880936; tgw_l7_route=83a50c6e17958c25ad3462765ddb8a87; JSESSIONID=B83B7D9AE6CF4F35E0D95C5C3DCDE0AB; _gid=GA1.2.38970883.1559541613; CNZZDATA1261736092=756492161-1557017437-%7C1559539536; Hm_lvt_9fa8070d0f1a747dc1fd8cc5bdda4088=1557903441,1558061382,1558658949,1559541613; _gat_gtag_UA_132849965_2=1; Hm_lpvt_9fa8070d0f1a747dc1fd8cc5bdda4088=1559541757"
    cookies = {i.split("=")[0]: i.split("=")[1] for i in cookies_str.split(";")}
    total_count = 21
    page_count = 1
    post_data = {
        "method": "/search/ywf/indexapi",
        "cityid": "131",
        "search": kw,
        "page": str(page_count),
        "type": "1"
    }
    def start_requests(self):
        if self.total_count > self.page_count * 20:
            yield scrapy.FormRequest(
                self.start_urls[0],
                cookies=self.cookies,
                callback=self.parse,
                formdata=self.post_data,
                dont_filter=True
            )

    def parse(self, response):
        self.settings.headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive"
        }
        cookies = "UM_distinctid=16a85bc719e11-0291ba522d6f78-39395704-1fa400-16a85bc719f8b2; _ga=GA1.2.457987246.1557021881; LSJLOGCOOKIE=11911911946108971111151051061054699111109-11273461-1557021880937; OdStatisticsToken=a2bd510b-6855-457b-87fa-89e9c0a729a9-1557021880936; tgw_l7_route=83a50c6e17958c25ad3462765ddb8a87; JSESSIONID=B83B7D9AE6CF4F35E0D95C5C3DCDE0AB; _gid=GA1.2.38970883.1559541613; CNZZDATA1261736092=756492161-1557017437-%7C1559539536; Hm_lvt_9fa8070d0f1a747dc1fd8cc5bdda4088=1557903441,1558061382,1558658949,1559541613; _gat_gtag_UA_132849965_2=1; Hm_lpvt_9fa8070d0f1a747dc1fd8cc5bdda4088=1559541757"
        # cookies = "UM_distinctid=16a85bc719e11-0291ba522d6f78-39395704-1fa400-16a85bc719f8b2; _ga=GA1.2.457987246.1557021881; LSJLOGCOOKIE=11911911946108971111151051061054699111109-11273461-1557021880937; OdStatisticsToken=a2bd510b-6855-457b-87fa-89e9c0a729a9-1557021880936;gid=GA1.2.798960997.1558061382; Hm_lvt_9fa8070d0f1a747dc1fd8cc5bdda4088=1557021882,1557817656,1557903441,1558061382; tgw_l7_route=b1a3cc9000bfce8fe74cd67462fc2144; JSESSIONID=9C3AEFD9D445FFA1C9502B9EDE5B599B; CNZZDATA1261736092=756492161-1557017437-%7C1558069156; Hm_lpvt_9fa8070d0f1a747dc1fd8cc5bdda4088=1558073069"
        cookies = {i.split("=")[0]: i.split("=")[1] for i in cookies.split(";")}
        json_list = json.loads(response.body.decode())
        self.total_count = int(json_list["body"]["search"]["sns"]["count"])
        json_list = json_list["body"]["search"]["sns"]["list"]
        url_list = [parse.urljoin( "https://www.laosiji.com/thread/", str(i["resourceid"]) + ".html")  for i in json_list if i is not None]
        for url in url_list:
            print(url)
            yield scrapy.Request(
                url,
                callback=self.parse_detail,
                cookies=cookies,
                dont_filter=True
            )
        self.page_count += 1
        self.post_data = {
            "method": "/search/ywf/indexapi",
            "cityid": "131",
            "search": self.kw,
            "page": str(self.page_count),
            "type": "1"
        }
        if self.total_count > self.page_count * 20:
            print("-" * 100)
            print(self.post_data)
            yield scrapy.FormRequest(
                self.start_urls[0],
                cookies=self.cookies,
                callback=self.parse,
                formdata=self.post_data,
                dont_filter=True
            )

    def parse_detail(self, response):
        img_list = response.xpath("//div[@class='threa-main-box']/li")
        for img_url in img_list:
            img_id = img_url.xpath("./div[@class='img-box']/@id").extract_first()
            url = img_url.xpath("./div[@class='img-box']/a/img/@src").extract_first()
            title=response.xpath("//h1[@class='title']/text()").extract_first()
            if url is not None:
                item=BeautypicItem()
                item["id"]=img_id
                item["title"]=title
                item["url"]=url
                yield item


This is the middlewares.py script, remember to open it in the settings
insert image description here
insert image description here
, mainly to add the User-Agent pool

class BeautypicSpiderMiddleware(object):
    def process_request(self,request, spider):
        request.headers["User-Agent"]=random.choice(settings.User_Agents)
        print(request.headers["User-Agent"])
        return None

Finally, the image download function is added in itemiple.py
Remember to open itemiple in settings.py

# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import requests
import os
import re
import string

class BeautypicPipeline(object):

    def process_item(self, item, spider):
        headers = {
            "User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
        }
        url = item["url"]
        title=item["title"]
        print(url)
        if url is not "":
            try:
                response = requests.get(url, timeout=4)
            except:
                print("url有误"+url)
            else:
                str_path = os.path.abspath("D:\BaiduNetdiskDownload\pic")
                if not os.path.exists(str_path):
                    os.mkdir(str_path)
                title = title.replace(",", "").replace("。", "").replace(":", "").replace(".",'').replace("?",'').replace("!","").replace("/","").strip()
                title=re.sub('[%s]' % re.escape(string.punctuation), '', title)
                # title="".join([i for i in title if i not in string.punctuation])
                dir_path = str_path + "\\" + title
                print(dir_path)
                if title is not "":
                    if not os.path.exists(dir_path):
                        os.mkdir(dir_path)
                    file_name = item["id"]
                    with(open(dir_path + "/" + file_name + ".png", "wb")) as f:
                        f.write(response.content)
                        print("保存成功{%s}" % file_name)


Finally, the picture naming will remove the special symbols that do not conform to the file naming rules, and use the string.punctuation
insert image description here
finished product for your reference. Welcome to correct me.

Guess you like

Origin blog.csdn.net/m_cainiaokuaifei/article/details/92797549