The pictures need to crawl content: www.dbmeinv.com this site's images.
A crawling an image content
Requests Import from lxml Import etree Import os
# 1. Gets the first page url url = "https://www.dbmeinv.com/?pager_offset=1"
reason # get first is to address pocketing mechanism. We can solve 70% to 80% by pocketing by User-Agent. = {headers 'the User-- Agent': "the Mozilla / 5.0 (the Windows NT 6.1; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 70.0.3538.110 Safari / 537.36"} # 2 sends a request acceptance response. Response requests.get = (url, headers = headers) # 3. extracted data # xpath generate a target analysis Selector = etree.HTML (response.text) # get the first page of all images URL list url_list = selector.xpath ( "/ / div / a / img / @ src ") # send the request again, get the picture content
# define a path variable is used to store pictures location path =" newpath " # traversing the first page of a list of all the pictures url for url in url_list: if not os.path.exists (path): os.makdirs (path) # sending a request, receiving a response Data = requests.get (URL, headers = headers) # save the data, use the write binary with open (path + "/" + url [-7:], " wb ") AS f: f.write (data.content)
Description: solve pocketing, adding request parameters. For example 70% to 80% solution of pocketing problem User-Agent.
Second, crawling all pages of the website pictures
import requests from lxml import etree import os #1.url url_list=["https://www.dbmeinv.com/?pager_offset={}".format(i) for i in range(1,10)] headers={'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"} #2.发送请求,接受响应 for url in url_list: response=requests.get(url,headers=headers) #3.提取数据 #生成一个xpath解析对 selector=etree.HTML(response.text) #获取图片的URL列表 url_list2=selector.xpath("//div/a/img/@src") #再次发送请求,获取图片内容 path="newpath" for url in url_list2: if not os.path.exists(path): os.makedirs(path) #发送请求,接受响应 data = requests.get(url, headers = headers) #保存数据,写入要用二进制 with open(path + "/"+url[-7:],"wb")as f: f.write(data.content)