Case I: crawling website images

The pictures need to crawl content: www.dbmeinv.com this site's images.

A crawling an image content

Requests Import 
from lxml Import etree 
Import os 
# 1. Gets the first page url url = "https://www.dbmeinv.com/?pager_offset=1"
reason # get first is to address pocketing mechanism. We can solve 70% to 80% by pocketing by User-Agent. = {headers 'the User-- Agent': "the Mozilla / 5.0 (the Windows NT 6.1; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 70.0.3538.110 Safari / 537.36"} # 2 sends a request acceptance response. Response requests.get = (url, headers = headers) # 3. extracted data # xpath generate a target analysis Selector = etree.HTML (response.text) # get the first page of all images URL list url_list = selector.xpath ( "/ / div / a / img / @ src ") # send the request again, get the picture content
# define a path variable is used to store pictures location path =" newpath " # traversing the first page of a list of all the pictures url for url in url_list: if not os.path.exists (path): os.makdirs (path) # sending a request, receiving a response Data = requests.get (URL, headers = headers) # save the data, use the write binary with open (path + "/" + url [-7:], " wb ") AS f: f.write (data.content)

  Description: solve pocketing, adding request parameters. For example 70% to 80% solution of pocketing problem User-Agent.

Second, crawling all pages of the website pictures

import requests
from lxml import etree
import  os
#1.url
url_list=["https://www.dbmeinv.com/?pager_offset={}".format(i) for i in range(1,10)]
headers={'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"}
#2.发送请求,接受响应
for url in url_list:
	response=requests.get(url,headers=headers)

#3.提取数据
#生成一个xpath解析对
	selector=etree.HTML(response.text)

#获取图片的URL列表
	url_list2=selector.xpath("//div/a/img/@src")
#再次发送请求,获取图片内容
	path="newpath"

	for url in url_list2:
		if not os.path.exists(path):
			os.makedirs(path)
		#发送请求,接受响应
			data = requests.get(url, headers = headers)
		#保存数据,写入要用二进制
		with open(path + "/"+url[-7:],"wb")as f:
			f.write(data.content)

  

 

Guess you like

Origin www.cnblogs.com/benpao1314/p/11226987.html