Article Directory
For record only, the code can run
Comprehensive
After running, enter the number of pages you want to crawl:
input 1-crawl 30 pages,
input 2-crawl 60 pages,
input 3-crawl 90 pages
import requests
from lxml import etree
page = input('请输入要爬取多少页:')
page = int(page) + 1
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
n = 0
pn = 1
# pn是从第几张图片获取 百度图片下滑时默认一次性显示30张
for m in range(1, page):
url = 'https://image.baidu.com/search/acjson?'
param = {
'tn': 'resultjson_com',
'logid': '11941270206720072198',
'ipn': 'rj',
'ct': '201326592',
'is': '',
'fp': 'result',
'queryWord': '猫',
'cl': '2',
'lm': '-1',
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': '',
'z':'',
'ic':'',
'hd': '',
'latest': '',
'copyright': '',
'word': '猫',
's':'',
'se':'',
'tab': '',
'width': '',
'height': '',
'face': '',
'istype': '',
'qc': '',
'nc': '1',
'fr': '',
'expermode': '',
'force': '',
'pn': '30',
'rn': '30',
'gsm': '1e',
}
page_text = requests.get(url=url, headers=header, params=param)
page_text.encoding = 'utf-8'
page_text = page_text.json()
info_list = page_text['data']
del info_list[-1]
img_path_list = []
for i in info_list:
img_path_list.append(i['thumbURL'])
for img_path in img_path_list:
img_data = requests.get(url=img_path, headers=header).content
img_path = r'E:\code\python\20191201\b8catimg\catpro\ ' + str(n) + '.jpg'
with open(img_path, 'wb') as fp:
fp.write(img_data)
n = n + 1
pn += 29
Step version
Changing pn
the value in the program means starting from a certain picture-crawling 30 pictures:
if pn is set to 1, it means starting from the first picture-crawling 30 pictures after it
Change the n
value in the program to indicate the name of the saved picture:
if n=0, the names of the saved picture are 0.jpg, 1.jpg, 2.jpg, ..., 29.jpg
import requests
#进行UA伪装
header = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
url = 'https://image.baidu.com/search/acjson?'
param = {
'tn': 'resultjson_com',
'logid': '11941270206720072198',
'ipn': 'rj',
'ct': '201326592',
'is': '',
'fp': 'result',
'queryWord': '猫',
'cl': '2',
'lm': '-1',
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': '',
'z':'',
'ic':'',
'hd': '',
'latest': '',
'copyright': '',
'word': '猫',
's':'',
'se':'',
'tab': '',
'width': '',
'height': '',
'face': '',
'istype': '',
'qc': '',
'nc': '1',
'fr': '',
'expermode': '',
'force': '',
'pn': '30',
'rn': '30',
'gsm': '1e',
}
#将编码形式转换为utf-8
page_text = requests.get(url=url,headers=header,params=param)
page_text.encoding = 'utf-8'
# page_text = page_text.text
# print(page_text)
page_text = page_text.json()
# 先取出所有链接所在的字典,并将其存储在一个列表当中
info_list = page_text['data']
# 由于利用此方式取出的字典最后一个为空,所以删除列表中最后一个元素
del info_list[-1]
# 定义一个存储图片地址的列表
img_path_list = []
for info in info_list:
img_path_list.append(info['thumbURL'])
#再将所有的图片地址取出,进行下载
#n将作为图片的名字
n = 0
for img_path in img_path_list:
img_data = requests.get(url=img_path,headers=header).content # .content获取真正的图片内容
img_path = r'E:\code\python\20191201\b8catimg\catstep\ ' + str(n) + '.jpg'
with open(img_path,'wb') as fp:
fp.write(img_data)
n += 1
print('ok')
Reprint
understanding
Now the pictures in Baidu Gallery are different from the original ones, (originally in the form of page numbers, the first page and the second page) are now loaded in the form of loading, and they are all on the same page; therefore ajax请求
, the method of crawling is used;
With the previous difference is that much param
:
param={
}
page_text = requests.get(url=url,headers=header,params=param)
page_text = page_text.json()
Finally, take the thumbURL of the data in page_text
(Effect:
Able to see the source code of the webpage at a glance
)
About XHR options