The following code can be used to crawl images from the two major image websites (Baidu and Sogou), which is very helpful for collecting image data in artificial intelligence and deep learning!
1. Crawl Baidu pictures
This code can crawl any customized pictures in Baidu pictures:
import requests
import re
import time
import os
def saveImg(imgurlList, imgOs):
for i in imgurlList:
try:
response = requests.get(url=i).content
except:
print("error!")
else:
imgName = i[28: 36]
with open(imgOs + imgName + ".jpg", "wb") as file:
file.write(response)
print(i + " 下载完成!!")
def get_asjson(page, gsm, word):
url = f"https://image.baidu.com/search/acjson?tn=resultjson_com&logid=9123806616981181340&ipn=rj&ct=201326592&is=&fp=result&fr=&word={word}&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn={str(30 * int(page))}&rn=30&gsm={gsm}&{str(int(time.time() * 1000))}="
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
'Referer': 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1637758492843_R&pv=&ic=&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&dyTabStr=MCwzLDYsMiw0LDEsNSw4LDcsOQ%3D%3D&ie=utf-8&sid=&word=hello',
'Cookie': 'BDqhfp=hello%26%26-10-1undefined%26%2628989%26%2635; BAIDUID=0C2336F5F3D356371C46DF079632E0C8:FG=1; BAIDUID_BFESS=0C2336F5F3D356371C46DF079632E0C8:FG=1; BIDUPSID=0C2336F5F3D356371C46DF079632E0C8; __yjs_duid=1_32693704d239fea9266064fc8a3d25631637737833661; PSTM=1637737880; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; userFrom=null; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; delPer=0; PSINO=6; __yjs_st=2_ZGU4ODA5ZTdmNzczMzgxNzRiZWZhNTdkODVkY2E5MzQ3NzM3Nzc2MzZlNjYzZmRiMWVjOTlmNWQzZDA3NWY1MzM2M2NkNjNmMjMzZWVlYzQxNGQ2ODIzYjlkNTdhYTUyZjdhNWQwNjQxZWE1YTI0MWZiNzQ1NTE0N2NlNTgwNjZjODlkNWVlZWI2ZDBkNjUzNmNiZDE3NzUyYTA4ZjkxYjI1NzNhODBjOGZhZTBmMzZkY2IwOWJmNjMxNjEzNmUxYjQxZmZhM2M1ODUzYTFkNTM4NTE5MzZjZjRkODliMTE1MmRmMDY1MjI4OGJiM2I3ZGMzMDdiNjI4MWE3NDgxZV83XzQyODU3N2M0; H_PS_PSSID=35295_34446_35104_31254_35237_35049_34584_34505_35245_34578_34872_26350_35210_35145_22160; indexPageSugList=%5B%22hello%22%2C%22bello%22%2C%22hello%20%22%5D; cleanHistoryStatus=0; ab_sr=1.0.1_MTJmNTIwNGNlNmI5NDg2YmZiZTI1OTM1MGZhNTJhZTZlMzVmODE2NmEwZjg5MjNlZWZjZWY1YTY3ZjQ2Yzc2MWZiNGRlODY2ZDJjOGE3N2RhMzg2NjcxZjEzY2ZiMDQ4ODNjYzgyZTZlNWM2NGQ4YjlhMzBlMWE1ZjU0ZTY2NzAxYmM0ZGRkOTM0MGI3NzUwOWZjODY2ODE5NmU1N2E1Yw=='
}
response = requests.get(url=url, headers=headers).text + "1111"
gsm = re.findall('"gsm":"(.*?)",', response)[0]
data = re.findall('"hoverURL":"(.*?)",', response)
return gsm, data
if __name__ == "__main__":
a = "1e"
key_word = "阳台" # 修改你要爬取的关键字
img = key_word + "_img\\"
os.mkdir(img)
for i in range(1, 2): #通过改变第二个数,修改要爬取的页数
asjson1 = get_asjson(i, a, key_word)
saveImg(asjson1[1], img)
a = asjson1[0]
while True:
asjson2 = get_asjson(int(i) + 1, a, key_word)
saveImg(asjson2[1], img)
a = asjson2[0]
break
2. Crawl Sogou pictures
This code can crawl any customized picture in Sogou Pictures:
from urllib.parse import quote
import requests
# 填入需要搜索的内容
key_word = quote('阳台')
# 通过定义page决定爬取多少页,每一页有48张图片
page=50
for page in range(1, page):
startN=(page-1)*48
url = 'https://pic.sogou.com/napi/pc/searchList?mode=1&start={}&xml_len=48&query={}'.format(startN,key_word)
response = requests.get(url)
json_data = response.json()
allData = json_data['data']['items']
img_urls=[]
i = 0
for data in allData:
url = data['thumbUrl']
img_urls.append(url)
i=i+1
for num in range(i):
data=requests.get(img_urls[num],timeout=5).content
# 此处需要修改保存路径
with open('C:/Users/wbl/Desktop/AI/pc/L/'+'page'+str(page)+'-'+str(num)+'.jpg','wb')as file:
file.write(data)
print(num,'下载完成!!')