import requests
from lxml import etree
from urllib.request import quote
from multiprocessing import Pool
from urllib.parse import urlencode
from json.decoder import JSONDecodeError
可以改成多进程爬取
在传输params的时候也可以用urlencode
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5383.400 QQBrowser/10.0.1313.400'}
def gethtml(num,keyword='薛之谦'):
params={
'tn': 'resultjson_com',
'ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': keyword,
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': 0,
'word': keyword,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': 1,
'fr': '',
'pn':num,
'rn': 30,
'gsm':str(hex(num)),
'1488942260214':''
}
url='https://image.baidu.com/search/acjson?'
html=requests.get(url,params=params,headers=headers)
html.encoding='utf-8'
print(url)
if html.status_code==200:
print('页面请求成功')
try:
soup=html.json().get('data')
return soup
except JSONDecodeError:
print('遇到json异常')
return None
else:
print('页面请求失败')
def geturl(html):
for item in html:
url=item.get('thumbURL')
name=item.get('fromPageTitleEnc')
info={
'url':url,
'name':name
}
yield info
def xiazai(url,name):
try:
response=requests.get(url,headers=headers)
if response.status_code==200:
filename='D:/ceshi/'+name[0:3]+url[-24:-15]+'.jpg'
#这里不想用x+=1的方法了,随便写个文件名
print('正在保存图片',name,url)
try:
with open(filename,'wb') as f:
f.write(response.content)
# urlretrieve(url,filename)
except:
print('文件名字错误')
else:
pass
except:
pass
def main(i):
obj=gethtml(num=i,keyword='薛之谦')#这里输入你要查的
if obj is None:
pass
else:
items=geturl(obj)
for item in items:
xiazai(item['url'],item['name'])
if __name__ == '__main__':
# pool=Pool(processes=4)
# pn=([i for i in range(90,3000,30)])
# pool.map(main,pn)
# pool.close()
for i in range(30,3000,30):
main(i)
python3爬取百度Ajax渲染图片
猜你喜欢
转载自blog.csdn.net/weixin_42557907/article/details/81196081
今日推荐
周排行