爬取百度大数据350张图片并存入本地文件夹

from urllib.parse import urlencode
import requests 
import re
import os
import json

from requests.exceptions import RequestException
def getindex(pn,rn):
    data={
        'tn': 'resultjson_com',
        'ipn': 'rj',
        'ct':'201326592',
        'is':'',
        'fp': 'result',
        'queryWord': '大数据',
        'cl': '2',
        'lm': '-1',
        'ie': 'utf-8',
        'oe': 'utf-8',
        'adpicid':'',
        'st': '',
        'z':'',
        'latest':'',
        'copyright':'',
        'word': '大数据',
        's':'',
        'se':'',
        'tab':'',
        'width':'',
        'height':'',
        'face': '0',
        'istype': '2',
        'qc':'',
        'nc': '1',
        'fr':'',
        'expermode':'',
        'force':'',
        'pn': pn,
        'rn': rn,
        'gsm': '1e',
        '1557667455319':'',
        }
    url='http://image.baidu.com/search/acjson?'+urlencode(data)#对data进行编码,将字典对象转化为请求参数
    response=requests.get(url)
    try:
        if response.status_code ==200:
            return response.text
        return None
    except RequestException:
        print("请求失败")
        return None
def geturl(html):
    data=json.loads(html)
    if data and 'data' in data.keys():    #判断含有json属性
        for item in data.get('data'):
            yield item.get('thumbURL')
def download(url):
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
           'Referer':'http://image.baidu.com/search'
               }
    req = requests.get(url, headers=headers)
    DIR='images'
    if len([name for name in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, name))])!=350:
        f=open('./images/'+url.split('/')[-1],'wb')
        f.write(req.content)
        f.close()
    else:
        return 0
def main():
    for i in range(70):
        pn=i*30
        rn=8
        html=getindex(pn,rn)
        for url in geturl(html)[:-1]:#最后一个为none
            abc=download(url)
            if abc==0:
                break

if __name__=="__main__":
    DIR='images'
    os.makedirs(DIR)
    main()
    

猜你喜欢

转载自blog.csdn.net/weixin_43323333/article/details/90170957