用python爬取百度图片

爬取的图片默认放在代码文件同目录的文件下(第一次运行代码时,本人的代码是放在桌面,爬取一千多张照片导致桌面当场崩溃,后来在桌面新建了文件夹效果就好多了)
上代码:

import requests
import urllib 
import os , re
from os.path import join
import time
def getPages(keyword,pages=5): 
    params = [] 
    for i in range(30, 30*pages+30, 30): 
        # 通过网上资料,可以使用 requests.get() 解析 json 数据,能够得到对应 url
        # 其中一个坑是,原来并不是以下的每个数据都是需要的,某些不要也可以!
        params.append({ 
                      'tn': 'resultjson_com', 
                      'ipn': 'rj', 
                      'ct': 201326592, 
                      'is': '', 
                      'fp': 'result', 
                      'queryWord': keyword, 
                      'cl': 2, 
                      'lm': -1, 
                      'ie': 'utf-8', 
                      'oe': 'utf-8', 
                      'adpicid': '', 
                      'st': -1, 
                      'z': '', 
                      'ic': 0, 
                      'word': keyword, 
                      's': '', 
                      'se': '', 
                      'tab': '', 
                      'width': '', 
                      'height': '', 
                      'face': 0, 
                      'istype': 2, 
                      'qc': '', 
                      'nc': 1, 
                      'fr': '',
                      'pn': i, 
                      'rn': 30, 
                      #'gsm': '1e', 
                      #'1488942260214': ''
                  }) 
    url = 'https://image.baidu.com/search/index?ct=201326592&z=&tn=baiduimage&ipn=r&word=%E8%B7%91%E8%BD%A6&pn=0&istype=2&ie=utf-8&oe=utf-8&cl=2&lm=-1&st=-1&fr=&fmq=&ic=0&se=&sme=&width=1920&height=1080&face=0' 
    urls = [] 
    for param in params:
        # url 与 param 合成完整 url 
        urls.append(requests.get(url,param,headers = headers,timeout = 3).url)     #
    #print (urls)
    return urls
def get_Img_url(keyword,pages=5):
    # 每页的 URL 集合
    pageUrls = getPages(keyword,pages)            
    # 图片url : "thumbURL":"https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=1789805203,3542215675&fm=27&gp=0.jpg"
    # 正则写的很差!
    exp = re.compile(r'"thumbURL":"[\:\,\_\w\d\=\.\+\s\/\%\$\&]+\.jpg')
    imgUrls = []
    for url in pageUrls:
        # 逐个读取每页 URL
        try:
            with urllib.request.urlopen(url,timeout = 3) as pageUrl:
                imgUrl = pageUrl.read().decode('utf-8')
                urls = re.findall(exp,imgUrl)
                for url in urls:
                    # 除去 thumbURL":"
                    imgUrls.append(url.replace('"thumbURL":"',''))
        # 正则提取 ImgUrl
        except:
            print('SomePage is not opened!')          
            continue
    # 所有照片的 urls
    return imgUrls
def getImg(urlList,localPath): 
    if not os.path.exists(localPath): 
        os.makedirs(localPath) 
    x = 1
    for url in urlList:
        # 将 for 循环写在 try 外面
        try:
            # 什么时候应该转义?这点还没弄明白
            # 没有打开特定文件夹!
            with open(keyword+str(x)+'.jpg','wb') as f:        # 原本写 ‘\.jpg’ 会出错,打印 \\.jpg
                img = urllib.request.urlopen(url,timeout = 3).read()
                f.write(img)
            print('%d.jpg is downloaded!' % x)
            x += 1
        except Exception: 
            print("\n  Failed downloading NO. %d image\n" % x)
if __name__ == '__main__':
    keyword = '超跑'
    pages = 500
    localPath = 'd:/pythonCode/day1001/'
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
    urlList = get_Img_url(keyword,pages)
    getImg(urlList,localPath)
点击打开   

猜你喜欢

转载自blog.csdn.net/eacxzm/article/details/80189754
今日推荐