根据关键字爬取搜狗图库图片

根据关键字爬取搜狗图库图片

说明:关键字是从excel表格文件中读取的
如果想看爬虫分析过程的可以参考博主之前的写的博客,完整代码如下:

'''
@Time    : 2019/10/24 15:38
@Software: PyCharm
'''
import json
import urllib
import jsonpath
import requests
import os
import xlrd

def search(startPage,endPage,path,keywords):
    url = "https://pic.sogou.com/pics"
    headers = {
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Host': 'pic.sogou.com',
        'Referer': 'https://pic.sogou.com/pics?query=%B1%ED%B8%F1%CD%BC%C6%AC&p=40230500&st=255&mode=255&policyType=0',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest'
    }
    datas={"mode":1,"reqType":"ajax","reqFrom":"result","tn":0}
    for keyword in keywords:
        datas['query'] = keyword
        temp  = path+keyword+"\\"
        if not os.path.exists(temp):
            os.makedirs(temp)
        for num in range(startPage,(endPage + 1)):
            datas['start'] = num
            images = requests.get(url, headers=headers, params=datas, timeout=5)
            # print(images.status_code)
            # print(images.text)
            jsonObjs = json.loads(images.text)
            images_urls = jsonpath.jsonpath(jsonObjs, '$.items..ori_pic_url')
            i = 1
            for image_url in images_urls:
                try:
                    print('*' * 10 + '正在下载——'+keyword+'——第' + str((num - 1) * 48 + i) + '张图片' + '*' * 10)
                    res = urllib.request.urlopen(image_url, timeout=5).read()
                    with open(temp + keyword + '_'+str((num - 1) * 48 + i) + '.jpg', 'wb') as file:
                        file.write(res)
                        file.close()
                except Exception as e:
                    print(keyword+'——第' + str((num - 1) * 48 + i) + '张图片下载出错,错误信息如下:')
                    print(' ' * 10 + str(e))
                    print('')
                    continue
                finally:
                    i += 1
        print('*' * 15 + '下载完成' + '*' * 15)

def read_excel(excel_path):
    book = xlrd.open_workbook(excel_path )
    sheet1 = book.sheets()[0]
    keywords = sheet1.col_values(0)
    return keywords

def main(excel_path,startPage,endPage,path):
    keywords = read_excel(excel_path)
    search(startPage, endPage, path, keywords)

if __name__ == '__main__':
    excel_path="D:\\My Documents\\Desktop\\搜狗爬虫\\10.xlsx"
    startPage=1
    endPage = 2
    path = 'd:\\download\\搜狗\\'
    main(excel_path,startPage,endPage,path)

看完了,随手点个赞呗!

发布了12 篇原创文章 · 获赞 43 · 访问量 5431

猜你喜欢

转载自blog.csdn.net/weixin_40481076/article/details/102736355
今日推荐