Python下载知乎单独答案的图片和所有答案的图片

#方法一:下载知乎单独答案图片
# coding=utf-8
#
# from urllib import request as rr
# from bs4 import BeautifulSoup
# from collections import Counter
# import os
# import re
#
# url = "https://www.zhihu.com/question/29814297"  # 指定的URL
#
# #下载图片并保存到本地
# def download(_url, file_name):
#     if (_url == None):  #地址若为None则pass
#         pass
#     result = rr.urlopen(_url)  # 打开链接
#     if (result.getcode() != 200):  # 如果链接不正常则pass
#         pass
#     else:
#
#         data = result.read() #链接正常的话则进行下载
#         with open(file_name, "wb") as f:
#             f.write(data)
#             f.close()
#
# if __name__ == '__main__':
#     res = rr.urlopen(url)   #打开目标地址
#     content = res.read()    #获取网页内容
#     cnt = 0 #计数器
#     soup = BeautifulSoup(content)   #实例化一个BeautifulSoup对象
#     link_list = []  #创建一个list来存放链接
#     # print(content)
#     for link in soup.find_all('img'):   #获取img标签中的内容
#         addr = link.get('data-actualsrc')    #属性data-original对应的值即为图片的地址
#         link_list.append(addr)  # 添加到list中
#         link_set = set(link_list)   #去重
#     for addr in link_set:
#         if (addr != None):
#             pathName = r'C:\Users\41174\AppData\Local\Temp\change.py\shrinkImage\\' + str(cnt + 1) + '.jpg'  #设置文件路径
#             cnt = cnt + 1
#             print("Doenloading the " + str(cnt) + "th picture")
#             download(addr, pathName)  # 调用下载函数


#方法二:下载知乎单独答案图片
# from urllib import request
# from bs4 import BeautifulSoup
# import re
# import time
#
# url = 'https://www.zhihu.com/question/22918070'
# html = request.urlopen(url).read().decode('utf-8')
# soup = BeautifulSoup(html, 'html.parser')
# # print(soup.prettify())
# # 使用BeautifulSoup结合正则表达式来提取包含所有图片链接(img标签中,class='origin_image zh-lightbox-thumb',以.jpg结尾的链接)的语句
# links = soup.find_all('img', 'origin_image zh-lightbox-thumb', src=re.compile(r'.jpg$'))
# print(links)
# # 设置图片保存路径,否则会保存程序当前路径
# path = r'C:\Users\41174\AppData\Local\Temp\change.py\shrinkImage'  # r保持字符串的原始值,不进行转义
# for link in links:
#     print(link.attrs['src'])
#     # 保存链接并命名,time.time()返回当前时间戳以防止命名冲突
#     request.urlretrieve(link.attrs['src'], path + '\%s.jpg' % time.time())

#方法三:下载知乎所有答案图片
# -*- coding:utf-8 -*-
import re
import requests
import os


headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
    'Accept-Encoding': 'gzip, deflate'}

def get_image_url(qid, headers):
    # 利用正则表达式把源代码中的图片地址过滤出来
    # reg = r'data-actualsrc="(.*?)">'
    tmp_url = "https://www.zhihu.com/node/QuestionAnswerListV2"
    size = 10
    image_urls = []
    session = requests.Session()
    while True:
        postdata = {'method': 'next', 'params': '{"url_token":' +
                                                str(qid) + ',"pagesize": "10",' + '"offset":' + str(size) + "}"}
        page = session.post(tmp_url, headers=headers, data=postdata)
        ret = eval(page.text)
        answers = ret['msg']
        print("答案数 : %d " % (len(answers)))
        size += 10
        if not answers:
            print("图片URL获取完毕, 页数: ", (size - 10) / 10)
            return image_urls
        # reg = r'https://pic\d.zhimg.com/[a-fA-F0-9]{5,32}_\w+.jpg'
        imgreg = re.compile('data-original="(.*?)"', re.S)
        for answer in answers:
            tmp_list = []
            url_items = re.findall(imgreg, answer)
            for item in url_items:  # 这里去掉得到的图片URL中的转义字符'\\'
                image_url = item.replace("\\", "")
                tmp_list.append(image_url)
            # 清理掉头像和去重 获取data-original的内容
            tmp_list = list(set(tmp_list))  # 去重
            for item in tmp_list:
                if item.endswith('r.jpg'):
                    # print(item)
                    image_urls.append(item)
        print('size: %d, num : %d' % (size, len(image_urls)))

def download_pic(img_lists, dir_name):
    print("一共有 {} 张照片".format(len(img_lists)))
    if not os.path.exists(dir_name):  # 新建文件夹
        os.mkdir(dir_name)
    for i,image_url in enumerate(img_lists):
        response = requests.get(image_url) #, stream=True
        if response.status_code == 200:
            image = response.content
            try:
                with open(dir_name + os.sep+ '%d.jpg' % i, "wb") as picture:
                    picture.write(image)
                    print("下载 {} 完成!".format(picture))
            except IOError:
                print("IO Error\n")
                continue
            finally:
                picture.close
        else:
            continue
        # file_name = dir_name + os.sep + basename(urlsplit(image_url)[2])

def mkdir(path):
    if not os.path.exists(path):
        print('新建文件夹:', path)
        os.makedirs(path)
        return True
    else:
        print("图片存放于:", os.getcwd() + os.sep + path)
        return False


if __name__ == '__main__':
    # question_id = 30061914
    question_id = 26037846
    zhihu_url = "https://www.zhihu.com/question/{qid}".format(qid=question_id)
    path = 'zhihu_pic'
    # mkdir(path)  # 创建本地文件夹
    img_list = get_image_url(question_id, headers)  # 获取图片的地址列表
    print(img_list)
    download_pic(img_list, path)  # 保存图片

参考:https://blog.csdn.net/zuochao_2013/article/details/77899190

猜你喜欢

转载自blog.csdn.net/weixin_42323337/article/details/84021557