python脚本爬取百度贴吧里的图片并下载到指定路径下,支持多页同时下载

# -*- coding: utf-8 -*-

import requests

import re

import os

#url = 'http://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gbk&word=ͼƬ&fr=ala&ala=1&alatpl=others&pos=0'

url = 'https://tieba.baidu.com/p/5580939290'#要抓取图片的网址

downloadDirPath = '/Users/zhoujinglin/Desktop/download'#下载下来图片要保存到的文件夹路径


webHeaders = {'Host': 'tieba.baidu.com',

'method': 'GET',

'path': '/question/27364360',

'scheme': 'https',

'accept': 'application/json, text/javascript, */*; q=0.01',

'accept-encoding': 'gzip, deflate, br',

'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',

'Connection': 'keep-alive',

'cookie': 'TIEBA_USERTYPE=6e38cd16e89501afe2f17ce4; FP_UID=88cd11aa371a693bafd977c8950b1f01; PSTM=1508392484; BIDUPSID=7DFA1B875D5271A5BAAA5561AA5918E6; BAIDUID=E8466CF472ECBD1C3AFE02033D3B0EC8:FG=1; FP_LASTTIME=1508484287746; TIEBAUID=cb23caae14130a0d384a57f1; H_PS_PSSID=1435_25810_21096_20929; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; PSINO=1; wise_device=0; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1521629325; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1521629325',

'upgrade-insecure-requests': '1',

'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'


}#获取网页源码时设置,防止抓取不到

heards = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}#图片写入文件夹时设置,避免下载下来图片打不开



#获取url对应的网页源码

def getHtmlText(url):

    try:

        resp = requests.get(url,headers=webHeaders)

        print(resp.encoding+'\n\n')

        return resp.text

    except:

        print('haha')

        return 'qwer'


#解析url源码页面获取图片路径的数组

def getUrlsFromHTML(html):

    reg = r'src="(http[^;]+?\.[jpg][pin][gf])'#正则表达式含义:以http开头,以jpg或png结尾中间是除了;的任意字符的字符串

    imgre = re.compile(reg)

    #reg=re.compile(pattern)

    urls=imgre.findall(html)

    return urls


i=0

#下载图片

def download(List):

    global i

    for url in List:

        r=requests.get(url,headers=heards,timeout=30)#headers防止图片下载下来后打不开

        r.encoding=r.apparent_encoding

        if not os.path.exists(downloadDirPath):#判断downloadDirPath下的文件夹是否存在不存在择创建

            os.makedirs(downloadDirPath)

        if '.png' in url:#根据图片路径判断图片的格式并在给下载下来的图片命名时使用

            path = downloadDirPath+'/'+str(i)+'.png'#图片要下载到的文件夹路径和图片名组合成图片的保存路径

        elif '.jpg' in url:

            path = downloadDirPath+'/'+str(i)+'.jpg'

        else:

            path = downloadDirPath+'/'+str(i)+'.gif'

        if not os.path.exists(path):

            with open(path,'wb') as f:

                f.write(r.content)#网文件夹中写入图片

                f.close()

                print(str(i)+' 文件保存成功')

        else:

            print('文件已经存在')

        i+=1


def main():

    html = getHtmlText(url)

    urls = getUrlsFromHTML(html)

    for str in urls :#循环打印图片地址

        print(str+'\n')

    download(urls)

#一个帖子可能有很多页这里举一个两页帖子的例子,for循环一页一页进行下载

for index in range(1,3):

    print('\n'+'第'+str(index)+'页图片下载')

    url = 'https://tieba.baidu.com/p/5580939290?pn='+str(index)

    main()



猜你喜欢

转载自blog.csdn.net/u013857988/article/details/79656344