基于python3 抓取贴吧图片与评论 图片下载保存

抓取百度贴吧
目标:
    1、获取帖子标题、总页数、评论、图片
    2、图片写入文件并保存
    3、将各种信息实现打印(测试追踪)
    4、输入帖子号便能实现以上操作(亦适用于其它帖子)

第一版:

# -*-coding:utf-8-*-
import random
import re
import os
import urllib
import requests
import urllib.request
import time
from PIL import Image
from io import BytesIO

from bs4 import BeautifulSoup

"""初始化查询的网址"""
siteURL = "http://tieba.baidu.com/p/"


def replace(x):
    """
    方便用replace方法把换行符等删除
    :param: x
    :return: x.strip
    """
    # 将list 转化为字符串 否则报错expected string or bytes-like object
    x = ''.join(x)
    removeImg = re.compile('<img.*?>|{7}| ')  # 去除img标签,1-7位空格,
    removeAddr = re.compile('<a.*?>|</a>')  # 删除超链接标签
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')  # 把换行的标签换位\n
    replaceTD = re.compile('<td>')  # 把表格制表<td>换为\t
    replaceBR = re.compile('<br><br>|<br>|</br>|</br></br>')  # 把换行符或者双换行符换为\n
    removeExtraTag = re.compile('.*?')  # 把其余标签剔除
    removeNoneLine = re.compile('\n+')  # 把多余空行删除
    removeNoneLine = re.compile('\n+')  # 把多余空行删除
    x = re.sub(removeImg, "", x)
    x = re.sub(removeAddr, "", x)
    x = re.sub(replaceLine, "\n", x)
    x = re.sub(replaceTD, "\t", x)
    x = re.sub(replaceBR, "\n", x)
    x = re.sub(removeExtraTag, "", x)
    x = re.sub(removeNoneLine, "\n", x)
    return x.strip()  # 把strip()前后多余内容删除


def getSource(url):
    """
    获取网页源码
    :param: url
    :return: result
    """
    user_agents = [
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:43.0) Gecko/20100101 Firefox/43.0',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \(KHTML, like Gecko) Element Browser 5.0',
        'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
        'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
        'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \Version/6.0 Mobile/10A5355d Safari/8536.25',
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \Chrome/28.0.1468.0 Safari/537.36',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)'
    ]

    length = len(user_agents) - 1
    print(length)
    index = random.randint(0, length)
    user_agent = user_agents[index]
    headers = {'User_agent': user_agent}
    r = requests.get(url, headers=headers)
    return r.text


def saveImage(imageURL, path, title, name, pageName):
    """
    保存图片写入文件
    :param: imageURL, path, title, name, pageName
    :return:
    """
    try:
        # 命名格式有问题 不能用网址命名 因为有'/' 命名格式不行 下面在做一个过滤器 下周一来搞
        # 解决方案 在调用前传入名称就好
        proDir = os.path.split(os.path.realpath(__file__))[0]
        fileName = name + '.' + 'jpg'
        filePath = os.path.join(proDir, "photo", title, path, pageName)
        urllib.request.urlretrieve(imageURL, filePath + fileName)
        # urllib.request.urlretrieve(imageURL, filePath)
        # urllib.request.urlretrieve(imageURL, filePath + '\\%s.jpg' % imageURL)
    except Exception as e:
        print(e)


def getTitle(url):
    """
    获取帖子的标题,并打印输出
    :param: url
    :return: iteam
    """
    result = getSource(url)
    pattern = re.compile('<h1.*?title.*?>(.*?)</h1>', re.S)
    iteam = re.findall(pattern, result)
    text = replace(iteam)
    print(u'这篇文章的标题为------' + text)
    return text


def getPageNumber(url):
    """
    获取该帖子的总页数,并打印输出
    :param: url
    :return:iteams
    """
    result = getSource(url)
    soup = BeautifulSoup(result, 'lxml')
    # pattern = re.compile('<div class="pb_footer".*?<ul class="l_posts_num".*?<li class="l_reply_num".*?<span.*?red.*?></span>', re.S)
    # 若存在多个pattern子串 只返回第一个
    # iteams = re.findall(pattern, result)
    iteams = soup.find_all('span', attrs={'class': 'red'})
    iteam = iteams[-1].get_text()
    print(iteam)
    page = replace(iteam)
    print(u'这篇文章的总页数为------' + page)
    return page


def getContent(url):
    """
    获取评论
    :param: url
    :return:items
    """
    result = getSource(url)
    pattern = re.compile('<a data-field.*?p_author_name.*?>(.*?)</a>.*?<div id="post_content_.*?>(.*?)</div>', re.S)
    items = re.findall(pattern, result)
    number = 1
    for item in items:
        data = str(number) + u'楼\t楼主:' + item[0] + '\n内容:' + item[1] + '\n'
        print(data)
        number += 1
    return items


def makeDir(path, title):
    """
    创建目录
    :param: path,title
    :return: path
    """
    # 获取 proDir 主路径
    proDir = os.path.split(os.path.realpath(__file__))[0]
    # filePath 配置文件路径地址
    filePath = os.path.join(proDir, "photo", title)
    # strip() 作用是去掉换行符之类的
    # 为每一页创建路径
    path = path.strip()
    # os.path.exists() 用来检查路径是否存在 false true
    E = os.path.exists(os.path.join(filePath, path))
    if not E:
        # 创建新目录,若想将内容保存至别的路径(非系统默认),需要更环境变量 os.makedir如果子目录创建失败或者已经存在
        # 更改环境变量用os.chdir() 切换路径
        os.makedirs(os.path.join(filePath, path))
        os.chdir(os.path.join(filePath, path))
        print(u'正在创建名为', path, u'的文件夹')
        return path
    else:
        print(u'名为', path, u'的文件夹已经存在...')
        return path


def getImage(url):
    """
    获取晒图,清洗获得链接并保存入list
    :param: url
    :return: images
    """
    result = getSource(url)
    soup = BeautifulSoup(result, 'lxml')
    # 此处用BeautifulSoup显然更高效
    # find_all()返回一个list,find()返回一个元素
    # 注意class属性和python内置的重合,所以加_变成class_
    # items = soup.find_all('img', class_="BDE_Image")
    items = soup.find_all('img', attrs={'class': 'BDE_Image'})
    images = []
    number = 0
    for item in items:
        print(u'发现一张图,链接为------', item['src'])
        images.append(item['src'])
        number += 1
    if number >= 1:
        print(u'\n', u'共晒图', number, u'张,厉害了我的哥!!!')
    else:
        print(u'喏,没有图......')
    return images


def getAllPage(Num, siteURL=siteURL):
    """
    :param: Num,siteURL
    :return:
    """
    siteURL = siteURL + str(Num)
    # 获取帖子标题
    title = getTitle(siteURL)
    # 获取帖子页数
    numbers = getPageNumber(siteURL)
    # 浏览全部页面
    for page in range(1, int(numbers) + 1):
        # 格式化索引链接
        url = siteURL + '?pn=' + str(page)
        print(u'\n\n', u'正准备获取第', page, u'页的内容...')
        # 获取评论
        print(u'\n', u'正准备获取评论...')
        getContent(url)
        # 保存图片
        # 每一页创建一个文件 page1 page2 page3
        path = makeDir(path='page' + str(page), title=title)
        # 获取图片
        print(u'\n', u'正准备获取图片...')
        images = getImage(url)
        print(images)
        print(u'\n', u'正准备保存图片...')
        number = 1
        # 保存图片,先从之前的list中找链接
        for detailURL in images:
            pageName = str(page) + str(number)
            name = 'page' + str(page) + 'num' + str(number)
            saveImage(detailURL, path, title, name, pageName)
            time.sleep(0.1)
            number += 1

        print(u'\n\n', u'完成第', page, u'页'
              )

    print(u'\n\n', u'恭喜,圆满成功!')


def main():
    """主函数  Num填写帖子号 打开帖子查看网址最后一串数字"""
    Num = 4252370485
    items = getAllPage(Num)


if __name__ == "__main__":
    main()

优化之后 删除一些重复的东西

'''
抓取百度贴吧(优化)
目标:
    1、获取帖子标题、总页数、评论、图片
    2、图片写入文件并保存
    3、将各种信息实现打印(测试追踪)
    4、输入帖子号便能实现以上操作(亦适用于其它帖子)
'''
# -*-coding:utf-8-*-
import random
import re
import os
import urllib
import requests
import urllib.request
import time
from bs4 import BeautifulSoup

"""初始化查询的网址"""
siteURL = "http://tieba.baidu.com/p/"


def replace(x):
    """
    方便用replace方法把换行符等删除
    :param: x
    :return: x.strip
    """
    # 将list 转化为字符串 否则报错expected string or bytes-like object
    x = ''.join(x)
    removeImg = re.compile('<img.*?>|{7}| ')  # 去除img标签,1-7位空格,
    removeAddr = re.compile('<a.*?>|</a>')  # 删除超链接标签
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')  # 把换行的标签换位\n
    replaceTD = re.compile('<td>')  # 把表格制表<td>换为\t
    replaceBR = re.compile('<br><br>|<br>|</br>|</br></br>')  # 把换行符或者双换行符换为\n
    removeExtraTag = re.compile('.*?')  # 把其余标签剔除
    removeNoneLine = re.compile('\n+')  # 把多余空行删除
    x = re.sub(removeImg, "", x)
    x = re.sub(removeAddr, "", x)
    x = re.sub(replaceLine, "\n", x)
    x = re.sub(replaceTD, "\t", x)
    x = re.sub(replaceBR, "\n", x)
    x = re.sub(removeExtraTag, "", x)
    x = re.sub(removeNoneLine, "\n", x)
    return x.strip()  # 把strip()前后多余内容删除


def getSource(url):
    """
    获取网页源码
    :param: url
    :return: result
    """
    # 设置18个浏览头 防止重复被封
    user_agents = [
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:43.0) Gecko/20100101 Firefox/43.0',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \(KHTML, like Gecko) Element Browser 5.0',
        'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
        'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
        'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \Version/6.0 Mobile/10A5355d Safari/8536.25',
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \Chrome/28.0.1468.0 Safari/537.36',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)'
    ]

    length = len(user_agents)
    # 从18个里面随机选一个 生成一个0到1的随机符点数: 0 <= n <= 17
    index = random.randint(0, length-1)
    user_agent = user_agents[index]
    headers = {'User_agent': user_agent}
    r = requests.get(url, headers=headers)
    return r.text


def get_Title_Numbers(url):
    """
    获取帖子的标题和总页数,并打印输出
    :param: url
    :return:title,page
    """
    result = getSource(url)
    # 下面的用正则来查找 总页数用正则查不到 因此就用了 beautifulsoup ,将两个合并在一块干脆全部使用soup吧
    # pattern = re.compile('<h1.*?title.*?>(.*?)</h1>', re.S)
    # iteam = re.findall(pattern, result)
    # text = replace(iteam)
    # print(u'这篇文章的标题为------' + text)
    soup = BeautifulSoup(result, 'lxml')
    titles = soup.find_all('h1', attrs={'class': 'core_title_txt'})
    title = titles[0].get_text()
    print(u'这篇文章的标题为------' + title)
    pages = soup.find_all('span', attrs={'class': 'red'})
    page = pages[-1].get_text()
    print(u'这篇文章的总页数为------' + page)
    return title, page


def makeDir(title):
    """
    创建目录
    :param:title
    :return:filePath
    缺陷:若是已经创建该文件夹 重复创建会失败 考虑下已经创建的情况(已改)
    """
    # 获取 proDir 主路径
    proDir = os.path.split(os.path.realpath(__file__))[0]
    filePath = os.path.join(proDir, "photo", title)
    # os.path.exists() 用来检查路径是否存在 false true
    E = os.path.exists(filePath)
    if not E:
        # 创建新目录,若想将内容保存至别的路径(非系统默认),需要更环境变量 os.makedir如果子目录创建失败或者已经存在
        # 更改环境变量用os.chdir() 切换路径
        os.makedirs(os.path.join(filePath))
        os.chdir(os.path.join(filePath))
        print(u'正在创建名为', title, u'的文件夹')
        return filePath+"\\"
    else:
        print(u'名为', title, u'的文件夹已经存在...')
        return filePath+"\\"


def get_Content_Images(url, page, path):
    """
    获取图片与评论,返回图片打印输出评论
    :param: url, page, title
    :return:
    """
    result = getSource(url)
    pattern = re.compile('<a data-field.*?p_author_name.*?>(.*?)</a>.*?<div id="post_content_.*?>(.*?)</div>', re.S)
    items = re.findall(pattern, result)
    print(u'\n', u'正准备获取评论...')
    number = 1
    for item in items:
        data = str(number) + u'楼\t楼主:' + item[0] + '\n内容:' + item[1] + '\n'
        print(data)
        number += 1
    soup = BeautifulSoup(result, 'lxml')
    items = soup.find_all('img', attrs={'class': 'BDE_Image'})
    images = []
    number = 0
    print(u'\n', u'正准备获取图片...')
    for item in items:
        print(u'发现一张图,链接为------', item['src'])
        images.append(item['src'])
        number += 1
    if number >= 1:
        print(u'\n', u'共晒图', number, u'张!')
    else:
        print(u'emm,没有图......')
    print(u'\n', u'正准备保存图片...')
    number = 1
    # 保存图片,先从之前的list中找链接
    for detailURL in images:
        name = 'page' + str(page) + 'num' + str(number)
        fileName = name + '.' + 'jpg'
        urllib.request.urlretrieve(detailURL, path+fileName)
        time.sleep(0.1)
        number += 1
    print(u'\n\n', u'获取第', page, u'页图片已经完成!')


def getAllPage(Num, siteURL=siteURL):
    """
    得到全部页面
    :param: Num,siteURL
    :return:
    """
    siteURL = siteURL + str(Num)
    # 获取帖子标题和页数可以合并为一个
    title, numbers = get_Title_Numbers(siteURL)
    # 创建文件夹
    path = makeDir(title)
    # 浏览全部页面 从第一页开始到最后一页
    for page in range(1, int(numbers) + 1):
        # 格式化索引链接 查看方法 看URL地址,找规矩
        url = siteURL + '?pn=' + str(page)
        print(u'\n\n', u'正准备获取第', page, u'页的内容...')
        # 获取评论 Because每一页都不同 So都要重新 getSource(url) 可以将获取图片和评论两个结合在一块
        get_Content_Images(url, page, path)
    print(u'\n\n', u'恭喜,圆满成功!')


def shuru():
    """
    输入函数
    :param:
    :return: x
    """
    print("请输入百度贴吧的帖子号:")
    x = input("请输入......")
    return x


def main():
    """主函数  Num填写帖子号 打开帖子查看网址最后一串数字"""
    # 4252370485
    Num = 4252370485
    items = getAllPage(Num)


if __name__ == "__main__":
    main()

猜你喜欢

转载自my.oschina.net/u/2672404/blog/1611792