分享一个用xpath能爬取所有百度贴吧图片的爬虫代码(简单粗暴,源码附上) 附加正则爬取法(最后面添加福利,可以爬取头像吧所有图片,楼主爬了小一万张)

"""
    level1:
    贴吧图片爬虫
    输入贴吧名,起始页数,终止页数,爬取帖子中的图片,保存到images文件夹下,图片命名
    贴吧名_xx.jpg

"""

from urllib import request
from urllib import parse
from  urllib import error
from lxml import etree
import string

def tiebaSpider(url, beginPage, endPage):
    """
        作用:负责处理url,分配每个url去发送请求
        url:需要处理的第一个url
        beginPage: 爬虫执行的起始页面
        endPage: 爬虫执行的截止页面
    """
    for page in range(beginPage, endPage):
        pn = page

        filename = "第" + str(page) + "页.html"
        # 组合为完整的 url,并且pn值每次增加50
        fullurl = url + "&pn=" + str(pn)
        # 发送请求获取HTML页面
        print("正在下载" + filename)
        # try:
        headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
        req = request.Request(fullurl, headers=headers)
        response = request.urlopen(req)
        html = response.read()
        html = etree.HTML(html)

        results = html.xpath('//div/a[@class="grbm_ele_a grbm_ele_big"]/img/@src')
        for i in range(len(results)):
            url = results[i]
            # print(url)
            imgresponse = request.urlopen(url)
            images = imgresponse.read()
            # 将获取到的HTML页面写入本地磁盘文件
            with open('./images/%s_%s.jpg'%(kw,i), 'wb') as file:
                file.write(images)
        # except Exception  as e:
        #     pass

# 模拟 main 函数
if __name__ == "__main__":
    proxy = {"http": "118.31.220.3:8080"}
    proxy_support = request.ProxyHandler(proxy)
    opener = request.build_opener(proxy_support)
    request.install_opener(opener)
    kw = input("请输入需要爬取的贴吧:")
    # 输入起始页和终止页,str转成int类型
    beginPage = int(input("请输入起始页:"))
    endPage = int(input("请输入终止页:"))
    url = "http://tieba.baidu.com/f?"
    key = parse.urlencode({"kw" : kw})
    # 组合后的url示例:http://tieba.baidu.com/f?kw=lol
    url = url + key
    url = parse.quote(url, safe=string.printable)
    tiebaSpider(url, beginPage, endPage)

正则方法:亲测可用

"""
5、输入贴吧名,起始页数,终止页数,爬取帖子中的图片,保存到images文件夹下,图片命名 贴吧名_xx.jpg
"""
"""
    level1:
    贴吧图片爬虫
    输入贴吧名,起始页数,终止页数,爬取帖子中的图片,保存到images文件夹下,图片命名
    贴吧名_xx.jpg

"""

from urllib import request
from urllib import parse
from  urllib import error
import string,re

def tiebaSpider(url, beginPage, endPage):
    """
        作用:负责处理url,分配每个url去发送请求
        url:需要处理的第一个url
        beginPage: 爬虫执行的起始页面
        endPage: 爬虫执行的截止页面
    """
    for page in range(beginPage, endPage):
        pn = page

        filename = "第" + str(page) + "页.html"
        # 组合为完整的 url,并且pn值每次增加50
        fullurl = url + "&pn=" + str(pn)
        # 发送请求获取HTML页面
        print("正在下载" + filename)
        # try:
        headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
        req = request.Request(fullurl, headers=headers)
        response = request.urlopen(req)
        html = response.read()
        resHtml = html.decode("utf-8", 'ignore')
        # 图片的提取
        img = r'<a .*?><img .*? bpic="(.*?)" .*?/></a>'
        img_pattern = re.compile(img, re.I | re.S | re.M)
        image = img_pattern.findall(resHtml)
        for i in range(len(image)):
            url = image[i]
            # print(url)
            imgresponse = request.urlopen(url)
            images = imgresponse.read()
            # 将获取到的HTML页面写入本地磁盘文件
            with open('./zhende/%s_%s.jpg'%(kw,i), 'wb') as file:
                file.write(images)


# 模拟 main 函数
if __name__ == "__main__":
    proxy = {"http": "118.31.220.3:8080"}
    proxy_support = request.ProxyHandler(proxy)
    opener = request.build_opener(proxy_support)
    request.install_opener(opener)
    kw = input("请输入需要爬取的贴吧:")
    # 输入起始页和终止页,str转成int类型
    beginPage = int(input("请输入起始页:"))
    endPage = int(input("请输入终止页:"))
    url = "http://tieba.baidu.com/f?"
    key = parse.urlencode({"kw" : kw})
    # 组合后的url示例:http://tieba.baidu.com/f?kw=lol
    url = url + key
    url = parse.quote(url, safe=string.printable)
    tiebaSpider(url, beginPage, endPage)

附上爬取头像吧图片

"""
/**
 * _ooOoo_
 * o8888888o
 * 88" . "88
 * (| -_- |)
 *  O\ = /O
 * ___/`---'\____
 * .   ' \\| |// `.
 * / \\||| : |||// \
 * / _||||| -:- |||||- \
 * | | \\\ - /// | |
 * | \_| ''\---/'' | |
 * \ .-\__ `-` ___/-. /
 * ___`. .' /--.--\ `. . __
 * ."" '< `.___\_<|>_/___.' >'"".
 * | | : `- \`.;`\ _ /`;.`/ - ` : | |
 * \ \ `-. \_ __\ /__ _/ .-` / /
 * ======`-.____`-.___\_____/___.-`____.-'======
 * `=---='
 *          .............................................
 *           佛曰:bug泛滥,我已瘫痪!
 */
"""

import requests
from lxml import etree
import os


import sys

proxies = {
    "http": "http//115.225.88.99:8118",

}
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}

teiba_name = input("请输入贴吧名字:")
strat_page = int(input("请输入开始爬取页数:"))
end_page = int(input("请输入爬取结束页数:"))
# proxy = {"http":'115.225.88.99:8118'}

url = "https://tieba.baidu.com/f?"
word = {'kw': teiba_name}

# print(word['kw'])

while strat_page <= end_page:

        # 将用户输入页码转换为对应url页数
        urlpage = strat_page * 50 - 50
        # 为读取贴吧下一页内容做准备
        strat_page += 1
        # 准备url页数字典
        word['pn'] = urlpage
        # 向贴吧页面链接发起请求,并返回响应对象
        response = requests.get(url, word, headers=headers)
        # 读取响应对象内容
        html = response.text
        # 解析html
        html = etree.HTML(html)
        # xpath匹配
        results = html.xpath('//div[contains(@class,"threadlist_title pull_left j_th_tit")]')
        # 标题页面链接前缀
        tieba_url = "https://tieba.baidu.com"

        # 计数器初始化:
        title_page = 0
        for result in results:
            # 当前标题为第几个
            title_page += 1
            # 获取当前页面贴吧所有帖子标题链接
            html_url = result.xpath('./a/@href')[0]
            # 获取当前贴吧页面标题名称
            html_title = result.xpath('./a')[0].text
            # 拼接贴吧标题完整链接
            full_tieba = tieba_url + html_url + '?'
            # 初始化标题内容页数
            page_now = {'pn': 1}
            # 向链接发起请求并传入第一页参数
            response = requests.get(full_tieba, params=page_now, headers=headers)
            text = response.content.decode("utf-8",errors='ignore')
            html = etree.HTML(text)
            total_pages = html.xpath("//li[@style='margin-left:8px']/span[last()]")
            print(total_pages)
            total_pages = total_pages[0].text
            # print("总页数:",total_pages)

            # 计数器(当前标题页内容所含图片数)
            a = 0
            for each_page in range(1, int(total_pages) + 1):
                # 准备标题内容页面数
                page_now2 = {'pn': each_page}
                # 向标题链接内容当前页面发起请求,返回响应对象
                response = requests.get(full_tieba, params=page_now2, headers=headers)
                text = response.content.decode("utf-8", errors='ignore')
                html = etree.HTML(text)
                # 获取图片链接
                res_image = html.xpath('//div[contains(@class,"p_content")]//img[@class="BDE_Image"]/@src')
                # print("正在抓取中。。。",strat_page, "图片数:",len(res_image),"标题数:",title_page,html_title,"页数:",each_page)

                if len(res_image) != 0:
                    # 遍历图片链接列表
                    for each_image in res_image:
                        try:
                            # 当前标题页内容所含图片数
                            a += 1
                            response1 = requests.get(each_image, headers=headers)
                            # 返回html字节对象
                            html = response1.content
                            # 写入文件
                            # 返回真实页数
                            page_num = int(word['pn']) / 50 + 1
                            print("正在抓取中。。。当前页数:", int(page_num), "图片数:", len(res_image), "标题数:", title_page, html_title,
                                  "标题内容页数:",
                                  each_page)

                            new_path = "./touxiang/" + word['kw'] + "第" + str(int(page_num)) + "页" + str(
                                title_page) + "标题下" + str(each_page) + "图" + str(a) + ".jpg"
                            # 创建文件夹并写入图片
                            parent_path = os.path.dirname(new_path)
                            if not os.path.exists(parent_path):
                                os.makedirs(parent_path)
                            with open(new_path, "wb") as file_o:
                                file_o.write(html)

                        except Exception as e:
                            pass

# print("抓取完成!。。。")




猜你喜欢

转载自blog.csdn.net/lzz781699880/article/details/81127951