Use Python to crawl pictures in Baidu Tieba

First of all, the crawler in my understanding is a collection of web crawling tools and information filtering tools.
The working principle is nothing more than obtaining the source code of the webpage first, and then filtering out the information you want through the filtering tool, and it becomes

The following is the source code of the program

__author__ = 'Liqifeng'
# -*- coding:utf-8 -*-

from urllib import request
import urllib
import re
import os

#爬取贴吧图片类
class heiheihei:
    #初始化方法
    def __init__(self,url):
        #需要爬取的网址
        self.url=url
    def getPage(self):
        #模拟浏览器信息
        user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
        header={
   
   'User-Agent':user_agent}
        #传入信息
        req = request.Request(self.url,headers=header)
        #解析页面
        response=request.urlopen(req)
        #返回一个UTF-8编码的页面字符集
        return response.read().decode('UTF-8')

    #获取帖子中所有用户的名字
    def getName(self):
        #首先通过getPage()方法获得已经解析好了的页面
        content=self.getPage()
        #编写pattern
        parrten=re.compile('<a data-field=.*?target="_blank">(.*?)</a>',re.S)
        #匹配页面中的字符集
        result=re.findall(parrten,content)
        #挨个输出
        for item in result:
            print(item)

    #获取帖子中所有图片的超链接
    def getImage(self):
        # 首先通过getPage()方法获得已经解析好了的页面
        content=self.getPage()
        #编写正则表达式
        pattern=re.compile('<img class="BDE_Image" src="(.*?)" style=.*?>')
        #挨个匹配
        result = re.findall(pattern, content)
        #命名
        name='liqifeng'
        for item in result:
            print(item)
            #调用saveImg方法将获取的图片链接挨个保存为文件
            self.saveImg(item,name+'.jpg')
            name+='1'

        #此方法为保存超链接中的图片
        #ImageUrl为图片链接,fileName为保存的文件名
    def saveImg(self,ImageUrl,fileName):
        #首先获取图片
        u=request.urlopen(ImageUrl)
        #读取图片的源代码
        data=u.read()
        #打开文件
        f=open(fileName,'wb')
        #写入数据
        f.write(data)
        f.close()

url='http://tieba.baidu.com/p/2125354565?pn=2'
test=heiheihei(url)
print(test.getImage())

The results are as follows

Write picture description here

Write picture description here

Guess you like

Origin blog.csdn.net/mrliqifeng/article/details/77972438