爬取贴吧图片(静态网页)【bs解析网页+re正则匹配】

#解析标签内容-------使用get_text()获得文本内容,使用get('')方法获取标签属性值 

import requests
from bs4 import BeautifulSoup
import re

url = 'http://tieba.baidu.com/p/4178314700'

def get_html():
    html = requests.get(url).text
    return html

def getimage(html):
    soup = BeautifulSoup(html,'lxml')

    #reg = re.compile('http://.*?\.jpg')
    #imglist1 = re.findall(reg,html)
    #print(type(imglist1[0]))#<class 'str'>

    imglist = []
    List = soup.find_all('img')
    #print(type(List[0]))#<class 'bs4.element.Tag'>

    #print(type(List[0].get_text()))

    test =[]
    
    pattern = re.compile(r'https:/.*.jpg')
    
    for img in List:
        imglist.append(img.get('src'))
    #print(type(imglist[0]))#<class 'str'>
    
    for i in imglist:
        if re.match(pattern,i):
            test.append(i)
        
    for i in test:
        print(i)
    

    x = 0 
    for i in test:
        with open('C:/Users/Lenovo/Desktop/Pic/%s.jpg' %x, 'wb') as file:
            file.write(requests.get(i).content)
        x+=1

    #    print(i.find_all('a'))#两层标签查找
    #List2 = soup.find_all(attrs = {'name':'elements'})
    

if __name__ == '__main__':
    html = get_html()
    getimage(html)

猜你喜欢

转载自blog.csdn.net/qq_41333844/article/details/85252843
今日推荐