版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u013716535/article/details/78955634
背景
上一节说到,要求自学使用python爬图,但是上次的代码不具有通用性,所以使用了BeautifulSoup。
- 安装
pip install beautifulsoup4 - 搜索BeautifulSoup用法
很方便地提取出HTML或XML标签中的内容
https://cuiqingcai.com/1319.html - 改编上节代码
- 解释
if((head!=’//tb2’)& (head!=’//tb1’)):
因为 “http://tieba.baidu.com/p/2460150866“这个地址里面有些图片地址很奇怪,//tb2开头的,代码解析不了,所以我直接用判断做了过滤。
fTail = ‘jpg’
因为图片格式很多png…所以决定统一处理成jpg格式
# -*- coding: utf-8 -*-
import urllib
import urllib2
import os
from bs4 import BeautifulSoup
def getContentFromUrl(url):
html=urllib2.urlopen(url)
page = html.read()
soup = BeautifulSoup(page, "html.parser")
print 2
return soup
def mkdir(path):
path = path.strip()
isExists = os.path.exists(path)
if not isExists:
print u'新建了名字叫做',path,u'的文件夹'
os.makedirs(path)
return True
else:
print u'名为',path,u'的文件夹已经创建成功'
return False
def getInfoFromContent(soup,name):
number = 1
imglist = soup.find_all('img')
for imageURL in imglist:
imageURL=imageURL.get('src')
splitPath = imageURL.split('.')
fTail = splitPath.pop()
head=splitPath[0]
if((head!='//tb2')& (head!='//tb1')):
if len(fTail)>=3:
fTail = 'jpg'
number2=1
fileName = name + "/" + str(number2) + "." + fTail
print fileName
# 对于每张图片地址,进行保存
try:
u = urllib2.urlopen(imageURL)
data = u.read()
f = open(fileName,'wb+')
f.write(data)
print u'正在保存的一张图片为',fileName
f.close()
except urllib2.URLError as e:
print (e.reason)
number += 1
if __name__ == "__main__":
soup = getContentFromUrl("http://tieba.baidu.com/p/2460150866")
path = u'图片2'
mkdir(path)
getInfoFromContent(soup,path)
这是爬的百度贴吧里面图片分享的帖子。下面是网易的,有细微的差别。
# -*- coding: utf-8 -*-
import urllib
import urllib2
import os
from bs4 import BeautifulSoup
def getContentFromUrl(url):
html=urllib2.urlopen(url)
page = html.read()
soup = BeautifulSoup(page, "html.parser")
print 2
return soup
def mkdir(path):
path = path.strip()
isExists = os.path.exists(path)
if not isExists:
print u'新建了名字叫做',path,u'的文件夹'
os.makedirs(path)
return True
else:
print u'名为',path,u'的文件夹已经创建成功'
return False
def getInfoFromContent(soup,name):
imglist = soup.find_all('img')
number=1
for imageURL in imglist:
imageURL=imageURL.get('src')
print imageURL
if imageURL:
splitPath = imageURL.split('.')
fTail = splitPath.pop()
if len(fTail)>=3:
fTail = 'jpg'
fileName = name + "/" + str(number) + "." + fTail
print fileName
# 对于每张图片地址,进行保存
try:
u = urllib2.urlopen(imageURL)
data = u.read()
f = open(fileName,'wb+')
f.write(data)
print u'正在保存的一张图片为',fileName
f.close()
except urllib2.URLError as e:
print (e.reason)
number+=1
if __name__ == "__main__":
soup = getContentFromUrl("http://www.163.com/")
path = u'图片3'
mkdir(path)
getInfoFromContent(soup,path)