from urllib.request import urlopen
from urllib.error import URLError
import re
def get_page(url):
"""
获取页面内容
:param url:
:return:
"""
try:
urlObj = urlopen(url) #实现对目标网站的访问
except URLError as e:
print("爬取%s失败...." % (url))
else:
# 默认是bytes类型, 需要的是字符串, 二进制文件不能decode
content = urlObj.read() #读取网站内容
return content
def parser_content(content):
"""
解析页面内容, 获取所有的图片链接
:param content:
:return:
"""
content = content.decode('utf-8').replace('\n', ' ')
pattern = re.compile(r'<img class="BDE_Image".*? src="(https://.*?\.jpg)".*?">')
imgList = re.findall(pattern, content)
return imgList
def get_page_img(page):
url = "https://tieba.baidu.com/p/5752826839?pn=%s" %(page)
content = get_page(url)
print(content)
# with open('tieba.html', 'w') as f:
# f.write(content)
if content:
imgList = parser_content(content)
for imgUrl in imgList:
# 依次遍历图片的每一个链接, 获取图片的内容;
imgContent = get_page(imgUrl)
# https://imgsa.baidu.com/forum/w%3D580/sign=a05cc58f2ca446237ecaa56aa8237246/94cd6c224f4a20a48b5d83329c529822700ed0e4.jpg
imgName = imgUrl.split('/')[-1]
with open('img/%s' %(imgName), 'wb') as f:
f.write(imgContent)
print("下载图片%s成功...." %(imgName))
if __name__ == '__main__':
for page in range(1, 11):
print("正在爬取第%s页的图片...." %(page))
get_page_img(page)
实现简单的爬取网站图片
猜你喜欢
转载自blog.csdn.net/qq_43279936/article/details/88093650
今日推荐
周排行