前言
因为煎蛋网的妹子图加入里反扒手段,网页中图片的链接已经加密 需要分析js来找出图片的请求地址 所以就用了selenium对它的简单爬取
导入包
from selenium import webdriver
from bs4 import BeautifulSoup
import urllib.request
import os
模拟谷歌浏览器
broswer = webdriver.Chrome()
将页面中的图片的url放入image_url_list列表
def get_one_page(url):
#构造一个空的列表 方便后续保存网页中图片的url
image_url_list = []
#通过模拟的浏览器自动打开待爬取的页面
broswer.get(url)
#获取网页源代码
data = broswer.page_source
#通过BeautifulSoup解析网页
soup = BeautifulSoup(data,'lxml')
#定位图片地址元素
images = soup.select("a.view_img_link")
#print(images)
#print(type(images))
for image in images:
hf = image.get('href')
#去除 gif的图片格式
if str('gif') in str(hf):
pass
else:
image_url_list.append(hf)
#print(len(image_url))
#print(type(image_url))
return image_url_list
将页面中的图片保存到本地
def download_one_page(image_url_list,toPath):
for i in image_url_list:
http_url = "http:" + i
#print(http_url)
extension = os.path.split(i)[1]
path = os.path.join(toPath, extension)
try:
print("照片正在下载......",http_url)
urllib.request.urlretrieve(http_url,filename=path)
except urllib.error.URLError as e:
print(e)
爬取多页目标
def main():
#url = r'http://jandan.net/ooxx/page-num#comments'
toPath =r"E:\python\requests和正则爬取图片\jiandan"
urls = ["http://jandan.net/ooxx/page-{}#comments".format(str(i)) for i in range(38,40)]
print(type(urls))
for url in urls:
image_url_list = get_one_page(url)
download_one_page(image_url_list,toPath)
broswer.close()
完整代码展示
from selenium import webdriver
from bs4 import BeautifulSoup
import urllib.request
import os
broswer = webdriver.Chrome()
def get_one_page(url):
#构造一个空的列表 方便后续保存网页中图片的url
image_url_list = []
#通过模拟的浏览器自动打开待爬取的页面
broswer.get(url)
#获取网页源代码
data = broswer.page_source
#通过BeautifulSoup解析网页
soup = BeautifulSoup(data,'lxml')
#定位图片地址元素
images = soup.select("a.view_img_link")
#print(images)
#print(type(images))
for image in images:
hf = image.get('href')
#去除 gif的图片格式
if str('gif') in str(hf):
pass
else:
image_url_list.append(hf)
#print(len(image_url))
#print(type(image_url))
return image_url_list
def download_one_page(image_url_list,toPath):
for i in image_url_list:
http_url = "http:" + i
#print(http_url)
extension = os.path.split(i)[1]
path = os.path.join(toPath, extension)
try:
print("图片正在下载......",http_url)
urllib.request.urlretrieve(http_url,filename=path)
except urllib.error.URLError as e:
print(e)
def main():
#url = r'http://jandan.net/ooxx/page-num#comments'
toPath =r"E:\python\requests和正则爬取图片\jiandan"
urls = ["http://jandan.net/ooxx/page-{}#comments".format(str(i)) for i in range(38,40)]
print(type(urls))
for url in urls:
image_url_list = get_one_page(url)
download_one_page(image_url_list,toPath)
broswer.close()
if __name__ == "__main__":
main()