python3通过selenium对煎蛋网妹子图的简单爬取

自行参考

前言

因为煎蛋网的妹子图加入里反扒手段,网页中图片的链接已经加密 需要分析js来找出图片的请求地址 所以就用了selenium对它的简单爬取

导入包

from selenium import webdriver
from bs4 import BeautifulSoup
import urllib.request
import os

模拟谷歌浏览器

broswer = webdriver.Chrome()

将页面中的图片的url放入image_url_list列表

def get_one_page(url):
    #构造一个空的列表 方便后续保存网页中图片的url
    image_url_list = []
    #通过模拟的浏览器自动打开待爬取的页面
    broswer.get(url)
    #获取网页源代码
    data = broswer.page_source
    #通过BeautifulSoup解析网页
    soup = BeautifulSoup(data,'lxml')
    #定位图片地址元素
    images = soup.select("a.view_img_link")
    #print(images)
    #print(type(images))
    for image in images:
        hf = image.get('href')
        #去除 gif的图片格式
        if str('gif') in str(hf):
            pass
        else:
            image_url_list.append(hf)
    #print(len(image_url))
    #print(type(image_url))
    return image_url_list

将页面中的图片保存到本地

def download_one_page(image_url_list,toPath):
    for i in image_url_list:
        http_url = "http:" + i
        #print(http_url)
        extension = os.path.split(i)[1]
        path = os.path.join(toPath, extension)
        try:
            print("照片正在下载......",http_url)
            urllib.request.urlretrieve(http_url,filename=path)
        except urllib.error.URLError as e:
            print(e)

爬取多页目标

def main():
    #url = r'http://jandan.net/ooxx/page-num#comments'
    toPath =r"E:\python\requests和正则爬取图片\jiandan"
    urls = ["http://jandan.net/ooxx/page-{}#comments".format(str(i)) for i in range(38,40)]
    print(type(urls))
    for url in urls:
        image_url_list = get_one_page(url)
        download_one_page(image_url_list,toPath)
    broswer.close()

完整代码展示

from selenium import webdriver
from bs4 import BeautifulSoup
import urllib.request
import os


broswer = webdriver.Chrome()


def get_one_page(url):
    #构造一个空的列表 方便后续保存网页中图片的url
    image_url_list = []
    #通过模拟的浏览器自动打开待爬取的页面
    broswer.get(url)
    #获取网页源代码
    data = broswer.page_source
    #通过BeautifulSoup解析网页
    soup = BeautifulSoup(data,'lxml')
    #定位图片地址元素
    images = soup.select("a.view_img_link")
    #print(images)
    #print(type(images))
    for image in images:
        hf = image.get('href')
        #去除 gif的图片格式
        if str('gif') in str(hf):
            pass
        else:
            image_url_list.append(hf)
    #print(len(image_url))
    #print(type(image_url))
    return image_url_list

def download_one_page(image_url_list,toPath):
    for i in image_url_list:
        http_url = "http:" + i
        #print(http_url)
        extension = os.path.split(i)[1]
        path = os.path.join(toPath, extension)
        try:
            print("图片正在下载......",http_url)
            urllib.request.urlretrieve(http_url,filename=path)
        except urllib.error.URLError as e:
            print(e)

def main():
    #url = r'http://jandan.net/ooxx/page-num#comments'
    toPath =r"E:\python\requests和正则爬取图片\jiandan"
    urls = ["http://jandan.net/ooxx/page-{}#comments".format(str(i)) for i in range(38,40)]
    print(type(urls))
    for url in urls:
        image_url_list = get_one_page(url)
        download_one_page(image_url_list,toPath)
    broswer.close()

if __name__ == "__main__":
    main()

文件保存的截图

猜你喜欢

转载自blog.csdn.net/xyl180808/article/details/81512009