python爬虫爬取网站图片

本文出处:http://blog.csdn.net/qq_27512671/article/details/78022625

效果图

都让让都让让,老司机先来一发效果图斜眼笑源码最下方

“斗鱼爬取结果效果图:)”

实现思路分为三步走:
1. 获取网页数据源
2. 解析网页源数据,获得所有的图片地址列表
3. 遍历列表,并将图片保存到本地

实现步骤

获取网页数据

def gethemltext(url):
    r = requests.get(url)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text

解析网页源数据,获得所有的图片地址列表

def getImageList(html, lst):
    soup = BeautifulSoup(html, 'html.parser')
    a = soup.find_all('img')
    for i in a:
        try:
            href = i.attrs['src']
            lst.append(href)
        except:
            continue

遍历列表,并将图片保存到本地

for src in list:
    try:
        print(root + src)
        urllib.request.urlretrieve(root + src, r'D:\pythonPath\%s.jpg' % tmp)
        tmp = tmp + 1
        print('成功')
    except:
        print('失败')
print('下载完毕')

实现案例

获取全景网首页所有图片数据

import os
import re
import urllib
import uuid

import requests
from bs4 import BeautifulSoup
from requests import request

urlPath = 'http://www.quanjing.com/'
localPath = 'd:\\pythonPath'

def gethemltext(url):
    r = requests.get(url)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text


def getImageList(html, lst):
    soup = BeautifulSoup(html, 'html.parser')
    a = soup.find_all('img')
    for i in a:
        try:
            href = i.attrs['src']
            lst.append(href)
        except:
            continue


def start():
    root = "http://www.quanjing.com/"
    html = gethemltext("http://www.quanjing.com/?audience=151316")
    list = []
    getImageList(html, list)
    tmp = 0
    for src in list:
        try:
            print(root + src)
            urllib.request.urlretrieve(root + src, r'D:\pythonPath\%s.jpg' % tmp)
            tmp = tmp + 1
            print('成功')
        except:
            print('失败')
    print('下载完毕')
#开始获取
start()

获取斗鱼神秘主播间头像

import os
import re
import urllib
import uuid

import requests
from bs4 import BeautifulSoup
from requests import request

urlPath = 'http://www.quanjing.com/'
localPath = 'd:\\pythonPath'


def gethemltext(url):
    r = requests.get(url)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text


def getImageList(html, lst):
    soup = BeautifulSoup(html, 'html.parser')
    a = soup.find_all('img')
    for i in a:
        try:
            href = i.attrs['data-original']
            lst.append(href)
        except:
            continue


def start():
    root = "http://www.quanjing.com/"
    html = gethemltext("https://www.douyu.com/directory/game/yz")
    list = []
    getImageList(html, list)
    tmp = 0
    for src in list:
        try:
            print(root + src)
            urllib.request.urlretrieve(  src, r'D:\pythonPath\%s.jpg' % tmp)
            tmp = tmp + 1
            print('成功')
        except:
            print('失败')
    print('下载完毕')
#开始获取
start()
发布了46 篇原创文章 · 获赞 62 · 访问量 30万+

猜你喜欢

转载自blog.csdn.net/qq_27512671/article/details/78022625