Alex -爬取图片

# 导包
import requests
import urllib.parse
import re
import os
from os.path import join

def get_url_one_page(url):
    html = requests.get(url)
    html.encoding = 'utf-8'
    html = html.text
    url_pic_this_page = re.findall(r'"objURL":"(.*?)",',html)
    url_next_page_prefix = re.findall(r'<a href="(.*)?" class="n" >下一页',html)
    if len(url_next_page_prefix) != 0:
        url_next_page = 'http://image.baidu.com' + url_next_page_prefix[0]
    else:
        print("已经到达最后一页")
        url_next_page=None
    return url_pic_this_page,url_next_page




def fetch_pictures(key,num_pics):
    print('开始爬虫:关键字「%s」,爬取图片数量「%d」'%(key, num_pics))
    url_init_base = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
    url_init = url_init_base + urllib.parse.quote(key)
    url_pic_list=[] # 所有图片地址的列表拍
    # 1。批量爬取
    while True:
        url_pic_this_page, url_next_page = get_url_one_page(url=url_init)
        url_pic_list += url_pic_this_page
        if url_next_page is not None:
            url_init = url_next_page
        else:
            print('图片页数已经达到最后')
            break
        if len( url_pic_list)>num_pics-1:
            print("已满足你的数量")
            break
    print(url_pic_list)
    # 2。保存图片


if __name__ == '__main__':
    key = "蔡徐坤打篮球"
    num_pics = 10
    SAVE_DIR = key
    # os.mkdir(SAVE_DIR)
    fetch_pictures(key, num_pics)


```
发布了254 篇原创文章 · 获赞 16 · 访问量 9505

猜你喜欢

转载自blog.csdn.net/houlaos/article/details/103995893