python 2.7 图片下载爬虫

写图片爬虫的一些心得

1.先到所要下载图片的网址看看,页面请求的网址是哪个(我用的是goolge浏览器)


2.点击所要下载的图片,查看其具体位置,(方便查找img链接)

3.找好之后就可以写代码了

4.主要难度是找到img=“”的具体位置,需要正则表达搜索一下
不会正则的或是beautifulsoup的小伙伴可以参考一下这两个视屏
beautifulsoup:https://www.youtube.com/watch?v=KLq0W1wUVmw&index=3&list=PLXO45tsB95cIuXEgV-mvYWRd_hVC43Akk
正则:https://www.youtube.com/watch?v=l1MAW1z641E
4.搜索成功后将其下载到本地文件中

以下是小编我自己写的代码

未改良版的:

#coding=utf-8
import requests
import os
from bs4 import BeautifulSoup

url = "http://www.ngchina.com.cn/magazine/2018/10/1337.html"
html = requests.get(url).text
soup = BeautifulSoup(html,'lxml')

all_img = soup.find_all('a',{'class':'img_btn'})

root = "C://img222//"
os.makedirs(root,mode=0o777)

for ul in all_img:
    imgs = ul.find_all('img')

    for ull in imgs:
        imgss = ull['src']

        r=requests.get(imgss,stream=True)
        path =root + imgss.split('/')[-1]
        try:
            with open(path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=100):
                    f.write(chunk)
            print path
        except:
            print "ERRor"

改良版的:

#coding=utf-8
import requests
import os
from bs4 import BeautifulSoup

def get_url(url):
    headers = {
        "user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
               "referer" : "http://www.ngchina.com.cn/magazine/2018/10/1337.html"
    }
    res = requests.get (url, headers = headers )
    return res


def main():
    url = "http://www.ngchina.com.cn/magazine/2018/10/1337.html"
    res = get_url(url)
    html = res.text
    soup = BeautifulSoup(html, 'lxml')

    all_imgs = soup.find_all('a', {'class': 'img_btn'})

    for ul in all_imgs:
        imgs = ul.find_all('img')
        for l in imgs:
             imgss = l['src']
             r = requests.get(imgss, stream=True)
             root = "C://img222//"
             path = root + imgss.split('/')[-1]
             try:
                with open (path,"wb") as f:
                    for chunk in r.iter_content(chunk_size=128):
                        f.write(chunk)
                print path
             except:
                print ERROE

if __name__ == "__main__":
    main()



猜你喜欢

转载自blog.csdn.net/qq_42133828/article/details/83590641