python3 爬煎蛋ooxx妹子图

import re
import urllib.request
import random
import os
import http.server
import http.client
from urllib.error import URLError, HTTPError
import urllib.parse
proxy = []  #定义代理IP列表


def change_proxy():      #创建使用随机某个代理IP地址
    proxy_ip = random.choice(proxy)
    proxy_support = urllib.request.ProxyHandler({"http":proxy_ip})
    opener = urllib.request.build_opener(proxy_support)
    opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36')]
    urllib.request.install_opener(opener)
    print("代理IP: %s" % proxy_ip)

def url_open(url):     #访问jandan.net网站,如果报错进行重新获取代理IP,最多5次
    count = 0
    while True:
        try:
            if count == "5":
                print("已经失败了5次,程序退出,重新执行")
            count += 1
            response = urllib.request.urlopen(url)
            html = response.read()
            return html
        except OSError as e:
            print("链接出问题了,智能切换新的代理IP\n出错的问题是:" + str(e))
            change_proxy()
            continue
        except urllib.error.URLError as u:
            print("链接出问题了,智能切换新的代理IP\n出错的问题是:" + str(u))
            change_proxy()
            continue
        except (http.client.BadStatusLine,http.client.IncompleteRead) as h:
            print("链接出问题了,智能切换新的代理IP\n出错的问题是:" + str(h))
            change_proxy()
            continue

def get_pagenum(url):    #获取jandan网站的页面号(2305)
    html = url_open(url).decode("utf-8")
    num_re = re.compile(r'<span\sclass="current-comment-page">\[\d{4}\]</span>')
    num = num_re.search(html)
    a = re.compile(r'\d{4}')
    num_b = a.search(num.group())
    return  num_b.group()

def get_imgurl(url):    #获取图片的地址
    img = []
    html = url_open(url).decode("utf-8")
    jpg_re = re.compile(r'<img src="//ww.*\.jpg')
    numurl = jpg_re.findall(html)
    jpg = re.compile(r'//ww.+\.jpg')
    for line in numurl:
        imgurl = jpg.findall(line)
        img.append(imgurl[0])
    return img

def save_img(img):   #保存图片
    i = 0
    for each in img:
        i += 1
        filename = each.split('/')[-1]
        with open(filename,'wb') as f:
            imgpage = url_open("http:%s" %each)
            f.write(imgpage)
            print("下载本页的第%s张图片,名称为%s" %(i,filename))


def get_proxy():     #从IP代理网站上抓取代理IP,存入Proxy列表中
    head = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
    req = urllib.request.Request(url="http://www.xicidaili.com",headers=head)
    response = urllib.request.urlopen(req)
    html = response.read().decode("utf-8")
    IP = re.compile(r'''<tr\sclass=.+>\s+
                                    <td\s.+</td>\s+
                                    <td>.+</td>\s+
                                    <td>.+</td>\s+
                                    <td>.+</td>\s+
                                    <td\s.+?</td>\s+
                                    <td>.+</td>\s+
                                    <td>.+</td>\s+
                                    <td>.+</td>\s+
                                    </tr>
                                    ''',re.VERBOSE)
    proxy_ip = IP.findall(html)
    for num in range(len(proxy_ip)):
        protocol_list = proxy_ip[num].split()
        protocol = protocol_list[-4].split(">")
        HTTP = protocol[1].split("<")
        PORT_list = proxy_ip[num].split()
        PORT = PORT_list[8].split(">")
        PO = PORT[1].split("<")
        ip_list = proxy_ip[num].split()
        ip = ip_list[7].split(">")
        IP = ip[1].split("<")
        if HTTP[0] == "HTTP":
            IP_list = IP[0]+":"+PO[0]
            proxy.append(IP_list)
    return proxy

def download(dir,url):
    if not os.path.isdir(dir):
        os.mkdir(dir)
        os.chdir(dir)
    else:
        os.chdir(dir)
    url = url
    page_num = int(get_pagenum(url))
    for i in range(10):
        page_num -= 1
        pageurl = url + "page-" + str(page_num) + "#comments"
        imgurl = get_imgurl(pageurl)
        print("下载第%s页图片" % page_num)
        saveimg = save_img(imgurl)

if __name__ == "__main__":
    get_proxy()
    change_proxy()
    dir = "ooxx"
    url = "http://jandan.net/ooxx/"
    download(dir,url)

猜你喜欢

转载自blog.csdn.net/Jonnter/article/details/54141065
今日推荐