改进版爬虫 爬取漂亮妹纸图2

采用代理方式,不知道为什么不行,请大神指教,代码如下:

import urllib.request
import os
import random

pagedict = { }
def url_open(url):
    req = urllib.request.Request(url)
    req.add_header('Usr-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36')

#红色为添加代理部分
    proxies = ['223.199.25.235:9999','183.166.139.189:9999','223.199.19.229:9999']
    proxy = random.choice(proxies)
    proxy_support = urllib.request.ProxyHandler({'http':proxy})
    opener = urllib.request.build_opener(proxy_support)
    urllib.request.install_opener(opener)

    response = urllib.request.urlopen(url)
    html = response.read()
    return html

def get_page(url,pages=2):
    
    html = url_open(url).decode('utf-8')
    a = html.find('//jandan.net/ooxx/',0)
    b = html.find('">', a)   # 从a开始,找到第一个">, 返回索引    
    c = html.find('</a>', b)   # 从b开始,找到第一个 ], 返回索引
    page = 'http:'+ html[a:b]    
    pagenum = html[b+2:c].strip()
    pagedict.update({pagenum:page})
    print(pagedict)    
    for i in range(pages): 
        a = html.find('//jandan.net/ooxx/',c)
        b = html.find('">', a)   # 从a开始,找到第一个">, 返回索引    
        c = html.find('</a>', b)   # 从b开始,找到第一个 ], 返回索引
        page = 'http:'+ html[a:b]    
        pagenum = html[b+2:c].strip()
        pagedict.update({pagenum:page})
#        print(pagedict)
    
def save_imgs(folder, img_addrs):
    for each in img_addrs:
        filename = each.split('/')[-1]
        print('http:' + each)
        with open(filename, 'wb') as f:
            img = url_open('http:' + each)
            f.write(img)
        
    

def find_imgs(url):
    html = url_open(url).decode('utf-8')
    img_addrs = []
    a = html.find('img src=')
    while a != -1:
        b = html.find('.jpg', a, a + 255)
        if b != -1:   # 找到一个 jpg
#            print('%s'%html[a+9 : b+4])
            img_addrs.append(html[a+9 : b+4])  # 加入列表
        else:         # 到不到, 移动b的位置
            b = a + 9
        a = html.find('img src=', b)  # 在b之后开始,再找img src
#    for each in img_addrs:
#         print(each)
    return img_addrs
    
def download_mm(folder='OOXX',pages=10):
#    if not os.path.exists(folder):
    os.makedirs(folder)
    os.chdir(folder)
    
    url = "http://jandan.net/ooxx"
    page_num = (get_page(url))
    print(pagedict)
    for key in pagedict:
        page_url = pagedict[key]
        print(page_url)
        img_addrs = find_imgs(page_url)
        save_imgs(folder,img_addrs)
if __name__ == '__main__':
    download_mm()
 

 

 

Guess you like

Origin blog.csdn.net/harhy/article/details/104126701