爬取煎蛋网妹子图

煎蛋网妹子图网址:'http://jandan.net/ooxx/'

版本:python 3.x


import urllib.request
import os
import random 

def url_open(url):
	req = urllib.request.Request(url)
	req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0')
	
	#proxies = ['119.109.194.57:80','61.135.217.7:80']
	#proxy = random.choice(proxies)
	
#	proxy_support = urllib.request.ProxyHandler({'http':proxy})
#	opener = urllib.request.build_opener(proxy_support)
#	urllib.request.install_opener(opener)
	
	
	response = urllib.request.urlopen(url)
	html = response.read()
	
	return html
	
#ҳ��
def get_page(url):
	html = url_open(url).decode('UTF-8')
 
	a = html.find('#comments">')+33
	b = html.find('                </a>',a)
	
	return html[a:b]

#ͼƬ��ַ
def find_imgs(url):
	html = url_open(url).decode('UTF-8')
	img_addrs = []
	a = html.find('img src=')
	while a!=-1:
		b = html.find('.jpg',a,a+255)
		if b != -1:
			#print('http:'+html[a+9,b+4])
			img_addrs.append('http:'+html[a+9:b+4])
		else:
			b = a+9
		a = html.find('img src=',b)

			
	return img_addrs
	
#����ͼƬ
def save_imgs(folder,img_addrs):
	for each in img_addrs:
		filename = each.split('/')[-1]
		with open(filename,'wb+') as f:
			img = url_open(each)
			f.write(img)

def download_mm(folder = 'OOXX',pages = 100):
	os.mkdir(folder)
	os.chdir(folder)
	
	url = 'http://jandan.net/ooxx/'
	page_num = int(get_page(url))
	
	for i in range(pages):
		page_num -= i
		page_url = url + 'page-' + str(page_num) + '#comments'
		img_addrs = find_imgs(page_url)
		save_imgs(folder,img_addrs)
		
if __name__=='__main__':
	download_mm()
		


猜你喜欢

转载自blog.csdn.net/weixin_37267014/article/details/78393729