python自动下载图片

有时候需要自动下载图片,在图片比较多的时候,一张张保存也麻烦,该脚本可以自动爬取站点里面的图片,并保存。

需要的童鞋拿去用大笑

#-*- coding: utf-8 -*-
'''
author: Derry
date:2015.1.19
'''
from HTMLParser import HTMLParser
import urllib
import time
import random
import os
import re
def saveImage(host,url):
	try:
		splitPath = url.split('/')
		f_name ="%d_"%random.randint(1,99999) + splitPath.pop()
		res = re.match('^http',url)
		if res is None:
			url='http://'+host+"/"+url
			print 'fixed image url=',url
		cmd='curl -o ./img/%s %s'%(f_name,url)
		os.system(cmd)
	except Exception,e:
		print "[Error]couldn't download: %s:%s" %(f_name,e)
		
def getHost(url):
	hosts=[]
	res = re.match('^http',url)
	if res == None:
		return ""
	else:
		segs=url[7:].split('/')
		print segs
		#segs[0] is the host
		return segs[0]
		
class MyParser(HTMLParser):
	def __init__(self,url):
		HTMLParser.__init__(self)
		self.url_list=[]
		self.url = url
		self.host=getHost(url)
		
	def handle_starttag(self,tag,attrs):
		if tag == 'a' and attrs:
			for key ,value in attrs:
				
				if key == 'href':
					if len(value) < 10 or value.find('javascript') != -1:
						continue
					self.url_list.append(value)
		if tag == 'img' and attrs:
			for key,value in attrs:
				if key=='src':
					print 'img url=',value
					saveImage(self.host,value)
					#urllib.urlretrieve(value,genFileName())
					
	def getUrlList(self):
		v_list=[]
		for url in self.url_list:
			if url.find(self.url) == -1:
					res = re.match('^http',url)
					if res is None:
						url='http://'+self.host+"/"+url;
						print 'fixed url=',url
					v_list.append(url)
			else:
				print 'is expect url',url
		return v_list


all_urls = ['http://www.hao123.com'] #抓取图片的初始网址
history_urls = []		
all_hosts=[]
while len(all_urls) > 0:	
	cur_url=''
	while True:
		cur_url=all_urls[0]
		if cur_url in history_urls:
			all_urls.remove(cur_url)
			continue
		else:
			all_urls.remove(cur_url)
			history_urls.append(cur_url)
			break
	print 'visiting url [%s]'%(cur_url)
	try:
		page = urllib.urlopen(cur_url).read()
	except Exception,e:
		print 'url open error',e
		continue
	parser = MyParser(cur_url)
	#print page
	try:
		parser.feed(page)
	except Exception,e:
		print 'feed error',e
		continue
	cur_url_list = parser.getUrlList()
	#cur_host_list = splitHost(cur_url_list)
	#print cur_host_list
	all_urls.extend(cur_url_list)
	parser.close()
#	print all_urls

	
	
	

	


猜你喜欢

转载自blog.csdn.net/dxt1107/article/details/50549156
今日推荐