有时候需要自动下载图片,在图片比较多的时候,一张张保存也麻烦,该脚本可以自动爬取站点里面的图片,并保存。
需要的童鞋拿去用
#-*- coding: utf-8 -*- ''' author: Derry date:2015.1.19 ''' from HTMLParser import HTMLParser import urllib import time import random import os import re def saveImage(host,url): try: splitPath = url.split('/') f_name ="%d_"%random.randint(1,99999) + splitPath.pop() res = re.match('^http',url) if res is None: url='http://'+host+"/"+url print 'fixed image url=',url cmd='curl -o ./img/%s %s'%(f_name,url) os.system(cmd) except Exception,e: print "[Error]couldn't download: %s:%s" %(f_name,e) def getHost(url): hosts=[] res = re.match('^http',url) if res == None: return "" else: segs=url[7:].split('/') print segs #segs[0] is the host return segs[0] class MyParser(HTMLParser): def __init__(self,url): HTMLParser.__init__(self) self.url_list=[] self.url = url self.host=getHost(url) def handle_starttag(self,tag,attrs): if tag == 'a' and attrs: for key ,value in attrs: if key == 'href': if len(value) < 10 or value.find('javascript') != -1: continue self.url_list.append(value) if tag == 'img' and attrs: for key,value in attrs: if key=='src': print 'img url=',value saveImage(self.host,value) #urllib.urlretrieve(value,genFileName()) def getUrlList(self): v_list=[] for url in self.url_list: if url.find(self.url) == -1: res = re.match('^http',url) if res is None: url='http://'+self.host+"/"+url; print 'fixed url=',url v_list.append(url) else: print 'is expect url',url return v_list all_urls = ['http://www.hao123.com'] #抓取图片的初始网址 history_urls = [] all_hosts=[] while len(all_urls) > 0: cur_url='' while True: cur_url=all_urls[0] if cur_url in history_urls: all_urls.remove(cur_url) continue else: all_urls.remove(cur_url) history_urls.append(cur_url) break print 'visiting url [%s]'%(cur_url) try: page = urllib.urlopen(cur_url).read() except Exception,e: print 'url open error',e continue parser = MyParser(cur_url) #print page try: parser.feed(page) except Exception,e: print 'feed error',e continue cur_url_list = parser.getUrlList() #cur_host_list = splitHost(cur_url_list) #print cur_host_list all_urls.extend(cur_url_list) parser.close() # print all_urls