通过python实现爬虫,支持多线程,支持层数自定义
爬取的网址及关键字信息存储在数据库中(mongodb)
环境:
ubuntu 14.04
依赖包 python mongodb pymongo
待优化部分:
1. 数据库插入优化
2.编解码优化
3.整个网页快照保存
4.去重
#-*- coding: utf-8 -*- ''' author: Derry date:2016.1.19 ''' from HTMLParser import HTMLParser import urllib import time import random import os import re import Queue import threading import sys import urllib2 import chardet #mongodb python driver import pymongo #当前url队列,将要被解析 ,Queue是线程安全的 CurLevelUrlQueue = Queue.Queue() NextLevelUrlQueue = Queue.Queue() #已经解析过得url队列 HistoryUrlQueue = Queue.Queue() TorrentQueue=Queue.Queue() HistoryUrlMap={} HtmlEntryList=[] FailedUrlList=[] #连接mongodb def get_collection(server): try: client = pymongo.MongoClient(host=server, port=27017) except Exception,e: print 'connect database error',e return None db = client['crawl'] coll = db['htmlpage'] print '## connect mongodb.....ok @%s'%(server) return coll def saveImage(host,url): try: splitPath = url.split('/') f_name ="%d_"%random.randint(1,99999) + splitPath.pop() res = re.match('^http',url) if res is None: url='http://'+host+"/"+url cmd='curl -o ./img/%s %s'%(f_name,url) os.system(cmd) except Exception,e: print "[Error]couldn't download: %s:%s" %(f_name,e) def getHost(url): hosts = [] res = re.match('^http',url) if res == None: return "" else: segs=url[7:].split('/') #segs[0] is the host return segs[0] def enqueueByList(q,list): for elem in list: q.put(elem) #HtmlEntry类保存爬取过的网页信息,如title,关键字等 class HtmlEntry(): def __init__(self,depth,url): self.depth = depth self.url = url self.encoding="" self.title = "" self.keywords = "" self.description = "" def setKeywords(self,keywords): self.keywords = keywords def setDescription(self,description): self.description = description def setTitle(self,title): self.title = title def getUrl(self): return self.url def getKeywords(self): return self.keywords def getDescription(self): return self.description def getTitle(self): return self.title class MyParser(HTMLParser): def __init__(self,url,depth): HTMLParser.__init__(self) self.url_list=[] self.pure_url_list=[] self.url = url self.host=getHost(url) self.depth = depth #当前深度 self.processing = 0 self.title_flag = 0 self.encoding = 'utf-8' self.title="" self.keywords="" self.description="" self.html_entry = HtmlEntry(depth,url) def checkVisitedStatus(self,url): length = len(url) if HistoryUrlMap.has_key(length) and url in HistoryUrlMap[length]: #print 'url = %s already visited.'%(url) return True return False def format(self,str): code=chardet.detect(str) if code.get('encoding') is not None: #编码 return str.decode(code.get('encoding')).encode(self.encoding) else: return str def handle_data(self,data): #if self.processing == 1: # print 'data=',data if self.title_flag == 1: self.html_entry.setTitle(self.format(data)) def handle_starttag(self,tag,attrs): if tag == 'a': self.processing = 1 for key ,value in attrs: if key == 'href': #print 'dep:%d,url:%s'%(self.depth,value) if len(value) < 10 or value.find('javascript') != -1: continue if value.find('123456bt') != -1: # print 'torrent =',value TorrentQueue.put(value) #没有访问过则加入队列 if False == self.checkVisitedStatus(value): self.url_list.append(value) if tag == 'img' and attrs: for key,value in attrs: if key=='src': #print 'img url=',value #saveImage(self.host,value) pass #urllib.urlretrieve(value,genFileName()) if tag == "title": self.title_flag = 1 if tag == 'meta'and attrs: for key,value in attrs: if key == 'name' and (value == 'Description' or value == "description"): for k,v in attrs: if k == 'content': #print 'desc',v self.html_entry.setDescription(self.format(v)) if key == 'name' and (value == 'keywords' or value == 'keywords'): for k,v in attrs: if k == 'content': self.html_entry.setKeywords(self.format(v)) def handle_endtag(self,tag): if tag=='a': self.processing = 0 if tag == 'title': self.title_flag = 0 def getUrlList(self): for url in self.url_list: if url.find(self.url) == -1: res = re.match('^http',url) if res is None: url='http://'+self.host+"/"+url; self.pure_url_list.append(url) return self.pure_url_list def getHtmlEntry(self): return self.html_entry class CrawlThread(threading.Thread): def __init__(self,num,depth): threading.Thread.__init__(self) self.num = num self.depth = depth # 当前的深度 self.total = CurLevelUrlQueue.qsize() def run(self): while CurLevelUrlQueue.qsize() > 0: cur_url = CurLevelUrlQueue.get(block = False) #print '%d/%d torrent:%d'%(CurLevelUrlQueue.qsize(),self.total,TorrentQueue.qsize()) #print '[thread %d]visiting [%s]' %(self.num,cur_url) HistoryUrlQueue.put(cur_url) url_len = len(cur_url) #根据字符串长度hash一次,提高匹配速度 if not HistoryUrlMap.has_key(url_len): HistoryUrlMap[url_len] = [] HistoryUrlMap[url_len].append(cur_url) try: req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept':'text/html;q=0.9,*/*;q=0.8', 'Connection':'close' } req = urllib2.Request(cur_url,None,req_header) resp = urllib2.urlopen(req,None,5) #page = resp.read().decode('gb2312').encode('utf-8') page = resp.read() resp.close() except Exception,e: print 'url open error',e FailedUrlList.append(cur_url) continue parser = MyParser(cur_url,self.depth) #print page try: parser.feed(page) except Exception,e: #print 'feed error',e continue entry=parser.getHtmlEntry() HtmlEntryList.append(entry) print 'failed %4d success %d total:%5d title=%s'%(len(FailedUrlList),len(HtmlEntryList),HistoryUrlQueue.qsize(),parser.html_entry.getTitle()) #保存数据库,可以开一个线程负责,待优化,需要增加重复数据判断 html_entry = {"url":entry.getUrl(), "title": entry.getTitle(),"keywords":entry.getKeywords(),"description":entry.getDescription()} g_collection.insert(html_entry) # 将爬取的链接放入队列中,供下层使用 enqueueByList(NextLevelUrlQueue,parser.getUrlList()) enqueueByList(HistoryUrlQueue,parser.getUrlList()) class MyCrawl: def __init__(self,urls,depth,thread_num): self.depth = depth self.url_list = urls self.thread_num = thread_num self.threads=[] self.dep=0 enqueueByList(CurLevelUrlQueue,self.url_list) def updateQueue(self): #while TorrentQueue.qsize() > 0: # CurLevelUrlQueue.put(TorrentQueue.get(block=False)) fd = open('urls_%d.txt'%(self.dep),'w') while NextLevelUrlQueue.qsize() > 0: url=NextLevelUrlQueue.get(block=False) CurLevelUrlQueue.put(url) fd.write(url) fd.write('\n') fd.close() print 'update queue success, next queue size [%d],cur queue size[%d]'%(NextLevelUrlQueue.qsize(),CurLevelUrlQueue.qsize()) def wait_allcomplete(self): for item in self.threads: if item.isAlive(): item.join() def process(self): start = time.time() self.dep = 0 while self.dep < self.depth: if self.dep != 0: self.updateQueue() for i in range(self.thread_num): thread = CrawlThread(i,self.dep) thread.start() self.threads.append(thread) #主线程等待 self.wait_allcomplete() threads=[] self.dep = self.dep + 1; print 'self.dep=',self.dep end = time.time() total=0 print 'time:%ds,urls=%d'%(end-start,HistoryUrlQueue.qsize()) reload(sys) sys.setdefaultencoding('utf-8') url_list=["http://www.2345.com/"] #连接数据库 g_collection = get_collection("localhost") if g_collection is None: exit() #开100个线程,爬取深度为3层 crawl = MyCrawl(url_list,3,100) crawl.process();