python实现网络爬虫

通过python实现爬虫，支持多线程，支持层数自定义
爬取的网址及关键字信息存储在数据库中（mongodb）
环境：
ubuntu 14.04
依赖包 python mongodb pymongo
待优化部分:
1. 数据库插入优化
2.编解码优化
3.整个网页快照保存
4.去重
#-*- coding: utf-8 -*-
'''
author: Derry
date:2016.1.19
'''
from HTMLParser import HTMLParser
import urllib
import time
import random
import os
import re
import Queue
import threading
import sys
import urllib2
import chardet
#mongodb python driver
import pymongo

#当前url队列，将要被解析 ,Queue是线程安全的
CurLevelUrlQueue = Queue.Queue()
NextLevelUrlQueue = Queue.Queue()
#已经解析过得url队列
HistoryUrlQueue = Queue.Queue()
TorrentQueue=Queue.Queue()
HistoryUrlMap={}
HtmlEntryList=[]
FailedUrlList=[]


#连接mongodb
def get_collection(server):
	try:
		client = pymongo.MongoClient(host=server, port=27017)
	except Exception,e:
		print 'connect database error',e
		return None
	db = client['crawl']
	coll = db['htmlpage']
	print '## connect mongodb.....ok  @%s'%(server)
	return coll

def saveImage(host,url):
	try:
		splitPath = url.split('/')
		f_name ="%d_"%random.randint(1,99999) + splitPath.pop()
		res = re.match('^http',url)
		if res is None:
			url='http://'+host+"/"+url
		cmd='curl -o ./img/%s %s'%(f_name,url)
		os.system(cmd)
	except Exception,e:
		print "[Error]couldn't download: %s:%s" %(f_name,e)
		
def getHost(url):
	hosts = []
	res = re.match('^http',url)
	if res == None:
		return ""
	else:
		segs=url[7:].split('/')
		#segs[0] is the host
		return segs[0]

def enqueueByList(q,list):
	for elem in list:
		q.put(elem)	
#HtmlEntry类保存爬取过的网页信息，如title，关键字等
class HtmlEntry():
	def __init__(self,depth,url):
		self.depth = depth
		self.url = url
		self.encoding=""
		self.title = ""
		self.keywords = ""
		self.description = ""
	def setKeywords(self,keywords):
		self.keywords = keywords
		
	def setDescription(self,description):
		self.description = description
		
	def setTitle(self,title):
		self.title = title
	def getUrl(self):
		return self.url
	def getKeywords(self):
		return self.keywords
		
	def getDescription(self):
		return self.description
		
	def getTitle(self):
		return self.title

class MyParser(HTMLParser):
	def __init__(self,url,depth):
		HTMLParser.__init__(self)
		self.url_list=[]
		self.pure_url_list=[]
		self.url = url
		self.host=getHost(url)
		self.depth = depth #当前深度
		self.processing = 0
		self.title_flag = 0
		self.encoding = 'utf-8'
		self.title=""
		self.keywords=""
		self.description=""
		self.html_entry = HtmlEntry(depth,url)

	def checkVisitedStatus(self,url):
		length = len(url)
		if HistoryUrlMap.has_key(length) and url in HistoryUrlMap[length]:
			#print 'url = %s already visited.'%(url)
			return True
		return False	
	def format(self,str):
		code=chardet.detect(str)
		if code.get('encoding') is not None:
			#编码
			return str.decode(code.get('encoding')).encode(self.encoding)
		else:
			return str
	def handle_data(self,data):
		#if self.processing == 1:
		#	print 'data=',data
		if self.title_flag == 1:
			self.html_entry.setTitle(self.format(data))
	
	def handle_starttag(self,tag,attrs):
		
		if tag == 'a':
			self.processing = 1

			for key ,value in attrs:
				
				if key == 'href':
					#print 'dep:%d,url:%s'%(self.depth,value)
						
					if len(value) < 10 or value.find('javascript') != -1:
						continue
					if value.find('123456bt') != -1:
					#	print 'torrent =',value
						TorrentQueue.put(value)
					#没有访问过则加入队列
					if False == self.checkVisitedStatus(value):
						self.url_list.append(value)
		if tag == 'img' and attrs:
			for key,value in attrs:
				if key=='src':
					#print 'img url=',value
					
					#saveImage(self.host,value)
					pass
					#urllib.urlretrieve(value,genFileName())
		if tag == "title":
			self.title_flag = 1
		if tag == 'meta'and attrs:
			for key,value in attrs:
				if key == 'name' and (value == 'Description' or value == "description"):
					for k,v in attrs:
						if k == 'content':
							#print 'desc',v
							self.html_entry.setDescription(self.format(v))

							
				if key == 'name' and (value == 'keywords' or value == 'keywords'):
					for k,v in attrs:
						if k == 'content':
							self.html_entry.setKeywords(self.format(v))

			
	def handle_endtag(self,tag):
		if tag=='a':
			self.processing = 0
		if tag == 'title':
			self.title_flag = 0
	def getUrlList(self):
		for url in self.url_list:
			if url.find(self.url) == -1:
					res = re.match('^http',url)
					if res is None:
						url='http://'+self.host+"/"+url;
					self.pure_url_list.append(url)
		return  self.pure_url_list

	def getHtmlEntry(self):
		return self.html_entry
		
class CrawlThread(threading.Thread):
	def __init__(self,num,depth):
		threading.Thread.__init__(self)
		self.num = num
		self.depth = depth # 当前的深度
		self.total = CurLevelUrlQueue.qsize()
	def run(self):
		while CurLevelUrlQueue.qsize() > 0:
			cur_url = CurLevelUrlQueue.get(block = False)
			
			#print '%d/%d torrent:%d'%(CurLevelUrlQueue.qsize(),self.total,TorrentQueue.qsize())
			#print '[thread %d]visiting [%s]' %(self.num,cur_url)
			HistoryUrlQueue.put(cur_url)
			url_len = len(cur_url)
			#根据字符串长度hash一次，提高匹配速度
			if not HistoryUrlMap.has_key(url_len):
				HistoryUrlMap[url_len] = []
				
			HistoryUrlMap[url_len].append(cur_url)
			
			try:
				
				req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
					'Accept':'text/html;q=0.9,*/*;q=0.8',
					'Connection':'close'
					}
				req = urllib2.Request(cur_url,None,req_header)
 				resp = urllib2.urlopen(req,None,5)
				
				#page = resp.read().decode('gb2312').encode('utf-8')
				page = resp.read()
				resp.close()
			except Exception,e:
				print 'url open error',e
				FailedUrlList.append(cur_url)
				continue
			parser = MyParser(cur_url,self.depth)
			#print page
			try:
				parser.feed(page)
			except Exception,e:
				#print 'feed error',e
				continue
			entry=parser.getHtmlEntry()
			HtmlEntryList.append(entry)
			print 'failed %4d success %d  total:%5d title=%s'%(len(FailedUrlList),len(HtmlEntryList),HistoryUrlQueue.qsize(),parser.html_entry.getTitle())
			
			#保存数据库，可以开一个线程负责，待优化，需要增加重复数据判断
			html_entry = {"url":entry.getUrl(), "title": entry.getTitle(),"keywords":entry.getKeywords(),"description":entry.getDescription()}
			g_collection.insert(html_entry)
			# 将爬取的链接放入队列中，供下层使用
			enqueueByList(NextLevelUrlQueue,parser.getUrlList())
			enqueueByList(HistoryUrlQueue,parser.getUrlList())


class MyCrawl:
	def __init__(self,urls,depth,thread_num):
		
		self.depth = depth
		self.url_list = urls
		self.thread_num = thread_num
		self.threads=[]
		self.dep=0
		enqueueByList(CurLevelUrlQueue,self.url_list)
	
	def updateQueue(self):
		#while TorrentQueue.qsize() > 0:
		#	CurLevelUrlQueue.put(TorrentQueue.get(block=False))
		fd = open('urls_%d.txt'%(self.dep),'w')
		while NextLevelUrlQueue.qsize() > 0:
			url=NextLevelUrlQueue.get(block=False)
			CurLevelUrlQueue.put(url)
			fd.write(url)
			fd.write('\n')
		fd.close()
	
		print 'update queue success, next queue size [%d],cur queue size[%d]'%(NextLevelUrlQueue.qsize(),CurLevelUrlQueue.qsize())
	def wait_allcomplete(self):  
		for item in self.threads:  
			if item.isAlive():
				item.join() 
	
	def process(self):	
		start = time.time()  
		self.dep = 0
		while self.dep < self.depth:
			if self.dep != 0:
				self.updateQueue()
	
			for i in range(self.thread_num):
				thread = CrawlThread(i,self.dep)
				thread.start()
				self.threads.append(thread)
			#主线程等待
			self.wait_allcomplete()
			threads=[]
			
			self.dep = self.dep + 1;
			print 'self.dep=',self.dep
		end = time.time()  
		
		total=0		
		print 'time:%ds,urls=%d'%(end-start,HistoryUrlQueue.qsize())

reload(sys)
sys.setdefaultencoding('utf-8')		
url_list=["http://www.2345.com/"]
#连接数据库
g_collection = get_collection("localhost")
if g_collection is None:
	exit()
#开100个线程，爬取深度为3层
crawl = MyCrawl(url_list,3,100)
crawl.process();
python实现网络爬虫

猜你喜欢