该项目将主要有五个部分负责完成爬取任务,分别是:URL管理器,HTML下载器,HTML解析器,数据存储器,爬虫调度器。
具体代码如下:
URL管理器:
import hashlib
import pickle
import time
class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()
self.error_urls = set()
def get_new_url(self):
"""
从容器中获取新的url,并且转化成md5减少内存消耗加进old_urls
:return:
"""
new_url = self.new_urls.pop()
m = hashlib.md5()
m.update(new_url.encode('utf-8'))
md5_url = m.hexdigest()
self.old_urls.add(md5_url)
return new_url
def old_urls_size(self):
return len(self.old_urls)
def new_urls_size(self):
return len(self.new_urls)
def add_new_url(self,url):
"""
添加单个url
:param url:
:return:
"""
if url is None:
print('url is None!')
m = hashlib.md5()
m.update(url.encode('utf-8'))
md5_url = m.hexdigest()
if md5_url not in self.old_urls and url not in self.new_urls:
self.new_urls.add(url)
def add_new_urls(self,urls):
"""
添加多个url,urls是个可迭代对象
:param urls:
:return:
"""
if urls is None:
print('urls is None!')
for url in urls:
self.add_new_url(url)
def add_error_urls(self,url):
"""
装进响应错误的urls中
:param url:
:return:
"""
return self.error_urls.add(url)
def save_progress(self,path,data):
"""
保存进度
:return:
"""
with open(path,'wb') as f:
pickle.dump(data,f)
def load_progress(self,path):
'''
从本地文件加载进度
:return: 返回set()集合
'''
try:
with open(path, 'rb') as f:
tmp = pickle.load(f)
print('继续%s的进程' % path)
return tmp
except FileNotFoundError as e:
print(e,'无进度文件,创建:%s'%path)
return set()
此URL管理器具有去重的功能,爬取过的url不会重复爬取,并且使用了md5技术减少内存的消耗。
HTML下载器:
import requests
import random
from URLManager import UrlManager
class HtmlDownloader(object):
def __init__(self):
self.url_manager = UrlManager()
USER_AGENT = random.choice([
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1"
])
self.headers = {'User-Agent':USER_AGENT}
def downloader(self,url):
response = requests.get(url,headers=self.headers)
response.encoding = 'utf-8'
if response.status_code in [int('20'+str(x)) for x in range(10)]:
return response
else:
self.url_manager.add_error_urls(url)
print('response.status_code is %d:%s'%(response.status_code,url))
return response
HTML下载器使用了user-agent代理反爬技术,在五个user-agent代理中任意切换,提高反爬性能。
HTML解析器:
from lxml import etree
from collections import defaultdict
from urllib.parse import urljoin
class HtmlParser(object):
def __init__(self):
self.data = defaultdict(list)
def parser(self,response):
'''
解析数据
:param response:
:return:
'''
try:
company_info = self.parse_company_1(response)
except Exception as e:
print(e,' ; parse_company_1解析错误,尝试使用parse_company_2方法解析...')
company_info = self.parse_company_2(response)
job_requests = self.parse_job_info(response)
self.data['job_info'] = job_requests
self.data['company_info'] = company_info
return self.data
def parse_job_urls(self,response):
'''
获取职位的连接
:param response:
:return:
'''
html = etree.HTML(response.text)
links = html.xpath('//div[@class="job-info"]/h3[@title]/a/@href')
job_urls = []
for link in links:
if link.find('www.liepin.com') != -1:
job_urls.append(link)
else:
link = self.url_join(response.url,link)
job_urls.append(link)
return job_urls
def url_join(self,base_url,url):
'''
获取绝对url
:param base_url:
:param url:
:return:
'''
abs_url = urljoin(base_url,url)
return abs_url
def parse_next_page(self,response):
'''
解析下一页的链接
:param response:
:return:
'''
html = etree.HTML(response.text)
next_page = html.xpath('//a[contains(.,"下一页") and contains(@href,"zhaopin")]/@href')
if next_page:
abs_url = self.url_join(response.url,next_page[0])
return abs_url
else:
return None
def parse_company_1(self,response):
"""
获取职位对应公司的信息
:param response:
:return:
"""
data = {}
html = etree.HTML(response.text)
company_info = html.xpath('//div[@class="new-compwrap"]')[0]
company_url = company_info.xpath('.//p/a/@href')[0]
company_basic_info = company_info.xpath('string(.//ul)')
company_introduction = html.xpath('string(//div[@class="info-word"])')
data['company_url'] = company_url
data['company_basic_info'] = company_basic_info
data['company_introduction'] = company_introduction
return data
def parse_company_2(self,response):
"""
获取职位对应公司的信息
:param response:
:return:
"""
data = {}
html = etree.HTML(response.text)
company_basic_info = html.xpath('string(//h3[contains(.,"其他信息")]/following-sibling::div[@class="content content-word"])')
company_introduction = html.xpath('string(//h3[contains(.,"企业介绍")]/following-sibling::div[@class]/div[1])')
data['company_url'] = 'None'
data['company_basic_info'] = company_basic_info
data['company_introduction'] = company_introduction
return data
def parse_job_info(self,response):
"""
获取职位的信息
:param response:
:return:
"""
data = {}
html = etree.HTML(response.text)
job_title = html.xpath('//div[contains(@class,"title-info")]/h1[@title]/@title')[0]
job_basic_info = html.xpath('string(//div[@class="job-title-left"])')
job_description = html.xpath('string(//h3[contains(.,"职位描述")]/following-sibling::div[@class="content content-word"])')
data['job_url'] = response.url
data['job_title'] = job_title
data['job_basic_info'] = job_basic_info
data['job_description'] = job_description
return data
HTML解析器使用了xpath来提取数据,可以应付两种不同的网页信息提取。
数据储存器:
import json
import pymongo
import time
class DataOutput(object):
def output_html_headers(self,path):
with open(path,'a+',encoding='utf-8') as f:
f.write('<html>\n<head>\n<title>猎聘python招聘信息</title>\n<meta charset="UTF-8">\n</head>\n')
f.write('<body>\n<table width="960" align="center" border="1" rules="all" cellpadding="15">\n')
f.write('<tr bgcolor="# ccc">\n<th>%s</th>\n<th>%s</th>\n<th>%s</th>\n<th>%s</th>\n'%('job_url','job_title','job_basic_info','job_description'))
f.write('<th>%s</th>\n<th>%s</th>\n<th>%s</th>\n</tr>\n'%('company_url','company_basic_info','company_introduction'))
def output_html(self,data,path):
self.clean_data(data)
with open(path,'a+',encoding='utf-8') as f:
f.write('<tr align="center">\n')
f.write('<td><a href="{0}" target="_blank">{0}</a></td>\n'.format(data["job_info"]["job_url"]))
f.write('<td>%s</td>\n' % data["job_info"]["job_title"])
f.write('<td>%s</td>\n' % data["job_info"]["job_basic_info"])
f.write('<td>%s</td>\n' % data["job_info"]["job_description"])
f.write('<td><a href="{0}" target="_blank">{0}</a></td>\n'.format(data["company_info"]["company_url"]))
f.write('<td>%s</td>\n' % data["company_info"]["company_basic_info"])
f.write('<td>%s</td>\n' % data["company_info"]["company_introduction"])
f.write('</tr>\n')
def output_html_end(self,path):
with open(path,'a+',encoding='utf-8') as f:
f.write('</table>\n</body>\n</html>\n')
def open_mongodb(self):
self.client = pymongo.MongoClient('localhost:27017')
self.db = self.client['lieping_job']
def close_mongdb(self):
self.client.close()
def output_mongodb(self,data,collection):
data = self.clean_data(data)
data['_id'] = time.time()
self.db[collection].insert(data if isinstance(data,dict) else dict(data))
def output_json_start(self,path):
with open(path,'w',encoding='utf-8') as f:
f.write('[""')
def output_json(self,data,path):
'''
保存为json格式
:param data:
:return:
'''
data = self.clean_data(data)
with open(path,'a',encoding='utf-8') as f:
f.write(',\n')
json.dump(data if isinstance(data,dict) else dict(data),f,indent=4)
def output_json_end(self,path):
with open(path, 'a', encoding='utf-8') as f:
f.write(']')
def output_text(self,data,path):
'''
以txt的格式保存
:param data:
:return:
'''
with open(path,'a',encoding='utf-8') as f:
clean_data = self.clean_data(data)
f.write(str(clean_data)+'\n')
def clean_data(self,data):
'''
处理爬取下来的数据
:param data:
:return:
'''
company_introduction = data['company_info']['company_introduction']
data['company_info']['company_introduction'] = company_introduction.replace('\r\n','').replace(' ','').replace('\xa0','')
new_compintro = data['company_info']['company_basic_info']
data['company_info']['company_basic_info'] = new_compintro.replace('\r', '').replace('\n', '').replace('\t', '').replace(' ','')
job_item = data['job_info']['job_basic_info']
data['job_info']['job_basic_info'] = job_item.replace('\r\n', '').replace(' ', '')
job_info = data['job_info']['job_description']
data['job_info']['job_description'] = job_info.replace('\r\n', '').replace(' ', '')
return data
数据存储器具有清洗数据的功能,使用python的replace字符串处理方法,可以保存为HTML,MongoDB,json,txt。
调度器:
import os
import time
from URLManager import UrlManager
from HTMLDownloader import HtmlDownloader
from HTMLParser import HtmlParser
from DATAOutput import DataOutput
class Crawl(object):
def __init__(self):
self.html_path = 'html_data%s.html' % str(time.time()).split('.')[0]
self.json_path = 'json_data%s.json' % str(time.time()).split('.')[0]
self.txt_path = 'text_data%s.txt' % str(time.time()).split('.')[0]
self.collection = 'python%s' % str(time.time()).split('.')[0]
self.page_num = 0
self.max_page_num = 21
self.url_manager = UrlManager()
self.html_downloader = HtmlDownloader()
self.html_parser = HtmlParser()
self.data_output = DataOutput()
def crawl_job_urls(self,base_url):
'''
爬取职位链接
:param base_url:
:return:
'''
response = self.html_downloader.downloader(base_url)
job_links = self.html_parser.parse_job_urls(response)
self.url_manager.add_new_urls(job_links)
next_page = self.html_parser.parse_next_page(response)
# 控制爬取页数
while next_page and self.page_num < self.max_page_num:
try:
next_page = self.html_parser.url_join(response.url,next_page)
print('抓取第%d页的职位链接'%(self.page_num+1))
r = self.html_downloader.downloader(next_page)
job_links = self.html_parser.parse_job_urls(r)
self.url_manager.add_new_urls(job_links)
next_page = self.html_parser.parse_next_page(r)
self.page_num += 1
except Exception as e:
self.url_manager.add_error_urls(r.url)
print(e)
def crawl_info(self):
'''
爬取职位相应信息
:return:
'''
# 打开数据存储
#self.data_output.open_mongodb()
#self.data_output.output_json_start(self.json_path)
self.data_output.output_html_headers(self.html_path)
while self.url_manager.new_urls_size()!=0:
try:
new_url = self.url_manager.get_new_url()
print('正在解析第%d个job_url:%s'%(self.url_manager.old_urls_size(),new_url))
response = self.html_downloader.downloader(new_url)
data = self.html_parser.parser(response)
# 存储为HTML格式
self.data_output.output_html(data,self.html_path)
# 储存为txt格式
#self.data_output.output_text(data,self.txt_path)
# 储存为json格式
#self.data_output.output_json(data,self.json_path)
# 保存在MongoDB
#self.data_output.output_mongodb(data,self.collection)
except Exception as e:
self.url_manager.add_error_urls(response.url)
print(e)
# 保存爬取进度
self.url_manager.save_progress('python_job_old_urls.txt', self.url_manager.old_urls)
if self.url_manager.error_urls != 0:
self.url_manager.save_progress('python_job_error_urls.txt',
self.url_manager.error_urls)
# 关闭数据存储
self.data_output.output_html_end(self.html_path)
#self.data_output.output_json_end(self.json_path)
#self.data_output.close_mongdb()
print('crawl is over!')
if __name__ == '__main__':
crawl = Crawl()
base_url = 'https://www.liepin.com/zhaopin/?sfrom=click-pc_homepage-centre_searchbox-search_new&d_sfrom=search_fp&key=python'
# 是否为继续爬取的状态
if os.path.exists('python_job_old_urls.txt'):
old_urls = crawl.url_manager.load_progress('python_job_old_urls.txt')
crawl.url_manager.old_urls = old_urls
error_urls = crawl.url_manager.load_progress('python_job_error_urls.txt')
crawl.url_manager.error_urls = error_urls
crawl.crawl_job_urls(base_url)
crawl.crawl_info()
调度器是五个部分中最重要的一部分,协调配合其他四个部分更好的工作运行,可以保存爬取的进度,再一次启动的时候不会重复爬取之前爬取过的网页。
此项目有利于对爬虫框架的理解,希望能对你们有帮助,谢谢!