需要导入的包
import random
from random import randint
import requests
from fake_useragent import UserAgent #反爬虫 伪装请求头
from retrying import retry #装饰器
import hashlib #信息摘要 md5
import queue #队列
import re #正则表达式
from urllib import robotparser #解析网站robots.txt文件
from urllib.parse import urlparse,urljoin,urldefrag #解析url
from threading import Thread #多线程
from datetime import datetime
import time
import Mongo_cache
1.定义爬虫的爬取深度
MAX_DEP = 2
2.有一些站点它在 robots.txt 文件中设定了禁止流量该站点的代理用户。既然目标站点有这样的规矩,我们就要遵循它。
#解析robots.txt文件,然后在在对需要下载的网页进行判断是否可以爬取。
def get_robots(url):
"""
解析robots.txt文件
:param url:
:return:
"""
rp = robotparser.RobotFileParser()
rp.set_url(urljoin(url,'robots.txt'))
rp.read()
return rp
3.存储下载内容
#首先转换为md5形式存进数据库.然后得到下载链接的名字,写入到download文件
def save_url(html_content,url_str):
"""
存储下载内容
:param html_content:
:param url_str:
:return:
"""
md5 = hashlib.md5()
md5.update(html_content)
#file_path = "./download/" + md5.hexigest()+".html"
file_path = "./download/" + gen_html_name(url_str)+".html"
with open(file_path,"wb") as f:
f.write(html_content)
4.根据路径切割出html名字 urlparse模块主要用于解析url里面的参数,对url按照一定的格式进行拼接或拆分
def gen_html_name(url_str):
"""
得到html的名字
:param url_str:
:return:
"""
path = urlparse(url_str).path
path_array = path.split('/')
return path_array[len(path_array) - 1]
5.抽取网页中的其他链接 re.IGNORECASE 表示忽略大小写.
def extractor_url_lists(html_content):
"""
抽取网页中的其他链接
:param html_content:
:return:
"""
#^出现在[]里面 表示非 ()里面分组匹配 有用的信息
url_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)
return url_regex.findall(html_content)
6.python 有一个现成的随机生成UA(user-agent)的第三方库:fake-useragent
class CrawlerCommon(Thread):
"""
实现一个通用爬虫,涵盖基本的爬虫功能及涉及一些反爬虫技术
"""
def __init__(self,init_url):
# super(CrawlerCommon,self).__init__()
super().__init__()
__ua = UserAgent() #随机User-Agent
self.seed_url = init_url #初始化爬取种子网址
self.crawler_queue = queue.Queue() # 先进先出 使用不同的队列会造成BFS和DFS的效果
self.crawler_queue.put(init_url) #将种子网址放入队列
self.visited = {init_url:0} #初始化爬虫深度为0
self.rp = get_robots(init_url) #初始化robots解析器
self.headers = {'User-Agent':__ua.random} #生成一个随机user-agent
self.link_regex = '/(lady|fashion)' # 抽取网址的过滤条件
self.throttle = Throttle(5.0) #下载限流器间隔为5秒
self.mcache = Mongo_cache.MongoCache() #初始化mongo-cache
self.time_sleep = 3
7.使用装饰器进行重试下载类
@retry(stop_max_attempt_number=3)
def retry_download(self,url_str,data,method,proxies):
if method == "POST":
result = requests.get(url_str,data=data,headers=self.headers,proxies=proxies)
else:
result = requests.get(url_str,headers = self.headers,timeout=3,proxies=proxies)
assert result.status_code == 200 #此处为断言,判断状态码是否为200
return result.content
8.真正下载类
def download(self,url_str,data=None,method='GET',proxies={}):
print("download url is ::::::",url_str)
try:
result = self.retry_download(url_str,data,method,proxies)
except Exception as e:
print("异常",e)
result = None
return result
9.补全下载链接
def nomalize(self,url_str):
real_url,_ = urldefrag(url_str)
return urljoin(self.seed_url,real_url)
- 将结果存入数据库,存入前检查内容是否存在
def save_result(self,html_content,url_str):
if url_str not in self.mcache:
self.mcache[url_str] = html_content
else:
data_from_mongo = self.mcache[url_str]
#初始化md5算法
md5_func_mongo = hashlib.md5()
md5_func_download = hashlib.md5()
#生成数据库记录的md5摘要
md5_func_mongo.update(data_from_mongo)
mongo_md5_str = md5_func_mongo.hexdigest()
#生成下载数据的md5摘要
md5_func_download.update(html_content)
download_md5_str = md5_func_download.hexdigest()
#对比下载结果是否和数据库一样
if download_md5_str != mongo_md5_str:
self.mcache[url_str] = html_content
11.进行爬取的主要方法
def run(self):
while not self.crawler_queue.empty():
url_str = self.crawler_queue.get()
# print('url_str is:::::',url_str)
#检测robots.txt文件规则
if self.rp.can_fetch(self.headers["User-Agent"],url_str):
self.throttle.wait_url(url_str)
random_oper = randint(0,5)
if random_oper ==1:
time.sleep(self.time_sleep + randint(0,2))
else:
time.sleep(self.time_sleep - randint(0,2))
depth = self.visited[url_str]
if depth < MAX_DEP:
#下载链接
html_content = self.download(url_str)
#存储链接
if html_content is not None:
self.mcache[url_str]=html_content
save_url(html_content,url_str)
#筛选出页面所有链接
url_list = extractor_url_lists(html_content.decode('gb18030'))
#筛选需要爬取的链接
filter_urls = [link for link in url_list if re.search(self.link_regex,link)]
for url in filter_urls:
#补全链接
real_url = self.nomalize(url)
#判断链接是否访问过
if real_url not in self.visited:
self.visited[real_url] = depth + 1
self.crawler_queue.put(real_url)
else:
print("robots.txt 禁止下载:",url_str)
12.下载限流器
class Throttle(object):
def __init__(self,delay):
self.domains = {} #可以放在数据库中
self.delay = delay #两次间隔下载间隔
def wait_url(self,url_str):
#以netloc为基础进行休眠
domain_url = urlparse(url_str).netloc #取出网址域名(netloc)
last_accessed = self.domains.get(domain_url) #取出域名的上次下载时间
if self.delay > 0 and last_accessed is not None:
#将当前时间和上次下载时间相减,得出两次下载时间间隔,然后用休眠时间(delay)减去这个时间间隔
#如果大于0就休眠,否则直接下载后续链接
sleep_interval = self.delay-(datetime.now()-last_accessed).seconds
if sleep_interval > 0:
time.sleep(sleep_interval)
self.domains[domain_url] = datetime.now() #当前时间以域名作为key存到domains字典中
13.随机代理
class RandomProxy(object):
"""
随机代理
"""
def __init__(self):
self.proxies = []
self.headers = {
"User-Agent":"wsf"
}
def crawl_proxies(self):
"""
抓取生成代理
:return:
"""
self.proxies.append('192.168.1.1')
self.proxies.append('192.168.1.2')
def verify_proxies(self):
"""
校验每一个代理是否可用
:return:
"""
invalid_ip = []
for ip_str in self.proxies:
proxies = {"http":ip_str}
r = requests.get("http://www.baidu.com",proxies=proxies,headers =self.headers)
if r.status_code ==200:
continue
else:
invalid_ip.append(ip_str)
for remove_ip in invalid_ip:
self.proxies.remove(remove_ip)
def get_one_proxy(self):
return random.choice(self.proxies)
if __name__ == "__main__":
crawler = CrawlerCommon("http://fashion.163.com/")
crawler.run()