mini-spider
- 功能描述:
- 多线程网络爬虫,爬取网页图片地址(也可提取其他特征的URL)
- 使用python开发一个迷你定向抓取器mini_spider.py,实现对种子链接的广度优先抓取,并把URL长相符合特定pattern的网页保存到磁盘上。
- 程序运行:
- python mini_spider.py -c spider.conf
- 配置文件
- spider.conf:
[spider]
- feedfile: ./urls # 种子文件路径
- result: ./result.data # 抓取结果存储文件, 一行一个
- max_depth: 6 # 最大抓取深度(种子为0级)
- crawl_interval: 1 # 抓取间隔. 单位: 秒
- crawl_timeout: 2 # 抓取超时. 单位: 秒
- thread_count: 8 # 抓取线程数
- filter_url: .*.(gif|png|jpg|bmp)$ # URL特征
- 种子文件urls:
- 抓取策略
- 广度优先的网页抓取策略
- 多线程抓取
- 获取符合特征的链接地址并存储到文件(例如gif|png|jpg|bmp为扩展格式的 url)
- 链接的绝对路径存储到result.data文件中, 一行一个 (图片也可直接保存至本地)
- 从HTML提取链接时支持处理相对路径及绝对路径
mini_spier.py
#!/usr/bin/env python
################################################################################
#
# Copyright (c) 2020 Baidu.com, Inc. All Rights Reserved
#
################################################################################
"""
This module is the main module
@Time : 2020/11/09
@File : mini_spider.py
@Author : [email protected]
"""
import log
from worker.SpiderWorker import SpiderWorker
from worker.param_parser import parm_parser
def main():
"""
Main method to run mini spider
"""
# get input params
args = parm_parser.get_args()
# init log config
log.init_log('./log/mini_spider')
if args:
# read config file spider.conf
conf_params = parm_parser.set_config_by_file(args.conf)
# use config set up spider initial params
spider = SpiderWorker(conf_params)
# init result_path, make it complete
spider.set_path()
# init url queue
spider.set_url_queue()
# start to crawl url
spider.start_crawl_work()
return
if __name__ == '__main__':
main()
spider.conf
[spider]
feedfile: http://xxx.xxx.com
result: ./result.data
max_depth: 6
crawl_interval: 1
crawl_timeout: 2
thread_count: 8
filter_url: .*\.(gif|png|jpg|bmp)$
SpiderThread.py 多线程模块
#!/usr/bin/env python
################################################################################
#
# Copyright (c) 2020 Baidu.com, Inc. All Rights Reserved
#
################################################################################
"""
This module is threading module, it is used to enable multithreading and multi line processing of requests
@Time : 2020/11/09
@File : SpiderThread.py
@Author : [email protected]
"""
import logging
import re
import time
import threading
from worker.UrlHandler import UrlHandler
class SpiderThread(threading.Thread):
"""
Provide multi thread for mini spider
"""
def __init__(self, urlqueue, result_path, max_depth, interval, timeout, filter_url, total_urlset):
threading.Thread.__init__(self)
self.urlqueue = urlqueue
self.result_path = result_path
self.max_depth = max_depth
self.interval = interval
self.timeout = timeout
self.filter_url = filter_url
self.total_urlset = total_urlset
self.lock = threading.Lock()
def can_download(self, url):
"""
Judge whether the url can be download. write your download rules here.
:param url: target url
:return: True, False
"""
if not UrlHandler.is_url(url):
return False
try:
# Regular expression matching image URL
pattern = re.compile(self.filter_url)
except Exception as e:
logging.error("the filter url %s is not re..compile fail: %s" % (self.filter_url, e))
return False
# if url length < 1 or url is not image type url
if len(url.strip(' ')) < 1 or not pattern.match(url.strip(' ')):
return False
# if url has been in total url set (avoid repeat downloads)
if url in self.total_urlset:
return False
return True
def run(self):
"""
Run crawling thread
Get task from queue and add sub url into queue, crawling page strategy -- BFS.
:return: no return
"""
while True:
try:
# get url and the page level
url, level = self.urlqueue.get(block=True, timeout=self.timeout)
except Exception as e:
logging.error('Can not finish the task. job done. %s' % e)
break
# print url is None
self.urlqueue.task_done()
# sleep interval
time.sleep(self.interval)
# judge if url can be download
if self.can_download(url):
UrlHandler.download_url(self.result_path, url)
# put a lock on add url to total url set
self.lock.acquire()
self.total_urlset.add(url)
self.lock.release()
# get the sub urls from url
suburls = UrlHandler.get_urls(url)
suburl_level = level + 1
# if sub url level larger than max_depth, stop crawling page deeper
if suburl_level > self.max_depth:
continue
for suburl in suburls:
self.urlqueue.put((suburl, suburl_level))
SpiderWorker.py 主工作模块
#!/usr/bin/env python
################################################################################
#
# Copyright (c) 2020 Baidu.com, Inc. All Rights Reserved
#
################################################################################
"""
This module is main worker, central module for crawling tasks
@Time : 2020/11/09
@File : SpiderWorker.py
@Author : [email protected]
"""
import os
from queue import Queue
import logging
from worker.SpiderThread import SpiderThread
class SpiderWorker(object):
def __init__(self, *args, **kwargs):
params = args[0]
self.urls = params[0]
self.result_path = params[1]
self.maxdepth = params[2]
self.interval = params[3]
self.timeout = params[4]
self.thread_count = params[5]
self.filter_url = params[6]
self.total_urlset = set()
self.urlqueue = Queue()
def set_abs_dir(self, path):
"""
Complete url path ,and mkdir if it not exits
:param path: url path
:return: result output path
"""
file_dir = os.path.join(os.getcwd(), path)
if not os.path.exists(file_dir):
try:
os.mkdir(file_dir)
except os.error as err:
logging.error("mkdir result-saved dir error: %s. " % err)
return str(file_dir)
def set_path(self):
"""
Complete the path
:return:
"""
self.result_path = self.set_abs_dir(self.result_path)
def set_url_queue(self):
"""
Set url queue
:return: True or False
"""
try:
self.urlqueue.put((self.urls, 0))
except Exception as e:
logging.error(e)
return False
return True
def start_crawl_work(self):
"""
Start to work
:return: nothing
"""
thread_list = []
for i in range(self.thread_count):
thread = SpiderThread(self.urlqueue, self.result_path, self.maxdepth, self.interval,
self.timeout, self.filter_url, self.total_urlset)
thread_list.append(thread)
logging.info("%s start..." % thread.name)
thread.start()
for thread in thread_list:
thread.join()
logging.info("thread %s work is done " % thread.name)
self.urlqueue.join()
logging.info("queue is all done")
return
URLHandler.py URL处理,http请求模块
#!/usr/bin/env python
################################################################################
#
# Copyright (c) 2020 Baidu.com, Inc. All Rights Reserved
#
################################################################################
"""
This module is used to handle URL and HTTP related requests
@Time : 2020/11/09
@File : UrlHandler.py
@Author : [email protected]
"""
import os
from urllib import parse, request
import logging
import chardet
from bs4 import BeautifulSoup
import requests
class UrlHandler(object):
"""
Public url tools for handle url
"""
@staticmethod
def is_url(url):
"""
Ignore url starts with Javascipt
:param url:
:return: True or False
"""
if url.startswith("javascript"):
return False
return True
@staticmethod
def get_content(url, timeout=10):
"""
Get html contents
:param url: the target url
:param timeout: request timeout, default 10
:return: content of html page, return None when error happens
"""
try:
response = requests.get(url, timeout=timeout)
except requests.HTTPError as e:
logging.error("url %s request error : %s" % (url, e))
return None
except Exception as e:
logging.error(e)
return None
return UrlHandler.decode_html(response.content)
@staticmethod
def decode_html(content):
"""
Decode html content
:param content: origin html content
:return: returen decoded html content. Error return None
"""
encoding = chardet.detect(content)['encoding']
if encoding == 'GB2312':
encoding = 'GBK'
else:
encoding = 'utf-8'
try:
content = content.decode(encoding, 'ignore')
except Exception as err:
logging.error("Decode error: %s.", err)
return None
return content
@staticmethod
def get_urls(url):
"""
Get all suburls of this url
:param url: origin url
:return: the set of sub_urls
"""
urlset = set()
if not UrlHandler.is_url(url):
return urlset
content = UrlHandler.get_content(url)
if content is None:
return urlset
tag_list = ['img', 'a', 'style', 'script']
linklist = []
for tag in tag_list:
linklist.extend(BeautifulSoup(content).find_all(tag))
# get url has attr 'src' and 'href'
for link in linklist:
if link.has_attr('src'):
urlset.add(UrlHandler.parse_url(link['src'], url))
if link.has_attr('href'):
urlset.add(UrlHandler.parse_url(link['href'], url))
return urlset
@staticmethod
def parse_url(url, base_url):
"""
Parse url to make it complete and standard
:param url: the current url
:param base_url: the base url
:return: completed url
"""
if url.startswith('http') or url.startswith('//'):
url = parse.urlparse(url, scheme='http').geturl()
else:
url = parse.urljoin(base_url, url)
return url
@staticmethod
def download_image_file(result_dir, url):
"""
Download image as file, save in result dir
:param result_dir: base_path
:param url: download url
:return: succeed True, fail False
"""
if not os.path.exists(result_dir):
try:
os.mkdir(result_dir)
except os.error as err:
logging.error("download to path, mkdir errror: %s" % err)
try:
path = os.path.join(result_dir, url.replace('/', '_').replace(':', '_')
.replace('?', '_').replace('\\', '_'))
logging.info("download url..: %s" % url)
request.urlretrieve(url, path, None)
except Exception as e:
logging.error("download url %s fail: %s " % (url, e))
return False
return True
@staticmethod
def download_url(result_file, url):
"""
Download the URL that matches the characteristics, and save in a file
:param result_file: base_path
:param url: download url
:return: succeed True, fail False
"""
try:
path = os.path.join(os.getcwd(), result_file)
logging.info("download url..: %s" % url)
with open(path, 'a') as f:
f.write(url + '\n')
except Exception as e:
logging.error("download url %s fail: %s " % (url, e))
return False
return True
param_parser.py 参数解析模块
#!/usr/bin/env python
################################################################################
#
# Copyright (c) 2020 Baidu.com, Inc. All Rights Reserved
#
################################################################################
"""
This module is used to parse params
@Time : 2020/11/09
@File : param_parser.py
@Author : [email protected]
"""
import argparse
import logging
import configparser
class parm_parser(object):
@staticmethod
def set_config_by_file(config_file):
"""
Set spiderworker params by config file
:param : config file
:return: True, False
"""
config = configparser.ConfigParser()
config.read(config_file, encoding='utf-8')
urls = config['spider']['feedfile'] # feedfile path
result_path = config['spider']['result'] # result storage file
max_depth = config['spider']['max_depth'] # max scratch depth
crawl_interval = config['spider']['crawl_interval'] # scratch interval
crawl_timeout = config['spider']['crawl_timeout'] # scratch timeout
thread_count = config['spider']['thread_count'] # scratch thread
filter_url = config['spider']['filter_url'] # URL characteristics
return urls, result_path, int(max_depth), int(crawl_interval), int(crawl_timeout), int(thread_count), filter_url
@staticmethod
def get_args():
"""
Get console args and parse
:return: nothing
"""
try:
parser = argparse.ArgumentParser(prog='other_mini_spider',
usage='minispider using method',
description='other_mini_spider is a Multithreaded crawler')
parser.add_argument('-c', '--conf', help='config_file')
parser.add_argument('-v', '--version', help='version', action="store_true")
except argparse.ArgumentError as e:
logging.error("get option error : %s." % e)
return
args = parser.parse_args()
if args.version:
parm_parser.version()
if args.conf:
return args
@staticmethod
def version():
"""
Print mini spider version
"""
print("other_mini_spider version 1.0.0")