一、实现原理
爬虫调度器首先要做的是初始化各个模块,然后通过 crawl(start_url) 方法传入入口 URL,方法内部实现按照运行流程控制各个模块的工作。
二、代码如下
1 from UrlManager import UrlManager 2 from HtmlDownloader import HtmlDownloader 3 from HtmlParser import HtmlParser 4 from DataOutput import DataOutput 5 6 7 class SpiderManager: 8 def __init__(self): 9 self.manager = UrlManager() 10 self.downloader = HtmlDownloader() 11 self.parser = HtmlParser() 12 self.output = DataOutput() 13 14 def crawl(self, start_url): 15 """ 16 负责调度其他爬虫模块 17 :param start_url: 起始 url 18 :return: None 19 """ 20 self.manager.add_new_url(start_url) 21 while self.manager.has_new_url(): 22 try: 23 new_url = self.manager.get_new_url() 24 html = self.downloader.download(new_url) 25 new_urls, new_datas = self.parser.parser(start_url, html) 26 self.manager.add_new_urls(new_urls) 27 for data in new_datas: 28 self.output.output_csv(data) 29 except Exception: 30 print('爬取失败') 31 self.output.close_file() 32 33 34 if __name__ == '__main__': 35 sm = SpiderManager() 36 sm.crawl('https://movie.douban.com/top250?start=0')