对前篇实现的python爬虫进行模块化、组件化。分为过滤器去重模块,提取链接模块,页面处理模块、登录模块和主流程模块,目前非常粗糙,有待完善,代码如下:、
主流程模块,程序入口:main.py
from collections import deque import requests from extracter.myextracter import myextracter from filter.setfilter import setfilter from handler.myhandler import myhandler #初始化组件:队列,过滤器,链接提取器,页面处理器 queue = deque() myfilter = setfilter() handler = myhandler() extracter = myextracter() #初始化状态 init_url = "https://mm.taobao.com/" queue.append(init_url) myfilter.add(init_url) file_path = 'E:/mm/' count = 0 i = 1; s = requests.session() while queue: url = queue.popleft() print('已经抓取:' + str(count) + '个,正在抓取-->' + url) count += 1 #下载网页准备处理 try: r = s.get(url, timeout = 2) r.encoding = 'UTF-8' except: continue urls = extracter.extract_urls(r.text, 'a', 'href') for x in urls: if 'mm.taobao.com' in x and myfilter.contains(x) == False: queue.append("https:" + x) myfilter.add("https:" + x) links = extracter.extract_urls(r.text, 'img', 'src') for x in links: print("正在保存图片" + str(i) + "-->https:" + x) try: handler.save_file_binary(file_path +str(i) +".jpg", s.get("https:" + x, timeout = 2).content) i += 1 except: continue
过滤器去重模块:setfilter.py:
# set过滤器,实现链接去重 class setfilter(object): myfilter = set() # 向过滤器中添加元素 def add(self, link): self.myfilter.add(link) # 检查是否重复 def contains(self, link): if link in self.myfilter: return True else: return False # 清空过滤器 def clear(self,): self.myfilter.clear()
提取链接模块:myextracter.py:
#默认的提取页面链接方法 from bs4 import BeautifulSoup class myextracter: #page页面数据,tag需要提取的标签如'a',attr需要提取的属性如'href' def extract_urls(self, page, tag, attr): urls = set() data = BeautifulSoup(page, "html.parser") for x in data.findAll(tag): try: next_url = x[attr] urls.add(next_url) except: continue return urls
页面处理模块,需用户自定义:myhandler.py:
#页面处理器,需要自定义 class myhandler: #保存二进制文件 def save_file_binary(self, file_path, data): with open(file_path,'wb') as f: f.write(data) #保存文本文件 def save_file_str(self, file_path, data): with open(file_path,'w') as f: f.write(data)
登录模块,用于保持session:login.py:
#自定义登录模块 class loginer: #通过头信息伪装成火狐浏览器 headinfo = { 'Connection': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' } def login(self, session, url, data): session.post(url, data, headers = self.headinfo) return session