#-*- coding: UTF-8 -*-
# 通过继承方式获取网站标题
from multiprocessing import Process
import requests,re
import logging
# 配置日志的等级
logging.basicConfig(level=logging.INFO)
class Spider(Process):
def __init__(self,l):
super().__init__()
self.title_re = re.compile(r'<title>([^<]+)</title>', re.I)
self.urls = l
self.headers = {
"user-agent": "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
}
def run(self): # 重定义Process方法 run方法
while self.urls:
url_s = self.urls.pop(0)
html = self.down_url(url_s)
title = self.get_title(html)
logging.info(f'获得标题{title}')
def get_title(self,html):
return ''.join(self.title_re.findall(html)) # 通过’‘.join() 可以较为方便的获取[xxx'] 列表字符串
def down_url(self , l , reserv = 3,):
try:
res = requests.get(l, headers = self.headers , timeout = 5)
except TimeoutError:
html = None
if reserv > 0:
self.down_url(l, reserv =reserv - 1)
except RecursionError as err:
html = None
logging.error(f'下载url={l},输出err={err}')
else:
res.encoding = 'utf-8'
html = res.text
return html
if __name__ == '__main__' :
with open('ss.txt', 'r', encoding='utf-8') as f:
url = [links.strip() for links in f]
p = Spider(url)
p.start()
p.join()
通过对Process的继承,获取页面的标题
猜你喜欢
转载自blog.csdn.net/haohaomax1/article/details/109675834
今日推荐
周排行