通过对Process的继承,获取页面的标题

#-*- coding: UTF-8 -*-
# 通过继承方式获取网站标题
from multiprocessing import Process
import requests,re
import logging
# 配置日志的等级
logging.basicConfig(level=logging.INFO)

class Spider(Process):
    def __init__(self,l):
        super().__init__()
        self.title_re = re.compile(r'<title>([^<]+)</title>', re.I)
        self.urls = l
        self.headers = {
            "user-agent": "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
        }

    def run(self):    # 重定义Process方法 run方法
        while self.urls:
            url_s = self.urls.pop(0)
            html = self.down_url(url_s)
            title = self.get_title(html)
            logging.info(f'获得标题{title}')

    def get_title(self,html):
        return ''.join(self.title_re.findall(html))    # 通过’‘.join() 可以较为方便的获取[xxx'] 列表字符串


    def down_url(self , l , reserv = 3,):
       try:
           res = requests.get(l, headers = self.headers , timeout = 5)
       except TimeoutError:
           html = None
           if reserv > 0:
               self.down_url(l, reserv =reserv - 1)
       except RecursionError as err:
           html = None
           logging.error(f'下载url={l},输出err={err}')
       else:
           res.encoding  = 'utf-8'
           html = res.text
       return html

if __name__ == '__main__' :

    with open('ss.txt', 'r', encoding='utf-8') as f:
       url = [links.strip() for links in f]
    p = Spider(url)
    p.start()
    p.join()

猜你喜欢

转载自blog.csdn.net/haohaomax1/article/details/109675834