利用selenium并使用gevent爬取动态网页数据

首先要下载相应的库

gevent协程库：pip install gevent

selenium模拟浏览器访问库：pip install selenium

selenium库相应驱动配置 https://www.cnblogs.com/Niuxingyu/p/10490882.html

#导包
import gevent
#猴子补丁
from gevent import monkey
monkey.patch_all()
import requests
import os
import re
from lxml import etree
#模拟浏览器行为
from selenium import webdriver

#定义全局变量
global_url_list = []

#定义类
class GeventSpider(object):
    #定义类属性
    encloseing_url_list = []

    #定义抓取方法
    def run(self,url):
        # 抓取写文件
        if url == 'http://military.cctv.com/' :
            file_name = 'test_cctv.html'
        else:
            file_name = 'inner_cctv.html'
        html_content = ''
        if not os.path.exists(file_name) :
            #定义浏览器对象
            browser = webdriver.Chrome()
            browser.get(url)
            #解码赋值
            html = browser.page_source.encode('utf-8').decode()
            time.sleep(1)
            #解码
            # html = r.content.decode('utf-8')
            #写文件  指定文件编码
            with open('./'+file_name,'w',encoding='utf-8') as f:
                f.write(html)
            #关闭浏览器
            browser.quit()
            html_content =  html
        else:    
            #读取文件返回
            with open('./'+file_name,encoding='utf-8') as f:
                content = f.read()
            html_content = content
        self.get_xpath(html_content)



    #定义数据匹配方法
    def get_xpath(self,html):
        #转换格式
        html = etree.HTML(html)
        #匹配url
        html_data_url = html.xpath('//span[@class="l"]/a/@href')
        #声明修改全局变量
        global global_url_list
        global_url_list = html_data_url
        #修改类属性
        self.encloseing_url_list = html_data_url

    #定义爬取内页逻辑
    def get_inner(self,url):
        #发送请求
        r = requests.get(url)
        html = r.content.decode('utf-8')
        #正则匹配标题
        regex = re.compile('<h1>(.+?)</h1>',re.I)
        print(regex.findall(html))


if __name__ == "__main__":
    #实例化一个对象
    geventspider = GeventSpider()
    #定义一个urllist
    url_list = ['http://military.cctv.com/']
   
    #请求首页没必要开协程
    geventspider.run(url_list[0])
    

    #重新赋值 使用协程同时爬取十四个内页  其实这里我们做了两种方法一个使用类属性赋值，还可以使用我们定义好的全局变量global_url_list来进行赋值
    url_list = geventspider.encloseing_url_list
    #url_list = global_url_list
    #列表推倒式将所有创建好的协程写入列表
    job_list = [gevent.spawn(geventspider.get_inner,item) for item in url_list]
    #阻塞协程 等待所有协程完成后在进行关闭
    gevent.joinall(job_list)

利用selenium并使用gevent爬取动态网页数据

猜你喜欢