【selenium应用实践】怎样实现自动监测百度收录站点链接数量

前段时间写了一篇文章介绍了使用python爬虫自动抓取百度site:命令的结果,但那个方案有个问题是不稳定,而且只是判断了是否收录,具体收录了多少个链接不清楚,这无法达到持续关注收录数量是否有增加的目的,于是用selenium写了这个实现方案,可以精准监测收录数量

安装依赖

import json
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
import re
import requests
from lxml import etree
import time

创建浏览器实例

myService = Service(r'./../chromedriver')
options = webdriver.ChromeOptions()
# options.add_argument('headless') #如果想不弹出浏览器则加上这项配置
myChrome = webdriver.Chrome(service=myService, options=options)
myChrome.implicitly_wait(10)

访问百度引擎并自动输入site:【域名】后自动点击搜索按钮

domain = 'jentian.com'
myChrome.get('https://www.baidu.com')
keywordInput = myChrome.find_element(By.ID, 'kw')
keywordInput.send_keys('site:' + domain)
searchBtn = myChrome.find_element(By.ID, 'su')
searchBtn.click()

通过xpath获取收录数量字符串并用正则匹配出收录数量

time.sleep(3)  # 点击搜索按钮后要过一会再对页面文本进行解析, 因为需要时间跳转及加载内容
dom = etree.HTML(myChrome.page_source)
resultStringArr = dom.xpath('//*[@id="content_left"]/div[1]/div/p[1]/b/text()')
resultCount = 0
if len(resultStringArr) > 0:
    resultCountString = resultStringArr[0]
    resultCountGroup = re.compile(r'\d+').findall(resultCountString)
    if resultCountGroup:
        resultCount = ''.join(resultCountGroup)
if int(resultCount) > 0:
    msg = '百度已收录' + domain + ',收录数量:' + str(resultCount)
else:
    msg = '百度未收录' + domain
print('抓取完毕!!!', msg, '\n')

最后将爬取结果自动发送到企业微信群,实现自动报告收录数据的目的

qiWeiWebHook = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=**'#请自动替换成自己的webhook链接
postHeaders = {
    
    
    'Content-Type': 'application/json'
}
msgData = {
    
    
    "msgtype": "text",
    "text": {
    
    
        "content": msg
    }
}
requests.post(qiWeiWebHook, headers=postHeaders, data=json.dumps(msgData))

最后,为了达到定时自动报告的目的,我加了一个循环,每隔一小时就自动爬取一次并发送收录结果,完整代码如下:

#通过抓取某个域名的site指令结果,判断是否已被百度收录代码
import json
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
import re
import requests
from lxml import etree
import time

def crawlBaiduPickupData():
    myService = Service(r'./../chromedriver')
    options = webdriver.ChromeOptions()
    # options.add_argument('headless') #如果想不弹出浏览器则加上这项配置
    myChrome = webdriver.Chrome(service=myService, options=options)
    myChrome.implicitly_wait(10)
    domain = 'jentian.com'
    while True:
        myChrome.get('https://www.baidu.com')
        keywordInput = myChrome.find_element(By.ID, 'kw')
        keywordInput.send_keys('site:' + domain)
        searchBtn = myChrome.find_element(By.ID, 'su')
        searchBtn.click()
        time.sleep(3)  # 点击搜索按钮后要过一会再对页面文本进行解析, 因为需要时间跳转及加载内容
        dom = etree.HTML(myChrome.page_source)
        resultStringArr = dom.xpath('//*[@id="content_left"]/div[1]/div/p[1]/b/text()')
        resultCount = 0
        if len(resultStringArr) > 0:
            resultCountString = resultStringArr[0]
            resultCountGroup = re.compile(r'\d+').findall(resultCountString)
            if resultCountGroup:
                resultCount = ''.join(resultCountGroup)
        if int(resultCount) > 0:
            msg = '百度已收录' + domain + ',收录数量:' + str(resultCount)
        else:
            msg = '百度未收录' + domain
        print('抓取完毕!!!', msg, '\n')
        qiWeiWebHook = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=*'
        postHeaders = {
    
    
            'Content-Type': 'application/json'
        }
        msgData = {
    
    
            "msgtype": "text",
            "text": {
    
    
                "content": msg
            }
        }
        requests.post(qiWeiWebHook, headers=postHeaders, data=json.dumps(msgData))
        time.sleep(3600)  # 每小时跟进一次

if __name__ == '__main__':
    crawlBaiduPickupData()

猜你喜欢

转载自blog.csdn.net/one_and_only4711/article/details/126557452