[Selenium application practice] How to automatically monitor the number of links to Baidu included sites

I wrote an article some time ago about using a python crawler to automatically crawl the results of the Baidu site: command. However, that solution has a problem: it is unstable, and it only determines whether it is included. It is not clear how many links are included. This cannot be done. To achieve the purpose of continuously paying attention to whether the number of collections has increased, I wrote this implementation plan using selenium, which can accurately monitor the number of collections.

Install dependencies

import json
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
import re
import requests
from lxml import etree
import time

Create a browser instance

myService = Service(r'./../chromedriver')
options = webdriver.ChromeOptions()
# options.add_argument('headless') #如果想不弹出浏览器则加上这项配置
myChrome = webdriver.Chrome(service=myService, options=options)
myChrome.implicitly_wait(10)

Visit Baidu engine and automatically enter site:[domain name] and then automatically click the search button

domain = 'jentian.com'
myChrome.get('https://www.baidu.com')
keywordInput = myChrome.find_element(By.ID, 'kw')
keywordInput.send_keys('site:' + domain)
searchBtn = myChrome.find_element(By.ID, 'su')
searchBtn.click()

Obtain the included quantity string through xpath and use regular matching to find the included quantity

time.sleep(3)  # 点击搜索按钮后要过一会再对页面文本进行解析, 因为需要时间跳转及加载内容
dom = etree.HTML(myChrome.page_source)
resultStringArr = dom.xpath('//*[@id="content_left"]/div[1]/div/p[1]/b/text()')
resultCount = 0
if len(resultStringArr) > 0:
    resultCountString = resultStringArr[0]
    resultCountGroup = re.compile(r'\d+').findall(resultCountString)
    if resultCountGroup:
        resultCount = ''.join(resultCountGroup)
if int(resultCount) > 0:
    msg = '百度已收录' + domain + ',收录数量:' + str(resultCount)
else:
    msg = '百度未收录' + domain
print('抓取完毕!!!', msg, '\n')

Finally, the crawling results are automatically sent to the enterprise WeChat group to achieve the purpose of automatic reporting and collection of data.

qiWeiWebHook = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=**'#请自动替换成自己的webhook链接
postHeaders = {
    
    
    'Content-Type': 'application/json'
}
msgData = {
    
    
    "msgtype": "text",
    "text": {
    
    
        "content": msg
    }
}
requests.post(qiWeiWebHook, headers=postHeaders, data=json.dumps(msgData))

Finally, in order to achieve the purpose of scheduled automatic reporting, I added a loop to automatically crawl and send the included results every hour. The complete code is as follows:

#通过抓取某个域名的site指令结果,判断是否已被百度收录代码
import json
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
import re
import requests
from lxml import etree
import time

def crawlBaiduPickupData():
    myService = Service(r'./../chromedriver')
    options = webdriver.ChromeOptions()
    # options.add_argument('headless') #如果想不弹出浏览器则加上这项配置
    myChrome = webdriver.Chrome(service=myService, options=options)
    myChrome.implicitly_wait(10)
    domain = 'jentian.com'
    while True:
        myChrome.get('https://www.baidu.com')
        keywordInput = myChrome.find_element(By.ID, 'kw')
        keywordInput.send_keys('site:' + domain)
        searchBtn = myChrome.find_element(By.ID, 'su')
        searchBtn.click()
        time.sleep(3)  # 点击搜索按钮后要过一会再对页面文本进行解析, 因为需要时间跳转及加载内容
        dom = etree.HTML(myChrome.page_source)
        resultStringArr = dom.xpath('//*[@id="content_left"]/div[1]/div/p[1]/b/text()')
        resultCount = 0
        if len(resultStringArr) > 0:
            resultCountString = resultStringArr[0]
            resultCountGroup = re.compile(r'\d+').findall(resultCountString)
            if resultCountGroup:
                resultCount = ''.join(resultCountGroup)
        if int(resultCount) > 0:
            msg = '百度已收录' + domain + ',收录数量:' + str(resultCount)
        else:
            msg = '百度未收录' + domain
        print('抓取完毕!!!', msg, '\n')
        qiWeiWebHook = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=*'
        postHeaders = {
    
    
            'Content-Type': 'application/json'
        }
        msgData = {
    
    
            "msgtype": "text",
            "text": {
    
    
                "content": msg
            }
        }
        requests.post(qiWeiWebHook, headers=postHeaders, data=json.dumps(msgData))
        time.sleep(3600)  # 每小时跟进一次

if __name__ == '__main__':
    crawlBaiduPickupData()

Guess you like

Origin blog.csdn.net/one_and_only4711/article/details/126557452