爬虫-最终版

#!/usr/bin/env python3
# -*- coding:utf-8 -*-

from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from datetime import date,timedelta
from re import search,findall,compile
import json
import requests

def get_messages(province):
    today = str(date.today())
    yesterday = str(date.today() - timedelta(days=2))
    query_day = yesterday

    # chrome_options = Options()
    # chrome_options.add_argument('--headless')
    # driver = webdriver.Chrome(chrome_options=chrome_options)   #设置有误
    driver = webdriver.Chrome()

    driver.get("https://www.cuecp.cn/portal/index.jhtml")
    # print(driver.page_source)
    assert "中国联通合作方门户" in driver.title
    driver.find_element_by_xpath('//*[@id="notic_content"]/div[2]/div[1]/span[2]/a').click()
    driver.switch_to.window(driver.window_handles[-1])
    driver.implicitly_wait(5)
    sleep(1)
    select_province = Select(driver.find_element_by_name("attribute4"))
    select_province.select_by_visible_text(province)

    js_start = "$('input[name=start_time_from]').removeAttr('readonly')"
    js_stop = "$('input[name=start_time_to]').removeAttr('readonly')"
    driver.execute_script(js_start)
    driver.find_element_by_name("start_time_from").send_keys(query_day)
    driver.execute_script(js_stop)
    driver.find_element_by_name("start_time_to").send_keys(query_day)
    driver.find_element_by_xpath('//*[@id="content_list"]/div[1]/table/tbody/tr[3]/td[4]/img').click()

    ps = driver.find_element_by_id("title_list").get_attribute('innerHTML')
    entries = findall(r'<a onclick.*>(.*)</a>',ps)
    fail_info = compile(r'不足|失败')
    index_entry = []

    messages = '发布日期:'+str(query_day)+'\r\n\r\n'
    for i, entry in enumerate(entries):
        if not fail_info.search(entry):
            index_entry.append(i+1)

    if len(index_entry) == 0:
        messages += (query_day + '无招标信息')
        driver.close()
    else:
        for i in index_entry:
            if i == index_entry[0]:
                driver.find_element_by_xpath('//*[@id="title_list"]/ul/li[' + str(i) + ']/span[1]/div/a').click()
                driver.switch_to.window(driver.window_handles[-1])
                html = driver.execute_script("return document.documentElement.outerHTML")
                title = findall(r'<div class="content_title">\s*(.*)\s*</div>',html)[0]  #注意空格
                content = driver.find_element_by_xpath('//*[@id="content_list"]/div[3]').text
                messages += (title+'\r\n'+content+'\r\n\r\n')
                driver.close()
            else:
                driver.switch_to.window(driver.window_handles[0])

                driver.find_element_by_xpath('//*[@id="notic_content"]/div[2]/div[1]/span[2]/a').click()
                sleep(1)
                driver.switch_to.window(driver.window_handles[-1])

                select_province = Select(driver.find_element_by_name("attribute4"))
                select_province.select_by_visible_text(province)
                js_start = "$('input[name=start_time_from]').removeAttr('readonly')"
                js_stop = "$('input[name=start_time_to]').removeAttr('readonly')"
                driver.execute_script(js_start)
                driver.find_element_by_name("start_time_from").send_keys(query_day)
                driver.execute_script(js_stop)
                driver.find_element_by_name("start_time_to").send_keys(query_day)
                driver.find_element_by_xpath('//*[@id="content_list"]/div[1]/table/tbody/tr[3]/td[4]/img').click()

                driver.find_element_by_xpath('//*[@id="title_list"]/ul/li[' + str(i) + ']/span[1]/div/a').click()
                driver.switch_to.window(driver.window_handles[-1])
                html = driver.execute_script("return document.documentElement.outerHTML")
                title = findall(r'<div class="content_title">\s*(.*)\s*</div>',html)[0]  #注意空格
                content = driver.find_element_by_xpath('//*[@id="content_list"]/div[3]').text
                messages += (title+'\r\n'+content+'\r\n\r\n')
                driver.close()
    return messages

    driver.switch_to.window(driver.window_handles[0])
    driver.close()

class Send_Message():
    def __init__(self, text):
        self.text = text
    def a_token(self):
        params = {'corpid':'wwb1783c01ce91f3c2',
                  'corpsecret': r'7YSBxSF31WkqrwI7_-oqztCM77BJlHXAye7OuEa718Y'
                  }
        url = 'https://qyapi.weixin.qq.com/cgi-bin/gettoken'
        r = requests.get(url=url, params=params)
        token=json.loads(r.text)['access_token']
        return token

    def send_message(self):
        data={"touser": "@all",
              "toparty": "first",
              # "totag": " TagID1 | TagID2 ",
              "msgtype": "text",
              "agentid": '1000003',
              "text": {"content": "%s" %(self.text)},
              "safe":0
              }
        value = json.dumps(data)
        token = self.a_token()
        url = 'https://qyapi.weixin.qq.com/cgi-bin/message/send?access_token=%s' %(token)
        r = requests.post(url, data=value)
        return r.text

if __name__ == '__main__':
    t = get_messages("浙江")
    s = Send_Message(t)
    s.send_message()











猜你喜欢

转载自blog.csdn.net/YWF331/article/details/81017233