warnings.warn('Selenium support for PhantomJS has been deprecated, please use headless '报错

problem:selenium已经放弃PhantomJS,建议使用火狐或者谷歌无界面浏览器。

Solution 1:

selenium version downgrade

The default installation version by pip install selenium. (By pip show seleniumshow version)
to uninstall pip uninstall selenium, reinstall and specify the version number pip install selenium == 2.48.0.

Solution 2:

Using a different browser, Chrome I used here;

Installation Guide online a lot https://segmentfault.com/a/1190000013940356

Examples :( collection Beijing municipal people contents of the letter)

 

from lxml import etree
import requests
import csv
from selenium import webdriver
import time
import os
from selenium.webdriver.chrome.webdriver import WebDriver

#创建csv
outPath = 'D://xinfang_data.csv'
if (os.path.exists(outPath)):
    os.remove(outPath)
fp = open(outPath, 'wt', newline='', encoding='utf-8')  # 创建csv
writer = csv.writer(fp)
writer.writerow(('kind', 'time', 'processingDepartment', 'content'))

#请求头
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}

# Create a browser object 
Driver = webdriver.Chrome ()

# Get information page 
DEF get_info (NUM):
    driver.get(url)
    driver.implicitly_wait ( 10)   # Implicit Wait n seconds, JavaScript interpretation takes time, if short data can not be normally acquired, the waste of time if the long; implicitly_wait () waits for a given time the smart 
    # driver.find_element_by_xpath ( '/ / * [@ ID = "pageNum"] '). Clear () 
    driver.find_element_by_id ( ' pageNum ' ) .clear () # clear input frame 
    # driver.find_element_by_id (' pageNum '). send_keys (NUM) 
    driver.find_element_by_xpath ( ' // * [@ ID = "pageNum"] ' ) .send_keys (NUM) # input pages 
    driver.find_element_by_xpath ( ' // * [@ ID = "judgeFlag"] / A ' ) .click () # click confirmation box 
    time.sleep(1)#Be sure to pause, otherwise it has no load output of the first page 
    # Print (driver.current_window_handle) # handle the current page 
    HTML = driver.page_source
     # Print (driver.page_source) 
    return HTML

# Parse HTML documents, access to data 
DEF get_Data (HTML):
    selector = etree.HTML(html)
    infos=selector.xpath('//*[@id="mailul"]/div')
    for info in infos:
        kind=info.xpath('div[1]/a/font/text()')[0]
        time=info.xpath('div[2]/div[1]/div[1]/text()')[0]
        processingDepartment = info.xpath('div[2]/div[1]/div[2]/span/text()')[0]
        Content = info.xpath ( ' div [. 1] / A / span / text () ' ) [0]
         # string processing obtained 
        parsekind kind.strip = (). Strip ( ' · [ ' ) .strip ( ' ] ' )
         # Print (parsekind) 
        parsetime = time.strip () Strip (. ' Started: ' ) .replace ( " - " , " / " )
         # Print (parsetime) 
        parsepd = processingDepartment.strip () Strip (. ' Authority: ' )
         #print(parsepd)
        parsecontent = content.strip()
        #print(parsecontent)
        #写入csv
        writer.writerow((parsekind,parsetime,parsepd,parsecontent))

if __name__ == '__main__':
    url = 'http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow'
    for i in range(1,1000):
        html=get_info(i)
        get_data(html)
        time.sleep(1)
View Code

 

Guess you like

Origin www.cnblogs.com/sengzhao666/p/12343920.html