problem:selenium已经放弃PhantomJS,建议使用火狐或者谷歌无界面浏览器。
Solution 1:
selenium version downgrade
The default installation version by pip install selenium. (By pip show selenium
show version)
to uninstall pip uninstall selenium, reinstall and specify the version number pip install selenium == 2.48.0.
Solution 2:
Using a different browser, Chrome I used here;
Installation Guide online a lot https://segmentfault.com/a/1190000013940356
Examples :( collection Beijing municipal people contents of the letter)
from lxml import etree import requests import csv from selenium import webdriver import time import os from selenium.webdriver.chrome.webdriver import WebDriver #创建csv outPath = 'D://xinfang_data.csv' if (os.path.exists(outPath)): os.remove(outPath) fp = open(outPath, 'wt', newline='', encoding='utf-8') # 创建csv writer = csv.writer(fp) writer.writerow(('kind', 'time', 'processingDepartment', 'content')) #请求头 headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36' } # Create a browser object Driver = webdriver.Chrome () # Get information page DEF get_info (NUM): driver.get(url) driver.implicitly_wait ( 10) # Implicit Wait n seconds, JavaScript interpretation takes time, if short data can not be normally acquired, the waste of time if the long; implicitly_wait () waits for a given time the smart # driver.find_element_by_xpath ( '/ / * [@ ID = "pageNum"] '). Clear () driver.find_element_by_id ( ' pageNum ' ) .clear () # clear input frame # driver.find_element_by_id (' pageNum '). send_keys (NUM) driver.find_element_by_xpath ( ' // * [@ ID = "pageNum"] ' ) .send_keys (NUM) # input pages driver.find_element_by_xpath ( ' // * [@ ID = "judgeFlag"] / A ' ) .click () # click confirmation box time.sleep(1)#Be sure to pause, otherwise it has no load output of the first page # Print (driver.current_window_handle) # handle the current page HTML = driver.page_source # Print (driver.page_source) return HTML # Parse HTML documents, access to data DEF get_Data (HTML): selector = etree.HTML(html) infos=selector.xpath('//*[@id="mailul"]/div') for info in infos: kind=info.xpath('div[1]/a/font/text()')[0] time=info.xpath('div[2]/div[1]/div[1]/text()')[0] processingDepartment = info.xpath('div[2]/div[1]/div[2]/span/text()')[0] Content = info.xpath ( ' div [. 1] / A / span / text () ' ) [0] # string processing obtained parsekind kind.strip = (). Strip ( ' · [ ' ) .strip ( ' ] ' ) # Print (parsekind) parsetime = time.strip () Strip (. ' Started: ' ) .replace ( " - " , " / " ) # Print (parsetime) parsepd = processingDepartment.strip () Strip (. ' Authority: ' ) #print(parsepd) parsecontent = content.strip() #print(parsecontent) #写入csv writer.writerow((parsekind,parsetime,parsepd,parsecontent)) if __name__ == '__main__': url = 'http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow' for i in range(1,1000): html=get_info(i) get_data(html) time.sleep(1)