Python Selenium Chrome Headless 爬取企查查数据

  1 # -*- coding:utf-8 -*-
  2 import os, pymysql,csv,configparser,pickle
  3 from selenium import webdriver
  4 from user_agent import generate_user_agent
  5 
  6 
  7 global csvpath
  8 global companypath
  9 global cookiedumped,csvinited
 10 global debugmode
 11 global browser_loaded
 12 global export
 13 global chromedriver
 14 
 15 browser_loaded=0
 16 csvinited=0
 17 
 18 #读取配置文件
 19 config=configparser.RawConfigParser()
 20 config.read('config.cfg')
 21 debugmode=int(config.get("config",'debugmode'))
 22 cookiedumped=int(config.get("config",'cookiedumped'))
 23 csvpath=config.get("config",'csvpath')
 24 export=int(config.get("config",'export'))
 25 companypath=config.get("config",'companypath')
 26 chromedriver=config.get("config","chromedriver")
 27 
 28 
 29 import time
 30 def dur( op=None, clock=[time.time()] ):
 31   if op != None:
 32     duration = time.time() - clock[0]
 33     print ('%s finished. Duration %.6f seconds.' % (op, duration))
 34   clock[0] = time.time()
 35 
 36 def durt( op=None, clock=[time.time()] ):
 37   if op != None:
 38     duration = time.time() - clock[0]
 39     print ('%s finished. Duration %.6f seconds.' % (op, duration))
 40   clock[0] = time.time()
 41 
 42 def init_db():
 43     global CONNECTION
 44     CONNECTION = pymysql.connect("地址", "用户名", "密码", "数据库", use_unicode=True, charset="utf8")
 45 
 46 
 47 def close_db():
 48     CONNECTION.close()
 49 
 50 
 51 def init_web_driver(opt1=0):
 52     global DRIVER, browser_loaded
 53     user_agent = generate_user_agent()
 54     co = webdriver.ChromeOptions()
 55     # Chrome driver default setting under Windows OS
 56     co.add_argument('--disable-gpu')
 57 
 58     if opt1 == 0:
 59         # Set the Chrome in headless mode
 60         co.add_argument('--headless')
 61         # Disable images loading
 62     co.add_argument('blink-settings=imagesEnabled=false')
 63 
 64     # Add User-Agent Profile
 65     co.add_argument('--user-agent={}'.format(user_agent))
 66 
 67     # Initialize Chrome
 68     DRIVER = webdriver.Chrome(
 69         chrome_options=co,
 70         executable_path=chromedriver,
 71         service_log_path=os.path.devnull
 72     )
 73     browser_loaded=1
 74     print('Chrome process loaed.')
 75 
 76 
 77 def close_web_driver():
 78     DRIVER.quit()
 79 
 80 
 81 def spider_create_cookie():
 82     init_web_driver(debugmode)
 83     DRIVER.get('https://www.qichacha.com/user_login')
 84     DRIVER.find_element_by_xpath('//*[@id="verifyLoginPanel"]/div[1]/a').click()
 85     time.sleep(10)
 86     print(DRIVER.current_url)
 87     cookie = [item["name"] + "=" + item["value"] for item in DRIVER.get_cookies()]
 88     print('Cookies Loaded' + '/n' + cookie)
 89     pickle.dump(DRIVER.get_cookies(), open("cookies.pkl", "wb"))
 90     close_web_driver()
 91     browser_loaded=0
 92     print('Cookies created.')
 93 
 94 
 95 def write_csv(inputstr, filename='result.csv',opt='a+'):
 96 
 97     if filename.strip()=='':
 98          filename='result.csv'
 99     #with open(filename, 'a+',newline='') as f:
100     with open(filename, opt, newline='') as f:
101         writer = csv.writer(f, dialect='excel')
102         writer.writerow(inputstr)
103     f.close()
104     print('CSV writed.')
105 
106 def init_csv():
107     headline=['搜索项','企业名称', '电话', '官网', '地址', '注册资本', '实缴资本',
108               '经营状态', '成立日期', '统一社会信用代码', '纳税人识别号',
109               '注册号', '组织机构代码', '公司类型', '所属行业', '核准日期',
110               '登记机关', '所属地区', '英文名', '曾用名', '经营方式', '人员规模',
111               '营业期限', '企业地址','经营范围']
112     write_csv(headline,csvpath,'w+')
113     global csvinited
114     csvinited=1
115     print('Output CSV ready.')
116 
117 
118 
119 #def write_sql():
120 
121 
122 
123 def get_companylist(filename='company.csv'):
124 
125 
126     company_list = []
127     f = open(filename, 'r')
128     # company_list=f.readlines()
129     for line in f.readlines():
130         company_list.append(line.replace('\n', ''))
131     return company_list
132     print('Company list loaded.')
133 
134 def table_reduction(searchitem,table, opt=1):
135     table_rows = table.find_elements_by_tag_name('tr')
136 
137     #table_rows = table.find_elements_by_tag_name('tr')
138     query_result = []
139     query_result.append(searchitem)
140     # 企业名称:
141     query_result.append(DRIVER.find_element_by_xpath('//*[@id="company-top"]/div/div[2]/div[1]/h1').text)
142     # 电话:
143     query_result.append(DRIVER.find_element_by_xpath('//*[@id="company-top"]/div[1]/div[2]/div[2]/span[1]/span[2]/span').text)
144     # 官网:
145     query_result.append(DRIVER.find_element_by_xpath('//*[@id="company-top"]/div[1]/div[2]/div[2]/span[3]').text)
146     # 地址:
147     query_result.append(DRIVER.find_element_by_xpath('//*[@id="company-top"]/div[1]/div[2]/div[3]/span[3]/a[1]').text)
148 
149     # 注册资本:
150     query_result.append(table_rows[0].find_elements_by_tag_name('td')[1].text)
151 
152     # 实缴资本:
153     query_result.append(table_rows[0].find_elements_by_tag_name('td')[3].text)
154 
155     # 经营状态:
156     query_result.append(table_rows[1].find_elements_by_tag_name('td')[1].text)
157 
158     # 成立日期:
159     query_result.append(table_rows[1].find_elements_by_tag_name('td')[3].text)
160 
161     # 统一社会信用代码:
162     query_result.append(table_rows[2].find_elements_by_tag_name('td')[1].text)
163 
164     # 纳税人识别号:
165     query_result.append(table_rows[2].find_elements_by_tag_name('td')[3].text)
166 
167     # 注册号:
168     query_result.append(table_rows[3].find_elements_by_tag_name('td')[1].text)
169 
170     # 组织机构代码:
171     query_result.append(table_rows[3].find_elements_by_tag_name('td')[3].text)
172 
173     # 公司类型:
174     query_result.append(table_rows[4].find_elements_by_tag_name('td')[1].text)
175 
176     # 所属行业:
177     query_result.append(table_rows[4].find_elements_by_tag_name('td')[3].text)
178 
179     # 核准日期:
180     query_result.append(table_rows[5].find_elements_by_tag_name('td')[1].text)
181 
182     # 登记机关:
183     query_result.append(table_rows[5].find_elements_by_tag_name('td')[3].text)
184 
185     # 所属地区:
186     query_result.append(table_rows[6].find_elements_by_tag_name('td')[1].text)
187 
188     # 英文名:
189     query_result.append(table_rows[6].find_elements_by_tag_name('td')[3].text)
190 
191     # 曾用名:
192     query_result.append(table_rows[7].find_elements_by_tag_name('td')[1].text)
193 
194     # 经营方式:
195     query_result.append(table_rows[7].find_elements_by_tag_name('td')[3].text)
196 
197     # 人员规模:
198     query_result.append(table_rows[8].find_elements_by_tag_name('td')[1].text)
199 
200     # 营业期限:
201     query_result.append(table_rows[8].find_elements_by_tag_name('td')[3].text)
202 
203     # 企业地址:
204     query_result.append(table_rows[9].find_elements_by_tag_name('td')[1].text)
205 
206     # 注册资本:
207     query_result.append(table_rows[10].find_elements_by_tag_name('td')[1].text)
208 
209 
210 
211     #if export == 1:  # Write in MYSQL
212 
213     if export == 0:  # Write in local csv
214         write_csv(query_result,csvpath)
215 
216 #使用前获取Cookie
217 def spider_create_cookie():
218     init_web_driver(1)
219     DRIVER.get('https://www.qichacha.com/user_login')
220     DRIVER.find_element_by_xpath('//*[@id="verifyLoginPanel"]/div[1]/a').click()
221     time.sleep(10)
222     print(DRIVER.current_url)
223     cookie = [item["name"] + "=" + item["value"] for item in DRIVER.get_cookies()]
224     pickle.dump(DRIVER.get_cookies(), open("cookies.pkl", "wb"))
225     print('Cookies loaded.')
226     global cookiedumped,browser_loaded
227     cookiedumped=1
228     DRIVER.close()
229     browser_loaded = 0
230 def visit_webpage(company_name):
231 
232     '''
233     Dump Logined Cookies
234     '''
235     if cookiedumped==0:
236         spider_create_cookie()
237     if browser_loaded==1:
238         DRIVER.find_element_by_id("headerKey").send_keys(company_name)
239         DRIVER.find_element_by_xpath('/html/body/header/div/form/div/div/span/button').click()
240 
241     if cookiedumped==1 and browser_loaded==0:
242         init_web_driver(debugmode)
243         DRIVER.get('https://www.qichacha.com/')
244         cookies = pickle.load(open("cookies.pkl", "rb"))
245         for cookie in cookies:
246             DRIVER.add_cookie(cookie)
247         DRIVER.find_element_by_id("searchkey").send_keys(company_name)
248         DRIVER.find_element_by_id("V3_Search_bt").click()
249 
250 
251 
252 
253     DRIVER.get(DRIVER.find_element_by_class_name("ma_h1").get_attribute("href"))
254     table = DRIVER.find_element_by_xpath('//*[@id="Cominfo"]/table[2]')
255     if csvinited==0:
256         init_csv()
257     table_reduction(company_name,table)
258 def main():
259     import array
260     global companys
261     filename = './log/'+str(time.strftime('%Y-%m-%d_%H-%M', time.localtime(time.time()))) + '_ERROR.log'
262     fp = open(filename, 'a+')
263     companys=[]
264     companys=get_companylist(companypath)
265     i=1
266     amount = len(companys)
267     for items in companys:
268 
269         try:
270             dur()
271             visit_webpage(items)
272            # t=timeit(visit_webpage(items))
273             dur(str(i)+' of '+str(amount)+' '+items)
274             i=i+1
275         except:
276             print(items+' FAILED TO CATCH')
277             fp.write(str(time.strftime('%Y-%m-%d_%H:%M:%S',time.localtime(time.time())))+' items '+'FAILED TO LOAD')
278     fp.close()
279 
280 
281 
282 #
283 #
284 #
285 #
286 #
287 
288 
289 
290 if __name__ == '__main__':
291     durt()
292     main()
293     DRIVER.close()
294     DRIVER.quit()
295     print(str(len(companys))+' items finieshed! ')
296     durt('TOTALY')

本地配置文件

[config]
debugmode=0
cookiedumped=0
csvpath=Result.csv
companypath=CompanyList.txt
chromedriver=.\chromedriver.exe
export=0
[sqlcon]
ip_port=
username=
pwd=
dbnanme=

本地企业列表

CompanyList.txt,每行放置一个企业名称或统一信用代码

猜你喜欢

转载自www.cnblogs.com/bionexit/p/9120147.html