目标网站:
这个网站如果不登录的话 就会一直跳验证码,注册个账号 登录一下 拿到cookie,获取.ASPXAUTH后面的值,在selenium发起请求的时候带上
登录之后 可以拿到一页列表的数据
进入详情页后,会发现一个公司会有多个年份的报告,写个循环,逐个获取
获取的数据保存到mongo中
内容部分保存的是html格式,附件保存的是链接地址
完整代码:
import time
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from pymongo import MongoClient
browser = webdriver.Chrome('./chromedriver')
browser.get("http://www.ipe.org.cn/IndustryRecord/Regulatory.html?keycode=4543j9f9ri334233r3rixxxyyo12")
# 登录
cookie = {'name':'.ASPXAUTH','value':'写自己或取的'}
browser.add_cookie(cookie)
time.sleep(1)
li_list = browser.find_elements_by_xpath('//*[@id="table_con0"]/div[2]/table/tbody/tr')
for i in li_list:
raw_name = i.text
name = re.search('.*? (.*?) .*?', raw_name).group(1)
print(name)
i.click()
# 切换到新开的网页
windows = browser.window_handles
browser.switch_to.window(windows[-1])
time.sleep(3)
# 获取往年报告
li_list2 = browser.find_elements_by_xpath('//*[@id="uitab_con"]/div[1]/div[1]/ul/li')
for li in li_list2:
li.click()
time.sleep(2)
a_list = li.find_elements_by_xpath('./div/a')
first = 1
for a in a_list:
if first:
a.click()
time.sleep(2)
first = 0
else:
li.click()
time.sleep(1)
a.click()
time.sleep(2)
# 提取数据
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
div_box = soup.find('div', class_='record-content record-information record-content_j')
# 数据内容
html = div_box.prettify()
print(html)
# 附件
try:
attachment = div_box.find('a').attrs['href']
except:
attachment = ''
print(attachment)
# 来源
source = soup.find('h2').text[2:]
print('来源', source)
# 保存数据
conn = MongoClient('localhost', 27017)
db = conn.demo
my_set = db.demo_set
item = {}
item['name'] = name
item['html'] = html
item['attachment'] = attachment
item['source'] = source
my_set.insert(item)
# break
# break
# break
# 切换到上一个窗口
windows = browser.window_handles
browser.switch_to.window(windows[0])
browser.quit()