本次爬虫是对天天基金网的华泰柏瑞沪深300ETF基金各年度各季度股票投资明细爬取。

因为直接通过requests 爬取的网页源代码所爬数据需js渲染，使用selenium的Webdriver，模拟真实浏览器，用来解决JavaScript渲染问题。

主程序，使用selenium打开网页, webdriver.Chrome()声明使用的浏览器

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
import requests
import urllib3
import time


urllib3.disable_warnings()
# 声明并调用浏览器
browser = webdriver.Chrome()


def main():
    # 华泰柏瑞沪深300ETF (510300) url
    page_url = 'http://fundf10.eastmoney.com/ccmx_510300.html'
    # 服务器响应http请求状态
    page_code = requests.get(page_url, verify=False).status_code
    years = ('2018年', '2017年', '2016年', '2015年', '2014年', '2013年', '2012年')

    for year in years:
        # 服务器成功响应http,网页自动打开网页，
        if page_code == 200:
            browser.get(page_url)
            try:
                open_full_page(year)
            except TimeoutError:
                print('超时...刷新页面...')
                return main()  
    
    browser.close()  # 关闭浏览器


if __name__ == '__main__':
    start = time.time()
    # 运行主程序
    main()
    end = time.time()
    print('耗时：', end - start)

如图本页使用Ajax技术。页面加载到浏览器时，该页面内的元素可以在选择不同年度时和点击显示全部持仓明细被加载。

这使得定位元素变得困难，直接requests请求，会发现选定的元素不在页面之中，会抛出 ElementNotVisibleException 异常。

waits提供了一些操作之间的时间间隔- 主要是定位元素或针对该元素的任何其他操作。

官方文档：https://python-selenium-zh.readthedocs.io/zh_CN/latest/5.Waits/

显式的waits等待一个确定的条件触发然后才进行更深一步的执行

from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select


wait = WebDriverWait(browser,10)


def open_full_page(year):
    # 下拉框选择年度
    Select(browser.find_element_by_tag_name("select")).select_by_visible_text(year)

    # 点击展开各季度股票投资明细
    for i in (1, 2, 3, 4):
        try:
            submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,
                                "#cctable > div:nth-child({}) > div > div.tfoot > font > a".format(i))))
            submit.click()
            print(i)
        except:
            continue

    time.sleep(1)  # 加载显示全部投资明细 等待1s
    get_data()  # 获取季度股票投资明细数据

Selenium 查找元素方法

在一个页面中有很多不同的策略可以定位一个元素。

find_element_by_id
find_element_by_name
find_element_by_xpath
find_element_by_link_text
find_element_by_partial_link_text
find_element_by_tag_name
find_element_by_class_name
find_element_by_css_selector

官方文档：https://selenium-python-zh.readthedocs.io/en/latest/locating-elements.html

def get_data():
    
    for i in (1, 2, 3, 4):
        try:
            # 具体年度季度 数据表名
            e_item_name = browser.find_elements_by_xpath('//*[@id="cctable"]/div[{}]/div/h4/label[1]'.format(i))
            table_name = [e.text for e in e_item_name][0].split(' ')[2]

            # 表内具体数据项名
            e_item_columns = browser.find_elements_by_xpath('//*[@id="cctable"]/div[{}]/div/table/thead/tr'.format(i))
            table_columns = [e.text for e in e_item_columns][0].split(' ')

            # 具体股票投资明细
            stocks_details(table_name, table_columns, i)

        except:
            continue


def stocks_details(table_name, table_columns, block):

    for i in range(1, 400):
        try:
            e_item_stock = browser.find_elements_by_xpath('//*[@id="cctable"]/div[{}]/div/table/tbody/tr[{}]'
                                                          .format(block,i))
            table_stock = [e.text for e in e_item_stock][0].split(' ')

            stock_code = table_stock[1]  # 股票代码
            stock_name = table_stock[2]  # 股票名称
            net_worth_ratio = table_stock[4]  # 占净值比例
            shareholdings = table_stock[5]  # 持股数
            position_hold_values = table_stock[6]  # 持仓市值
            time = table_name[2:8]  # 年度-季度

            mysql(stock_code, stock_name, net_worth_ratio, shareholdings, position_hold_values, time)

        except Exception as e:
            print('{}：一共{}支股票'.format(table_name, i-1))
            break

MySQL创建stocks表

stock_code, stock_name, net_worth_ratio, shareholdings, position_hold_values, time 八个字段

将爬取的数据存到本地 MySQL中

import pymysql


class down_mysql:
    def __init__(self, stock_code, stock_name, net_worth_ratio, shareholdings, position_hold_values, time):
        self.stock_code = stock_code
        self.stock_name = stock_name
        self.net_worth_ratio = net_worth_ratio
        self.shareholdings = shareholdings
        self.position_hold_values = position_hold_values
        self.time = time
        self.connect = pymysql.connect(
            host='localhost',
            db='hs300etf',
            port=3306,
            user='root',
            passwd='123456',
            charset='utf8',
            use_unicode=False
        )
        self.cursor = self.connect.cursor()

    # 保存数据到MySQL中
    def save_mysql(self):
        sql = "insert into stocks(stock_code, stock_name, net_worth_ratio, shareholdings, position_hold_values, time) " \
              "VALUES (%s,%s,%s,%s,%s,%s)"
        try:
            self.cursor.execute(sql, (self.stock_code, self.stock_name, self.net_worth_ratio,
                                      self.shareholdings, self.position_hold_values, self.time))
            self.connect.commit()
            print('数据插入成功')
        except:
            print('数据插入错误')


# 新建对象，然后将数据传入类中
def mysql(stock_code, stock_name, net_worth_ratio, shareholdings, position_hold_values, time):
    down = down_mysql(stock_code, stock_name, net_worth_ratio, shareholdings, position_hold_values, time)
    down.save_mysql()

保存数据到CSV文件

import pandas as pd

def stocks_details(table_name, table_columns, block):
    # 创建CSV 表存放数据
    htbs_hs300etf = pd.DataFrame(columns=table_columns)
    
    for i in range(1, 400):
        try:
            e_item_stock = browser.find_elements_by_xpath('//*[@id="cctable"]/div[{}]/div/table/tbody/tr[{}]'
                                                          .format(block,i))
            table_stock = [e.text for e in e_item_stock][0].split(' ')
            htbs_hs300etf.loc[i] = table_stock

        except Exception as e:
            print('{}：一共{}支股票'.format(table_name[2], i-1))
            is_300 = i-1
            break

        htbs_hs300etf.to_csv('C:\F\py_B_data\HS300_ETF\HTBR_HS300ETF\Stock-investment-details\{}.csv'.
                         format(table_name[2]), encoding='utf_8_sig')

Fargo的火

发布了16 篇原创文章 · 获赞 1 · 访问量 1575

私信关注

Python ：selenium 爬取Ajax技术网页，并存入MySQL数据库和本地CSV文件

Selenium 查找元素方法

猜你喜欢

Python ：selenium 爬取Ajax技术网页，并存入MySQL数据库 和 本地CSV文件

Selenium 查找元素方法

猜你喜欢

Python ：selenium 爬取Ajax技术网页，并存入MySQL数据库和本地CSV文件