Scraping Website(yahoo finance_appl) that load data with Javascript (from selenium import webdriver)

from selenium import webdriver

url = "https://finance.yahoo.com/quote/AAPL/key-statistics?p=AAPL"

browserDriver = webdriver.Chrome(executable_path='D:/chromedriver/chromedriver')
browserDriver.get(url)

print(browserDriver.page_source)

#If I want to look for Trailing P/E 17.10

#3 matches

#first snapshoot  looks like a json file(Yes, it is a json file format) or a dictionary

#second snapshoot looks like a static html code(yes, it is a html code)

 

# find certain element

element = browserDriver.find_element_by_xpath("html")

print(element.text)

#pycharm

# find certain element

element = browserDriver.find_element_by_xpath("html")

print(element.get_attribute("textContent")) #without html elements

#3 matches

.

#find childElements                                  #"/*" all children under the element "html"
elements = browserDriver.find_elements_by_xpath("html/*")
for childElement in elements:
   print(childElement.tag_name)

elements = browserDriver.find_elements_by_xpath("html/head/*")
for childElement in elements:
    print(childElement.tag_name)

#web structure (tags)

#If I want to look for TrailingPE

elements = browserDriver.find_elements_by_xpath("html")
counter = 1
for element in elements:
   if "trailingPE" in element.get_attribute("textContent"):
       print(counter)
   counter +=1

# However, we know the data is in <script> data </script> then

def findXPath(element, target, path):
    if target in element.get_attribute("textContent") and element.tag_name == "script" :
        return path

    #if not, I want to go deeper
                                                      #"./": current directory
    childrenElements = element.find_elements_by_xpath("./*") #goes deeper
    for childElement in childrenElements:
        print(path+"/" +childElement.tag_name)
        final = findXPath(childElement, target, path+"/" +childElement.tag_name)
        if final !="":
            return final
    return ""
element = browserDriver.find_element_by_xpath("html")
print("The final path is: ", findXPath(element, "trailingPE","html"))

elements = browserDriver.find_elements_by_xpath("html/body/script")
counter = 1
for element in elements:
    if "trailingPE" in element.get_attribute("textContent"):
        print(counter)
    counter +=1 

#the first one

element = browserDriver.find_element_by_xpath("html/body/script[1]")
print(element.get_attribute("textContent"))
#only one match
#"trailingPE":{"raw":17.102716,"fmt":"17.10"}  #json format

from selenium import webdriver

url = "https://finance.yahoo.com/quote/AAPL/key-statistics?p=AAPL"

browserDriver = webdriver.Chrome(executable_path='D:/chromedriver/chromedriver')
browserDriver.get(url)

"trailingPE"

def findXPath(element, target, path):
    if target in element.get_attribute("textContent") and element.tag_name == "script" :
        return path
                                                      #"./": current directory
    childrenElements = element.find_elements_by_xpath("./*") #goes deeper
    for childElement in childrenElements:
        print(path+"/" +childElement.tag_name)
        final = findXPath(childElement, target, path+"/" +childElement.tag_name)
        if final !="":
            return final
    return ""
import json

element = browserDriver.find_element_by_xpath("html/body/script[1]")
tempData = element.get_attribute("textContent").strip("(this));\n")
#the data is after "root.App.main ="  #
tempData=tempData.split("root.App.main = ")[1][:-3]
jsonData = json.loads(tempData)
print(jsonData.keys()) #dict_keys(['context', 'plugins'])

matchType = type(jsonData)
print("Final Path is: ", findJsonPath(jsonData, "trailingPE", "", matchType))
#Final Path is:  ,context,dispatcher,stores,QuoteSummaryStore,summaryDetail

matchType = type(jsonData)
#print("Final Path is: ", findJsonPath(jsonData, "trailingPE", "", matchType))
#Final Path is:  ,context,dispatcher,stores,QuoteSummaryStore,summaryDetail
print(jsonData["context"]["dispatcher"]["stores"]["QuoteSummaryStore"]["summaryDetail"])

#whole dictionary

finalData=jsonData["context"]["dispatcher"]["stores"]["QuoteSummaryStore"]["summaryDetail"]

import pandas as pd
df = pd.DataFrame(data = finalData)
print(df)
browserDriver.quit()

####################################all codes###############################################

#!/usr/bin/python
#encoding:utf-8

"""
@author: LlQ
@contact:[email protected]
@file:appleSta.py
@time: 7/14/2019 3:18 PM
"""

from selenium import webdriver
import json

url = "https://finance.yahoo.com/quote/AAPL/key-statistics?p=AAPL"

browserDriver = webdriver.Chrome(executable_path='D:/chromedriver/chromedriver')
browserDriver.get(url)

"trailingPE"

def findXPath(element, target, path):
    if target in element.get_attribute("textContent") and element.tag_name == "script" :
        return path
                                                      #"./": current directory
    childrenElements = element.find_elements_by_xpath("./*") #goes deeper
    for childElement in childrenElements:
        print(path+"/" +childElement.tag_name)
        final = findXPath(childElement, target, path+"/" +childElement.tag_name)
        if final !="":
            return final
    return ""

def findJsonPath(jsonObject, target, path, matchType):
    if type(jsonObject) == matchType:
        if target in jsonObject:
            return path
        for newKey in jsonObject.keys():
            final = findJsonPath(jsonObject[newKey], target, path + ","+newKey, matchType)
            if final != "":
                return final
    return ""


# print(browserDriver.page_source)

# find certain element
# element = browserDriver.find_element_by_xpath("html")
# print(element.text)

#find certain element
# element = browserDriver.find_element_by_xpath("html")
# print(element.get_attribute("textContent"))

# find childElements                               # "/*":all children under the "html:
# elements = browserDriver.find_elements_by_xpath("html/*")
# for childElement in elements:
#    print(childElement.tag_name)

# elements = browserDriver.find_elements_by_xpath("html/head/*")
# for childElement in elements:
#     print(childElement.tag_name)


#17.10: "trailingPE"

# elements = browserDriver.find_elements_by_xpath("html")
# counter = 1
# for element in elements:
#    if "trailingPE" in element.get_attribute("textContent"):
#        print(counter)
#    counter +=1

# element = browserDriver.find_element_by_xpath("html")
# print("The final path is: ", findXPath(element, "trailingPE","html"))
#The final path is:  html/body/script
    
# elements = browserDriver.find_elements_by_xpath("html/body/script")
# counter = 1
# for element in elements:
#     if "trailingPE" in element.get_attribute("textContent"):
#         print(counter)  #1
#     counter +=1
#the first one

# element = browserDriver.find_element_by_xpath("html/body/script[1]")
# print(element.get_attribute("textContent"))
#only one match
#"trailingPE":{"raw":17.102716,"fmt":"17.10"}  #json format


element = browserDriver.find_element_by_xpath("html/body/script[1]")
tempData = element.get_attribute("textContent").strip("(this));\n")
#the data is after "root.App.main ="  #
tempData=tempData.split("root.App.main = ")[1][:-3]
jsonData = json.loads(tempData)
#print(jsonData.keys()) #dict_keys(['context', 'plugins'])

matchType = type(jsonData)
#print("Final Path is: ", findJsonPath(jsonData, "trailingPE", "", matchType))
#Final Path is:  ,context,dispatcher,stores,QuoteSummaryStore,summaryDetail
#print(jsonData["context"]["dispatcher"]["stores"]["QuoteSummaryStore"]["summaryDetail"])

finalData=jsonData["context"]["dispatcher"]["stores"]["QuoteSummaryStore"]["summaryDetail"]

import pandas as pd
df = pd.DataFrame(data = finalData)
print(df)
browserDriver.quit()

###########api key

https://openweathermap.org/current

{"coord":{"lon":145.77,"lat":-16.92},"weather":[{"id":802,"main":"Clouds","description":"scattered clouds","icon":"03n"}],"base":"stations","main":{"temp":300.15,"pressure":1007,"humidity":74,"temp_min":300.15,"temp_max":300.15},"visibility":10000,"wind":{"speed":3.6,"deg":160},"clouds":{"all":40},"dt":1485790200,"sys":{"type":1,"id":8166,"message":0.2064,"country":"AU","sunrise":1485720272,"sunset":1485766550},"id":2172797,"name":"Cairns","cod":200}
发布了53 篇原创文章 · 获赞 38 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/Linli522362242/article/details/95964826