python 爬取,selenium

import time
from selenium import webdriver
import os
import csv
import datetime

driver = webdriver.Chrome(executable_path=r’C:\Python35\chromedriver.exe’)

获取时间

today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
befyesterday = today - datetime.timedelta(days=2)

格式化时间

yesterdayf = yesterday.strftime(“%Y%m%d”)
befyesterdayf =befyesterday.strftime(“%Y%m%d”)

拼接url

url = ‘https://analytics.google.com/analytics/web/#/report/content-pages/a998944w1747039p1759011/_u.date00=‘+ befyesterdayf + ‘&_u.date01=’ + yesterdayf + ‘&explorer-table.secSegmentId=analytics.customDimension1&explorer-table.rowStart=0&explorer-table.rowCount=5000/’
driver.get(url);
time.sleep(1) # Let the user actually see something!
driver.find_element_by_id(‘identifierId’).send_keys(‘[email protected]’)
time.sleep(1)
driver.find_element_by_xpath(‘//*[@id=”identifierNext”]/content/span’).click()
time.sleep(1) # Let the user actually see something!
driver.find_element_by_xpath(‘//*[@id=”password”]/div[1]/div/div[1]/input’).send_keys(‘GS@RvJ@%0ttgW0#jd’)
time.sleep(1)
driver.find_element_by_xpath(‘//*[@id=”passwordNext”]/content/span’).click()
time.sleep(15)
driver.switch_to_frame(0)
time.sleep(1)
driver.find_element_by_xpath(‘//*[@id=”ID-reportHeader-reportToolbar”]/div[1]/div[2]/span[2]’).click()
time.sleep(1)
driver.find_element_by_xpath(‘//*[@id=”ID-reportHeader-reportToolbar-exportControl”]/div/ul/li[4]/span[2]’).click()
time.sleep(10)

点击下一页,加上判断循环,先获取总条数,然后除以5000不足+1 这是循环的次数

ele_str = driver.find_element_by_xpath(“//*[@id=\”ID-explorer-table\”]/div[3]/div[1]/div/span[1]/label”).text
page_num = int(ele_str.split(‘of ‘)[1])
page = page_num//5000+1
count = 1
while count < page:
driver.find_element_by_xpath(‘//*[@id=”ID-explorer-table”]/div[3]/div[1]/div/span[3]/ul/li[2]/div’).click() #点击 下一页
time.sleep(20)
driver.find_element_by_xpath(‘//*[@id=”ID-reportHeader-reportToolbar”]/div[1]/div[2]/span[2]’).click() # 点击导出
time.sleep(1)
driver.find_element_by_xpath(‘//*[@id=”ID-reportHeader-reportToolbar-exportControl”]/div/ul/li[4]/span[2]’).click() #点击导出csv
time.sleep(10)
count +=1
driver.quit()

解析csv

paths = [r’C:\Users\zhudong\Downloads’]
global data
data = []

for path in paths:
for filename in os.listdir(path):
exname = filename.split(‘.’)
if exname[-1] == ‘csv’:
csvFile = open(path + ‘\’ + filename,’r’,errors=’ignore’)
reader = csv.reader(csvFile)
for item in reader:
if len(item) != 0:
if item[0][0:1] == ‘/’:
data.append(item)
else:
pass
else:
pass
else:
pass
csvFile.close()

解析后的文件存放到result.csv文件中

csvFile2 = open(‘C:\Users\zhudong\Desktop\result.csv’,’w’,newline=”)#mode要写成wb,如果是w,则会出现空行的情况。
fieldnames = (“page”,”cid”,”pageviews”,”unique_pageviews”,”avg_time_on_page”,”entrances”,”bounce_rate”,”exit”,”page_value”)
writer = csv.writer(csvFile2)
writer.writerow(fieldnames)
m = len(data)
for i in range(m):
writer.writerow(data[i])
csvFile2.close()

删除文件夹下面的cav文件

delList = []
delDir = “C:\Users\zhudong\Downloads”
delList = os.listdir(delDir )

for f in delList:
filePath = os.path.join( delDir, f )
if os.path.isfile(filePath):
os.remove(filePath)
print (filePath + ” was removed!” )

猜你喜欢

转载自blog.csdn.net/qq_22994783/article/details/81702101