selenium crawling electricity supplier website
from selenium import webdriver
import random
import time
import csv
import re
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC#条件判断
from selenium.webdriver.support.wait import WebDriverWait#等待
class TestSpider(object):
def __init__(self):
self.url = '这里是目标网站链接'
self.error = []
#进入一级页面
def get_page(self):
options = webdriver.ChromeOptions()
# 添加无界面参数
options.add_argument('--headless')
self.browser = webdriver.Chrome(options=options)
self.browser.get(self.url)
print('已启动浏览器1')
self.browser.maximize_window()
self.browser.implicitly_wait(8)
self.browser2 = webdriver.Chrome(options=options)
self.browser2.implicitly_wait(3)
print('已启动浏览器2')
WebDriverWait(self.browser,5).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="pageTWrap"]/div[4]/ul/li[2]/a'))).click()
#遍历一级页面各个产品节点
def xunhuan_one_page(self):
i = 1
#如果出错了,重新爬取,可以在这里加入上次那个产品所在的页数,位置,代码有待补充
while True:
list = self.browser.find_elements_by_xpath('//div[@id="proList"]//div[@index]') # 产品节点
if i <= int(len(list)):
try:
num = list[i-1].find_element_by_xpath('.//div[@class="pro-sold"]//span[@class="num"]')
page_one = self.browser.find_element_by_xpath('//div[@class="filter-page"]/span').text
page_one = re.split('\s|\/', page_one)[1]
list[i-1].find_element_by_xpath('.//div[@class="photo"]//img').click()
print('已进入第{}个产品节点'.format(i))
i = i + 1
time.sleep(random.randint(7,8))
try:
self.two_page()
except:
print('\033[1;35;0m ****************************** \033[0m')
a = ['第{}页第{}个产品爬取失败'.format(page_one,i-1)]
self.error.append(a)
print(a[0])
print('\033[1;35;0m ****************************** \033[0m')
self.save_error(a)
# self.error.append(a)
time.sleep(random.randint(2,3))
while True:
try:
MainContent = self.browser.find_element_by_xpath('//div[@class="mainout"]').get_attribute(
'id')
break
except:
self.browser.back()
time.sleep(random.randint(2,3))
except:
i = i+1
else:
if self.browser.page_source.find('unnext') == -1:
self.browser.find_element_by_xpath('//div[@class="filter-page"]//a[@class="next"]').click()
i = 1#翻页了,归1
one_page = self.browser.find_element_by_xpath('//div[@class="page"]/span/b').text
print('一级页面第{}页'.format(one_page))
time.sleep(random.randint(1,2))
else:
break
print(self.error)
self.browser.quit()
# print(self.error)
#二级页面的一些操作
def two_page(self):
#尝试爬取二级页面的某些信息,加入了判断语句,10次内拿到数据则停止
m = 1
while True:
try :
if m <= 10 :
product = self.browser.find_element_by_xpath('//*[@id="colLeft"]/div[1]/div[1]/div/div[1]/h2').text
page = self.browser.find_element_by_xpath('//*[@id="transactionHistory"]/div/div[5]/div/span/strong[2]').text
three_url = self.browser.find_element_by_xpath('//div[@id="transactionHistoryWarp"]').get_attribute(
'init-src')
time.sleep(random.randint(0,1))
m = m+1
if page != None and three_url != None and product != None:
print(product)
print(three_url)
break
else:
continue
else:
break
except :
self.browser.find_element_by_xpath('//li[@id="ctab-transactionHistory"]/span').click() # 点击历史记录
time.sleep(random.randint(0, 1))
#拿到的链接不完整,需要自己拼接
three_url_list = re.split('\?', three_url)
three_url = three_url_list
r_three_url = '这里是目标网站初始链接' + three_url[0] + '?act=pageload&appid=dh&page={}&' + three_url[-1] + '&lang=en'
info_list = [['买家', '产品名称', '购买数量', '购买日期', '买方地址']]
self.three_parse_page(product, info_list, page, r_three_url)
#与parse_page还有save_page结合爬取某一个产品节点的所有历史交易记录
def three_parse_page(self,product,info_list,page,r_three_url):
j = 1
while True:
if j <= int(page) :
self.browser2.get(r_three_url.format(j))
self.parse_page(info_list)
print('三级页面第{}页爬取成功'.format(j))
j = j+1
time.sleep(random.randint(1,2))
else:
print('三级页面最后一页爬取成功')
try:
self.save_page(product,info_list)
print('产品节点信息存储成功')
except:
self.save_page2(product,info_list)
print('产品节点信息存储成功')
time.sleep(random.randint(1,2))
break
#三级页面爬取函数
def parse_page(self,info_list):
tran_list = self.browser2.find_elements_by_xpath(
'//div[@class="transaction-list"]//ul'
)
for tran in tran_list:
info_two = tran.text.split('\n')
country = tran.find_element_by_xpath('.//li[@class="col1"]//img').get_attribute('src')
country_list = re.split('\.|\/',country)
cou = country_list[-2]
info_two.append(cou)#买家,产品名称,数量,日期,买家地址
info_list.append(info_two)
time.sleep(random.randint(1,2))
#保存三级页面的数据
def save_page(self,product,info_list):
with open('{}.csv'.format(product), 'a', newline='') as f:
for rt in info_list:
writer = csv.writer(f)
writer.writerow([rt[0].strip(), rt[1].strip(), rt[2].strip(), rt[3].strip(), rt[4]])
#有些product作为文件名由于有'/'这个符号会出错,所以写个新的函数去除'/'
def save_page2(self,product,info_list):
product = re.split('\/',product)[0]
with open('{}.csv'.format(product), 'a', newline='') as f:
for rt in info_list:
writer = csv.writer(f)
writer.writerow([rt[0].strip(), rt[1].strip(), rt[2].strip(), rt[3].strip(), rt[4]])
#凡是爬取失败的产品,报错信息存下来(页数,第几个商品)
def save_error(self,a):
with open('error.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow([a[0].strip()])
def main(self):
self.get_page()
self.xunhuan_one_page()
if __name__ == '__main__':
spider = TestSpider()
spider.main()