selenium crawling electricity supplier website

selenium crawling electricity supplier website


from selenium import webdriver
import random
import time
import csv
import re
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC#条件判断
from selenium.webdriver.support.wait import WebDriverWait#等待

class TestSpider(object):
    def __init__(self):
        self.url = '这里是目标网站链接'
        self.error = []
    #进入一级页面
    def get_page(self):
        options = webdriver.ChromeOptions()
        # 添加无界面参数
        options.add_argument('--headless')
        self.browser = webdriver.Chrome(options=options)
        self.browser.get(self.url)
        print('已启动浏览器1')
        self.browser.maximize_window()
        self.browser.implicitly_wait(8)

        self.browser2 = webdriver.Chrome(options=options)
        self.browser2.implicitly_wait(3)
        print('已启动浏览器2')
        WebDriverWait(self.browser,5).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="pageTWrap"]/div[4]/ul/li[2]/a'))).click()


    #遍历一级页面各个产品节点
    def xunhuan_one_page(self):
       
     
        i = 1
        #如果出错了,重新爬取,可以在这里加入上次那个产品所在的页数,位置,代码有待补充


        while True:

            list = self.browser.find_elements_by_xpath('//div[@id="proList"]//div[@index]')  # 产品节点
            if i <= int(len(list)):

                try:
                    num = list[i-1].find_element_by_xpath('.//div[@class="pro-sold"]//span[@class="num"]')
                    page_one = self.browser.find_element_by_xpath('//div[@class="filter-page"]/span').text
                    page_one = re.split('\s|\/', page_one)[1]

                    list[i-1].find_element_by_xpath('.//div[@class="photo"]//img').click()

                    print('已进入第{}个产品节点'.format(i))
                    i = i + 1
                    time.sleep(random.randint(7,8))
                    try:
                        self.two_page()

                    except:
                        print('\033[1;35;0m ****************************** \033[0m')
                        a = ['第{}页第{}个产品爬取失败'.format(page_one,i-1)]
                        self.error.append(a)
                        print(a[0])
                        print('\033[1;35;0m ****************************** \033[0m')
                        self.save_error(a)
                        # self.error.append(a)
                    time.sleep(random.randint(2,3))

                    while True:
                        try:
                            MainContent = self.browser.find_element_by_xpath('//div[@class="mainout"]').get_attribute(
                                'id')
                            break
                        except:
                            self.browser.back()
                            time.sleep(random.randint(2,3))
                except:
                    i = i+1
            else:
                if self.browser.page_source.find('unnext') == -1:
                    self.browser.find_element_by_xpath('//div[@class="filter-page"]//a[@class="next"]').click()
                    i = 1#翻页了,归1
                    one_page = self.browser.find_element_by_xpath('//div[@class="page"]/span/b').text
                    print('一级页面第{}页'.format(one_page))
                    time.sleep(random.randint(1,2))
                else:
                    break
        print(self.error)

        self.browser.quit()
        # print(self.error)

    #二级页面的一些操作
    def two_page(self):
    	#尝试爬取二级页面的某些信息,加入了判断语句,10次内拿到数据则停止
        m = 1
        while True:
            try :
                if m <= 10 :

                    product = self.browser.find_element_by_xpath('//*[@id="colLeft"]/div[1]/div[1]/div/div[1]/h2').text
                    page = self.browser.find_element_by_xpath('//*[@id="transactionHistory"]/div/div[5]/div/span/strong[2]').text
                    three_url = self.browser.find_element_by_xpath('//div[@id="transactionHistoryWarp"]').get_attribute(
                        'init-src')
                    time.sleep(random.randint(0,1))
                    m = m+1
                    if page != None and three_url != None and product != None:
                        print(product)
                        print(three_url)
                        break
                    else:
                        continue
                else:
                    break

            except :
                self.browser.find_element_by_xpath('//li[@id="ctab-transactionHistory"]/span').click()  # 点击历史记录
                time.sleep(random.randint(0, 1))
      
		#拿到的链接不完整,需要自己拼接
        three_url_list = re.split('\?', three_url)
        three_url = three_url_list
        r_three_url = '这里是目标网站初始链接' + three_url[0] + '?act=pageload&appid=dh&page={}&' + three_url[-1] + '&lang=en'

        info_list = [['买家', '产品名称', '购买数量', '购买日期', '买方地址']]
        self.three_parse_page(product, info_list, page, r_three_url)




    #与parse_page还有save_page结合爬取某一个产品节点的所有历史交易记录
    def three_parse_page(self,product,info_list,page,r_three_url):
        j = 1
        while True:
            if j <= int(page) :

                self.browser2.get(r_three_url.format(j))
                self.parse_page(info_list)

                print('三级页面第{}页爬取成功'.format(j))
                j = j+1
                time.sleep(random.randint(1,2))
            else:
                print('三级页面最后一页爬取成功')
                try:
                    self.save_page(product,info_list)
                    print('产品节点信息存储成功')
                except:
                    self.save_page2(product,info_list)
                    print('产品节点信息存储成功')
                time.sleep(random.randint(1,2))
                break

	#三级页面爬取函数
    def parse_page(self,info_list):

        tran_list = self.browser2.find_elements_by_xpath(
            '//div[@class="transaction-list"]//ul'
        )

        for tran in tran_list:
            info_two = tran.text.split('\n')
            country = tran.find_element_by_xpath('.//li[@class="col1"]//img').get_attribute('src')
            country_list = re.split('\.|\/',country)
            cou = country_list[-2]
            info_two.append(cou)#买家,产品名称,数量,日期,买家地址
            info_list.append(info_two)

        time.sleep(random.randint(1,2))

 	#保存三级页面的数据
    def save_page(self,product,info_list):
        with open('{}.csv'.format(product), 'a', newline='') as f:
            for rt in info_list:
                writer = csv.writer(f)
                writer.writerow([rt[0].strip(), rt[1].strip(), rt[2].strip(), rt[3].strip(), rt[4]])
   
    #有些product作为文件名由于有'/'这个符号会出错,所以写个新的函数去除'/'            
    def save_page2(self,product,info_list):
        product = re.split('\/',product)[0]
        with open('{}.csv'.format(product), 'a', newline='') as f:
            for rt in info_list:
                writer = csv.writer(f)
                writer.writerow([rt[0].strip(), rt[1].strip(), rt[2].strip(), rt[3].strip(), rt[4]])
    
	#凡是爬取失败的产品,报错信息存下来(页数,第几个商品)
	def save_error(self,a):
        with open('error.csv', 'w', newline='') as f:
	        writer = csv.writer(f)
	        writer.writerow([a[0].strip()])


    def main(self):
        self.get_page()
        self.xunhuan_one_page()

if __name__ == '__main__':
    spider = TestSpider()
    spider.main()


Published an original article · won praise 2 · Views 164

Guess you like

Origin blog.csdn.net/a17695740817/article/details/105094872