selenio sitio web del proveedor de electricidad que se arrastra

selenio sitio web del proveedor de electricidad que se arrastra


from selenium import webdriver
import random
import time
import csv
import re
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC#条件判断
from selenium.webdriver.support.wait import WebDriverWait#等待

class TestSpider(object):
    def __init__(self):
        self.url = '这里是目标网站链接'
        self.error = []
    #进入一级页面
    def get_page(self):
        options = webdriver.ChromeOptions()
        # 添加无界面参数
        options.add_argument('--headless')
        self.browser = webdriver.Chrome(options=options)
        self.browser.get(self.url)
        print('已启动浏览器1')
        self.browser.maximize_window()
        self.browser.implicitly_wait(8)

        self.browser2 = webdriver.Chrome(options=options)
        self.browser2.implicitly_wait(3)
        print('已启动浏览器2')
        WebDriverWait(self.browser,5).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="pageTWrap"]/div[4]/ul/li[2]/a'))).click()


    #遍历一级页面各个产品节点
    def xunhuan_one_page(self):
       
     
        i = 1
        #如果出错了,重新爬取,可以在这里加入上次那个产品所在的页数,位置,代码有待补充


        while True:

            list = self.browser.find_elements_by_xpath('//div[@id="proList"]//div[@index]')  # 产品节点
            if i <= int(len(list)):

                try:
                    num = list[i-1].find_element_by_xpath('.//div[@class="pro-sold"]//span[@class="num"]')
                    page_one = self.browser.find_element_by_xpath('//div[@class="filter-page"]/span').text
                    page_one = re.split('\s|\/', page_one)[1]

                    list[i-1].find_element_by_xpath('.//div[@class="photo"]//img').click()

                    print('已进入第{}个产品节点'.format(i))
                    i = i + 1
                    time.sleep(random.randint(7,8))
                    try:
                        self.two_page()

                    except:
                        print('\033[1;35;0m ****************************** \033[0m')
                        a = ['第{}页第{}个产品爬取失败'.format(page_one,i-1)]
                        self.error.append(a)
                        print(a[0])
                        print('\033[1;35;0m ****************************** \033[0m')
                        self.save_error(a)
                        # self.error.append(a)
                    time.sleep(random.randint(2,3))

                    while True:
                        try:
                            MainContent = self.browser.find_element_by_xpath('//div[@class="mainout"]').get_attribute(
                                'id')
                            break
                        except:
                            self.browser.back()
                            time.sleep(random.randint(2,3))
                except:
                    i = i+1
            else:
                if self.browser.page_source.find('unnext') == -1:
                    self.browser.find_element_by_xpath('//div[@class="filter-page"]//a[@class="next"]').click()
                    i = 1#翻页了,归1
                    one_page = self.browser.find_element_by_xpath('//div[@class="page"]/span/b').text
                    print('一级页面第{}页'.format(one_page))
                    time.sleep(random.randint(1,2))
                else:
                    break
        print(self.error)

        self.browser.quit()
        # print(self.error)

    #二级页面的一些操作
    def two_page(self):
    	#尝试爬取二级页面的某些信息,加入了判断语句,10次内拿到数据则停止
        m = 1
        while True:
            try :
                if m <= 10 :

                    product = self.browser.find_element_by_xpath('//*[@id="colLeft"]/div[1]/div[1]/div/div[1]/h2').text
                    page = self.browser.find_element_by_xpath('//*[@id="transactionHistory"]/div/div[5]/div/span/strong[2]').text
                    three_url = self.browser.find_element_by_xpath('//div[@id="transactionHistoryWarp"]').get_attribute(
                        'init-src')
                    time.sleep(random.randint(0,1))
                    m = m+1
                    if page != None and three_url != None and product != None:
                        print(product)
                        print(three_url)
                        break
                    else:
                        continue
                else:
                    break

            except :
                self.browser.find_element_by_xpath('//li[@id="ctab-transactionHistory"]/span').click()  # 点击历史记录
                time.sleep(random.randint(0, 1))
      
		#拿到的链接不完整,需要自己拼接
        three_url_list = re.split('\?', three_url)
        three_url = three_url_list
        r_three_url = '这里是目标网站初始链接' + three_url[0] + '?act=pageload&appid=dh&page={}&' + three_url[-1] + '&lang=en'

        info_list = [['买家', '产品名称', '购买数量', '购买日期', '买方地址']]
        self.three_parse_page(product, info_list, page, r_three_url)




    #与parse_page还有save_page结合爬取某一个产品节点的所有历史交易记录
    def three_parse_page(self,product,info_list,page,r_three_url):
        j = 1
        while True:
            if j <= int(page) :

                self.browser2.get(r_three_url.format(j))
                self.parse_page(info_list)

                print('三级页面第{}页爬取成功'.format(j))
                j = j+1
                time.sleep(random.randint(1,2))
            else:
                print('三级页面最后一页爬取成功')
                try:
                    self.save_page(product,info_list)
                    print('产品节点信息存储成功')
                except:
                    self.save_page2(product,info_list)
                    print('产品节点信息存储成功')
                time.sleep(random.randint(1,2))
                break

	#三级页面爬取函数
    def parse_page(self,info_list):

        tran_list = self.browser2.find_elements_by_xpath(
            '//div[@class="transaction-list"]//ul'
        )

        for tran in tran_list:
            info_two = tran.text.split('\n')
            country = tran.find_element_by_xpath('.//li[@class="col1"]//img').get_attribute('src')
            country_list = re.split('\.|\/',country)
            cou = country_list[-2]
            info_two.append(cou)#买家,产品名称,数量,日期,买家地址
            info_list.append(info_two)

        time.sleep(random.randint(1,2))

 	#保存三级页面的数据
    def save_page(self,product,info_list):
        with open('{}.csv'.format(product), 'a', newline='') as f:
            for rt in info_list:
                writer = csv.writer(f)
                writer.writerow([rt[0].strip(), rt[1].strip(), rt[2].strip(), rt[3].strip(), rt[4]])
   
    #有些product作为文件名由于有'/'这个符号会出错,所以写个新的函数去除'/'            
    def save_page2(self,product,info_list):
        product = re.split('\/',product)[0]
        with open('{}.csv'.format(product), 'a', newline='') as f:
            for rt in info_list:
                writer = csv.writer(f)
                writer.writerow([rt[0].strip(), rt[1].strip(), rt[2].strip(), rt[3].strip(), rt[4]])
    
	#凡是爬取失败的产品,报错信息存下来(页数,第几个商品)
	def save_error(self,a):
        with open('error.csv', 'w', newline='') as f:
	        writer = csv.writer(f)
	        writer.writerow([a[0].strip()])


    def main(self):
        self.get_page()
        self.xunhuan_one_page()

if __name__ == '__main__':
    spider = TestSpider()
    spider.main()


Publicó un artículo original · ganado elogios 2 · Vistas 164

Supongo que te gusta

Origin blog.csdn.net/a17695740817/article/details/105094872
Recomendado
Clasificación