用 python 爬取某珠宝网站

GitHub：https://github.com/121812/python
转载请注明出处
详细信息请看：http://www.forever121.cn/?p=138
下面为源代码
import os
import re
import time
import request
import requests
import xlsxwriter
from  selenium import webdriver
from  selenium.webdriver.support.ui import WebDriverWait
url = 'https://www.dior.cn/zh_cn/products/search?page=3&query=珠宝'
#url = 'http://www.baidu.com'


chrome="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument('lang=zh_CN.UTF-8')
options.add_argument('user-agent="Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Mobile Safari/537.36"')
browser = webdriver.Chrome(chrome_options=options)
browser.maximize_window()
wait = WebDriverWait(browser,2)


def get_source(url):
    browser.get(url)
    num = 0
    try:
        while True:
            print(num)
            num += 1
            login = browser.find_element_by_xpath('//*[@id="main"]/div/div[2]/div[3]/button/span/span')
            login.click()
            time.sleep(1)
            if num == 20:
                break
    except:
        pass
    html = browser.page_source
    return html


def find_combination(html):
    combination = re.findall('<div class="product-legend"><span class="title-with-level product-title century-std size-s"><span class="multiline-text multiline-text--is-china">(.*?)</span>.*?<img src="(.*?)"', 
    return combination

def find_jpg(combination):
    jpg = []
    txt = []
    a = 0
    b = 4
    e = 1
    num = 1
    file = xlsxwriter.Workbook('迪奥.xls')
    table = file.add_worksheet('迪奥')
    table.write(0, 0, '总体')
    table.write(0, 1, '名称')
    table.write(0, 2, '质量')
    table.write(0, 3, '用料')
    table.write(0, 4, '图片')
    
    for i in combination:
        i = list(i)
        down = i[1]
        save_jpg = requests.get(down)
        time.sleep(1)
        print('正在下载：%s'%down)
        save = open('F:\\python测试\\data\\迪奥\\%s'%num + '.jpg', 'wb')
        save.write(save_jpg.content)
        save.close()

        name = re.findall('^(.*?)750', i[0])
        quality = re.findall('750/1000(.*?)$', i[0])
        try:
            table.write(e,a , '%s'%i[0])
            table.write(e,1, '%s'%name[0])
            table.write(e,2, '750/1000')
            table.write(e,3, '%s'%quality[0])
        except:
            pass
        table.insert_image(e,b, 'F:\\python测试\\data\\迪奥\\%s.jpg'%num)
        e += 1
        num += 1
    file.close()



find_jpg(find_combination(get_source(url)))
print('下载完成')
用 python 爬取 某珠宝网站

猜你喜欢

用 python 爬取某珠宝网站