使用python selenium爬取淘宝商品信息 自动登录淘宝和爬取某一宝贝的主图,属性图和详情图等等

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接: https://blog.csdn.net/nick_chain/article/details/102629901

selenium作为一个自动化测试工具非常好用,谁用谁知道啊。

先说如何登录淘宝,淘宝现在直接用会员名和密码登录会有滑块验证,找了网上说的几种方法和自己尝试了一番效果还是不太理想,实测过程中,即使滑块滑动成功了也无法登录,出现报错的情况。限于自身的技术水平无法解决。但是方法总比困难多,最后用了微博账号登录的。如果你使用了下文的登录方法。那快去注册一个微博账号或者绑定淘宝吧

登录:

首先确保安装了selenium,requests

pip install selenium

pip install requests

然后安装webdriver,具体怎么安装可以网上搜索,这里不做描述。

1.登陆:

#encoding=utf-8
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver import ActionChains
import time
import requests
import os
import re

#
url = 'https://login.taobao.com/member/login.jhtml'
options = webdriver.ChromeOptions()

# 不加载图片,加快访问速度
options.add_experimental_option("prefs",{"profile.mamaged_default_content_settings.images":2})

# 设置为开发者模式,防止被各大网站识别出来使用了Selenium,
options.add_experimental_option('excludeSwitches',['enable-automation'])

driver = webdriver.Chrome(executable_path='C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver',options=options)
wait = WebDriverWait(driver,10)


def login():

    driver.get(url)
    # 打开网页
    driver.get(url)

    # 等待 密码登录选项 出现
    password_login = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.qrcode-login > .login-links > .forget-pwd')))
    password_login.click()

    # 等待 微博登录选项 出现
    weibo_login = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.weibo-login')))
    weibo_login.click()

    #获取账号输入框
    #input1 = driver.find_element_by_id('TPL_username_1')
    #input1.send_keys('xxxx ')
    EMAIL1  = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.username> .W_input' )))
    EMAIL1.send_keys('xxxxxx') #输入自己的账号

    #获取密码输入框
    #input2 = driver.find_element_by_id('TPL_password_1')
    #input2.send_keys('xxx')
    PASSWD = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#pl_login_logged > div > div:nth-child(3) > div > input')))
    PASSWD.send_keys('xxxxx')

    #获取登陆按钮
    button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#pl_login_logged > div > div:nth-child(7) > div:nth-child(1) > a > span')))
    #button = driver.find_element_by_id('J_SubmitStatic')
    button.click()
    time.sleep(1)

2.定位:

以手机为关键词,点击第一个宝贝,以第一个宝贝为爬取对象。爬取宝贝的主图,属性图和详细图信息

def location():
     #定位到新页面并且搜索手机关键字
    time.sleep(1)
    driver.switch_to.window(driver.window_handles[0])
    search = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#q')))
    #search = driver.find_element_by_id('q')
    search.send_keys('电脑')
    search.send_keys(Keys.ENTER)

    #取第一个宝贝做测试
    time.sleep(1)
    index = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.pic>a')))
    #search = driver.find_element_by_id('q')
    #search.send_keys('手机')
    index.send_keys(Keys.ENTER)

3.爬取宝贝主图:

def get_main_pic():
       
    #爬取主图信息
    time.sleep(1)
    driver.switch_to.window(driver.window_handles[1])
    pics = driver.find_elements(By.CSS_SELECTOR,'#J_UlThumb  li a img')

    # .为匹配任意字符,*为匹配多个 \转译字符
    pattern = r"https://img.alicdn.com/imgextra/.*/.*/O.*?\..{3}"
    list = []
    for item in pics:                  
        print(item.get_attribute("src"))
        src = item.get_attribute("src")
        opt = re.findall(pattern,src)
        print(opt[0])
        list.append(opt[0])

    dowland_pic(list,mainPicDir)

4.爬取宝贝属性图:

def get_attribute_pic():
    
    #爬取详情图信息
    detail = driver.find_elements(By.CSS_SELECTOR,'.tb-prop dd ul li')

    list = []
    for item in detail:                  
        print(item.get_attribute("style"))
        src = item.get_attribute("style")
        list.append(src)
        
        
    dowland_pic(list,attributePicDir)

5.爬取宝贝详情图:

def get_detail_pic():

    #爬取详情图信息
    #detail = driver.find_elements(By.CSS_SELECTOR,'#description .content  div div img')
    
    detail = driver.find_elements(By.CSS_SELECTOR,'#description .content  p img')
    if detail:
        detail = driver.find_elements(By.CSS_SELECTOR,'#description .content  div div img')

    if detail:
        print('not find detail')
        return 0
    list = []
    for item in detail:                  
        print(item.get_attribute("src"))
        src = item.get_attribute("src")
        list.append(src)

    dowland_pic(list,detailPicDir)

说明:爬取详情图有些样式的宝贝还不能爬取,还要做些适配性工作。还有不足的地方就是宝贝视频不能爬取后期需要加入这个功能敬请期待。

6.完整代码:

#encoding=utf-8
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver import ActionChains
import time
import requests
import os
import re

#
url = 'https://login.taobao.com/member/login.jhtml'
options = webdriver.ChromeOptions()

# 不加载图片,加快访问速度
options.add_experimental_option("prefs",{"profile.mamaged_default_content_settings.images":2})

# 设置为开发者模式,防止被各大网站识别出来使用了Selenium,
options.add_experimental_option('excludeSwitches',['enable-automation'])

driver = webdriver.Chrome(executable_path='C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver',options=options)
wait = WebDriverWait(driver,10)

mainPicDir = './main/'
detailPicDir = './detail/'
attributePicDir = './attribute/'


def mkdir():

     #创建一个目录
    os.makedirs(mainPicDir,exist_ok=True)
    os.makedirs(detailPicDir,exist_ok=True)
    os.makedirs(attributePicDir,exist_ok=True)


def dowland_pic(src,dir):

    cout = 0
    for index in src:
        r = requests.get(index)
        fileName = "%s.jpg"%cout
        lcoalDir = dir + fileName
        with open(lcoalDir,'wb') as f:
            f.write(r.content)
        cout +=1

def login():

    driver.get(url)
    # 打开网页
    driver.get(url)

    # 等待 密码登录选项 出现
    password_login = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.qrcode-login > .login-links > .forget-pwd')))
    password_login.click()

    # 等待 微博登录选项 出现
    weibo_login = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.weibo-login')))
    weibo_login.click()

 #获取账号输入框
    #input1 = driver.find_element_by_id('TPL_username_1')
    #input1.send_keys('xxxx ')
    EMAIL1  = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.username> .W_input' )))
    EMAIL1.send_keys('xxxxxx') #输入自己的账号

    #获取密码输入框
    #input2 = driver.find_element_by_id('TPL_password_1')
    #input2.send_keys('xxx')
    PASSWD = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#pl_login_logged > div > div:nth-child(3) > div > input')))
    PASSWD.send_keys('xxxxx')

    #获取登陆按钮
    button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#pl_login_logged > div > div:nth-child(7) > div:nth-child(1) > a > span')))
    #button = driver.find_element_by_id('J_SubmitStatic')
    button.click()
    time.sleep(1)


def location():
     #定位到新页面并且搜索手机关键字
    time.sleep(1)
    driver.switch_to.window(driver.window_handles[0])
    search = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#q')))
    #search = driver.find_element_by_id('q')
    search.send_keys('电脑')
    search.send_keys(Keys.ENTER)

    #取第一个宝贝做测试
    time.sleep(1)
    index = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.pic>a')))
    #search = driver.find_element_by_id('q')
    #search.send_keys('手机')
    index.send_keys(Keys.ENTER)


def get_main_pic():
       
    #爬取主图信息
    time.sleep(1)
    driver.switch_to.window(driver.window_handles[1])
    pics = driver.find_elements(By.CSS_SELECTOR,'#J_UlThumb  li a img')

    # .为匹配任意字符,*为匹配多个 \转译字符
    pattern = r"https://img.alicdn.com/imgextra/.*/.*/O.*?\..{3}"
    list = []
    for item in pics:                  
        print(item.get_attribute("src"))
        src = item.get_attribute("src")
        opt = re.findall(pattern,src)
        print(opt[0])
        list.append(opt[0])

    dowland_pic(list,mainPicDir)



def get_detail_pic():

    #爬取详情图信息
    #detail = driver.find_elements(By.CSS_SELECTOR,'#description .content  div div img')
    
    detail = driver.find_elements(By.CSS_SELECTOR,'#description .content  p img')
    if detail:
        detail = driver.find_elements(By.CSS_SELECTOR,'#description .content  div div img')

    if detail:
        print('not find detail')
        return 0
    list = []
    for item in detail:                  
        print(item.get_attribute("src"))
        src = item.get_attribute("src")
        list.append(src)

    dowland_pic(list,detailPicDir)


def get_attribute_pic():
    
    #爬取详情图信息
    detail = driver.find_elements(By.CSS_SELECTOR,'.tb-prop dd ul li')

    list = []
    for item in detail:                  
        print(item.get_attribute("style"))
        src = item.get_attribute("style")
        list.append(src)
        
        
    dowland_pic(list,attributePicDir)


def main():
    print('mkdir')
    mkdir()
    print('login')
    login()
    print('location')
    location()
    print('get_main_pic')
    get_main_pic()
    print('get_detail_pic')
    get_detail_pic()
    get_attribute_pic()

if __name__ == "__main__":
        main()











猜你喜欢

转载自blog.csdn.net/nick_chain/article/details/102629901