Python3爬虫三大案例实战分享之Selenium+Chrome/Headless Chrome

知识点:

高人:selenium+headless chrome爬虫  

爬虫的代码有一点需要注意,需要操作事件的时候最好不要直接用相应的方法,比如click。最好嵌入js脚本的方式进行调用。因为爬虫的代码执行速度很快,前端元素结构往往反应不过来,从而找出元素不可见或者不存在的错误

province_items = DRIVER.find_element_by_class_name("city-province").find_elements_by_tag_name("a")
#province_item.click()  #前端加载不赢
DRIVER.execute_script('arguments[0].click();',province_item)

Python3爬虫三大案例实战

教程:https://edu.hellobi.com/course/156/play/lesson/2454

函数分2部分。主要文件main.py和配置MongoDB文件config.py

1、main.py

# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from pyquery import PyQuery as pq
import time
import re
from configs import *
import pymongo

'''
1、搜索关键词
2、分析页码并翻页
3、分析提取数据
4、存到数据库
'''

#mongo数据库
client=pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]


#headless chrome无界面模式
# from selenium.webdriver.chrome.options import Options
# global browser
# chrome_options = Options()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')
# browser = webdriver.Chrome(chrome_options=chrome_options)
# browser.set_window_size(1400, 900)
# wait = WebDriverWait(browser, 10)


#有界面模式,掉起浏览器便于观察
global browser
browser = webdriver.Chrome()
browser.set_window_size(1400, 900)
wait = WebDriverWait(browser, 10)

#打开函数
def search():
    try:
        #百度
        # browser.get("https://www.baidu.com")
        # input=browser.find_element_by_id("kw")
        # input.send_keys("Python")
        # input.send_keys(Keys.ENTER)
        # wait=WebDriverWait(browser,10)
        # wait.until(EC.presence_of_element_located((By.ID,"content_left")))
        # print(browser.current_url)
        # print(browser.get_cookies())
        # print(browser.page_source)

        #淘宝
        browser.get("http://www.taobao.com")
        #1、获取按钮
        input = wait.until(
            EC.presence_of_element_located((By.ID, "q")) )
        submit = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#J_TSearchForm > div.search-button > button")) )
        #2、模拟操作
        input.send_keys('美食')
        submit.click()
        #等待加载挖完成再提取总页数
        total = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total"))
        ).text
        # print(total.text)  # '100天'提取数字
        totalnum=int(re.search(r'\d+',total).group())
        return totalnum
    except TimeoutError:
        search()

#自动翻页
def next_page(page_number):
    try:
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
        submit = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
        input.clear()  #清除内容先
        input.send_keys(page_number)
        submit.click()
        #判断百度首页上,“糯米”按钮这个元素中存在文本:糯米,判断翻页是否成功
        wait.until(
            EC.text_to_be_present_in_element(
                (By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_number)))
        time.sleep(0.1)
        getproducts()
    except TimeoutError:
        next_page(page_number)

#数据提取
def getproducts():
    wait.until(
        EC.presence_of_element_located(
            (By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))
    html=browser.page_source
    doc=pq(html)
    #items方法得到所有选择内容
    items=doc('#mainsrp-itemlist .items .item').items()
    for item in items:
        product={
            # 'image':item.find('.pic .img').attr('src'),
            'name': item.find('.pic .img').attr('alt'),
            'price':item.find('.price').text(),
            'desl':item.find('.deal-cnt').text()[:-2],
            'shop':item.find(('.shop')).text(),
            'location':item.find('.location').text()
        }
        print(product)
        # save_to_mongo(product)
def save_to_mongo(result):
    try:
        if db[MONGO_TABLE].insert(result):
            print('存储成功',result)
    except Exception:
        print('存储失败')

#主函数
def main():
    try:
        totalnum=search()
        # print(totalnum)
        for i in range(2,totalnum+1):
        # for i in range(2, 3):
            next_page(i)
    except Exception:
        print('出错啊')
    finally:
        browser.close()

if __name__=='__main__':
    main()

2、config.py

前提启动MongoDB数据库

MONGO_URL='localhost'
MONGO_DB='taobao'
MONGO_TABLE='table'

猜你喜欢

转载自blog.csdn.net/sinat_23880167/article/details/82227536
今日推荐