Python之使用selenium动态爬取猫眼电影信息并保存MongoDB

本篇博客仅作为学习交流,不可用于商业用途
要使用selenium必须装浏览器驱动,下载一个驱动包,解压放在python的script目录即可

#!/usr/bin/python
# -*- coding: UTF-8 -*-    
# Author: RuiMing Lin
# DateTime: 2021/01/26 17:09
# Description: 使用Selenium动态爬取电影
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import logging
import pymongo

# 定义日志级别
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s: %(message)s')
# 根路径
base_url = 'http://maoyan.com/board/4?offset='
# 总页数
TOTAL_PAGE = 10
# MongoDB配置信息
MONGO_CONNECTION_STRING = 'mongodb://localhost:27017'
MONGO_DB_NAME = 'movies'
MONGO_COLLECTION_NAME = 'maoyan'
client = pymongo.MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DB_NAME]
collection = db[MONGO_COLLECTION_NAME]

for i in range(10):
    # 1.使用谷歌浏览器打开猫眼电影网
    index = i * 10
    url = base_url + str(index)  # 拼接url
    browser = webdriver.Chrome()    # 打开谷歌浏览器
    browser.get(url)    # 打开猫眼网
    logging.info('scraping %s ...', url)
    wait = WebDriverWait(browser, timeout=10, poll_frequency=0.5)  # 显示等待
    # 2.解析页面,获得内容
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'movie-item-info')))
    divs = browser.find_elements_by_class_name('movie-item-info')
    for div in divs:
        name = div.find_element_by_class_name("name").find_element_by_tag_name("a").get_attribute('title')
        url = div.find_element_by_class_name("name").find_element_by_tag_name("a").get_attribute('href')
        star = div.find_element_by_class_name("star").text
        release_time = div.find_element_by_class_name("releasetime").text
        data_dict = {
    
    
            'name': name,
            'url': url,
            'star': star,
            'release_time': release_time
        }
        # 3.保存到MongoDB
        if data_dict is not None:
            collection.update_one({
    
    
                'name': data_dict.get('name')
            }, {
    
    
                '$set': data_dict
            }, upsert=True)
        else:
            logging.info("save_data fail... because data is none")
    browser.close()

# 4.查看数据
results = collection.find()
for result in results:
    print(result)
print(collection.find().count())

猜你喜欢

转载自blog.csdn.net/Orange_minger/article/details/113483663