Python uses selenium to dynamically crawl cat's eye movie information and save MongoDB

This blog is only for learning and communication, not for commercial purposes.
To use selenium, you must install a browser driver, download a driver package, and unzip it in the python script directory.

#!/usr/bin/python
# -*- coding: UTF-8 -*-    
# Author: RuiMing Lin
# DateTime: 2021/01/26 17:09
# Description: 使用Selenium动态爬取电影
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import logging
import pymongo

# 定义日志级别
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s: %(message)s')
# 根路径
base_url = 'http://maoyan.com/board/4?offset='
# 总页数
TOTAL_PAGE = 10
# MongoDB配置信息
MONGO_CONNECTION_STRING = 'mongodb://localhost:27017'
MONGO_DB_NAME = 'movies'
MONGO_COLLECTION_NAME = 'maoyan'
client = pymongo.MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DB_NAME]
collection = db[MONGO_COLLECTION_NAME]

for i in range(10):
    # 1.使用谷歌浏览器打开猫眼电影网
    index = i * 10
    url = base_url + str(index)  # 拼接url
    browser = webdriver.Chrome()    # 打开谷歌浏览器
    browser.get(url)    # 打开猫眼网
    logging.info('scraping %s ...', url)
    wait = WebDriverWait(browser, timeout=10, poll_frequency=0.5)  # 显示等待
    # 2.解析页面,获得内容
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'movie-item-info')))
    divs = browser.find_elements_by_class_name('movie-item-info')
    for div in divs:
        name = div.find_element_by_class_name("name").find_element_by_tag_name("a").get_attribute('title')
        url = div.find_element_by_class_name("name").find_element_by_tag_name("a").get_attribute('href')
        star = div.find_element_by_class_name("star").text
        release_time = div.find_element_by_class_name("releasetime").text
        data_dict = {
    
    
            'name': name,
            'url': url,
            'star': star,
            'release_time': release_time
        }
        # 3.保存到MongoDB
        if data_dict is not None:
            collection.update_one({
    
    
                'name': data_dict.get('name')
            }, {
    
    
                '$set': data_dict
            }, upsert=True)
        else:
            logging.info("save_data fail... because data is none")
    browser.close()

# 4.查看数据
results = collection.find()
for result in results:
    print(result)
print(collection.find().count())

Guess you like

Origin blog.csdn.net/Orange_minger/article/details/113483663