本篇博客仅作为学习交流,不可用于商业用途
要使用selenium必须装浏览器驱动,下载一个驱动包,解压放在python的script目录即可
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# Author: RuiMing Lin
# DateTime: 2021/01/26 17:09
# Description: 使用Selenium动态爬取电影
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import logging
import pymongo
# 定义日志级别
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')
# 根路径
base_url = 'http://maoyan.com/board/4?offset='
# 总页数
TOTAL_PAGE = 10
# MongoDB配置信息
MONGO_CONNECTION_STRING = 'mongodb://localhost:27017'
MONGO_DB_NAME = 'movies'
MONGO_COLLECTION_NAME = 'maoyan'
client = pymongo.MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DB_NAME]
collection = db[MONGO_COLLECTION_NAME]
for i in range(10):
# 1.使用谷歌浏览器打开猫眼电影网
index = i * 10
url = base_url + str(index) # 拼接url
browser = webdriver.Chrome() # 打开谷歌浏览器
browser.get(url) # 打开猫眼网
logging.info('scraping %s ...', url)
wait = WebDriverWait(browser, timeout=10, poll_frequency=0.5) # 显示等待
# 2.解析页面,获得内容
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'movie-item-info')))
divs = browser.find_elements_by_class_name('movie-item-info')
for div in divs:
name = div.find_element_by_class_name("name").find_element_by_tag_name("a").get_attribute('title')
url = div.find_element_by_class_name("name").find_element_by_tag_name("a").get_attribute('href')
star = div.find_element_by_class_name("star").text
release_time = div.find_element_by_class_name("releasetime").text
data_dict = {
'name': name,
'url': url,
'star': star,
'release_time': release_time
}
# 3.保存到MongoDB
if data_dict is not None:
collection.update_one({
'name': data_dict.get('name')
}, {
'$set': data_dict
}, upsert=True)
else:
logging.info("save_data fail... because data is none")
browser.close()
# 4.查看数据
results = collection.find()
for result in results:
print(result)
print(collection.find().count())