Selenium+Pyquery+MongoDB爬取网易新闻评论

通过观察网易新闻评论界面，翻页时，网址没有改变。
因此采用Selenium爬取评论，并保存到MongoDB数据库。
'''
网易新闻的评论实际上没有它所标注的页数这么多
'''
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from pyquery import PyQuery as pq
import pymongo

# 配置数据库信息
Mongo_URL = 'localhost'
Mongo_DB = 'wangyiNews'
MONGO_COLLECTION = 'comments_pinduoduo'
client = pymongo.MongoClient(Mongo_URL)
db = client[Mongo_DB]

# 创建浏览器对象、等待时间对象
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)

# 网易新闻评论url
# 在拼多多里，折叠着一个最真实的中国
url = 'http://comment.tie.163.com/DODJBOE900018M4D.html'  


def search():
    print('正在检索')
    try:
        # 等待页面全部加载完毕
        wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '.wrapper .main-bg.clearfix #tie-main .tie-foot .post-tips'))
        )
        html = driver.page_source  # 返回页面源码
        return html
    except TimeoutException:  # 超时异常
        return search()

def parse_one_page(html):
    doc = pq(html)
    items = doc('.tie-new .list-bdy .trunk.clearfix').items()
    for item in items:
        comments = {
            'name': item.find('.rgt-col .tie-author.clearfix .author-info .from').text(),
            'ip': item.find('.rgt-col .tie-author.clearfix .author-info .ip').text(),
            'date': item.find('.rgt-col  .tie-author.clearfix .post-time').text()[2:].replace('\n', ''),
            'comment': item.find('.rgt-col .tie-bdy .tie-cnt').text(),
            'support': item.find('.rgt-col .tie-operation.clearfix .rgt .support').text().replace('\n', '').replace('顶', ''),
            'digg': item.find('.rgt-col .tie-operation.clearfix .rgt .digg').text().replace('\n', '').replace('踩', '')
        }
        print(comments)
        save_to_mongo(comments)
    next_page()
    time.sleep(5)

def next_page():
    # 翻页操作
    try:
        if wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.wrapper .main-bg.clearfix #tie-main .tie-new .list-foot.clearfix .page-bar .m-page .next.z-enable'))):
            next_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.wrapper .main-bg.clearfix #tie-main .tie-new .list-foot.clearfix .page-bar .m-page .next.z-enable')))
            next_page.click()
    except TimeoutException:
        return None

def save_to_mongo(comments):
    try:
        if db[MONGO_COLLECTION].insert(comments):
            print('存储到MONGODB成功')
    except  Exception:
        print('存储到MONGODB失败')

def main():
    driver.get(url)
    time.sleep(5)
    try:
        for i in range(68):  # 观察实际的评论页数
            print(i)
            html = search()
            parse_one_page(html)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")  # 网易新闻最后一页必须将页面下拉至底端才能输出
    finally:
        time.sleep(5)
        driver.close()


if __name__ == '__main__':
    main()
Selenium+Pyquery+MongoDB爬取网易新闻评论

猜你喜欢