爬取某一微博用户所有文本文件或者是视频图片文件

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/a19990412/article/details/83415900

简述

爬取所有的中大微博的信息。包括文本文件和视频文件,亦或是是图片文件。

代码实现

from gevent import monkey
import gevent

monkey.patch_all(select=False)
import selenium
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import shutil
import requests


def scroll():
    try:
        browser.find_element_by_xpath('//div[@class="W_pages"]')
    except:
        return False
    return True


def checkfile(filepath):
    if os.path.exists(filepath):
        shutil.rmtree(filepath)
    os.mkdir(filepath)


def writeText(text, filepath):
    with open(os.path.join(filepath, 'text.txt'), 'w', encoding='utf-8', errors='ignore') as f:
        f.write(text)


def writeMedia(mediaurl, filepath, name, vedio=False):
    if not vedio:
        form = os.path.splitext(mediaurl)[-1]
    else:
        form = '.mp4'
    with open(os.path.join(filepath, name + form), 'wb') as f:
        f.write(requests.get(mediaurl).content)


if __name__ == '__main__':
    options = webdriver.ChromeOptions()
    options.binary_location = r"D:\Software\Chrome\Application\chrome.exe"
    browser = webdriver.Chrome(chrome_options=options)
    url = 'https://weibo.com/u/1892723783?is_all=1'
    browser.get(url)
    time.sleep(40)
    WB_innerwrap = browser.find_element_by_css_selector('div[class="WB_innerwrap"]')
    # 总共的数量
    td_last_number = int(WB_innerwrap.find_elements_by_tag_name('td')[-1].find_element_by_class_name("W_f16").text)
    print("微博总数:", td_last_number)

    file_path = './weibo'

    checkfile(file_path)
    medialist = []
    MAXLEN = 50
    for TIMES in range(221):
        while not scroll():
            browser.execute_script("window.scrollBy(0,10000)")
            time.sleep(5)

        WB_feed = browser.find_element_by_css_selector('div[module-type="feed"]')
        feed_content = WB_feed.find_elements_by_xpath('//div[@node-type="feed_content"]')
        for feed_list_item in feed_content:
            print("#### BEGIN", td_last_number)
            WB_detail = feed_list_item.find_element_by_class_name('WB_detail')
            WB_text = WB_detail.find_element_by_css_selector('div[node-type="feed_list_content"]').text

            # 创建文件夹
            feed_list_item_file_path = os.path.join(file_path, str(td_last_number))
            checkfile(feed_list_item_file_path)

            writeText(WB_text, feed_list_item_file_path)

            td_last_number -= 1
            try:
                WB_media_box = WB_detail.find_element_by_class_name('media_box')
                imgs = WB_media_box.find_elements_by_tag_name('img')
                for i, img in enumerate(imgs):
                    # print('image ', i, img.get_attribute('src'))
                    medialist.append(gevent.spawn(writeMedia, img.get_attribute('src'), feed_list_item_file_path,
                                                  'image ' + str(i)))
                videos = WB_media_box.find_elements_by_tag_name('video')
                for video in videos:
                    # print('video', video.get_attribute('src'))
                    medialist.append(gevent.spawn(writeMedia, video.get_attribute('src'), feed_list_item_file_path,
                                                  'veido', True))
            except:
                pass
            if len(medialist) >= MAXLEN:
                gevent.joinall(medialist)
                medialist = []

        W_pages = browser.find_element_by_xpath('//div[@class="W_pages"]')
        W_pages.find_elements_by_tag_name('a')[-1].click()

        if len(medialist):
            gevent.joinall(medialist)
            medialist = []
        else:
            time.sleep(5)

猜你喜欢

转载自blog.csdn.net/a19990412/article/details/83415900