版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/a19990412/article/details/83415900
简述
爬取所有的中大微博的信息。包括文本文件和视频文件,亦或是是图片文件。
代码实现
from gevent import monkey
import gevent
monkey.patch_all(select=False)
import selenium
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import shutil
import requests
def scroll():
try:
browser.find_element_by_xpath('//div[@class="W_pages"]')
except:
return False
return True
def checkfile(filepath):
if os.path.exists(filepath):
shutil.rmtree(filepath)
os.mkdir(filepath)
def writeText(text, filepath):
with open(os.path.join(filepath, 'text.txt'), 'w', encoding='utf-8', errors='ignore') as f:
f.write(text)
def writeMedia(mediaurl, filepath, name, vedio=False):
if not vedio:
form = os.path.splitext(mediaurl)[-1]
else:
form = '.mp4'
with open(os.path.join(filepath, name + form), 'wb') as f:
f.write(requests.get(mediaurl).content)
if __name__ == '__main__':
options = webdriver.ChromeOptions()
options.binary_location = r"D:\Software\Chrome\Application\chrome.exe"
browser = webdriver.Chrome(chrome_options=options)
url = 'https://weibo.com/u/1892723783?is_all=1'
browser.get(url)
time.sleep(40)
WB_innerwrap = browser.find_element_by_css_selector('div[class="WB_innerwrap"]')
# 总共的数量
td_last_number = int(WB_innerwrap.find_elements_by_tag_name('td')[-1].find_element_by_class_name("W_f16").text)
print("微博总数:", td_last_number)
file_path = './weibo'
checkfile(file_path)
medialist = []
MAXLEN = 50
for TIMES in range(221):
while not scroll():
browser.execute_script("window.scrollBy(0,10000)")
time.sleep(5)
WB_feed = browser.find_element_by_css_selector('div[module-type="feed"]')
feed_content = WB_feed.find_elements_by_xpath('//div[@node-type="feed_content"]')
for feed_list_item in feed_content:
print("#### BEGIN", td_last_number)
WB_detail = feed_list_item.find_element_by_class_name('WB_detail')
WB_text = WB_detail.find_element_by_css_selector('div[node-type="feed_list_content"]').text
# 创建文件夹
feed_list_item_file_path = os.path.join(file_path, str(td_last_number))
checkfile(feed_list_item_file_path)
writeText(WB_text, feed_list_item_file_path)
td_last_number -= 1
try:
WB_media_box = WB_detail.find_element_by_class_name('media_box')
imgs = WB_media_box.find_elements_by_tag_name('img')
for i, img in enumerate(imgs):
# print('image ', i, img.get_attribute('src'))
medialist.append(gevent.spawn(writeMedia, img.get_attribute('src'), feed_list_item_file_path,
'image ' + str(i)))
videos = WB_media_box.find_elements_by_tag_name('video')
for video in videos:
# print('video', video.get_attribute('src'))
medialist.append(gevent.spawn(writeMedia, video.get_attribute('src'), feed_list_item_file_path,
'veido', True))
except:
pass
if len(medialist) >= MAXLEN:
gevent.joinall(medialist)
medialist = []
W_pages = browser.find_element_by_xpath('//div[@class="W_pages"]')
W_pages.find_elements_by_tag_name('a')[-1].click()
if len(medialist):
gevent.joinall(medialist)
medialist = []
else:
time.sleep(5)