General idea:
(1) Use the VideoFileClip class in the moviepy library to read video files and convert them into audio files;
(2) Use the splitonsilence function in the pydub library to split the audio file into multiple audio segments for speech recognition;
(3) Use the Recognizer class in the SpeechRecognition library for speech recognition and write the recognition results into a text file
Notice:
Split the audio file into multiple audio segments, perform speech recognition on each segment, and finally merge all recognition results into a text file.
Brief description of operating environment:
(1)Mac OS 13.3.1
(2)pycharm 2021.1
Overall code:
from moviepy.editor import VideoFileClip
from pathlib import Path
import os
import speech_recognition as sr
from pydub import AudioSegment
import datetime
from pydub.silence import split_on_silence
from tqdm import tqdm
# 视频文件夹路径
video_folder = './folder'
# 初始化语音识别器
r = sr.Recognizer()
# 遍历视频文件夹中的所有视频文件
for video_file in tqdm(Path(video_folder).rglob('*.mp4')):
# 提取视频文件名和扩展名
file_name = video_file.stem
print(f'Processing video file: {file_name}')
# 构建视频文件路径和音频文件路径
audio_file = f'{file_name}.wav'
video_clip = VideoFileClip(str(video_file))
video_clip.audio.write_audiofile(audio_file)
def get_large_audio_transcription(path):
sound = AudioSegment.from_wav(path)
chunks = split_on_silence(sound, min_silence_len=500, silence_thresh=sound.dBFS - 14, keep_silence=500, )
folder_name = "audio-chunks"
# 要创建一个目录来存储音频块
if not os.path.isdir(folder_name):
os.mkdir(folder_name)
whole_text = []
time_lines = []
# 处理每一个音频模块
start_time = datetime.datetime.fromisoformat('2022-01-01T00:00:00')
for i, audio_chunk in enumerate(chunks, start=1):
# 导出音频,并保存
chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
audio_chunk.export(chunk_filename, format="wav")
with sr.AudioFile(chunk_filename) as source:
audio_listened = r.record(source)
text = ""
try:
text = r.recognize_google(audio_listened, language="zh-CN", show_all=True)
if text and len(text['alternative']) > 0:
text = text['alternative'][0]['transcript']
else:
text = ""
except sr.UnknownValueError as e:
# print("Error:", str(e))
pass
else:
if text:
text = f"{text.capitalize()}."
# print(start_time.time(), ":", text)
whole_text.append(text)
time_lines.append(start_time)
duration = audio_chunk.duration_seconds
start_time += datetime.timedelta(seconds=duration)
# return the text for all chunks detected
return whole_text, time_lines
# 使用SpeechRecognition库进行语音识别
with open(f'{file_name}.txt', 'w', encoding='utf-8') as f:
for text, time in tqdm(zip(*get_large_audio_transcription(audio_file)),
total=len(list(get_large_audio_transcription(audio_file)))):
f.write(f'{time.time()} {text}\n')
print('All done!')
operation result: