How to extract audio from video and convert audio to text through python

General idea:

(1) Use the VideoFileClip class in the moviepy library to read video files and convert them into audio files;

(2) Use the splitonsilence function in the pydub library to split the audio file into multiple audio segments for speech recognition;

(3) Use the Recognizer class in the SpeechRecognition library for speech recognition and write the recognition results into a text file

Notice:

Split the audio file into multiple audio segments, perform speech recognition on each segment, and finally merge all recognition results into a text file.

Brief description of operating environment:

（1）Mac OS 13.3.1

（2）pycharm 2021.1

Overall code:

from moviepy.editor import VideoFileClip
from pathlib import Path
import os
import speech_recognition as sr
from pydub import AudioSegment
import datetime
from pydub.silence import split_on_silence
from tqdm import tqdm

# 视频文件夹路径
video_folder = './folder'

# 初始化语音识别器
r = sr.Recognizer()

# 遍历视频文件夹中的所有视频文件
for video_file in tqdm(Path(video_folder).rglob('*.mp4')):
    # 提取视频文件名和扩展名
    file_name = video_file.stem
    print(f'Processing video file: {file_name}')

    # 构建视频文件路径和音频文件路径
    audio_file = f'{file_name}.wav'
    video_clip = VideoFileClip(str(video_file))
    video_clip.audio.write_audiofile(audio_file)


def get_large_audio_transcription(path):
    sound = AudioSegment.from_wav(path)
    chunks = split_on_silence(sound, min_silence_len=500, silence_thresh=sound.dBFS - 14, keep_silence=500, )
    folder_name = "audio-chunks"
    # 要创建一个目录来存储音频块
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)
    whole_text = []
    time_lines = []
    # 处理每一个音频模块
    start_time = datetime.datetime.fromisoformat('2022-01-01T00:00:00')

    for i, audio_chunk in enumerate(chunks, start=1):
        # 导出音频，并保存
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
        audio_chunk.export(chunk_filename, format="wav")

      
        with sr.AudioFile(chunk_filename) as source:
            audio_listened = r.record(source)
       
        text = ""
        try:
            text = r.recognize_google(audio_listened, language="zh-CN", show_all=True)
            if text and len(text['alternative']) > 0:
                text = text['alternative'][0]['transcript']
            else:
                text = ""
        except sr.UnknownValueError as e:
            # print("Error:", str(e))
            pass
        else:
            if text:
                text = f"{text.capitalize()}."
                # print(start_time.time(), ":", text)
                whole_text.append(text)
                time_lines.append(start_time)
                duration = audio_chunk.duration_seconds
                start_time += datetime.timedelta(seconds=duration)
    # return the text for all chunks detected
    return whole_text, time_lines


# 使用SpeechRecognition库进行语音识别
with open(f'{file_name}.txt', 'w', encoding='utf-8') as f:
    for text, time in tqdm(zip(*get_large_audio_transcription(audio_file)),
                           total=len(list(get_large_audio_transcription(audio_file)))):
        f.write(f'{time.time()} {text}\n')

print('All done!')

operation result:

How to extract audio from video and convert audio to text through python

Guess you like