Use python to realize AI video synthesis audio, and the mouth shape can be right

To implement such a program, some deep learning techniques are used. The following is a basic implementation idea:

1. Collect video and audio data for training, and the data should preferably be synchronized.

2. Use the deep learning model to detect the key points of the video frame to obtain the coordinates of the mouth shape.

3. Use speech recognition technology to convert audio to text.

4. Convert the text into a pronunciation sequence, and generate the corresponding audio through a synthesizer.

5. Use the deep learning model to map the pronunciation sequence to the mouth shape coordinate sequence.

6. Synthesize the mouth shape coordinate sequence with the original video.

Here is a simple Python sample code:

import cv2
import dlib
import numpy as np
import librosa
import argparse
import os

# 加载模型,使用dlib的68个特征点
predictor = dlib.shape_predictor('shape_predictor_68_face_landmarks.dat')
detector = dlib.get_frontal_face_detector()

# 视频处理类
class Video(object):
    def __init__(self, path):
        self.path = path
        self.cap = cv2.VideoCapture(path)
        self.fps = self.cap.get(cv2.CAP_PROP_FPS)
        self.width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        self.height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))

    def __del__(self):
        if self.cap.isOpened():
            self.cap.release()

    def get_frame(self, indx):
        if indx >= self.total_frames:
            return None
        self.cap.set(cv2.CAP_PROP_POS_FRAMES, indx)
        ret, frame = self.cap.read()
        return frame

# 语音处理类
class Audio(object):
    def __init__(self, path):
        self.path = path
        self.y, self.sr = librosa.load(path, sr=22050)

    def get_audio(self, start, end):
        return self.y[int(start*self.sr):int(end*self.sr)]

# 合成程序
class Synth(object):
    def __init__(self):
        self.frame_size = 256
        self.frame_shift = 128
        self.window = np.hanning(self.frame_size)
        self.n_fft = self.frame_size

    def get_spectrogram(self, audio):
        stft = librosa.stft(audio, n_fft=self.n_fft, hop_length=self.frame_shift, win_length=self.frame_size, window=self.window, center=False)
        spectrogram = np.abs(stft)**2
        return spectrogram

    def get_mel_basis(self, n_mels, fmin, fmax):
        return librosa.filters.mel(sr=self.sr, n_fft=self.n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)

    def get_mel_spectrum(self, spectrogram, mel_basis):
        mel_spectrum = np.dot(mel_basis, spectrogram)
        return mel_spectrum

    def get_audio(self, spect, wav_encoder):
        inv_mel_basis = np.linalg.pinv(self.mel_basis)
        inv_spectrogram = np.dot(inv_mel_basis, spect)
        return librosa.istft(inv_spectrogram, hop_length=self.frame_shift, win_length=self.frame_size, window=self.window)

    def synthesize(self, text):
        # 使用TTS合成声音
        wav_encoder = TTS.synthesize(text)
        audio = np.array(wav_encoder.output.audio).astype(np.float32)
        audio /= np.max(np.abs(audio))
        mel_basis = self.get_mel_basis(...)
        spectrogram = self.get_spectrogram(audio)
        mel_spectrum = self.get_mel_spectrum(spectrogram, mel_basis)
        
        # 使用深度学习模型, 将发音序列映射到嘴巴坐标系
        landmarks = model(mel_spectrum)
        
        # 合并视频和音频
        video = Video('./path/to/video')
        audio = Audio('./path/to/audio')
        for i in range(video.total_frames):
            frame = video.get_frame(i)
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            faces = detector(gray, 0)
            if len(faces) == 1:
                landmarks = predictor(gray, faces[0])
                landmarks = np.array([[p.x, p.y] for p in landmarks.parts()])
                landmarks = landmarks[48:68]
        
            # 合成视频
            cv2.imshow('frame', frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    def save(self, output_path):
        out.release()
        cv2.destroyAllWindows()

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--text', type=str, help='要说的话')
    parser.add_argument('--output', type=str, help='输出路径')
    args = parser.parse_args()
    
    synth = Synth()
    synth.synthesize(args.text)
    synth.save(args.output)

Some details to note:

1. The model needs to be trained in order to map the pronunciation sequence to the mouth shape coordinate system.

2. It may be difficult to synchronize the mouth shape with the audio, because insufficient lighting, switching perspectives and data delays may cause deviations.

3. This code may not be sufficient, and the specific parts need to be adjusted and improved according to the actual situation and needs.

Guess you like

Origin blog.csdn.net/zjj1898/article/details/129742665