吴恩达深度学习course5 week3:Trigger word detection - v1

# -*- coding:utf-8 -*- 
#Author: shenying
#Date: 18-7-19 上午11:39

import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'


import numpy as np
from pydub import AudioSegment
import random
import sys
import os
import glob
import IPython
from IPython.display import Image
from td_utils import *

from keras.callbacks import ModelCheckpoint
from keras.models import Model,load_model,Sequential
from keras.layers import Dense,Activation,Dropout,Input,Masking,TimeDistributed,LSTM,Conv1D
from keras.layers import GRU,Bidirectional,BatchNormalization,Reshape
from keras.optimizers import Adam


# IPython.display.Audio("./raw_data/activates/1.wav")
# IPython.display.Audio('/home/shenying/dl/code/5/raw_data/negatives/4.wav')
# IPython.display.Audio('/home/shenying/dl/code/5/raw_data/backgrounds/1.wav')
# IPython.display.Audio("audio_examples/example_train.wav")
# x = graph_spectrogram("audio_examples/example_train.wav")
# plt.show()
# _, data = wavfile.read("audio_examples/example_train.wav")
# print("Time steps in audio recording before spectrogram", data[:,0].shape)
# print("Time steps in input after spectrogram", x.shape)


# print("background len: " + str(len(backgrounds[0])))
# print("activate[0] len: " + str(len(activates[0])))
# print("activate[1] len: " + str(len(activates[1])))
def get_random_time_segment(segment_ms):
    segment_start=np.random.randint(low=0,high=10000-segment_ms)
    segment_end=segment_start+segment_ms-1
    return (segment_start,segment_end)
def is_overlapping(segment_time,previous_segments):
    segment_start,segment_end=segment_time
    overlap=False
    for previous_start,previous_end in previous_segments:
        if segment_start<=previous_end and segment_end>=previous_start:
            overlap=True
    return overlap
def insert_audio_clip(background,audio_clip,previous_segments):
    segment_ms=len(audio_clip)
    segment_time=get_random_time_segment(segment_ms)
    while is_overlapping(segment_time,previous_segments):
        segment_time=get_random_time_segment(segment_ms)
    previous_segments.append(segment_time)
    new_background=background.overlay(audio_clip,position=segment_time[0])
    return new_background,segment_time
def insert_ones(y,segment_end_ms):
    segment_end_y=int(segment_end_ms*Ty/10000)
    for i in range(segment_end_y+1,segment_end_y+51):
        if i<Ty:
            y[0,i]=1
    return y
def create_training_example(background,activates,negatives):
    np.random.seed(18)
    background=background-20
    y=np.zeros((1,Ty))
    previous_segment=[]

    number_of_activates=np.random.randint(0,5)
    random_indices=np.random.randint(len(activates),size=number_of_activates)
    random_activates=[activates[i] for  i in random_indices]

    for random_activate in random_activates:
        background,segment_time=insert_audio_clip(background,random_activate,previous_segment)
        segment_start,segment_end=segment_time
        y=insert_ones(y,segment_end)

    plt.plot(y[0,:])
    plt.show()

    number_of_negatives=np.random.randint(0,3)
    random_indices_n=np.random.randint(len(negatives),size=number_of_negatives)
    random_negatives=[negatives[i] for i in random_indices_n]

    for random_negative in random_negatives:
        background, _=insert_audio_clip(background,random_negative,previous_segment)
    background=match_target_amplitude(background,-20.0)
    file_handle=background.export('train'+'.wav',format="wav")
    print('File (train.wav) was saved in your directory.')

    x=graph_spectrogram("train.wav")
    plt.show()

    return x,y
def model(input_shape):
    X_input=Input(shape=input_shape)

    X = Conv1D(filters=256,kernel_size=15,strides=4)(X_input)
    X = BatchNormalization()(X)
    X = Activation('relu')(X)
    X = Dropout(0.8)(X)

    X=GRU(units=128,return_sequences=True)(X)
    X=Dropout(0.8)(X)
    X=BatchNormalization()(X)

    X=GRU(units=128,return_sequences=True)(X)
    X=Dropout(0.8)(X)
    X=BatchNormalization()(X)
    X=Dropout(0.8)(X)

    X=TimeDistributed(Dense(1,activation="sigmoid"))(X)
    model=Model(inputs=X_input,outputs=X)

    return model
def detect_triggerword(filename):
    plt.subplot(2,1,1)
    x=graph_spectrogram(filename)
    x=x.swapaxes(0,1)
    x=np.expand_dims(x,axis=0)
    predictions=model.predict(x)

    plt.subplot(2,1,2)
    plt.plot(predictions[0,:,0])
    plt.ylabel('probability')
    plt.savefig("result_pre.png")
    plt.show()
    return predictions
def chime_on_activate(filename,predictions,threshold):
    audio_clip=AudioSegment.from_wav(filename)
    chime=AudioSegment.from_wav(chime_file)
    Ty=predictions.shape[1]
    consecutive_timesteps=0
    for i in range(Ty):
        consecutive_timesteps+=1
        if predictions[0,i,0]>threshold and consecutive_timesteps>75:
            audio_clip=audio_clip.overlay(chime,position=((i/Ty)*audio_clip.duration_seconds)*1000)
            consecutive_timesteps=0
    audio_clip.export("chime_output_pre_model.wav",format='wav')
def preprocess_audio(filename):
    pading=AudioSegment.silent(duration=10000)
    segment=AudioSegment.from_wav(filename)[:10000]
    segment=pading.overlay(segment)
    segment=segment.set_frame_rate(44100)
    segment.export(filename,format='wav')

if __name__=="__main__":
    Tx=5511
    n_freq=101
    Ty=1375
    activates,negatives,backgrounds=load_raw_audio()


    # overlap1 = is_overlapping((950, 1430), [(2000, 2550), (260, 949)])
    # overlap2 = is_overlapping((2305, 2950), [(824, 1532), (1900, 2305), (3424, 3656)])
    # print("Overlap 1 = ", overlap1)
    # print("Overlap 2 = ", overlap2)

    # np.random.seed(5)
    # audio_clip, segment_time = insert_audio_clip(backgrounds[0], activates[0], [(3790, 4400)])
    # audio_clip.export("insert_test1.wav", format="wav")
    # print("Segment Time: ", segment_time)
    # IPython.display.Audio("insert_test1.wav")
    # IPython.display.Audio("audio_examples/insert_reference.wav")

    # arr1 = insert_ones(np.zeros((1, Ty)), 9700)
    # plt.plot(insert_ones(arr1, 4251)[0, :])
    # plt.show()
    # print("sanity checks:", arr1[0][1333], arr1[0][634], arr1[0][635])

    x, y = create_training_example(backgrounds[0], activates, negatives)
    # IPython.display.Audio("train.wav")
    # IPython.display.Audio("audio_examples/train_reference.wav")
    plt.plot[y[0,:]]
    plt.show()

    # X = np.load("./XY_train/X.npy")
    # Y = np.load("./XY_train/Y.npy")
    #
    # X_dev = np.load("./XY_dev/X_dev.npy")
    # Y_dev = np.load("./XY_dev/Y_dev.npy")
    #
    # model = model(input_shape=(Tx, n_freq))
    # model.summary()

    # print("load model:")
    # model=load_model('/home/shenying/dl/code/5/model3/tr_model.h5')
    # model1.summary()
    # opt=Adam(lr=0.0001,beta_1=0.9,beta_2=0.999,decay=0.01)
    # model.compile(loss='binary_crossentropy',optimizer=opt,metrics=['accuracy'])
    # model.fit(X,Y,batch_size=5,epochs=10)

    # loss, acc = model.evaluate(X_dev, Y_dev)
    # print("Dev set accuracy = ", acc)
    chime_file="audio_examples/chime.wav"
    # plt.figure()
    #
    # IPython.display.Audio('/home/shenying/dl/code/5/raw_data/dev/1.wav')
    # IPython.display.Audio('/home/shenying/dl/code/5/raw_data/dev/2.wav')

    # filename = "./raw_data/dev/1.wav"
    # prediction = detect_triggerword(filename)
    # chime_on_activate(filename, prediction, 0.5)
    # IPython.display.Audio("./chime_output.wav")

    # filename = "./raw_data/dev/2.wav"
    # prediction = detect_triggerword(filename)
    # chime_on_activate(filename, prediction, 0.5)
    # IPython.display.Audio("./chime_output.wav")
    # my_filename="audio_examples/my_audio.wav"
    # preprocess_audio(my_filename)
    # predition=detect_triggerword(my_filename)
    # chime_on_activate(my_filename,predition,0.5)
吴恩达深度学习course5 week3:Trigger word detection - v1

猜你喜欢