# -*- coding:utf-8 -*-
#Author: shenying
#Date: 18-7-19 上午11:39
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import numpy as np
from pydub import AudioSegment
import random
import sys
import os
import glob
import IPython
from IPython.display import Image
from td_utils import *
from keras.callbacks import ModelCheckpoint
from keras.models import Model,load_model,Sequential
from keras.layers import Dense,Activation,Dropout,Input,Masking,TimeDistributed,LSTM,Conv1D
from keras.layers import GRU,Bidirectional,BatchNormalization,Reshape
from keras.optimizers import Adam
# IPython.display.Audio("./raw_data/activates/1.wav")
# IPython.display.Audio('/home/shenying/dl/code/5/raw_data/negatives/4.wav')
# IPython.display.Audio('/home/shenying/dl/code/5/raw_data/backgrounds/1.wav')
# IPython.display.Audio("audio_examples/example_train.wav")
# x = graph_spectrogram("audio_examples/example_train.wav")
# plt.show()
# _, data = wavfile.read("audio_examples/example_train.wav")
# print("Time steps in audio recording before spectrogram", data[:,0].shape)
# print("Time steps in input after spectrogram", x.shape)
# print("background len: " + str(len(backgrounds[0])))
# print("activate[0] len: " + str(len(activates[0])))
# print("activate[1] len: " + str(len(activates[1])))
def get_random_time_segment(segment_ms):
segment_start=np.random.randint(low=0,high=10000-segment_ms)
segment_end=segment_start+segment_ms-1
return (segment_start,segment_end)
def is_overlapping(segment_time,previous_segments):
segment_start,segment_end=segment_time
overlap=False
for previous_start,previous_end in previous_segments:
if segment_start<=previous_end and segment_end>=previous_start:
overlap=True
return overlap
def insert_audio_clip(background,audio_clip,previous_segments):
segment_ms=len(audio_clip)
segment_time=get_random_time_segment(segment_ms)
while is_overlapping(segment_time,previous_segments):
segment_time=get_random_time_segment(segment_ms)
previous_segments.append(segment_time)
new_background=background.overlay(audio_clip,position=segment_time[0])
return new_background,segment_time
def insert_ones(y,segment_end_ms):
segment_end_y=int(segment_end_ms*Ty/10000)
for i in range(segment_end_y+1,segment_end_y+51):
if i<Ty:
y[0,i]=1
return y
def create_training_example(background,activates,negatives):
np.random.seed(18)
background=background-20
y=np.zeros((1,Ty))
previous_segment=[]
number_of_activates=np.random.randint(0,5)
random_indices=np.random.randint(len(activates),size=number_of_activates)
random_activates=[activates[i] for i in random_indices]
for random_activate in random_activates:
background,segment_time=insert_audio_clip(background,random_activate,previous_segment)
segment_start,segment_end=segment_time
y=insert_ones(y,segment_end)
plt.plot(y[0,:])
plt.show()
number_of_negatives=np.random.randint(0,3)
random_indices_n=np.random.randint(len(negatives),size=number_of_negatives)
random_negatives=[negatives[i] for i in random_indices_n]
for random_negative in random_negatives:
background, _=insert_audio_clip(background,random_negative,previous_segment)
background=match_target_amplitude(background,-20.0)
file_handle=background.export('train'+'.wav',format="wav")
print('File (train.wav) was saved in your directory.')
x=graph_spectrogram("train.wav")
plt.show()
return x,y
def model(input_shape):
X_input=Input(shape=input_shape)
X = Conv1D(filters=256,kernel_size=15,strides=4)(X_input)
X = BatchNormalization()(X)
X = Activation('relu')(X)
X = Dropout(0.8)(X)
X=GRU(units=128,return_sequences=True)(X)
X=Dropout(0.8)(X)
X=BatchNormalization()(X)
X=GRU(units=128,return_sequences=True)(X)
X=Dropout(0.8)(X)
X=BatchNormalization()(X)
X=Dropout(0.8)(X)
X=TimeDistributed(Dense(1,activation="sigmoid"))(X)
model=Model(inputs=X_input,outputs=X)
return model
def detect_triggerword(filename):
plt.subplot(2,1,1)
x=graph_spectrogram(filename)
x=x.swapaxes(0,1)
x=np.expand_dims(x,axis=0)
predictions=model.predict(x)
plt.subplot(2,1,2)
plt.plot(predictions[0,:,0])
plt.ylabel('probability')
plt.savefig("result_pre.png")
plt.show()
return predictions
def chime_on_activate(filename,predictions,threshold):
audio_clip=AudioSegment.from_wav(filename)
chime=AudioSegment.from_wav(chime_file)
Ty=predictions.shape[1]
consecutive_timesteps=0
for i in range(Ty):
consecutive_timesteps+=1
if predictions[0,i,0]>threshold and consecutive_timesteps>75:
audio_clip=audio_clip.overlay(chime,position=((i/Ty)*audio_clip.duration_seconds)*1000)
consecutive_timesteps=0
audio_clip.export("chime_output_pre_model.wav",format='wav')
def preprocess_audio(filename):
pading=AudioSegment.silent(duration=10000)
segment=AudioSegment.from_wav(filename)[:10000]
segment=pading.overlay(segment)
segment=segment.set_frame_rate(44100)
segment.export(filename,format='wav')
if __name__=="__main__":
Tx=5511
n_freq=101
Ty=1375
activates,negatives,backgrounds=load_raw_audio()
# overlap1 = is_overlapping((950, 1430), [(2000, 2550), (260, 949)])
# overlap2 = is_overlapping((2305, 2950), [(824, 1532), (1900, 2305), (3424, 3656)])
# print("Overlap 1 = ", overlap1)
# print("Overlap 2 = ", overlap2)
# np.random.seed(5)
# audio_clip, segment_time = insert_audio_clip(backgrounds[0], activates[0], [(3790, 4400)])
# audio_clip.export("insert_test1.wav", format="wav")
# print("Segment Time: ", segment_time)
# IPython.display.Audio("insert_test1.wav")
# IPython.display.Audio("audio_examples/insert_reference.wav")
# arr1 = insert_ones(np.zeros((1, Ty)), 9700)
# plt.plot(insert_ones(arr1, 4251)[0, :])
# plt.show()
# print("sanity checks:", arr1[0][1333], arr1[0][634], arr1[0][635])
x, y = create_training_example(backgrounds[0], activates, negatives)
# IPython.display.Audio("train.wav")
# IPython.display.Audio("audio_examples/train_reference.wav")
plt.plot[y[0,:]]
plt.show()
# X = np.load("./XY_train/X.npy")
# Y = np.load("./XY_train/Y.npy")
#
# X_dev = np.load("./XY_dev/X_dev.npy")
# Y_dev = np.load("./XY_dev/Y_dev.npy")
#
# model = model(input_shape=(Tx, n_freq))
# model.summary()
# print("load model:")
# model=load_model('/home/shenying/dl/code/5/model3/tr_model.h5')
# model1.summary()
# opt=Adam(lr=0.0001,beta_1=0.9,beta_2=0.999,decay=0.01)
# model.compile(loss='binary_crossentropy',optimizer=opt,metrics=['accuracy'])
# model.fit(X,Y,batch_size=5,epochs=10)
# loss, acc = model.evaluate(X_dev, Y_dev)
# print("Dev set accuracy = ", acc)
chime_file="audio_examples/chime.wav"
# plt.figure()
#
# IPython.display.Audio('/home/shenying/dl/code/5/raw_data/dev/1.wav')
# IPython.display.Audio('/home/shenying/dl/code/5/raw_data/dev/2.wav')
# filename = "./raw_data/dev/1.wav"
# prediction = detect_triggerword(filename)
# chime_on_activate(filename, prediction, 0.5)
# IPython.display.Audio("./chime_output.wav")
# filename = "./raw_data/dev/2.wav"
# prediction = detect_triggerword(filename)
# chime_on_activate(filename, prediction, 0.5)
# IPython.display.Audio("./chime_output.wav")
# my_filename="audio_examples/my_audio.wav"
# preprocess_audio(my_filename)
# predition=detect_triggerword(my_filename)
# chime_on_activate(my_filename,predition,0.5)
吴恩达深度学习course5 week3:Trigger word detection - v1
猜你喜欢
转载自blog.csdn.net/qq_31119155/article/details/81162998
今日推荐
周排行