TensorFlow case study: simple audio recognition

Preface

The following content is from the official tutorial:Simple audio recognition: identifying keywords

audio recognition

Download dataset

下载地址:http://storage.googleapis.com/download.tensorflow.org/data/mini_speech_commands.zip

You can access and download directly from the browser.
Insert image description here
After the download is complete, unzip it into the project. From the folder, you can see that there are 8 subfolders, and the names of the folders are the 8 voice commands.
Insert image description here
Note: We only need the mini_speech_commands folder, others are not required
Insert image description here

Load dataset

# 加载训练数据集、验证集
train_ds, val_ds = tf.keras.utils.audio_dataset_from_directory(
    directory='./data/mini_speech_commands',  # 数据集路径
    batch_size=64,  # 批次
    validation_split=0.2,  # 验证集占数据集的20%
    seed=0,  # 指定随机生成数据集的种子
    # 每个样本的输出序列长度。音频剪辑在 1kHz 时为 16 秒或更短。将较短的填充到正好 1 秒(并且会修剪较长的填充),以便可以轻松批量处理
    output_sequence_length=16000,
    subset='both'  # 训练集和验证集两者同时使用
)

Get category

# 获取命令的类别
label_names = np.array(train_ds.class_names)
print("命令类别:", label_names)

Insert image description here
Exactly the same name and order as the subfiles.

Dimension compression

The documentation says that this dataset only contains mono audio, so dimensional compression of the input audio data is required

  • Mono audio has only one channel. This means that all audio signals are mixed into one channel, without distinction between left and right channels. In mono audio, all sound is played through a single speaker. Mono audio is suitable for most audio applications, such as telephone communications, voice recording, etc.

  • Multichannel (stereo) audio has two channels, left channel and right channel. By using different signals for the left and right channels, a stereo effect can be created on the audio space. Multi-channel audio provides a richer audio experience that better simulates sound distribution in real-world environments. Common applications include music playback, movie sound, game sound effects, etc.

def squeeze(audio,labels):
    audio = tf.squeeze(audio,axis=-1)
    return audio,labels

train_ds = train_ds.map(squeeze,tf.data.AUTOTUNE)
val_ds = val_ds.map(squeeze,tf.data.AUTOTUNE)

Split the validation set
I don’t quite understand what is going on here

test_ds = val_ds.shard(num_shards=2, index=0)
val_ds = val_ds.shard(num_shards=2, index=1)
for example_audio, example_labels in train_ds.take(1):
  print(example_audio.shape)
  print(example_labels.shape)

Draw audio waveform
This section just allows us to visually observe the audio waveform. This section can be commented out later

plt.figure(figsize=(8, 5))
rows = 3
cols = 3
n = rows * cols
for i in range(n):
    plt.subplot(rows, cols, i+1)
    audio_signal = example_audio[i]
    plt.plot(audio_signal)
    plt.title(label_names[example_labels[i]])
    plt.yticks(np.arange(-1.2, 1.2, 0.2))
    plt.ylim([-1.1, 1.1])
plt.tight_layout()
plt.show()

Insert image description here

Convert waveform to spectrogram

The purpose of converting waveforms into spectrograms is to better analyze and understand audio signals.

A waveform is a time domain representation that shows the changes in the audio signal on the time axis. However, a spectrogram is a representation in the frequency domain that decomposes an audio signal into different frequency components and displays the energy or amplitude of each frequency component.

By converting a waveform into a spectrogram, we can more clearly see which frequency components in an audio signal are important for a specific sound or event. This is very helpful for audio processing tasks (such as speech recognition, audio classification, audio segmentation, etc.) as well as audio signal understanding and analysis.

def get_spectrogram(waveform):
  spectrogram = tf.signal.stft(
      waveform, frame_length=255, frame_step=128)
  spectrogram = tf.abs(spectrogram)
  spectrogram = spectrogram[..., tf.newaxis]
  return spectrogram

Browse data
Print an example tensorized waveform and the shape of the corresponding spectrogram, and play the original audio:

for i in range(3):
    label = label_names[example_labels[i]]
    waveform = example_audio[i]
    spectrogram = get_spectrogram(waveform)

    print('Label:', label)
    print('Waveform shape:', waveform.shape)
    print('Spectrogram shape:', spectrogram.shape)
    print('Audio playback')
    display.display(display.Audio(waveform, rate=16000))

Create a spectrogram dataset from an audio dataset

# 从音频数据集创建频谱图数据集
def make_spec_ds(ds):
  return ds.map(
      map_func=lambda audio,label: (get_spectrogram(audio), label),
      num_parallel_calls=tf.data.AUTOTUNE)


train_spectrogram_ds = make_spec_ds(train_ds)
val_spectrogram_ds = make_spec_ds(val_ds)
test_spectrogram_ds = make_spec_ds(test_ds)

Reduce read latency when training models

train_spectrogram_ds = train_spectrogram_ds.cache().shuffle(10000).prefetch(tf.data.AUTOTUNE)
val_spectrogram_ds = val_spectrogram_ds.cache().prefetch(tf.data.AUTOTUNE)
test_spectrogram_ds = test_spectrogram_ds.cache().prefetch(tf.data.AUTOTUNE)

Create and train a model using a convolutional neural network

# 使用卷积神经网络创建模型
input_shape = example_spectrograms.shape[1:]
print('Input shape:', input_shape)
num_labels = len(label_names)
norm_layer = tf.keras.layers.Normalization()  # 创建规范化层,便于更好的进行模型训练和推断
norm_layer.adapt(data=train_spectrogram_ds.map(
    map_func=lambda spec, label: spec))

model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=input_shape),
    tf.keras.layers.Resizing(32, 32),
    norm_layer,
    tf.keras.layers.Conv2D(32, 3, activation='relu'),
    tf.keras.layers.Conv2D(64, 3, activation='relu'),
    tf.keras.layers.MaxPool2D(),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels),
])

model.summary()

# 编译模型
model.compile(
    optimizer=tf.keras.optimizers.Adam(),  # 优化器
    loss=tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True),  # 损失函数
    metrics=['accuracy']  # 准确率作为评估标准
)

# 训练模型,并记录训练的日志
history = model.fit(
    train_spectrogram_ds,
    validation_data=val_spectrogram_ds,
    epochs=10,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)

Insert image description here
Evaluate performance

model.evaluate(test_spectrogram_ds, return_dict=True)

Export model

class ExportModel(tf.Module):
    def __init__(self, model):
        self.model = model

        # Accept either a string-filename or a batch of waveforms.
        # YOu could add additional signatures for a single wave, or a ragged-batch.
        self.__call__.get_concrete_function(
            x=tf.TensorSpec(shape=(), dtype=tf.string))
        self.__call__.get_concrete_function(
            x=tf.TensorSpec(shape=[None, 16000], dtype=tf.float32))

    @tf.function
    def __call__(self, x):
        # If they pass a string, load the file and decode it.
        if x.dtype == tf.string:
            x = tf.io.read_file(x)
            x, _ = tf.audio.decode_wav(
                x, desired_channels=1, desired_samples=16000,)
            x = tf.squeeze(x, axis=-1)
            x = x[tf.newaxis, :]

        x = get_spectrogram(x)
        result = self.model(x, training=False)

        class_ids = tf.argmax(result, axis=-1)
        class_names = tf.gather(label_names, class_ids)
        return {
    
    'predictions': result,
                'class_ids': class_ids,
                'class_names': class_names}


export = ExportModel(model)
export(tf.constant('./data/mini_speech_commands/no/012c8314_nohash_0.wav'))

tf.saved_model.save(export, "saved")

The following is the saved model
Insert image description here
The complete code

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from IPython import display

# 加载训练数据集、验证集
train_ds, val_ds = tf.keras.utils.audio_dataset_from_directory(
    directory='./data/mini_speech_commands',  # 数据集路径
    batch_size=64,  # 批次
    validation_split=0.2,  # 验证集占数据集的20%
    seed=0,  # 指定随机生成数据集的种子
    # 每个样本的输出序列长度。音频剪辑在 1kHz 时为 16 秒或更短。将较短的填充到正好 1 秒(并且会修剪较长的填充),以便可以轻松批量处理
    output_sequence_length=16000,
    subset='both'  # 训练集和验证集两者同时使用
)

# 获取命令的类别
label_names = np.array(train_ds.class_names)
print("命令类别:", label_names)

# 输入数据压缩


def squeeze(audio, labels):
    audio = tf.squeeze(audio, axis=-1)
    return audio, labels


train_ds = train_ds.map(squeeze, tf.data.AUTOTUNE)
val_ds = val_ds.map(squeeze, tf.data.AUTOTUNE)

# 拆分验证集
test_ds = val_ds.shard(num_shards=2, index=0)
val_ds = val_ds.shard(num_shards=2, index=1)

for example_audio, example_labels in train_ds.take(1):
    print(example_audio.shape)
    print(example_labels.shape)


# 绘制音频波形
# plt.figure(figsize=(8, 5))
# rows = 3
# cols = 3
# n = rows * cols
# for i in range(n):
#     plt.subplot(rows, cols, i+1)
#     audio_signal = example_audio[i]
#     plt.plot(audio_signal)
#     plt.title(label_names[example_labels[i]])
#     plt.yticks(np.arange(-1.2, 1.2, 0.2))
#     plt.ylim([-1.1, 1.1])
# plt.tight_layout()
# plt.show()

# 将波形转换为频谱图
def get_spectrogram(waveform):
    spectrogram = tf.signal.stft(
        waveform, frame_length=255, frame_step=128)
    spectrogram = tf.abs(spectrogram)
    spectrogram = spectrogram[..., tf.newaxis]
    return spectrogram


# 浏览数据
for i in range(3):
    label = label_names[example_labels[i]]
    waveform = example_audio[i]
    spectrogram = get_spectrogram(waveform)

    print('Label:', label)
    print('Waveform shape:', waveform.shape)
    print('Spectrogram shape:', spectrogram.shape)
    print('Audio playback')
    display.display(display.Audio(waveform, rate=16000))

# 从音频数据集创建频谱图数据集


def make_spec_ds(ds):
    return ds.map(
        map_func=lambda audio, label: (get_spectrogram(audio), label),
        num_parallel_calls=tf.data.AUTOTUNE)


train_spectrogram_ds = make_spec_ds(train_ds)
val_spectrogram_ds = make_spec_ds(val_ds)
test_spectrogram_ds = make_spec_ds(test_ds)

# 检查数据集的不同示例的频谱图
for example_spectrograms, example_spect_labels in train_spectrogram_ds.take(1):
    break

# 减少训练模型时的读取延迟
train_spectrogram_ds = train_spectrogram_ds.cache().shuffle(
    10000).prefetch(tf.data.AUTOTUNE)
val_spectrogram_ds = val_spectrogram_ds.cache().prefetch(tf.data.AUTOTUNE)
test_spectrogram_ds = test_spectrogram_ds.cache().prefetch(tf.data.AUTOTUNE)


# 使用卷积神经网络创建模型
input_shape = example_spectrograms.shape[1:]
print('Input shape:', input_shape)
num_labels = len(label_names)
norm_layer = tf.keras.layers.Normalization()  # 创建规范化层,便于更好的进行模型训练和推断
norm_layer.adapt(data=train_spectrogram_ds.map(
    map_func=lambda spec, label: spec))

model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=input_shape),
    tf.keras.layers.Resizing(32, 32),
    norm_layer,
    tf.keras.layers.Conv2D(32, 3, activation='relu'),
    tf.keras.layers.Conv2D(64, 3, activation='relu'),
    tf.keras.layers.MaxPool2D(),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_labels),
])

model.summary()

# 编译模型
model.compile(
    optimizer=tf.keras.optimizers.Adam(),  # 优化器
    loss=tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True),  # 损失函数
    metrics=['accuracy']  # 准确率作为评估标准
)

# 训练模型,并记录训练的日志
history = model.fit(
    train_spectrogram_ds,
    validation_data=val_spectrogram_ds,
    epochs=10,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)

# 评估性能
model.evaluate(test_spectrogram_ds, return_dict=True)

# 导出模型

class ExportModel(tf.Module):
    def __init__(self, model):
        self.model = model

        self.__call__.get_concrete_function(
            x=tf.TensorSpec(shape=(), dtype=tf.string))
        self.__call__.get_concrete_function(
            x=tf.TensorSpec(shape=[None, 16000], dtype=tf.float32))

    @tf.function
    def __call__(self, x):
        if x.dtype == tf.string:
            x = tf.io.read_file(x)
            x, _ = tf.audio.decode_wav(
                x, desired_channels=1, desired_samples=16000,)
            x = tf.squeeze(x, axis=-1)
            x = x[tf.newaxis, :]

        x = get_spectrogram(x)
        result = self.model(x, training=False)

        class_ids = tf.argmax(result, axis=-1)
        class_names = tf.gather(label_names, class_ids)
        return {
    
    'predictions': result,
                'class_ids': class_ids,
                'class_names': class_names}


export = ExportModel(model)
export(tf.constant('./data/mini_speech_commands/no/012c8314_nohash_0.wav'))

tf.saved_model.save(export, "saved")

Load using exported model

Audio predicted using modeldown

import tensorflow as tf

# 直接加载模型的目录
new_model = tf.saved_model.load("./saved")
res = new_model('./data/mini_speech_commands/down/004ae714_nohash_0.wav')
print("结果:",res)

class_names = ['down', 'go', 'left', 'no', 'right', 'stop', 'up', 'yes']
class_index = res['class_ids'].numpy()[0]
class_name = class_names[class_index]
print("类别名称:", class_name)

Insert image description here

Guess you like

Origin blog.csdn.net/weixin_41897680/article/details/134138637