Clasificacion de sonidos en el UrbanSound Dataset, https://urbansounddataset.weebly.com/urbansound8k.html Este conjunto de datos contiene 8732 extractos de sonido etiquetados (<= 4s) de sonidos urbanos de 10 clases:
- air_conditioner
- car_horn
- children_playing
- dog_bark
- drill
- enginge_idling
- gun_shot
- jackhammer
- siren
- street_music.
Las clases se extraen de la taxonomía sonora urbana. Mas información en el paper:
Justin Salamon, Christopher Jacoby, and Juan Pablo Bello. 2014. A Dataset and Taxonomy for Urban Sound Research. In Proceedings of the 22nd ACM international conference on Multimedia (MM ‘14). Association for Computing Machinery, New York, NY, USA, 1041–1044. DOI:https://doi.org/10.1145/2647868.2655045
Mas adelante tendremos las clases de:
- script de definición del dataset
- script de definición de la red neuronal convolucional
- Script de entrenamiento
- Scrip de prediccion
Urbansound Dataset
import os
import torch
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
class UrbanSoundDataset(Dataset):
def __init__(self,
annotations_file,
audio_dir,
transformation,
target_sample_rate,
num_samples,
device):
# archivo de anotaciones
self.annotations = pd.read_csv(annotations_file)
# directorio de audio
self.audio_dir = audio_dir
# gpu o cpu
self.device = device
# transformacion
self.transformation = transformation.to(self.device)
# frecuencia de muestreo
self.target_sample_rate = target_sample_rate
# numero de muestras
self.num_samples = num_samples
def __len__(self):
return len(self.annotations)
def __getitem__(self, index):
# obtener path de audio
audio_sample_path = self._get_audio_sample_path(index)
# obtener label
label = self._get_audio_sample_label(index)
# cargar audio
signal, sr = torchaudio.load(audio_sample_path)
# definir la señal de entrada
signal = signal.to(self.device)
# aplicar remuestreo si es necesario (resampling)
signal = self._resample_if_necessary(signal, sr)
# convertir a señal monofonica si es necesario (mixing down)
signal = self._mix_down_if_necessary(signal)
# cortar si es necesario (cutting)
signal = self._cut_if_necessary(signal)
# ajustar si es necesario (padding)
signal = self._right_pad_if_necessary(signal)
# aplicar transformacion
signal = self.transformation(signal)
return signal, label
def _cut_if_necessary(self, signal):
"""
Corta la señal si es necesario, para ajustar el tamaño de la señal
"""
if signal.shape[1] > self.num_samples:
signal = signal[:, :self.num_samples]
return signal
def _right_pad_if_necessary(self, signal):
"""
Ajusta la señal si es necesario, para ajustar el tamaño de la señal
"""
length_signal = signal.shape[1]
if length_signal < self.num_samples:
num_missing_samples = self.num_samples - length_signal
last_dim_padding = (0, num_missing_samples)
signal = torch.nn.functional.pad(signal, last_dim_padding)
return signal
def _resample_if_necessary(self, signal, sr):
"""
Aplica el remuestreo si es necesario
"""
if sr != self.target_sample_rate:
resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
signal = resampler(signal)
return signal
def _mix_down_if_necessary(self, signal):
"""
Convertir a señal monofonica si es necesario
"""
if signal.shape[0] > 1:
signal = torch.mean(signal, dim=0, keepdim=True)
return signal
def _get_audio_sample_path(self, index):
"""
Obtiene el path de audio
"""
fold = f"fold{self.annotations.iloc[index, 5]}"
path = os.path.join(self.audio_dir, fold, self.annotations.iloc[
index, 0])
return path
def _get_audio_sample_label(self, index):
"""
Obtiene el label del audio
"""
return self.annotations.iloc[index, 6]
if __name__ == "__main__":
# path de las anotaciones
ANNOTATIONS_FILE = "datasets/UrbanSound8K/metadata/UrbanSound8K.csv"
# directorio de audio
AUDIO_DIR = "datasets/UrbanSound8K/audio"
# frecuencia de muestreo
SAMPLE_RATE = 22050
# numero de muestras
NUM_SAMPLES = 22050
if torch.cuda.is_available():
# gpu disponible
device = "cuda"
else:
# gpu no disponible
device = "cpu"
print(f"Using device {device}")
# extraer el mel spectrogram de la señal
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
# frecuencia de muestreo
sample_rate=SAMPLE_RATE,
# tamaño de la FFT
n_fft=1024,
# tamaño del desplazamiento de la fft, normalemente fft/2
hop_length=512,
# numero de coeficientes del filtro mel
n_mels=64
)
#definir el dataset
usd = UrbanSoundDataset(ANNOTATIONS_FILE,
AUDIO_DIR,
mel_spectrogram,
SAMPLE_RATE,
NUM_SAMPLES,
device)
print(f"El dataset tiene {len(usd)} archivos de audio.")
signal, label = usd[0]
Convolutional Neural Network
from torch import nn
# pip install torchsummary
from torchsummary import summary
# crear una red neuronal convolucional
class CNNNetwork(nn.Module):
def __init__(self):
super().__init__()
# 4 conv blocks / flatten / linear / softmax
# conv block 1
self.conv1 = nn.Sequential(
nn.Conv2d(
in_channels=1,
out_channels=16,
kernel_size=3,
stride=1,
padding=2
),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
# conv block 2
self.conv2 = nn.Sequential(
nn.Conv2d(
in_channels=16,
out_channels=32,
kernel_size=3,
stride=1,
padding=2
),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
# conv block 3
self.conv3 = nn.Sequential(
nn.Conv2d(
in_channels=32,
out_channels=64,
kernel_size=3,
stride=1,
padding=2
),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
# conv block 4
self.conv4 = nn.Sequential(
nn.Conv2d(
in_channels=64,
out_channels=128,
kernel_size=3,
stride=1,
padding=2
),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
# flatten
self.flatten = nn.Flatten()
# linear
self.linear = nn.Linear(128 * 5 * 4, 10)
# softmax
self.softmax = nn.Softmax(dim=1)
def forward(self, input_data):
"""
definición del proceso de forward en la red neuronal
"""
x = self.conv1(input_data)
x = self.conv2(x)
x = self.conv3(x)
x = self.conv4(x)
x = self.flatten(x)
logits = self.linear(x)
predictions = self.softmax(logits)
return predictions
if __name__ == "__main__":
# instanciar la red neuronal convolucional
cnn = CNNNetwork()
# mostrar la información de la red neuronal convolucional
summary(cnn.cuda(), (1, 64, 44))
Entrenamiento
import torch
import torchaudio
from torch import nn
from torch.utils.data import DataLoader
from urbansounddataset import UrbanSoundDataset
from cnn import CNNNetwork
BATCH_SIZE = 128
EPOCHS = 10
LEARNING_RATE = 0.001
ANNOTATIONS_FILE = "datasets/UrbanSound8K/metadata/UrbanSound8K.csv"
AUDIO_DIR = "datasets/UrbanSound8K/audio"
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050
def create_data_loader(train_data, batch_size):
train_dataloader = DataLoader(train_data, batch_size=batch_size)
return train_dataloader
def train_single_epoch(model, data_loader, loss_fn, optimiser, device):
for input, target in data_loader:
input, target = input.to(device), target.to(device)
# calcular la perdida y el gradiente
prediction = model(input)
loss = loss_fn(prediction, target)
# error de backpropagation y actualizacion de los pesos
optimiser.zero_grad()
loss.backward()
optimiser.step()
print(f"loss: {loss.item()}")
def train(model, data_loader, loss_fn, optimiser, device, epochs):
for i in range(epochs):
print(f"Epoch {i+1}")
train_single_epoch(model, data_loader, loss_fn, optimiser, device)
print("---------------------------")
print("Finished training")
if __name__ == "__main__":
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
print(f"Using {device}")
# instanciar la extración del mel spectrogram
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
sample_rate=SAMPLE_RATE,
n_fft=1024,
hop_length=512,
n_mels=64
)
# instanciar el dataset
usd = UrbanSoundDataset(ANNOTATIONS_FILE,
AUDIO_DIR,
mel_spectrogram,
SAMPLE_RATE,
NUM_SAMPLES,
device)
train_dataloader = create_data_loader(usd, BATCH_SIZE)
# construcion del modelo y asignarlo a la GPU
cnn = CNNNetwork().to(device)
print(cnn)
# inicializacion de la función de perdida + optimizador
loss_fn = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(cnn.parameters(),
lr=LEARNING_RATE)
# entrenamiento del modelo
train(cnn, train_dataloader, loss_fn, optimiser, device, EPOCHS)
# grabar el modelo
torch.save(cnn.state_dict(), "cnnnet.pth")
print("red neuronal entrenada cnnnet.pth")
Predicion
import torch
import torchaudio
from cnn import CNNNetwork
from urbansounddataset import UrbanSoundDataset
from train import AUDIO_DIR, ANNOTATIONS_FILE, SAMPLE_RATE, NUM_SAMPLES
class_mapping = [
"air_conditioner",
"car_horn",
"children_playing",
"dog_bark",
"drilling",
"engine_idling",
"gun_shot",
"jackhammer",
"siren",
"street_music"
]
def predict(model, input, target, class_mapping):
"""
Predecir la clase de una secuencia de audio
"""
model.eval()
with torch.no_grad():
predictions = model(input)
predicted_index = predictions[0].argmax(0)
predicted = class_mapping[predicted_index]
expected = class_mapping[target]
return predicted, expected
if __name__ == "__main__":
# cargar el modelo
cnn = CNNNetwork()
state_dict = torch.load("cnnnet.pth")
cnn.load_state_dict(state_dict)
# instanciar la extración del mel spectrogram
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
sample_rate=SAMPLE_RATE,
n_fft=1024,
hop_length=512,
n_mels=64
)
# cargar el urban sound dataset
usd = UrbanSoundDataset(ANNOTATIONS_FILE,
AUDIO_DIR,
mel_spectrogram,
SAMPLE_RATE,
NUM_SAMPLES,
"cpu") # predicciones con CPU
# obtencion un ejemplo de prueba del urban sound dataset para prediccion
input, target = usd[0][0], usd[0][1] # [batch size, num_channels, fr, time]
input.unsqueeze_(0)
# realizar prediccion
predicted, expected = predict(cnn, input, target,
class_mapping)
print(f"Prediccion: '{predicted}', Real: '{expected}'")