Clasificacion de Audio con Pytorch

Invitame a un Cafe

Clasificacion de sonidos en el UrbanSound Dataset, https://urbansounddataset.weebly.com/urbansound8k.html Este conjunto de datos contiene 8732 extractos de sonido etiquetados (<= 4s) de sonidos urbanos de 10 clases:

  • air_conditioner
  • car_horn
  • children_playing
  • dog_bark
  • drill
  • enginge_idling
  • gun_shot
  • jackhammer
  • siren
  • street_music.

Las clases se extraen de la taxonomía sonora urbana. Mas informacion en el paper:

Justin Salamon, Christopher Jacoby, and Juan Pablo Bello. 2014. A Dataset and Taxonomy for Urban Sound Research. In Proceedings of the 22nd ACM international conference on Multimedia (MM ‘14). Association for Computing Machinery, New York, NY, USA, 1041–1044. DOI:https://doi.org/10.1145/2647868.2655045

Mas adelante tendremos las clases de:

  • script de definicion del dataset
  • script de definicion de la red neuronal convolucional
  • Script de entrenamiento
  • Scrip de prediccion

Urbansound Dataset

import os

import torch
from torch.utils.data import Dataset
import pandas as pd
import torchaudio


class UrbanSoundDataset(Dataset):

    def __init__(self,
                 annotations_file,
                 audio_dir,
                 transformation,
                 target_sample_rate,
                 num_samples,
                 device):
        # archivo de anotaciones
        self.annotations = pd.read_csv(annotations_file)
        # directorio de audio
        self.audio_dir = audio_dir
        # gpu o cpu
        self.device = device
        # transformacion
        self.transformation = transformation.to(self.device)
        # frecuencia de muestreo
        self.target_sample_rate = target_sample_rate
        # numero de muestras
        self.num_samples = num_samples

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        # obtener path de audio
        audio_sample_path = self._get_audio_sample_path(index)
        # obtener label
        label = self._get_audio_sample_label(index)
        # cargar audio
        signal, sr = torchaudio.load(audio_sample_path)
        # definir la señal de entrada
        signal = signal.to(self.device)
        # aplicar remuestreo si es necesario (resampling)
        signal = self._resample_if_necessary(signal, sr)
        # convertir a señal monofonica si es necesario (mixing down)
        signal = self._mix_down_if_necessary(signal)
        # cortar si es necesario (cutting)
        signal = self._cut_if_necessary(signal)
        # ajustar si es necesario (padding)
        signal = self._right_pad_if_necessary(signal)
        # aplicar transformacion
        signal = self.transformation(signal)

        return signal, label

    def _cut_if_necessary(self, signal):
        """
        Corta la señal si es necesario, para ajustar el tamaño de la señal
        """        
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        """
        Ajusta la señal si es necesario, para ajustar el tamaño de la señal
        """
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    def _resample_if_necessary(self, signal, sr):
        """
        Aplica el remuestreo si es necesario
        """

        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        """
        Convertir a señal monofonica si es necesario
        """
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    def _get_audio_sample_path(self, index):
        """
        Obtiene el path de audio
        """

        fold = f"fold{self.annotations.iloc[index, 5]}"
        path = os.path.join(self.audio_dir, fold, self.annotations.iloc[
            index, 0])
        return path

    def _get_audio_sample_label(self, index):
        """
        Obtiene el label del audio
        """

        return self.annotations.iloc[index, 6]


if __name__ == "__main__":
    # path de las anotaciones
    ANNOTATIONS_FILE = "datasets/UrbanSound8K/metadata/UrbanSound8K.csv"
    # directorio de audio
    AUDIO_DIR = "datasets/UrbanSound8K/audio"
    # frecuencia de muestreo
    SAMPLE_RATE = 22050
    # numero de muestras
    NUM_SAMPLES = 22050

    if torch.cuda.is_available():
        # gpu disponible
        device = "cuda"
    else:
        # gpu no disponible
        device = "cpu"
    print(f"Using device {device}")


    # extraer el mel spectrogram de la señal
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        # frecuencia de muestreo
        sample_rate=SAMPLE_RATE,
        # tamaño de la FFT
        n_fft=1024,
        # tamaño del desplazamiento de la fft, normalemente fft/2
        hop_length=512,
        # numero de coeficientes del filtro mel
        n_mels=64
    )

    #definir el dataset
    usd = UrbanSoundDataset(ANNOTATIONS_FILE,
                            AUDIO_DIR,
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            device)
    print(f"El dataset tiene {len(usd)} archivos de audio.")
    signal, label = usd[0]

Convolutional Neural Network

from torch import nn
# pip install torchsummary
from torchsummary import summary

# crear una red neuronal convolucional
class CNNNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        # 4 conv blocks / flatten / linear / softmax

        # conv block 1
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        # conv block 2
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        # conv block 3
        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        # conv block 4
        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        # flatten
        self.flatten = nn.Flatten()
        # linear
        self.linear = nn.Linear(128 * 5 * 4, 10)
        # softmax
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_data):
        """
        definicion del proceso de forward en la red neuronal
        """

        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        logits = self.linear(x)
        predictions = self.softmax(logits)
        return predictions


if __name__ == "__main__":
    # instanciar la red neuronal convolucional
    cnn = CNNNetwork()
    # mostrar la informacion de la red neuronal convolucional
    summary(cnn.cuda(), (1, 64, 44))

Entrenamiento

import torch
import torchaudio
from torch import nn
from torch.utils.data import DataLoader

from urbansounddataset import UrbanSoundDataset
from cnn import CNNNetwork


BATCH_SIZE = 128
EPOCHS = 10
LEARNING_RATE = 0.001

ANNOTATIONS_FILE = "datasets/UrbanSound8K/metadata/UrbanSound8K.csv"
AUDIO_DIR = "datasets/UrbanSound8K/audio"
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050


def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    return train_dataloader


def train_single_epoch(model, data_loader, loss_fn, optimiser, device):
    for input, target in data_loader:
        input, target = input.to(device), target.to(device)

        # calcular la perdida y el gradiente
        prediction = model(input)
        loss = loss_fn(prediction, target)

        # error de backpropagation y actualizacion de los pesos
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    print(f"loss: {loss.item()}")


def train(model, data_loader, loss_fn, optimiser, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, data_loader, loss_fn, optimiser, device)
        print("---------------------------")
    print("Finished training")


if __name__ == "__main__":
    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    print(f"Using {device}")

    
    # instanciar la extración del mel spectrogram
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )

    # instanciar el dataset
    usd = UrbanSoundDataset(ANNOTATIONS_FILE,
                            AUDIO_DIR,
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            device)
    
    train_dataloader = create_data_loader(usd, BATCH_SIZE)

    # construcion del modelo y asignarlo a la GPU
    cnn = CNNNetwork().to(device)
    print(cnn)

    
    # inicializacion de la funcion de perdida + optimizador
    loss_fn = nn.CrossEntropyLoss()
    optimiser = torch.optim.Adam(cnn.parameters(),
                                 lr=LEARNING_RATE)

    # entrenamiento del modelo
    train(cnn, train_dataloader, loss_fn, optimiser, device, EPOCHS)

    # grabar el modelo
    torch.save(cnn.state_dict(), "cnnnet.pth")
    print("red neuronal entrenada cnnnet.pth")

Predicion

import torch
import torchaudio

from cnn import CNNNetwork
from urbansounddataset import UrbanSoundDataset
from train import AUDIO_DIR, ANNOTATIONS_FILE, SAMPLE_RATE, NUM_SAMPLES


class_mapping = [
    "air_conditioner",
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "jackhammer",
    "siren",
    "street_music"
]


def predict(model, input, target, class_mapping):
    """
    Predecir la clase de una secuencia de audio
    """


    model.eval()
    with torch.no_grad():
        predictions = model(input)
        predicted_index = predictions[0].argmax(0)
        predicted = class_mapping[predicted_index]
        expected = class_mapping[target]
    return predicted, expected


if __name__ == "__main__":
    
    # cargar el modelo
    cnn = CNNNetwork()
    state_dict = torch.load("cnnnet.pth")
    cnn.load_state_dict(state_dict)

   
    # instanciar la extración del mel spectrogram
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )

    # cargar el urban sound dataset

    usd = UrbanSoundDataset(ANNOTATIONS_FILE,
                            AUDIO_DIR,
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            "cpu") # predicciones con CPU

    # obtencion un ejemplo de prueba del urban sound dataset para prediccion
    input, target = usd[0][0], usd[0][1] # [batch size, num_channels, fr, time]
    input.unsqueeze_(0)

    # realizar prediccion
    predicted, expected = predict(cnn, input, target,
                                  class_mapping)
    print(f"Prediccion: '{predicted}', Real: '{expected}'")

Referencias