wrapper.py

import warnings, librosa
import numpy as np
from time import time
import torch
import torch.nn as nn
import torch.nn.functional as F

import utils
from model import train_audio_transforms, AcousticModel, BoundaryDetection

np.random.seed(7)

def preprocess_from_file(audio_file, lyrics_file, word_file=None):
    y, sr = preprocess_audio(audio_file)

    words, lyrics_p, idx_word_p, idx_line_p = preprocess_lyrics(lyrics_file, word_file)

    return y, words, lyrics_p, idx_word_p, idx_line_p

def align(audio, words, lyrics_p, idx_word_p, idx_line_p, method="Baseline", cuda=True):

    # start timer
    t = time()

    # constants
    resolution = 256 / 22050 * 3
    alpha = 0.8

    # decode method
    if "BDR" in method:
        model_type = method[:-4]
        bdr_flag = True
    else:
        model_type = method
        bdr_flag = False
    print("Model: {} BDR?: {}".format(model_type, bdr_flag))

    # prepare acoustic model params
    if model_type == "Baseline":
        n_class = 41
    elif model_type == "MTL":
        n_class = (41, 47)
    else:
        ValueError("Invalid model type.")

    hparams = {
        "n_cnn_layers": 1,
        "n_rnn_layers": 3,
        "rnn_dim": 256,
        "n_class": n_class,
        "n_feats": 32,
        "stride": 1,
        "dropout": 0.1
    }

    device = 'cuda' if (cuda and torch.cuda.is_available()) else 'cpu'

    ac_model = AcousticModel(
        hparams['n_cnn_layers'], hparams['rnn_dim'], hparams['n_class'], \
        hparams['n_feats'], hparams['stride'], hparams['dropout']
    ).to(device)

    print("Loading acoustic model from checkpoint...")
    state = utils.load_model(ac_model, "./checkpoints/checkpoint_{}".format(model_type), cuda=(device=="gpu"))
    ac_model.eval()

    print("Computing phoneme posteriorgram...")

    # reshape input, prepare mel
    x = audio.reshape(1, 1, -1)
    x = utils.move_data_to_device(x, device)
    x = x.squeeze(0)
    x = x.squeeze(1)
    x = train_audio_transforms.to(device)(x)
    x = nn.utils.rnn.pad_sequence(x, batch_first=True).unsqueeze(1)

    # predict
    all_outputs = ac_model(x)
    if model_type == "MTL":
        all_outputs = torch.sum(all_outputs, dim=3)

    all_outputs = F.log_softmax(all_outputs, dim=2)

    batch_num, output_length, num_classes = all_outputs.shape
    song_pred = all_outputs.data.cpu().numpy().reshape(-1, num_classes)  # total_length, num_classes
    total_length = int(audio.shape[1] / 22050 // resolution)
    song_pred = song_pred[:total_length, :]

    # smoothing
    P_noise = np.random.uniform(low=1e-11, high=1e-10, size=song_pred.shape)
    song_pred = np.log(np.exp(song_pred) + P_noise)

    if bdr_flag:
        # boundary model: fixed
        bdr_hparams = {
            "n_cnn_layers": 1,
            "rnn_dim": 32,  # a smaller rnn dim than acoustic model
            "n_class": 1,  # binary classification
            "n_feats": 32,
            "stride": 1,
            "dropout": 0.1,
        }

        bdr_model = BoundaryDetection(
            bdr_hparams['n_cnn_layers'], bdr_hparams['rnn_dim'], bdr_hparams['n_class'],
            bdr_hparams['n_feats'], bdr_hparams['stride'], bdr_hparams['dropout']
        ).to(device)
        print("Loading BDR model from checkpoint...")
        state = utils.load_model(bdr_model, "./checkpoints/checkpoint_BDR", cuda=(device == "gpu"))
        bdr_model.eval()

        print("Computing boundary probability curve...")
        # get boundary prob curve
        bdr_outputs = bdr_model(x).data.cpu().numpy().reshape(-1)
        # apply log
        bdr_outputs = np.log(bdr_outputs) * alpha

        line_start = [d[0] for d in idx_line_p]

        # start alignment
        print("Aligning...It might take a few minutes...")
        word_align, score = utils.alignment_bdr(song_pred, lyrics_p, idx_word_p, bdr_outputs, line_start)
    else:
        # start alignment
        print("Aligning...It might take a few minutes...")
        word_align, score = utils.alignment(song_pred, lyrics_p, idx_word_p)

    t = time() - t
    print("Alignment Score:\t{}\tTime:\t{}".format(score, t))

    return word_align, words

def preprocess_audio(audio_file, sr=22050):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        y, curr_sr = librosa.load(audio_file, sr=sr, mono=True, res_type='kaiser_fast')

    if len(y.shape) == 1:
        y = y[np.newaxis, :] # (channel, sample)

    return y, curr_sr

def preprocess_lyrics(lyrics_file, word_file=None):
    from string import ascii_lowercase
    d = {ascii_lowercase[i]: i for i in range(26)}
    d["'"] = 26
    d[" "] = 27
    d["~"] = 28

    # process raw
    with open(lyrics_file, 'r') as f:
        raw_lines = f.read().splitlines()

    raw_lines = ["".join([c for c in line.lower() if c in d.keys()]).strip() for line in raw_lines]
    raw_lines = [" ".join(line.split()) for line in raw_lines if len(line) > 0]
    # concat
    full_lyrics = " ".join(raw_lines)

    if word_file:
        with open(word_file) as f:
            words_lines = f.read().splitlines()
    else:
        words_lines = full_lyrics.split()

    lyrics_p, words_p, idx_word_p, idx_line_p = utils.gen_phone_gt(words_lines, raw_lines)

    return words_lines, lyrics_p, idx_word_p, idx_line_p

def write_csv(pred_file, word_align, words):
    resolution = 256 / 22050 * 3

    with open(pred_file, 'w') as f:
        for j in range(len(word_align)):
            word_time = word_align[j]
            f.write("{},{},{}\n".format(word_time[0] * resolution, word_time[1] * resolution, words[j]))