I am trying to generate music using custom voice and its the first time I am using and coding with it, and I an not having no idea why I an getting this error.
here are my dependencies:
pip install --upgrade pip !pip install torch numpy matplotlib scipy !pip install unidecode !pip install inflect !pip install librosa !pip install unidecode inflect librosa !pip install webrtcvad==2.0.10 !pip install jukebox !pip install magenta
I am getting this error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-38-07f82bd02fd5> in <cell line: 38>()
36
37 # Load Tacotron2 model
---> 38 model = Tacotron2(hparams)
39 model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
40 model.cuda().eval().half()
/content/tacotron2/model.py in __init__(self, hparams)
458 def __init__(self, hparams):
459 super(Tacotron2, self).__init__()
--> 460 self.mask_padding = hparams.mask_padding
461 self.fp16_run = hparams.fp16_run
462 self.n_mel_channels = hparams.n_mel_channels
AttributeError: 'dict' object has no attribute 'mask_padding'
here is my code:
import sys
import torch
import numpy as np
from scipy.io.wavfile import write
from IPython.display import Audio
import tensorflow as tf
sys.path.append('/content/tacotron2/')
sys.path.append('/content/waveglow/')
from tacotron2.model import Tacotron2
from tacotron2.text import text_to_sequence
from waveglow.denoiser import Denoiser
# Load models
checkpoint_path = "tacotron2/checkpoints/voice.pth"
waveglow_path = "waveglow/checkpoints/waveglow_256channels_universal_v5.pt"
# Create a dictionary for hyperparameters
hparams = {
'sampling_rate': 22050,
'n_mel_channels': 80,
'n_symbols': 256,
'symbols_embedding_dim': 512,
'n_frames_per_step': 1,
'mask_padding': True,
'fp16_run': False,
'segment_size': 1024,
'filter_length': 1024,
'hop_length': 256,
'win_length': 1024,
'n_layers': 6,
'n_heads': 4,
'n_units': 512
}
# Load Tacotron2 model
model = Tacotron2(hparams)
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
model.cuda().eval().half()
# Load WaveGlow model
waveglow = torch.load(waveglow_path)['model']
waveglow.cuda().eval()
denoiser = Denoiser(waveglow)
def text_to_speech(text, model, waveglow, denoiser):
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.from_numpy(sequence).cuda().long()
with torch.no_grad():
mel_outputs, mel_outputs_postnet, _, _ = model.inference(sequence)
audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
audio_denoised = denoiser(audio, strength=0.01)[:, 0]
return audio_denoised.cpu().numpy()
# Example lyrics
lyrics = "This is a sample lyric to generate the song."
# Generate vocals
vocals = text_to_speech(lyrics, model, waveglow, denoiser)
# Save the generated vocals
vocals_path = "/content/drive/MyDrive/vocals.wav"
write(vocals_path, 22050, vocals)
# Play the generated vocals
print("Generated Vocals:")
display(Audio(vocals_path))
I have tried checking other but there were no similar issue on the net, so I am here asking for help. I am using Google Colab.