diff --git a/inference_sameRes_multiplespeakers.py b/inference_sameRes_multiplespeakers.py new file mode 100644 index 0000000000000000000000000000000000000000..bab35b9f69a2d37da2d0e064dac090772881af23 --- /dev/null +++ b/inference_sameRes_multiplespeakers.py @@ -0,0 +1,153 @@ +############################################################################### +# +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +############################################################################### +import matplotlib +matplotlib.use("Agg") +import matplotlib.pylab as plt + +import os +import argparse +import json +import sys +import numpy as np +import torch + + +from flowtron import Flowtron +from torch.utils.data import DataLoader +from data import Data +from train import update_params + +sys.path.insert(0, "tacotron2") +sys.path.insert(0, "tacotron2/waveglow") +from glow import WaveGlow +from scipy.io.wavfile import write +from denoiser import Denoiser + +def infer(flowtron_path, waveglow_path, output_dir, text, speaker_id, n_frames, + sigma, gate_threshold, seed): + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + + # load waveglow + waveglow = torch.load(waveglow_path)['model'].cuda().eval() + waveglow.cuda().half() + for k in waveglow.convinv: + k.float() + waveglow.eval() + denoiser = Denoiser(waveglow) + + # load flowtron + model = Flowtron(**model_config).cuda() + pretrained_dict = torch.load(flowtron_path, map_location='cpu') + if 'model' in pretrained_dict: + state_dict = pretrained_dict['model'].state_dict() + else: + state_dict = pretrained_dict['state_dict'] + + #state_dict = torch.load(flowtron_path, map_location='cpu')['state_dict'] + model.load_state_dict(state_dict) + model.eval() + print("Loaded checkpoint '{}')" .format(flowtron_path)) + + ignore_keys = ['training_files', 'validation_files'] + trainset = Data( + data_config['training_files'], + **dict((k, v) for k, v in data_config.items() if k not in ignore_keys)) + + text = trainset.get_text(text).cuda() + text = text[None] + + #residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma + residual = torch.cuda.FloatTensor((torch.zeros(1, 80, n_frames)+0.1).cuda()) + + for speaker_id in [14,34]: + speaker_vecs = trainset.get_speaker_id(speaker_id).cuda() + print (speaker_vecs.cpu()) + speaker_vecs = speaker_vecs[None] + with torch.no_grad(): + mels, attentions = model.infer( + residual, speaker_vecs, text, gate_threshold=gate_threshold) + + + + #with torch.no_grad(): + audio = waveglow.infer(mels.half(), sigma=0.8).float() + audio_denoised = denoiser(audio, strength=0.1)[:, 0] + for k in range(len(attentions)): + attention = torch.cat(attentions[k]).cpu().numpy() + fig, axes = plt.subplots(1, 2, figsize=(16, 4)) + axes[0].imshow(mels[0].cpu().numpy(), origin='lower', aspect='auto') + axes[1].imshow(attention[:, 0].transpose(), origin='lower', aspect='auto') + fig.savefig(os.path.join(output_dir, 'sid{}_sigma{}_attnlayer{}.png'.format(str(speaker_id).zfill(2), sigma, k))) + plt.close("all") + + + #audio = audio.cpu().numpy()[0] + # normalize audio for now + #audio = audio / np.abs(audio).max() + print(audio.shape[1]) + + #write(os.path.join(output_dir, 'sid{}_sigma{}.wav'.format(speaker_id, sigma)), + # data_config['sampling_rate'], audio) + audio_denoised = audio_denoised.cpu().numpy()[0] + # normalize audio for now + audio_denoised = audio_denoised / np.abs(audio_denoised).max() + + write(os.path.join(output_dir, 'sid{}_sigma{}_denoised.wav'.format(str(speaker_id).zfill(2), sigma)),data_config['sampling_rate'], audio_denoised) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--config', type=str, + help='JSON file for configuration') + parser.add_argument('-p', '--params', nargs='+', default=[]) + parser.add_argument('-f', '--flowtron_path', + help='Path to flowtron state dict', type=str) + parser.add_argument('-w', '--waveglow_path', + help='Path to waveglow state dict', type=str) + parser.add_argument('-t', '--text', help='Text to synthesize', type=str) + parser.add_argument('-i', '--id', help='Speaker id', type=int) + parser.add_argument('-n', '--n_frames', help='Number of frames', + default=400, type=int) + parser.add_argument('-o', "--output_dir", default="results/") + parser.add_argument("-s", "--sigma", default=0.5, type=float) + parser.add_argument("-g", "--gate", default=0.5, type=float) + parser.add_argument("--seed", default=1234, type=int) + args = parser.parse_args() + + # Parse configs. Globals nicer in this case + with open(args.config) as f: + data = f.read() + + global config + config = json.loads(data) + update_params(config, args.params) + + data_config = config["data_config"] + global model_config + model_config = config["model_config"] + + # Make directory if it doesn't exist + if not os.path.isdir(args.output_dir): + os.makedirs(args.output_dir) + os.chmod(args.output_dir, 0o775) + + torch.backends.cudnn.enabled = True + torch.backends.cudnn.benchmark = False + infer(args.flowtron_path, args.waveglow_path, args.output_dir, args.text, + args.id, args.n_frames, args.sigma, args.gate, args.seed) diff --git a/inference_style_transfer.py b/inference_style_transfer.py new file mode 100644 index 0000000000000000000000000000000000000000..861ce74569aea985b5574b640c29d3834b003c29 --- /dev/null +++ b/inference_style_transfer.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python +# coding: utf-8 + +# ## Flowtron Style Transfer Demo + +# #### Import libraries and setup matplotlib + +# In[1]: + + +#get_ipython().run_line_magic('matplotlib', 'inline') +import matplotlib +import matplotlib.pylab as plt +import IPython.display as ipd +import numpy as np +import json +import sys,os +import torch +from torch.distributions import Normal + +from scipy.io.wavfile import write +from flowtron import Flowtron +from data import Data +from train import update_params +sys.path.insert(0, "tacotron2") +sys.path.insert(0, "tacotron2/waveglow") +from denoiser import Denoiser + + +# #### Load Flowtron + +# In[2]: + + +config_path = "config_SWARA_ALL_noSIL_noSPK.json" +params = ["model_config.dummy_speaker_embedding=0", + "data_config.p_arpabet=1.0"] + +with open(config_path) as f: + data = f.read() + +config = json.loads(data) +update_params(config, params) + +data_config = config["data_config"] +model_config = config["model_config"] + + +# In[3]: + + +model_path = "outdir_Flowtron2021_SWARA_ALL_fromMaraTacotron2_noSPK/model_5900000" + +pretrained_dict = torch.load(model_path, map_location='cpu') +if 'model' in pretrained_dict: + #pretrained_dict = pretrained_dict['model'].state_dict() + state_dict = torch.load(model_path, map_location='cpu')['model'].state_dict() +else: + state_dict = torch.load(model_path, map_location='cpu')['state_dict'] + # pretrained_dict = pretrained_dict['state_dict'] + + +#state_dict = torch.load(model_path, map_location='cpu')['state_dict'] +model = Flowtron(**model_config) +model.load_state_dict(state_dict) +_ = model.eval().cuda() + + +# #### Load WaveGlow + +# In[4]: + + +waveglow_path = '../../NVIDIA/0_MODELS/EN/waveglow-models/waveglow_256channels_v4.pt' +waveglow = torch.load(waveglow_path)['model'] +_ = waveglow.eval().cuda() +denoiser = Denoiser(waveglow).cuda().eval() + + +# #### Download samples with surprised style and unzip them in the 'data' folder +# [Surprised samples](https://drive.google.com/file/d/100YJu80Y-k5katrwzzE6rFoEHJ2rLmkc/view?usp=sharing) https://drive.google.com/file/d/100YJu80Y-k5katrwzzE6rFoEHJ2rLmkc/view?usp=sharing + +# #### Prepare the dataloader + +# In[5]: + + +dataset_path = 'filelists/eme_data.txt' +dataset = Data( + dataset_path, + **dict((k, v) for k, v in data_config.items() if k not in ['training_files', 'validation_files'])) + + +# #### Collect z values + +# In[6]: + + +z_values = [] +force_speaker_id =-1 +for i in range(len(dataset)): + mel, sid, text, attn_prior = dataset[i] + mel, sid, text = mel[None].cuda(), sid.cuda(), text[None].cuda() + if force_speaker_id > -1: + sid = sid * 0 + force_speaker_id + in_lens = torch.LongTensor([text.shape[1]]).cuda() + out_lens = torch.LongTensor([mel.shape[2]]).cuda() + with torch.no_grad(): + z = model(mel, sid, text, in_lens, out_lens)[0] + z_values.append(z.permute(1, 2, 0)) + + +# #### Compute the posterior distribution + +# In[7]: + + +lambd = 0.0001 +sigma = 0.5 +n_frames = 300 +aggregation_type = 'batch' + +if aggregation_type == 'time_and_batch': + z_mean = torch.cat([z.mean(dim=2) for z in z_values]) + z_mean = torch.mean(z_mean, dim=0)[:, None] + ratio = len(z_values) / lambd + mu_posterior = (ratio * z_mean / (ratio + 1)) +elif aggregation_type == 'batch': + for k in range(len(z_values)): + expand = z_values[k] + while expand.size(2) < n_frames: + expand = torch.cat((expand, z_values[k]), 2) + z_values[k] = expand[:, :, :n_frames] + + z_mean = torch.mean(torch.cat(z_values, dim=0), dim=0)[None] + z_mean_size = z_mean.size() + z_mean = z_mean.flatten() + ratio = len(z_values) / float(lambd) + mu_posterior = (ratio * z_mean / (ratio + 1)).flatten() + mu_posterior = mu_posterior.view(80, -1) + +print(ratio) +dist = Normal(mu_posterior.cpu(), sigma) + + +# In[8]: + + +z_baseline = torch.FloatTensor(1, 80, n_frames).cuda().normal_() * sigma +if aggregation_type == 'time_and_batch': + z_posterior = dist.sample([n_frames]).permute(2,1,0).cuda() +elif aggregation_type == 'batch': + z_posterior = dist.sample().view(1, 80, -1)[..., :n_frames].cuda() + + +# In[9]: + + +text = "De ce e mai râioasă capra, de aia stă cu coada mai sus." +text_encoded = dataset.get_text(text).cuda()[None] + + +# #### Perform inference sampling the posterior and a standard gaussian baseline + +# In[10]: + + +speaker = 0 +speaker_id = torch.LongTensor([speaker]).cuda() +with torch.no_grad(): + mel_posterior = model.infer(z_posterior, speaker_id, text_encoded)[0] + mel_baseline = model.infer(z_baseline, speaker_id, text_encoded)[0] + + +# In[11]: + + +fig, axes = plt.subplots(2, 2, figsize=(16, 6)) +axes[0, 0].imshow(mel_posterior[0].cpu(), aspect='auto', origin='lower', interpolation='none') +im = axes[0, 1].imshow(z_posterior[0].cpu(), aspect='auto', origin='lower', interpolation='none') +plt.colorbar(im, ax=axes[0, 1]) +axes[1, 0].imshow(mel_baseline[0].cpu(), aspect='auto', origin='lower', interpolation='none') +im = axes[1, 1].imshow(z_baseline[0].cpu(), aspect='auto', origin='lower', interpolation='none') +plt.colorbar(im, ax=axes[1, 1]) + + + +output_dir = 'results/' + +# #### Posterior sample + +with torch.no_grad(): + #audio = #denoiser(waveglow.infer(mel_posterior, sigma=0.8), 0.001) + audio = waveglow.infer(mel_posterior, sigma=0.8) +audio = audio.cpu().numpy()[0] +audio = np.transpose(audio / np.abs(audio).max()) +print(audio.shape) + + +write(os.path.join(output_dir, 'sid{}_sigma{}-posterior.wav'.format(str(speaker).zfill(2), sigma)),data_config['sampling_rate'], audio) + +# #### Baseline sample +with torch.no_grad(): +# audio = denoiser(waveglow.infer(mel_baseline, sigma=0.8), 0.001) + audio = waveglow.infer(mel_baseline, sigma=0.8) + +audio = audio.cpu().numpy()[0] +audio = np.transpose(audio / np.abs(audio).max()) +print(audio.shape) +write(os.path.join(output_dir, 'sid{}_sigma{}-baseline.wav'.format(str(speaker).zfill(2), sigma)),data_config['sampling_rate'], audio) +