一个轻量级的TTS模型实现

1.环境

python 版本 3.9

2.训练数据集

本次采用LJSpeech数据集,百度网盘下载地址 链接:https://pan.baidu.com/s/1DDFmPpHQrTR_NvjAfwX-QA 
提取码:1234

3.安装依赖

pip install TTS

4.工程结构

5代码部分

decoder.py

import torch
from torch import nnfrom TTS.tts.layers.generic.normalization import ActNorm
from TTS.tts.layers.glow_tts.glow import CouplingBlock, InvConvNeardef squeeze(x, x_mask=None, num_sqz=2):"""GlowTTS squeeze operationIncrease number of channels and reduce number of time stepsby the same factor.Note:each 's' is a n-dimensional vector.``[s1,s2,s3,s4,s5,s6] --> [[s1, s3, s5], [s2, s4, s6]]``"""b, c, t = x.size()t = (t // num_sqz) * num_sqzx = x[:, :, :t]x_sqz = x.view(b, c, t // num_sqz, num_sqz)x_sqz = x_sqz.permute(0, 3, 1, 2).contiguous().view(b, c * num_sqz, t // num_sqz)if x_mask is not None:x_mask = x_mask[:, :, num_sqz - 1 :: num_sqz]else:x_mask = torch.ones(b, 1, t // num_sqz).to(device=x.device, dtype=x.dtype)return x_sqz * x_mask, x_maskdef unsqueeze(x, x_mask=None, num_sqz=2):"""GlowTTS unsqueeze operation (revert the squeeze)Note:each 's' is a n-dimensional vector.``[[s1, s3, s5], [s2, s4, s6]] --> [[s1, s3, s5, s2, s4, s6]]``"""b, c, t = x.size()x_unsqz = x.view(b, num_sqz, c // num_sqz, t)x_unsqz = x_unsqz.permute(0, 2, 3, 1).contiguous().view(b, c // num_sqz, t * num_sqz)if x_mask is not None:x_mask = x_mask.unsqueeze(-1).repeat(1, 1, 1, num_sqz).view(b, 1, t * num_sqz)else:x_mask = torch.ones(b, 1, t * num_sqz).to(device=x.device, dtype=x.dtype)return x_unsqz * x_mask, x_maskclass Decoder(nn.Module):"""Stack of Glow Decoder Modules.::Squeeze -> ActNorm -> InvertibleConv1x1 -> AffineCoupling -> UnsqueezeArgs:in_channels (int): channels of input tensor.hidden_channels (int): hidden decoder channels.kernel_size (int): Coupling block kernel size. (Wavenet filter kernel size.)dilation_rate (int): rate to increase dilation by each layer in a decoder block.num_flow_blocks (int): number of decoder blocks.num_coupling_layers (int): number coupling layers. (number of wavenet layers.)dropout_p (float): wavenet dropout rate.sigmoid_scale (bool): enable/disable sigmoid scaling in coupling layer."""def __init__(self,in_channels,hidden_channels,kernel_size,dilation_rate,num_flow_blocks,num_coupling_layers,dropout_p=0.0,num_splits=4,num_squeeze=2,sigmoid_scale=False,c_in_channels=0,):super().__init__()self.in_channels = in_channelsself.hidden_channels = hidden_channelsself.kernel_size = kernel_sizeself.dilation_rate = dilation_rateself.num_flow_blocks = num_flow_blocksself.num_coupling_layers = num_coupling_layersself.dropout_p = dropout_pself.num_splits = num_splitsself.num_squeeze = num_squeezeself.sigmoid_scale = sigmoid_scaleself.c_in_channels = c_in_channelsself.flows = nn.ModuleList()for _ in range(num_flow_blocks):self.flows.append(ActNorm(channels=in_channels * num_squeeze))self.flows.append(InvConvNear(channels=in_channels * num_squeeze, num_splits=num_splits))self.flows.append(CouplingBlock(in_channels * num_squeeze,hidden_channels,kernel_size=kernel_size,dilation_rate=dilation_rate,num_layers=num_coupling_layers,c_in_channels=c_in_channels,dropout_p=dropout_p,sigmoid_scale=sigmoid_scale,))def forward(self, x, x_mask, g=None, reverse=False):"""Shapes:- x:  :math:`[B, C, T]`- x_mask: :math:`[B, 1 ,T]`- g: :math:`[B, C]`"""if not reverse:flows = self.flowslogdet_tot = 0else:flows = reversed(self.flows)logdet_tot = Noneif self.num_squeeze > 1:x, x_mask = squeeze(x, x_mask, self.num_squeeze)for f in flows:if not reverse:x, logdet = f(x, x_mask, g=g, reverse=reverse)logdet_tot += logdetelse:x, logdet = f(x, x_mask, g=g, reverse=reverse)if self.num_squeeze > 1:x, x_mask = unsqueeze(x, x_mask, self.num_squeeze)return x, logdet_totdef store_inverse(self):for f in self.flows:f.store_inverse()

encoder.py

import mathimport torch
from torch import nnfrom TTS.tts.layers.generic.gated_conv import GatedConvBlock
from TTS.tts.layers.generic.res_conv_bn import ResidualConv1dBNBlock
from TTS.tts.layers.generic.time_depth_sep_conv import TimeDepthSeparableConvBlock
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
from TTS.tts.layers.glow_tts.glow import ResidualConv1dLayerNormBlock
from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
from TTS.tts.utils.helpers import sequence_maskclass Encoder(nn.Module):"""Glow-TTS encoder module.::embedding -> <prenet> -> encoder_module -> <postnet> --> proj_mean||-> proj_var||-> concat -> duration_predictor↑speaker_embedArgs:num_chars (int): number of characters.out_channels (int): number of output channels.hidden_channels (int): encoder's embedding size.hidden_channels_ffn (int): transformer's feed-forward channels.kernel_size (int): kernel size for conv layers and duration predictor.dropout_p (float): dropout rate for any dropout layer.mean_only (bool): if True, output only mean values and use constant std.use_prenet (bool): if True, use pre-convolutional layers before transformer layers.c_in_channels (int): number of channels in conditional input.Shapes:- input: (B, T, C)::suggested encoder params...for encoder_type == 'rel_pos_transformer'encoder_params={'kernel_size':3,'dropout_p': 0.1,'num_layers': 6,'num_heads': 2,'hidden_channels_ffn': 768,  # 4 times the hidden_channels'input_length': None}for encoder_type == 'gated_conv'encoder_params={'kernel_size':5,'dropout_p': 0.1,'num_layers': 9,}for encoder_type == 'residual_conv_bn'encoder_params={"kernel_size": 4,"dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1],"num_conv_blocks": 2,"num_res_blocks": 13}for encoder_type == 'time_depth_separable'encoder_params={"kernel_size": 5,'num_layers': 9,}"""def __init__(self,num_chars,out_channels,hidden_channels,hidden_channels_dp,encoder_type,encoder_params,dropout_p_dp=0.1,mean_only=False,use_prenet=True,c_in_channels=0,):super().__init__()# class argumentsself.num_chars = num_charsself.out_channels = out_channelsself.hidden_channels = hidden_channelsself.hidden_channels_dp = hidden_channels_dpself.dropout_p_dp = dropout_p_dpself.mean_only = mean_onlyself.use_prenet = use_prenetself.c_in_channels = c_in_channelsself.encoder_type = encoder_type# embedding layerself.emb = nn.Embedding(num_chars, hidden_channels)nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)# init encoder moduleif encoder_type.lower() == "rel_pos_transformer":if use_prenet:self.prenet = ResidualConv1dLayerNormBlock(hidden_channels, hidden_channels, hidden_channels, kernel_size=5, num_layers=3, dropout_p=0.5)self.encoder = RelativePositionTransformer(hidden_channels, hidden_channels, hidden_channels, **encoder_params)elif encoder_type.lower() == "gated_conv":self.encoder = GatedConvBlock(hidden_channels, **encoder_params)elif encoder_type.lower() == "residual_conv_bn":if use_prenet:self.prenet = nn.Sequential(nn.Conv1d(hidden_channels, hidden_channels, 1), nn.ReLU())self.encoder = ResidualConv1dBNBlock(hidden_channels, hidden_channels, hidden_channels, **encoder_params)self.postnet = nn.Sequential(nn.Conv1d(self.hidden_channels, self.hidden_channels, 1), nn.BatchNorm1d(self.hidden_channels))elif encoder_type.lower() == "time_depth_separable":if use_prenet:self.prenet = ResidualConv1dLayerNormBlock(hidden_channels, hidden_channels, hidden_channels, kernel_size=5, num_layers=3, dropout_p=0.5)self.encoder = TimeDepthSeparableConvBlock(hidden_channels, hidden_channels, hidden_channels, **encoder_params)else:raise ValueError(" [!] Unkown encoder type.")# final projection layersself.proj_m = nn.Conv1d(hidden_channels, out_channels, 1)if not mean_only:self.proj_s = nn.Conv1d(hidden_channels, out_channels, 1)# duration predictorself.duration_predictor = DurationPredictor(hidden_channels + c_in_channels, hidden_channels_dp, 3, dropout_p_dp)def forward(self, x, x_lengths, g=None):"""Shapes:- x: :math:`[B, C, T]`- x_lengths: :math:`[B]`- g (optional): :math:`[B, 1, T]`"""# embedding layer# [B ,T, D]x = self.emb(x) * math.sqrt(self.hidden_channels)# [B, D, T]x = torch.transpose(x, 1, -1)# compute input sequence maskx_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)# prenetif hasattr(self, "prenet") and self.use_prenet:x = self.prenet(x, x_mask)# encoderx = self.encoder(x, x_mask)# postnetif hasattr(self, "postnet"):x = self.postnet(x) * x_mask# set duration predictor inputif g is not None:g_exp = g.expand(-1, -1, x.size(-1))x_dp = torch.cat([x.detach(), g_exp], 1)else:x_dp = x.detach()# final projection layerx_m = self.proj_m(x) * x_maskif not self.mean_only:x_logs = self.proj_s(x) * x_maskelse:x_logs = torch.zeros_like(x_m)# duration predictorlogw = self.duration_predictor(x_dp, x_mask)return x_m, x_logs, logw, x_mask

glow_tts.py

import math
from typing import Dict, List, Tuple, Unionimport torch
from coqpit import Coqpit
from torch import nn
from torch.cuda.amp.autocast_mode import autocast
from torch.nn import functional as Ffrom TTS.tts.configs.glow_tts_config import GlowTTSConfig
from decoder import Decoder
from encoder import Encoder
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.io import load_fsspecclass GlowTTS(BaseTTS):"""GlowTTS model.Paper::https://arxiv.org/abs/2005.11129Paper abstract::Recently, text-to-speech (TTS) models such as FastSpeech and ParaNet have been proposed to generatemel-spectrograms from text in parallel. Despite the advantage, the parallel TTS models cannot be trainedwithout guidance from autoregressive TTS models as their external aligners. In this work, we propose Glow-TTS,a flow-based generative model for parallel TTS that does not require any external aligner. By combining theproperties of flows and dynamic programming, the proposed model searches for the most probable monotonicalignment between text and the latent representation of speech on its own. We demonstrate that enforcing hardmonotonic alignments enables robust TTS, which generalizes to long utterances, and employing generative flowsenables fast, diverse, and controllable speech synthesis. Glow-TTS obtains an order-of-magnitude speed-up overthe autoregressive model, Tacotron 2, at synthesis with comparable speech quality. We further show that ourmodel can be easily extended to a multi-speaker setting.Check :class:`TTS.tts.configs.glow_tts_config.GlowTTSConfig` for class arguments.Examples:Init only model layers.>>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig>>> from TTS.tts.models.glow_tts import GlowTTS>>> config = GlowTTSConfig(num_chars=2)>>> model = GlowTTS(config)Fully init a model ready for action. All the class attributes and class members(e.g Tokenizer, AudioProcessor, etc.). are initialized internally based on config values.>>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig>>> from TTS.tts.models.glow_tts import GlowTTS>>> config = GlowTTSConfig()>>> model = GlowTTS.init_from_config(config, verbose=False)"""def __init__(self,config: GlowTTSConfig,ap: "AudioProcessor" = None,tokenizer: "TTSTokenizer" = None,speaker_manager: SpeakerManager = None,):super().__init__(config, ap, tokenizer, speaker_manager)# pass all config fields to `self`# for fewer code changeself.config = configfor key in config:setattr(self, key, config[key])self.decoder_output_dim = config.out_channels# init multi-speaker layers if necessaryself.init_multispeaker(config)self.run_data_dep_init = config.data_dep_init_steps > 0self.encoder = Encoder(self.num_chars,out_channels=self.out_channels,hidden_channels=self.hidden_channels_enc,hidden_channels_dp=self.hidden_channels_dp,encoder_type=self.encoder_type,encoder_params=self.encoder_params,mean_only=self.mean_only,use_prenet=self.use_encoder_prenet,dropout_p_dp=self.dropout_p_dp,c_in_channels=self.c_in_channels,)self.decoder = Decoder(self.out_channels,self.hidden_channels_dec,self.kernel_size_dec,self.dilation_rate,self.num_flow_blocks_dec,self.num_block_layers,dropout_p=self.dropout_p_dec,num_splits=self.num_splits,num_squeeze=self.num_squeeze,sigmoid_scale=self.sigmoid_scale,c_in_channels=self.c_in_channels,)def init_multispeaker(self, config: Coqpit):"""Init speaker embedding layer if `use_speaker_embedding` is True and set the expected speaker embeddingvector dimension to the encoder layer channel size. If model uses d-vectors, then it only setsspeaker embedding vector dimension to the d-vector dimension from the config.Args:config (Coqpit): Model configuration."""self.embedded_speaker_dim = 0# set number of speakers - if num_speakers is set in config, use it, otherwise use speaker_managerif self.speaker_manager is not None:self.num_speakers = self.speaker_manager.num_speakers# set ultimate speaker embedding sizeif config.use_d_vector_file:self.embedded_speaker_dim = (config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512)if self.speaker_manager is not None:assert (config.d_vector_dim == self.speaker_manager.embedding_dim), " [!] d-vector dimension mismatch b/w config and speaker manager."# init speaker embedding layerif config.use_speaker_embedding and not config.use_d_vector_file:print(" > Init speaker_embedding layer.")self.embedded_speaker_dim = self.hidden_channels_encself.emb_g = nn.Embedding(self.num_speakers, self.hidden_channels_enc)nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)# set conditioning dimensionsself.c_in_channels = self.embedded_speaker_dim@staticmethoddef compute_outputs(attn, o_mean, o_log_scale, x_mask):"""Compute and format the mode outputs with the given alignment map"""y_mean = torch.matmul(attn.squeeze(1).transpose(1, 2), o_mean.transpose(1, 2)).transpose(1, 2)  # [b, t', t], [b, t, d] -> [b, d, t']y_log_scale = torch.matmul(attn.squeeze(1).transpose(1, 2), o_log_scale.transpose(1, 2)).transpose(1, 2)  # [b, t', t], [b, t, d] -> [b, d, t']# compute total duration with adjustmento_attn_dur = torch.log(1 + torch.sum(attn, -1)) * x_maskreturn y_mean, y_log_scale, o_attn_durdef unlock_act_norm_layers(self):"""Unlock activation normalization layers for data depended initalization."""for f in self.decoder.flows:if getattr(f, "set_ddi", False):f.set_ddi(True)def lock_act_norm_layers(self):"""Lock activation normalization layers."""for f in self.decoder.flows:if getattr(f, "set_ddi", False):f.set_ddi(False)def _set_speaker_input(self, aux_input: Dict):if aux_input is None:d_vectors = Nonespeaker_ids = Noneelse:d_vectors = aux_input.get("d_vectors", None)speaker_ids = aux_input.get("speaker_ids", None)if d_vectors is not None and speaker_ids is not None:raise ValueError("[!] Cannot use d-vectors and speaker-ids together.")if speaker_ids is not None and not hasattr(self, "emb_g"):raise ValueError("[!] Cannot use speaker-ids without enabling speaker embedding.")g = speaker_ids if speaker_ids is not None else d_vectorsreturn gdef _speaker_embedding(self, aux_input: Dict) -> Union[torch.tensor, None]:g = self._set_speaker_input(aux_input)# speaker embeddingif g is not None:if hasattr(self, "emb_g"):# use speaker embedding layerif not g.size():  # if is a scalarg = g.unsqueeze(0)  # unsqueezeg = F.normalize(self.emb_g(g)).unsqueeze(-1)  # [b, h, 1]else:# use d-vectorg = F.normalize(g).unsqueeze(-1)  # [b, h, 1]return gdef forward(self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}):  # pylint: disable=dangerous-default-value"""Args:x (torch.Tensor):Input text sequence ids. :math:`[B, T_en]`x_lengths (torch.Tensor):Lengths of input text sequences. :math:`[B]`y (torch.Tensor):Target mel-spectrogram frames. :math:`[B, T_de, C_mel]`y_lengths (torch.Tensor):Lengths of target mel-spectrogram frames. :math:`[B]`aux_input (Dict):Auxiliary inputs. `d_vectors` is speaker embedding vectors for a multi-speaker model.:math:`[B, D_vec]`. `speaker_ids` is speaker ids for a multi-speaker model usind speaker-embeddinglayer. :math:`B`Returns:Dict:- z: :math: `[B, T_de, C]`- logdet: :math:`B`- y_mean: :math:`[B, T_de, C]`- y_log_scale: :math:`[B, T_de, C]`- alignments: :math:`[B, T_en, T_de]`- durations_log: :math:`[B, T_en, 1]`- total_durations_log: :math:`[B, T_en, 1]`"""# [B, T, C] -> [B, C, T]y = y.transpose(1, 2)y_max_length = y.size(2)# norm speaker embeddingsg = self._speaker_embedding(aux_input)# embedding passo_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)# drop redisual frames wrt num_squeeze and set y_lengths.y, y_lengths, y_max_length, attn = self.preprocess(y, y_lengths, y_max_length, None)# create masksy_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype)# [B, 1, T_en, T_de]attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)# decoder passz, logdet = self.decoder(y, y_mask, g=g, reverse=False)# find the alignment pathwith torch.no_grad():o_scale = torch.exp(-2 * o_log_scale)logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1)  # [b, t, 1]logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z**2))  # [b, t, d] x [b, d, t'] = [b, t, t']logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z)  # [b, t, d] x [b, d, t'] = [b, t, t']logp4 = torch.sum(-0.5 * (o_mean**2) * o_scale, [1]).unsqueeze(-1)  # [b, t, 1]logp = logp1 + logp2 + logp3 + logp4  # [b, t, t']attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach()y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask)attn = attn.squeeze(1).permute(0, 2, 1)outputs = {"z": z.transpose(1, 2),"logdet": logdet,"y_mean": y_mean.transpose(1, 2),"y_log_scale": y_log_scale.transpose(1, 2),"alignments": attn,"durations_log": o_dur_log.transpose(1, 2),"total_durations_log": o_attn_dur.transpose(1, 2),}return outputs@torch.no_grad()def inference_with_MAS(self, x, x_lengths, y=None, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}):  # pylint: disable=dangerous-default-value"""It's similar to the teacher forcing in Tacotron.It was proposed in: https://arxiv.org/abs/2104.05557Shapes:- x: :math:`[B, T]`- x_lenghts: :math:`B`- y: :math:`[B, T, C]`- y_lengths: :math:`B`- g: :math:`[B, C] or B`"""y = y.transpose(1, 2)y_max_length = y.size(2)# norm speaker embeddingsg = self._speaker_embedding(aux_input)# embedding passo_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)# drop redisual frames wrt num_squeeze and set y_lengths.y, y_lengths, y_max_length, attn = self.preprocess(y, y_lengths, y_max_length, None)# create masksy_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype)attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)# decoder passz, logdet = self.decoder(y, y_mask, g=g, reverse=False)# find the alignment path between z and encoder outputo_scale = torch.exp(-2 * o_log_scale)logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1)  # [b, t, 1]logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z**2))  # [b, t, d] x [b, d, t'] = [b, t, t']logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z)  # [b, t, d] x [b, d, t'] = [b, t, t']logp4 = torch.sum(-0.5 * (o_mean**2) * o_scale, [1]).unsqueeze(-1)  # [b, t, 1]logp = logp1 + logp2 + logp3 + logp4  # [b, t, t']attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach()y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask)attn = attn.squeeze(1).permute(0, 2, 1)# get predited aligned distributionz = y_mean * y_mask# reverse the decoder and predict using the aligned distributiony, logdet = self.decoder(z, y_mask, g=g, reverse=True)outputs = {"model_outputs": z.transpose(1, 2),"logdet": logdet,"y_mean": y_mean.transpose(1, 2),"y_log_scale": y_log_scale.transpose(1, 2),"alignments": attn,"durations_log": o_dur_log.transpose(1, 2),"total_durations_log": o_attn_dur.transpose(1, 2),}return outputs@torch.no_grad()def decoder_inference(self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}):  # pylint: disable=dangerous-default-value"""Shapes:- y: :math:`[B, T, C]`- y_lengths: :math:`B`- g: :math:`[B, C] or B`"""y = y.transpose(1, 2)y_max_length = y.size(2)g = self._speaker_embedding(aux_input)y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(y.dtype)# decoder passz, logdet = self.decoder(y, y_mask, g=g, reverse=False)# reverse decoder and predicty, logdet = self.decoder(z, y_mask, g=g, reverse=True)outputs = {}outputs["model_outputs"] = y.transpose(1, 2)outputs["logdet"] = logdetreturn outputs@torch.no_grad()def inference(self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None}):  # pylint: disable=dangerous-default-valuex_lengths = aux_input["x_lengths"]g = self._speaker_embedding(aux_input)# embedding passo_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)# compute output durationsw = (torch.exp(o_dur_log) - 1) * x_mask * self.length_scalew_ceil = torch.clamp_min(torch.ceil(w), 1)y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()y_max_length = None# compute masksy_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype)attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)# compute attention maskattn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1)y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask)z = (y_mean + torch.exp(y_log_scale) * torch.randn_like(y_mean) * self.inference_noise_scale) * y_mask# decoder passy, logdet = self.decoder(z, y_mask, g=g, reverse=True)attn = attn.squeeze(1).permute(0, 2, 1)outputs = {"model_outputs": y.transpose(1, 2),"logdet": logdet,"y_mean": y_mean.transpose(1, 2),"y_log_scale": y_log_scale.transpose(1, 2),"alignments": attn,"durations_log": o_dur_log.transpose(1, 2),"total_durations_log": o_attn_dur.transpose(1, 2),}return outputsdef train_step(self, batch: dict, criterion: nn.Module):"""A single training step. Forward pass and loss computation. Run data depended initialization for thefirst `config.data_dep_init_steps` steps.Args:batch (dict): [description]criterion (nn.Module): [description]"""text_input = batch["text_input"]text_lengths = batch["text_lengths"]mel_input = batch["mel_input"]mel_lengths = batch["mel_lengths"]d_vectors = batch["d_vectors"]speaker_ids = batch["speaker_ids"]if self.run_data_dep_init and self.training:# compute data-dependent initialization of activation norm layersself.unlock_act_norm_layers()with torch.no_grad():_ = self.forward(text_input,text_lengths,mel_input,mel_lengths,aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids},)outputs = Noneloss_dict = Noneself.lock_act_norm_layers()else:# normal training stepoutputs = self.forward(text_input,text_lengths,mel_input,mel_lengths,aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids},)with autocast(enabled=False):  # avoid mixed_precision in criterionloss_dict = criterion(outputs["z"].float(),outputs["y_mean"].float(),outputs["y_log_scale"].float(),outputs["logdet"].float(),mel_lengths,outputs["durations_log"].float(),outputs["total_durations_log"].float(),text_lengths,)return outputs, loss_dictdef _create_logs(self, batch, outputs, ap):alignments = outputs["alignments"]text_input = batch["text_input"][:1] if batch["text_input"] is not None else Nonetext_lengths = batch["text_lengths"]mel_input = batch["mel_input"]d_vectors = batch["d_vectors"][:1] if batch["d_vectors"] is not None else Nonespeaker_ids = batch["speaker_ids"][:1] if batch["speaker_ids"] is not None else None# model runs reverse flow to predict spectrogramspred_outputs = self.inference(text_input,aux_input={"x_lengths": text_lengths[:1], "d_vectors": d_vectors, "speaker_ids": speaker_ids},)model_outputs = pred_outputs["model_outputs"]pred_spec = model_outputs[0].data.cpu().numpy()gt_spec = mel_input[0].data.cpu().numpy()align_img = alignments[0].data.cpu().numpy()figures = {"prediction": plot_spectrogram(pred_spec, ap, output_fig=False),"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),"alignment": plot_alignment(align_img, output_fig=False),}# Sample audiotrain_audio = ap.inv_melspectrogram(pred_spec.T)return figures, {"audio": train_audio}def train_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:  # pylint: disable=no-self-usefigures, audios = self._create_logs(batch, outputs, self.ap)logger.train_figures(steps, figures)logger.train_audios(steps, audios, self.ap.sample_rate)@torch.no_grad()def eval_step(self, batch: dict, criterion: nn.Module):return self.train_step(batch, criterion)def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:figures, audios = self._create_logs(batch, outputs, self.ap)logger.eval_figures(steps, figures)logger.eval_audios(steps, audios, self.ap.sample_rate)@torch.no_grad()def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:"""Generic test run for `tts` models used by `Trainer`.You can override this for a different behaviour.Returns:Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard."""print(" | > Synthesizing test sentences.")test_audios = {}test_figures = {}test_sentences = self.config.test_sentencesaux_inputs = self._get_test_aux_input()if len(test_sentences) == 0:print(" | [!] No test sentences provided.")else:for idx, sen in enumerate(test_sentences):outputs = synthesis(self,sen,self.config,"cuda" in str(next(self.parameters()).device),speaker_id=aux_inputs["speaker_id"],d_vector=aux_inputs["d_vector"],style_wav=aux_inputs["style_wav"],use_griffin_lim=True,do_trim_silence=False,)test_audios["{}-audio".format(idx)] = outputs["wav"]test_figures["{}-prediction".format(idx)] = plot_spectrogram(outputs["outputs"]["model_outputs"], self.ap, output_fig=False)test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"], output_fig=False)return test_figures, test_audiosdef preprocess(self, y, y_lengths, y_max_length, attn=None):if y_max_length is not None:y_max_length = (y_max_length // self.num_squeeze) * self.num_squeezey = y[:, :, :y_max_length]if attn is not None:attn = attn[:, :, :, :y_max_length]y_lengths = torch.div(y_lengths, self.num_squeeze, rounding_mode="floor") * self.num_squeezereturn y, y_lengths, y_max_length, attndef store_inverse(self):self.decoder.store_inverse()def load_checkpoint(self, config, checkpoint_path, eval=False):  # pylint: disable=unused-argument, redefined-builtinstate = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))self.load_state_dict(state["model"])if eval:self.eval()self.store_inverse()assert not self.training@staticmethoddef get_criterion():from TTS.tts.layers.losses import GlowTTSLoss  # pylint: disable=import-outside-toplevelreturn GlowTTSLoss()def on_train_step_start(self, trainer):"""Decide on every training step wheter enable/disable data depended initialization."""self.run_data_dep_init = trainer.total_steps_done < self.data_dep_init_steps@staticmethoddef init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None, verbose=True):"""Initiate model from configArgs:config (VitsConfig): Model config.samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.Defaults to None.verbose (bool): If True, print init messages. Defaults to True."""from TTS.utils.audio import AudioProcessorap = AudioProcessor.init_from_config(config, verbose)tokenizer, new_config = TTSTokenizer.init_from_config(config)speaker_manager = SpeakerManager.init_from_config(config, samples)return GlowTTS(new_config, ap, tokenizer, speaker_manager)

train.py
 

from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.datasets import load_tts_samples
import os
import numpy as np
import torch
from glow_tts import GlowTTS
from trainer import Trainer, TrainerArgs
from TTS.utils.radam import RAdam
from trainer.torch import NoamLR
from TTS.tts.layers.losses import GlowTTSLossdef init_config():dataset_config = BaseDatasetConfig(path='train/LJSpeech-1.1/',meta_file_train='metadata.csv',formatter='ljspeech')config = GlowTTSConfig(batch_size=32,eval_batch_size=16,num_loader_workers=4,num_eval_loader_workers=4,run_eval=True,test_delay_epochs=-1,epochs=3,text_cleaner='phoneme_cleaners',use_phonemes=True,phoneme_language='en-us',phoneme_cache_path='train/phoneme_cache',print_step=25,print_eval=False,mixed_precision=True,output_path='train',datasets=[dataset_config],save_step=1000,data_dep_init_steps=0,)processor = AudioProcessor.init_from_config(config)tokenizer, config = TTSTokenizer.init_from_config(config)datas, _ = load_tts_samples(dataset_config,eval_split=True,eval_split_size=0.001)# 排序lens = [os.path.getsize(i['audio_file']) for i in datas]ids = np.argsort(lens)datas = [datas[i] for i in ids]return config, processor, tokenizer, datasconfig, processor, tokenizer, datas = init_config()out = processor.load_wav('train/LJSpeech-1.1/wavs/LJ001-0108.wav')
print('processor.load_wav=', out, out.shape)out = tokenizer.text_to_ids('it is obvious that legibility is the first thing to be aimed at in the forms of the letters'
)
print('tokenizer.text_to_ids=', out, len(out))out = processor.melspectrogram(processor.load_wav('train/LJSpeech-1.1/wavs/LJ001-0108.wav'))
print('processor.melspectrogram=', out.shape)len(datas), datas[:2]def init_model(from_trainer):model = GlowTTS(config, processor, tokenizer, speaker_manager=None)model.run_data_dep_init = Falseif from_trainer:trainer = Trainer(args=TrainerArgs(),config=config,output_path='train',model=model,train_samples=datas,eval_samples=None)optimizer = trainer.get_optimizer(model, config)scheduler = trainer.get_scheduler(model, config, optimizer)criterion = trainer.get_criterion(model)loader = trainer.get_train_dataloader({}, datas, verbose=True)else:optimizer = RAdam(model.parameters(),lr=1e-3,betas=[0.9, 0.998],weight_decay=1e-6)scheduler = NoamLR(optimizer, warmup_steps=4000)criterion = GlowTTSLoss()loader = model.get_data_loader(config=config,assets={},is_eval=False,samples=datas,verbose=True,num_gpus=0)return model, optimizer, scheduler, criterion, loadermodel, optimizer, scheduler, criterion, loader = init_model(from_trainer=False)# 统计参数量
print(sum(i.numel() for i in model.parameters()) / 10000)#optimizer, scheduler, criterion, loaderdef train():global modeldevice = 'cuda' if torch.cuda.is_available() else 'cpu'model.train()model = model.to(device)for epoch in range(config.epochs):for i, data in enumerate(loader):data = model.format_batch(data)for k in data.keys():if isinstance(data[k], torch.Tensor):data[k] = data[k].to(device)print("#############################################")print(data['text_input'].shape)print(data['mel_input'].shape)print("====================================================")_, loss_dict = model.train_step(data, criterion)model.zero_grad(set_to_none=True)loss_dict['loss'].backward()torch.nn.utils.clip_grad_norm_(model.parameters(), 5)optimizer.step()optimizer.zero_grad(set_to_none=True)if i % 2 == 0:lr = optimizer.state_dict()['param_groups'][0]['lr']print(epoch, i, loss_dict['loss'].item(), lr)scheduler.step()config.save_json('train/config.json')model = model.cpu()torch.save({'config': config.to_dict(),'model': model.state_dict()}, 'train/model.pth')if __name__ == '__main__':train()

其中train.py是训练TTS模型的入口,训练好的模型保存在train文件夹下

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/web/29819.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

汇编基础之使用vscode写hello world

汇编语言&#xff08;Assembly Language&#xff09; 概述 汇编语言&#xff08;Assembly Language&#xff09;是一种低级编程语言&#xff0c;它直接对应于计算机的机器代码&#xff08;machine code&#xff09;&#xff0c;但使用了更易读的文本符号。每台个人计算机都有…

iOS 18 Siri 升级之后都有哪些改变?

新界面 首先最显著的改变就是 Siri 的界面不同了&#xff0c;之前的界面是在打开 Siri 之后会出现一个圆形图案&#xff0c;而在 Siri 升级之后变成了屏幕边缘发出亮光。 来源&#xff1a;Apple 可在任意位置使用 苹果的生成式人工智能 Apple Intelligence 将为 Siri 提供支…

注意力机制和Transformer模型各部分功能解释

文章目录 Transformer1、各部分功能解释2、通过例子解释a.输入预处理位置编码b.Encoder 的处理c.Decoder的输入Decoder的工作流程d.输出预测总结 Attention代码和原理理解 Transformer 运行机理&#xff1a; &#xff08;1&#xff09;假设我们需要进行文本生成任务。我们将已…

Springboot集成SSE消息推送

SSE介绍 SSE&#xff08;Server-Sent Events&#xff09;的全称是服务器推送事件&#xff0c;它是一种基于 HTTP 协议的实时通信技术&#xff0c;用于在客户端和服务器之间建立持久、单向的链接&#xff0c;允许服务器向客户端发送异步消息。 了解 websocket 的小伙伴&…

SAP BC 换了logo后,其他人的logo都已经换了,但是其中有一台就是PRD 显示DEV的logo,从smw0上下载的是PRD

昨天终于发现是缓存的问题 GUI登录后 选项-本地数据-缓存 删除本地缓存文件&#xff0c;问题解决了

机器学习课程复习——聚类算法

Q:什么是硬聚类,什么是软聚类? 如果一个样本只能属于一个类,则称为硬聚类(hard clustering); 如果一个样本可以属于多个类,则称为软聚类(soft clustering)。 Q:聚类和分类的区别? 聚类分类学习类型无监督学习方法 不需要事先标记的数据 通过发现数据中的模式或结构来组…

sprintboot依赖管理和自动配置

springboot依赖管理和自动配置 依赖管理和自动配置依赖管理什么是依赖管理修改自动仲裁/默认版本号 starter场景启动器starter场景启动器基本介绍官方提供的starter第三方starter 自动配置自动配置基本介绍SpringBoot自动配置了哪些?如何修改默认配置如何修改默认扫描包结构re…

深入解析 iOS 应用启动过程:main() 函数前的四大步骤

深入解析 iOS 应用启动过程&#xff1a;main() 函数前的四大步骤 背景描述&#xff1a;使用 Objective-C 开发的 iOS 或者 MacOS 应用 在开发 iOS 应用时&#xff0c;我们通常会关注 main() 函数及其之后的执行逻辑&#xff0c;但在 main() 函数之前&#xff0c;系统已经为我们…

C++及cmake语法介绍

c/cmake学习 文章目录 c/cmake学习1. c1.1 基本模型1.1.1 for循环1.1.2 main函数1.1.2 带参数函数编译函数 2. CMAKE2.1 相关命令2.1.1 编译基本命令2.1.2 动态库静态库编译2.1.3 消息输出2.1.4 cmake变量常见参数1. 设置构建类型2. 设置编译器标志3. 指定编译器4. 设置安装路径…

机器学习_PCA

目录 一、概念 二、原理 三、步骤 四、实战 1、数据处理——转rgb为灰度图像 2、手动实现pca降维 3、查看信息保留数量 4、调用第三方库实现pca降维 五、小结 引入&#xff1a; 当说黄河五路和渤海三路交叉口的时候&#xff0c;这些路就类似于我们说的坐标系。而城市中的…

高等数学笔记(三):导数

一、导数概念 1.1 导数的定义 1.1.1 函数在一点处的导数与导函数 1.1.2 单侧导数 1.2 导数的几何意义 1.3 函数可导性与连续性的关系 二、函数的求导法则 2.1 函数的和、差、积、商的求导法则 2.2 反函数的求导法则 2.3 复合函数的求导法则 2.4 基本求导法则与导数公式 三…

必看!!! 2024 最新 PG 硬核干货大盘点(上)

PGConf.dev&#xff08;原名PGCon&#xff0c;从2007年至2023年&#xff09;首次在风景如画的加拿大温哥华市举办。此次重新定位的会议带来了全新的视角和多项新的内容&#xff0c;参会体验再次升级。尽管 PGCon 历来更侧重于开发者&#xff0c;吸引来自世界各地的资深开发者、…

深入理解并打败C语言难关之一————指针(5)(最终篇)

前言&#xff1a; 仔细一想&#xff0c;小编已经把指针的大部分内容都说了一遍了&#xff0c;小编目前有点灵感枯竭了&#xff0c;今天决定就结束指针这一大山&#xff0c;可能很多小编并没有提到过&#xff0c;如果有些没说的小编会在后续博客进行补充道&#xff0c;不多废话了…

服务器数据恢复—NTFS文件系统下双循环riad5数据恢复案例

服务器存储数据恢复环境&#xff1a; EMC CX4-480存储&#xff0c;该存储中有10块硬盘&#xff0c;其中有3块磁盘为掉线磁盘&#xff0c;另外7块磁盘组成一组RAID5磁盘阵列。运维人员在处理掉线磁盘时只添加新的硬盘做rebuild&#xff0c;并没有将掉线的硬盘拔掉&#xff0c;所…

ARCGIS 如何对河流等线条图形进行Smooth处理——具有多个断点高阶版

1.线转点折点&#xff08;注意&#xff01;很重要&#xff0c;不是线转点&#xff09; 2.点转线步骤 ## 3 线的融合 2.1 新建Filed 》短精度类型》利用选择工具的 线文件。全选同一条河流点&#xff0c;进入Tabel的选择界面。给同一条河赋值同一个值。 大功告成&#xff01;…

文章解读与仿真程序复现思路——电网技术EI\CSCD\北大核心《计及多类型储能调频容量动态申报的电能量与调频市场联合出清方法研究》

本专栏栏目提供文章与程序复现思路&#xff0c;具体已有的论文与论文源程序可翻阅本博主免费的专栏栏目《论文与完整程序》 论文与完整源程序_电网论文源程序的博客-CSDN博客https://blog.csdn.net/liang674027206/category_12531414.html 电网论文源程序-CSDN博客电网论文源…

深入探究RTOS的任务调度

阅读引言&#xff1a; 此文将会从一个工程文件&#xff0c; 一步一步的分析RTOS的任务调度实现&#xff0c; 这里选用FreeRTOS分析&#xff0c; 别的也差不多的&#xff0c; 可能在细节上有少许不一样。 目录 1&#xff0c; 常见嵌入式实时操作系统 2&#xff0c; 任务调度的…

【机器学习】第11章 神经网络与深度学习(重中之重)

一、概念 1.神经元模型 &#xff08;1&#xff09;神经网络的基本组成单位 &#xff08;2&#xff09;生物上&#xff0c;每个神经元通过树突接受来自其他被激活神经元的信息&#xff0c;通过轴突释放出来的化学递质改变当前神经元内的电位。当神经元内的电位累计到一个水平时…

Kubernetes Dashboard

Dashboard Dashboard 的项目网站&#xff0c;可以查看说明文档和基本的使用情况。 下载yaml wget https://raw.githubusercontent.com/kubernetes/dashboard/v2.6.0/aio/deploy/recommended.yaml注意需要修改镜像&#xff0c;不然可能会拉去不下来镜像 cat recommended.yaml…

人工智能--自然语言处理NLP概述

欢迎来到 Papicatch的博客 目录 &#x1f349;引言 &#x1f348;基本概念 &#x1f348;核心技术 &#x1f348;常用模型和方法 &#x1f348;应用领域 &#x1f348;挑战和未来发展 &#x1f349;案例分析 &#x1f348;机器翻译中的BERT模型 &#x1f348;情感分析在…