Merge branch 'dev' into main

coqui-ai · Feb 10, 2023 · ea5bd7d · ea5bd7d
2 parents 6cfb590 + 914280a
commit ea5bd7d
Show file tree

Hide file tree

Showing 51 changed files with 15 additions and 66 deletions.
diff --git a/TTS/VERSION b/TTS/VERSION
@@ -1 +1 @@
-0.10.2
+0.11.0
diff --git a/TTS/api.py b/TTS/api.py
@@ -102,7 +102,7 @@ def download_model_by_name(self, model_name: str):
         return model_path, config_path, vocoder_path, vocoder_config_path
 
     def load_model_by_name(self, model_name: str, gpu: bool = False):
-        """ Load one of 🐸TTS models by name.
+        """Load one of 🐸TTS models by name.
 
         Args:
             model_name (str): Model name to load. You can list models by ```tts.models```.

diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py
@@ -10,7 +10,6 @@
 
 
 def compute_encoder_accuracy(dataset_items, encoder_manager):
-
     class_name_key = encoder_manager.encoder_config.class_name_key
     map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
 

diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py
@@ -164,7 +164,6 @@ def extract_spectrograms(
     model.eval()
     export_metadata = []
     for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
-
         # format data
         (
             text_input,

diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py
@@ -35,7 +35,6 @@ def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser(
         description="""Resample a folder recusively with librosa
                        Can be used in place or create a copy of the folder as an output.\n\n

diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py
@@ -14,7 +14,6 @@
 
 class AugmentWAV(object):
     def __init__(self, ap, augmentation_config):
-
         self.ap = ap
         self.use_additive_noise = False
 
@@ -67,7 +66,6 @@ def create_augmentation_global_list(self):
             self.global_noise_list.append("RIR_AUG")
 
     def additive_noise(self, noise_type, audio):
-
         clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
 
         noise_list = random.sample(

diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
@@ -411,7 +411,6 @@ def collate_fn(self, batch):
 
         # Puts each data field into a tensor with outer dimension batch size
         if isinstance(batch[0], collections.abc.Mapping):
-
             token_ids_lengths = np.array([len(d["token_ids"]) for d in batch])
 
             # sort items with text input length for RNN efficiency

diff --git a/TTS/tts/layers/feed_forward/decoder.py b/TTS/tts/layers/feed_forward/decoder.py
@@ -81,7 +81,6 @@ class RelativePositionTransformerDecoder(nn.Module):
     """
 
     def __init__(self, in_channels, out_channels, hidden_channels, params):
-
         super().__init__()
         self.prenet = Conv1dBN(in_channels, hidden_channels, 1, 1)
         self.rel_pos_transformer = RelativePositionTransformer(in_channels, out_channels, hidden_channels, **params)
@@ -111,7 +110,6 @@ class FFTransformerDecoder(nn.Module):
     """
 
     def __init__(self, in_channels, out_channels, params):
-
         super().__init__()
         self.transformer_block = FFTransformerBlock(in_channels, **params)
         self.postnet = nn.Conv1d(in_channels, out_channels, 1)

diff --git a/TTS/tts/layers/feed_forward/duration_predictor.py b/TTS/tts/layers/feed_forward/duration_predictor.py
@@ -18,7 +18,6 @@ class DurationPredictor(nn.Module):
     """
 
     def __init__(self, hidden_channels):
-
         super().__init__()
 
         self.layers = nn.ModuleList(

diff --git a/TTS/tts/layers/generic/res_conv_bn.py b/TTS/tts/layers/generic/res_conv_bn.py
@@ -100,7 +100,6 @@ class ResidualConv1dBNBlock(nn.Module):
     def __init__(
         self, in_channels, out_channels, hidden_channels, kernel_size, dilations, num_res_blocks=13, num_conv_blocks=2
     ):
-
         super().__init__()
         assert len(dilations) == num_res_blocks
         self.res_blocks = nn.ModuleList()

diff --git a/TTS/tts/layers/generic/wavenet.py b/TTS/tts/layers/generic/wavenet.py
@@ -153,7 +153,6 @@ def __init__(
         dropout_p=0,
         weight_norm=True,
     ):
-
         super().__init__()
         self.wn_blocks = nn.ModuleList()
         for idx in range(num_blocks):

diff --git a/TTS/tts/layers/glow_tts/transformer.py b/TTS/tts/layers/glow_tts/transformer.py
@@ -64,7 +64,6 @@ def __init__(
         proximal_bias=False,
         proximal_init=False,
     ):
-
         super().__init__()
         assert channels % num_heads == 0, " [!] channels should be divisible by num_heads."
         # class attributes
@@ -272,7 +271,6 @@ class FeedForwardNetwork(nn.Module):
     """
 
     def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dropout_p=0.0, causal=False):
-
         super().__init__()
         self.in_channels = in_channels
         self.out_channels = out_channels

diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py
@@ -363,7 +363,6 @@ def forward(
         alignments_backwards,
         input_lens,
     ):
-
         # decoder outputs linear or mel spectrograms for Tacotron and Tacotron2
         # the target should be set acccordingly
         postnet_target = linear_input if self.config.model.lower() in ["tacotron"] else mel_input

diff --git a/TTS/tts/layers/overflow/common_layers.py b/TTS/tts/layers/overflow/common_layers.py
@@ -22,7 +22,6 @@ class Encoder(nn.Module):
     """
 
     def __init__(self, num_chars, state_per_phone, in_out_channels=512, n_convolutions=3):
-
         super().__init__()
 
         self.state_per_phone = state_per_phone

diff --git a/TTS/tts/layers/overflow/decoder.py b/TTS/tts/layers/overflow/decoder.py
@@ -36,7 +36,6 @@ def __init__(
         sigmoid_scale=False,
         c_in_channels=0,
     ):
-
         super().__init__()
 
         self.glow_decoder = GlowDecoder(

diff --git a/TTS/tts/layers/overflow/neural_hmm.py b/TTS/tts/layers/overflow/neural_hmm.py
@@ -123,7 +123,6 @@ def forward(self, inputs, inputs_len, mels, mel_lens):
         h_memory, c_memory = self._init_lstm_states(batch_size, self.memory_rnn_dim, mels)
 
         for t in range(T_max):
-
             # Process Autoregression
             h_memory, c_memory = self._process_ar_timestep(t, ar_inputs, h_memory, c_memory)
             # Get mean, std and transition vector from decoder for this timestep
@@ -418,7 +417,6 @@ def sample(self, inputs, input_lens, sampling_temp, max_sampling_time, duration_
         output_parameter_values = []
         quantile = 1
         while True:
-
             memory_input = self.prenet(prenet_input.flatten(1).unsqueeze(0))
             # will be 1 while sampling
             h_memory, c_memory = self.memory_rnn(memory_input.squeeze(0), (h_memory, c_memory))

diff --git a/TTS/tts/layers/tacotron/attentions.py b/TTS/tts/layers/tacotron/attentions.py
@@ -50,7 +50,6 @@ class GravesAttention(nn.Module):
     COEF = 0.3989422917366028  # numpy.sqrt(1/(2*numpy.pi))
 
     def __init__(self, query_dim, K):
-
         super().__init__()
         self._mask_value = 1e-8
         self.K = K

diff --git a/TTS/tts/layers/tacotron/capacitron_layers.py b/TTS/tts/layers/tacotron/capacitron_layers.py
@@ -83,7 +83,6 @@ class ReferenceEncoder(nn.Module):
     """
 
     def __init__(self, num_mel, out_dim):
-
         super().__init__()
         self.num_mel = num_mel
         filters = [1] + [32, 32, 64, 64, 128, 128]

diff --git a/TTS/tts/layers/tacotron/gst_layers.py b/TTS/tts/layers/tacotron/gst_layers.py
@@ -31,7 +31,6 @@ class ReferenceEncoder(nn.Module):
     """
 
     def __init__(self, num_mel, embedding_dim):
-
         super().__init__()
         self.num_mel = num_mel
         filters = [1] + [32, 32, 64, 64, 128, 128]
@@ -119,7 +118,6 @@ class MultiHeadAttention(nn.Module):
     """
 
     def __init__(self, query_dim, key_dim, num_units, num_heads):
-
         super().__init__()
         self.num_units = num_units
         self.num_heads = num_heads

diff --git a/TTS/tts/layers/tacotron/tacotron.py b/TTS/tts/layers/tacotron/tacotron.py
@@ -27,7 +27,6 @@ class BatchNormConv1d(nn.Module):
     """
 
     def __init__(self, in_channels, out_channels, kernel_size, stride, padding, activation=None):
-
         super().__init__()
         self.padding = padding
         self.padder = nn.ConstantPad1d(padding, 0)
@@ -149,7 +148,7 @@ def __init__(
         activations += [None]
         # setup conv1d projection layers
         layer_set = []
-        for (in_size, out_size, ac) in zip(out_features, conv_projections, activations):
+        for in_size, out_size, ac in zip(out_features, conv_projections, activations):
             layer = BatchNormConv1d(in_size, out_size, kernel_size=3, stride=1, padding=[1, 1], activation=ac)
             layer_set.append(layer)
         self.conv1d_projections = nn.ModuleList(layer_set)

diff --git a/TTS/tts/layers/vits/transforms.py b/TTS/tts/layers/vits/transforms.py
@@ -21,7 +21,6 @@ def piecewise_rational_quadratic_transform(
     min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
     min_derivative=DEFAULT_MIN_DERIVATIVE,
 ):
-
     if tails is None:
         spline_fn = rational_quadratic_spline
         spline_kwargs = {}

diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py
@@ -109,7 +109,6 @@ def __init__(
         tokenizer: "TTSTokenizer" = None,
         speaker_manager: SpeakerManager = None,
     ):
-
         super().__init__(config, ap, tokenizer, speaker_manager)
         self.speaker_manager = speaker_manager
         self.phase = -1

diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py
@@ -252,7 +252,12 @@ def compute_gst(self, inputs, style_input, speaker_embedding=None):
 
     def compute_capacitron_VAE_embedding(self, inputs, reference_mel_info, text_info=None, speaker_embedding=None):
         """Capacitron Variational Autoencoder"""
-        (VAE_outputs, posterior_distribution, prior_distribution, capacitron_beta,) = self.capacitron_vae_layer(
+        (
+            VAE_outputs,
+            posterior_distribution,
+            prior_distribution,
+            capacitron_beta,
+        ) = self.capacitron_vae_layer(
             reference_mel_info,
             text_info,
             speaker_embedding,  # pylint: disable=not-callable

diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
@@ -357,7 +357,6 @@ def get_data_loader(
     def _get_test_aux_input(
         self,
     ) -> Dict:
-
         d_vector = None
         if self.config.use_d_vector_file:
             d_vector = [self.speaker_manager.embeddings[name]["embedding"] for name in self.speaker_manager.embeddings]

diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py
@@ -63,7 +63,6 @@ def __init__(
         tokenizer: "TTSTokenizer" = None,
         speaker_manager: SpeakerManager = None,
     ):
-
         super().__init__(config, ap, tokenizer, speaker_manager)
 
         # pass all config fields to `self`

diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py
@@ -36,7 +36,6 @@ def __init__(
         tokenizer: "TTSTokenizer" = None,
         speaker_manager: SpeakerManager = None,
     ):
-
         super().__init__(config, ap, tokenizer, speaker_manager)
 
         # pass all config fields to `self`

diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py
@@ -50,7 +50,6 @@ def __init__(
         tokenizer: "TTSTokenizer" = None,
         speaker_manager: SpeakerManager = None,
     ):
-
         super().__init__(config, ap, tokenizer, speaker_manager)
 
         self.decoder_output_dim = config.out_channels

diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
@@ -633,7 +633,6 @@ def __init__(
         speaker_manager: SpeakerManager = None,
         language_manager: LanguageManager = None,
     ):
-
         super().__init__(config, ap, tokenizer, speaker_manager, language_manager)
 
         self.init_multispeaker(config)
@@ -1280,7 +1279,6 @@ def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> T
 
             # compute melspec segment
             with autocast(enabled=False):
-
                 if self.args.encoder_sample_rate:
                     spec_segment_size = self.spec_segment_size * int(self.interpolate_factor)
                 else:

diff --git a/TTS/tts/utils/text/phonemizers/base.py b/TTS/tts/utils/text/phonemizers/base.py
@@ -32,7 +32,6 @@ class BasePhonemizer(abc.ABC):
     """
 
     def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False):
-
         # ensure the backend is installed on the system
         if not self.is_available():
             raise RuntimeError("{} not installed on your system".format(self.name()))  # pragma: nocover

diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py
@@ -158,7 +158,6 @@ def __init__(
         verbose=True,
         **_,
     ):
-
         # setup class attributed
         self.sample_rate = sample_rate
         self.resample = resample

diff --git a/TTS/utils/download.py b/TTS/utils/download.py
@@ -43,7 +43,6 @@ def stream_url(
         total=url_size,
         disable=not progress_bar,
     ) as pbar:
-
         num_bytes = 0
         while True:
             chunk = upointer.read(block_size)

diff --git a/TTS/utils/radam.py b/TTS/utils/radam.py
@@ -31,13 +31,11 @@ def __setstate__(self, state):  # pylint: disable=useless-super-delegation
         super().__setstate__(state)
 
     def step(self, closure=None):
-
         loss = None
         if closure is not None:
             loss = closure()
 
         for group in self.param_groups:
-
             for p in group["params"]:
                 if p.grad is None:
                     continue

diff --git a/TTS/utils/samplers.py b/TTS/utils/samplers.py
@@ -72,7 +72,6 @@ def __init__(
         self._num_classes_in_batch = num_classes_in_batch
 
     def __iter__(self):
-
         batch = []
         if self._num_classes_in_batch != len(self._samplers):
             valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch)

diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
@@ -212,7 +212,6 @@ def tts(
         speaker_embedding = None
         speaker_id = None
         if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
-
             # handle Neon models with single speaker.
             if len(self.tts_model.speaker_manager.name_to_id) == 1:
                 speaker_id = list(self.tts_model.speaker_manager.name_to_id.values())[0]
@@ -247,7 +246,6 @@ def tts(
         if self.tts_languages_file or (
             hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None
         ):
-
             if len(self.tts_model.language_manager.name_to_id) == 1:
                 language_id = list(self.tts_model.language_manager.name_to_id.values())[0]
 

diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py
@@ -47,7 +47,6 @@ def get_vad_model_and_utils(use_cuda=False):
 def remove_silence(
     model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False
 ):
-
     # get the VAD model and utils functions
     model, get_speech_timestamps, _, collect_chunks = model_and_utils
 

diff --git a/TTS/vocoder/datasets/gan_dataset.py b/TTS/vocoder/datasets/gan_dataset.py
@@ -118,7 +118,6 @@ def load_item(self, idx):
                 mel = self.ap.melspectrogram(audio)
                 audio, mel = self._pad_short_samples(audio, mel)
         else:
-
             # load precomputed features
             wavpath, feat_path = self.item_list[idx]
 

diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py
@@ -30,7 +30,6 @@ def __init__(
         use_cache=False,
         verbose=False,
     ):
-
         super().__init__()
         self.ap = ap
         self.item_list = items

diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py
@@ -12,7 +12,6 @@ class WaveRNNDataset(Dataset):
     def __init__(
         self, ap, items, seq_len, hop_len, pad, mode, mulaw, is_training=True, verbose=False, return_segments=True
     ):
-
         super().__init__()
         self.ap = ap
         self.compute_feat = not isinstance(items[0], (tuple, list))
@@ -52,7 +51,6 @@ def load_item(self, index):
         else compute it on the fly
         """
         if self.compute_feat:
-
             wavpath = self.item_list[index]
             audio = self.ap.load_wav(wavpath)
             if self.return_segments:
@@ -74,7 +72,6 @@ def load_item(self, index):
                 raise RuntimeError("Unknown dataset mode - ", self.mode)
 
         else:
-
             wavpath, feat_path = self.item_list[index]
             mel = np.load(feat_path.replace("/quant/", "/mel/"))