Skip to content

Commit

Permalink
Merge pull request #803 from coqui-ai/dev
Browse files Browse the repository at this point in the history
v0.3.0
  • Loading branch information
erogol committed Sep 13, 2021
2 parents dc2ace3 + f563415 commit 0592a58
Show file tree
Hide file tree
Showing 56 changed files with 24,537 additions and 1,031 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,6 @@ old_configs/*
model_importers/*
model_profiling/*
docs/source/TODO/*
docs/source/models/*
.noseids
.dccache
log.txt
Expand Down
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
.DEFAULT_GOAL := help
.PHONY: test system-deps dev-deps deps style lint install help
.PHONY: test system-deps dev-deps deps style lint install help docs

help:
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
Expand Down Expand Up @@ -45,3 +45,6 @@ deps: ## install 🐸 requirements.

install: ## install 🐸 TTS for development.
pip install -e .[all]

docs: ## build the docs
$(MAKE) -C docs clean && $(MAKE) -C docs html
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
- Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802)
- Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
- FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
- FastSpeech: [paper](https://arxiv.org/abs/1905.09263)

### End-to-End Models
- VITS: [paper](https://arxiv.org/pdf/2106.06103)
Expand All @@ -82,6 +84,7 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
- Graves Attention: [paper](https://arxiv.org/abs/1910.10288)
- Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
- Dynamic Convolutional Attention: [paper](https://arxiv.org/pdf/1910.10288.pdf)
- Alignment Network: [paper](https://arxiv.org/abs/2108.10447)

### Speaker Encoder
- GE2E: [paper](https://arxiv.org/abs/1710.10467)
Expand Down
27 changes: 14 additions & 13 deletions TTS/.models.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@
"license": "MPL",
"contact": "[email protected]"
},
"speedy-speech": {
"description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.3.0/tts_models--en--ljspeech--speedy_speech.zip",
"stats_file": null,
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
"commit": "4581e3d",
"author": "Eren Gölge @erogol",
"license": "TBD",
"contact": "[email protected]"
},
"tacotron2-DCA": {
"description": "",
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.9/tts_models--en--ljspeech--tacotron2-DCA.zip",
Expand All @@ -47,15 +57,6 @@
"license": "MPL",
"contact": "[email protected]"
},
"speedy-speech-wn": {
"description": "Speedy Speech model with wavenet decoder.",
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.0/tts_models--en--ljspeech--speedy-speech-wn.zip",
"default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
"commit": "77b6145",
"author": "Eren Gölge @erogol",
"license": "MPL",
"contact": "[email protected]"
},
"vits": {
"description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.2.0/tts_models--en--ljspeech--vits.zip",
Expand Down Expand Up @@ -218,11 +219,11 @@
"contact": "[email protected]"
},
"univnet": {
"description": "UnivNet model trained on LJSpeech to complement the TacotronDDC_ph model.",
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.2.0/vocoder_models--en--ljspeech--univnet.zip",
"commit": "3900448",
"description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.3.0/vocoder_models--en--ljspeech--univnet_v2.zip",
"commit": "4581e3d",
"author": "Eren @erogol",
"license": "",
"license": "TBD",
"contact": "[email protected]"
}
},
Expand Down
2 changes: 1 addition & 1 deletion TTS/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.2.2
0.3.0
26 changes: 14 additions & 12 deletions TTS/bin/extract_tts_spectrograms.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from TTS.tts.utils.speakers import get_speaker_manager
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import count_parameters
from TTS.utils.io import load_fsspec

use_cuda = torch.cuda.is_available()

Expand Down Expand Up @@ -77,14 +76,14 @@ def set_filename(wav_path, out_path):

def format_data(data):
# setup input data
text_input = data['text']
text_lengths = data['text_lengths']
mel_input = data['mel']
mel_lengths = data['mel_lengths']
item_idx = data['item_idxs']
d_vectors = data['d_vectors']
speaker_ids = data['speaker_ids']
attn_mask = data['attns']
text_input = data["text"]
text_lengths = data["text_lengths"]
mel_input = data["mel"]
mel_lengths = data["mel_lengths"]
item_idx = data["item_idxs"]
d_vectors = data["d_vectors"]
speaker_ids = data["speaker_ids"]
attn_mask = data["attns"]
avg_text_length = torch.mean(text_lengths.float())
avg_spec_length = torch.mean(mel_lengths.float())

Expand Down Expand Up @@ -133,7 +132,11 @@ def inference(
elif d_vectors is not None:
speaker_c = d_vectors
outputs = model.inference_with_MAS(
text_input, text_lengths, mel_input, mel_lengths, aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids}
text_input,
text_lengths,
mel_input,
mel_lengths,
aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
)
model_output = outputs["model_outputs"]
model_output = model_output.transpose(1, 2).detach().cpu().numpy()
Expand Down Expand Up @@ -239,8 +242,7 @@ def main(args): # pylint: disable=redefined-outer-name
model = setup_model(c)

# restore model
checkpoint = load_fsspec(args.checkpoint_path, map_location="cpu")
model.load_state_dict(checkpoint["model"])
model.load_checkpoint(c, args.checkpoint_path, eval=True)

if use_cuda:
model.cuda()
Expand Down
13 changes: 7 additions & 6 deletions TTS/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def __init__(
# load data for `tts` models
self.data_train, self.data_eval = load_meta_data(self.config.datasets)
elif self.config.feature_path is not None:
# load data for `vocoder`models
# load pre-comnputed features for `vocoder`models
print(f" > Loading features from: {self.config.feature_path}")
self.data_eval, self.data_train = load_wav_feat_data(
self.config.data_path, self.config.feature_path, self.config.eval_split_size
Expand Down Expand Up @@ -275,7 +275,8 @@ def __init__(
if self.args.continue_path:
if isinstance(self.scheduler, list):
for scheduler in self.scheduler:
scheduler.last_epoch = self.restore_step
if scheduler is not None:
scheduler.last_epoch = self.restore_step
else:
self.scheduler.last_epoch = self.restore_step

Expand Down Expand Up @@ -662,6 +663,7 @@ def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_ti
lrs = {"current_lr": current_lr}

# log run-time stats
loss_dict.update(lrs)
loss_dict.update(
{
"step_time": round(step_time, 4),
Expand Down Expand Up @@ -878,7 +880,7 @@ def _restore_best_loss(self):
"""Restore the best loss from the args.best_path if provided else
from the model (`args.restore_path` or `args.continue_path`) used for resuming the training"""
if self.restore_step != 0 or self.args.best_path:
print(" > Restoring best loss from " f"{os.path.basename(self.args.best_path)} ...")
print(f" > Restoring best loss from {os.path.basename(self.args.best_path)} ...")
ch = load_fsspec(self.args.restore_path, map_location="cpu")
if "model_loss" in ch:
self.best_loss = ch["model_loss"]
Expand Down Expand Up @@ -1125,7 +1127,7 @@ def get_last_checkpoint(path: str) -> Tuple[str, str]:
last_model_num = model_num
last_model = file_name

# if there is not checkpoint found above
# if there is no checkpoint found above
# find the checkpoint with the latest
# modification date.
key_file_names = [fn for fn in file_names if key in fn]
Expand All @@ -1144,7 +1146,7 @@ def get_last_checkpoint(path: str) -> Tuple[str, str]:
last_models["checkpoint"] = last_models["best_model"]
elif "best_model" not in last_models: # no best model
# this shouldn't happen, but let's handle it just in case
last_models["best_model"] = None
last_models["best_model"] = last_models["checkpoint"]
# finally check if last best model is more recent than checkpoint
elif last_model_nums["best_model"] > last_model_nums["checkpoint"]:
last_models["checkpoint"] = last_models["best_model"]
Expand Down Expand Up @@ -1180,7 +1182,6 @@ def process_args(args, config=None):
args.restore_path, best_model = get_last_checkpoint(args.continue_path)
if not args.best_path:
args.best_path = best_model

# init config if not already defined
if config is None:
if args.config_path:
Expand Down
47 changes: 38 additions & 9 deletions TTS/tts/configs/fast_pitch_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
from typing import List

from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.fast_pitch import FastPitchArgs
from TTS.tts.models.forward_tts import ForwardTTSArgs


@dataclass
class FastPitchConfig(BaseTTSConfig):
"""Defines parameters for Speedy Speech (feed-forward encoder-decoder) based models.
"""Configure `ForwardTTS` as FastPitch model.
Example:
Expand All @@ -18,6 +18,10 @@ class FastPitchConfig(BaseTTSConfig):
model (str):
Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
base_model (str):
Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
model_args (Coqpit):
Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
Expand All @@ -36,22 +40,43 @@ class FastPitchConfig(BaseTTSConfig):
d_vector_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
noam_schedule (bool):
enable / disable the use of Noam LR scheduler. Defaults to False.
d_vector_dim (int):
Dimension of the external speaker embeddings. Defaults to 0.
optimizer (str):
Name of the model optimizer. Defaults to `Adam`.
optimizer_params (dict):
Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
warmup_steps (int):
Number of warm-up steps for the Noam scheduler. Defaults 4000.
lr_scheduler (str):
Name of the learning rate scheduler. Defaults to `Noam`.
lr_scheduler_params (dict):
Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
lr (float):
Initial learning rate. Defaults to `1e-3`.
grad_clip (float):
Gradient norm clipping value. Defaults to `5.0`.
spec_loss_type (str):
Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
duration_loss_type (str):
Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
use_ssim_loss (bool):
Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
wd (float):
Weight decay coefficient. Defaults to `1e-7`.
ssim_loss_alpha (float):
Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
huber_loss_alpha (float):
dur_loss_alpha (float):
Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
spec_loss_alpha (float):
Expand All @@ -74,8 +99,10 @@ class FastPitchConfig(BaseTTSConfig):
"""

model: str = "fast_pitch"
base_model: str = "forward_tts"

# model specific params
model_args: FastPitchArgs = field(default_factory=FastPitchArgs)
model_args: ForwardTTSArgs = ForwardTTSArgs()

# multi-speaker settings
use_speaker_embedding: bool = False
Expand All @@ -92,11 +119,13 @@ class FastPitchConfig(BaseTTSConfig):
grad_clip: float = 5.0

# loss params
spec_loss_type: str = "mse"
duration_loss_type: str = "mse"
use_ssim_loss: bool = True
ssim_loss_alpha: float = 1.0
dur_loss_alpha: float = 1.0
spec_loss_alpha: float = 1.0
pitch_loss_alpha: float = 1.0
dur_loss_alpha: float = 1.0
aligner_loss_alpha: float = 1.0
binary_align_loss_alpha: float = 1.0
binary_align_loss_start_step: int = 20000
Expand Down
Loading

0 comments on commit 0592a58

Please sign in to comment.