Skip to content

Commit

Permalink
Merge pull request #3300 from coqui-ai/dev
Browse files Browse the repository at this point in the history
v0.21.1
  • Loading branch information
erogol committed Nov 27, 2023
2 parents 2211ba2 + 11ec9f7 commit 6189e2f
Show file tree
Hide file tree
Showing 21 changed files with 153 additions and 269 deletions.
26 changes: 26 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,32 @@ The following steps are tested on an Ubuntu system.
14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version.
## Development in Docker container
If you prefer working within a Docker container as your development environment, you can do the following:
1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page.
2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```.
```bash
$ git clone [email protected]:<your Github name>/TTS.git
$ cd TTS
$ git remote add upstream https://github.com/coqui-ai/TTS.git
```
3. Build the Docker Image as your development environment (it installs all of the dependencies for you):
```
docker build --tag=tts-dev:latest -f .\dockerfiles\Dockerfile.dev .
```
4. Run the container with GPU support:
```
docker run -it --gpus all tts-dev:latest /bin/bash
```
Feel free to ping us at any step you need help using our communication channels.
If you are new to Github or open-source contribution, These are good resources.
Expand Down
10 changes: 8 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
FROM ${BASE}

RUN apt-get update && apt-get upgrade -y
RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
RUN pip3 install llvmlite --ignore-installed

WORKDIR /root
COPY . /root
# Install Dependencies:
RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
RUN rm -rf /root/.cache/pip

# Copy TTS repository contents:
WORKDIR /root
COPY . /root

RUN make install

ENTRYPOINT ["tts"]
CMD ["--help"]
2 changes: 1 addition & 1 deletion TTS/.models.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
],
"model_hash": "5ce0502bfe3bc88dc8d9312b12a7558c",
"model_hash": "10f92b55c512af7a8d39d650547a15a7",
"default_vocoder": null,
"commit": "480a6cdf7",
"license": "CPML",
Expand Down
2 changes: 1 addition & 1 deletion TTS/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.20.6
0.21.1
27 changes: 19 additions & 8 deletions TTS/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from TTS.utils.audio.numpy_transforms import save_wav
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer

from TTS.config import load_config

class TTS(nn.Module):
"""TODO: Add voice conversion and Capacitron support."""
Expand Down Expand Up @@ -66,13 +66,12 @@ def __init__(
"""
super().__init__()
self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)

self.config = load_config(config_path) if config_path else None
self.synthesizer = None
self.voice_converter = None
self.csapi = None
self.cs_api_model = cs_api_model
self.model_name = ""

if gpu:
warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")

Expand Down Expand Up @@ -106,7 +105,8 @@ def is_coqui_studio(self):
@property
def is_multi_lingual(self):
# Not sure what sets this to None, but applied a fix to prevent crashing.
if isinstance(self.model_name, str) and "xtts" in self.model_name:
if (isinstance(self.model_name, str) and "xtts" in self.model_name or
self.config and ("xtts" in self.config.model or len(self.config.languages) > 1)):
return True
if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
return self.synthesizer.tts_model.language_manager.num_languages > 1
Expand Down Expand Up @@ -440,7 +440,7 @@ def voice_conversion_to_file(
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
return file_path

def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None):
def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None, speaker: str = None):
"""Convert text to speech with voice conversion.
It combines tts with voice conversion to fake voice cloning.
Expand All @@ -457,17 +457,25 @@ def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None):
speaker_wav (str, optional):
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
Defaults to None.
speaker (str, optional):
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
"""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
# Lazy code... save it to a temp file to resample it while reading it for VC
self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name, speaker_wav=speaker_wav)
self.tts_to_file(text=text, speaker=speaker, language=language, file_path=fp.name)
if self.voice_converter is None:
self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
return wav

def tts_with_vc_to_file(
self, text: str, language: str = None, speaker_wav: str = None, file_path: str = "output.wav"
self,
text: str,
language: str = None,
speaker_wav: str = None,
file_path: str = "output.wav",
speaker: str = None,
):
"""Convert text to speech with voice conversion and save to file.
Expand All @@ -484,6 +492,9 @@ def tts_with_vc_to_file(
Defaults to None.
file_path (str, optional):
Output file path. Defaults to "output.wav".
speaker (str, optional):
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
"""
wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav)
wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav, speaker=speaker)
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
7 changes: 7 additions & 0 deletions TTS/bin/synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,13 @@ def main():
print(" > Saving output to ", args.out_path)
return

if args.language_idx is None and args.language is not None:
msg = (
"--language is only supported for Coqui Studio models. "
"Use --language_idx to specify the target language for multilingual models."
)
raise ValueError(msg)

# CASE4: load pre-trained model paths
if args.model_name is not None and not args.model_path:
model_path, config_path, model_item = manager.download_model(args.model_name)
Expand Down
23 changes: 18 additions & 5 deletions TTS/bin/train_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@

import torch
from torch.utils.data import DataLoader
from trainer.io import copy_model_files, save_best_model, save_checkpoint
from trainer.torch import NoamLR
from trainer.trainer_utils import get_optimizer

from TTS.encoder.dataset import EncoderDataset
from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
from TTS.encoder.utils.generic_utils import setup_encoder_model
from TTS.encoder.utils.training import init_training
from TTS.encoder.utils.visual import plot_embeddings
from TTS.tts.datasets import load_tts_samples
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
from TTS.utils.io import copy_model_files
from TTS.utils.samplers import PerfectBatchSampler
from TTS.utils.training import check_update

Expand Down Expand Up @@ -222,7 +222,9 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,

if global_step % c.save_step == 0:
# save model
save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch)
save_checkpoint(
c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict()
)

end_time = time.time()

Expand All @@ -245,7 +247,18 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
flush=True,
)
# save the best checkpoint
best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
best_loss = save_best_model(
eval_loss,
best_loss,
c,
model,
optimizer,
None,
global_step,
epoch,
OUT_PATH,
criterion=criterion.state_dict(),
)
model.train()

return best_loss, global_step
Expand Down Expand Up @@ -276,7 +289,7 @@ def main(args): # pylint: disable=redefined-outer-name

if c.loss == "softmaxproto" and c.model != "speaker_encoder":
c.map_classid_to_classname = map_classid_to_classname
copy_model_files(c, OUT_PATH)
copy_model_files(c, OUT_PATH, new_fields={})

if args.restore_path:
criterion, args.restore_step = model.load_checkpoint(
Expand Down
46 changes: 0 additions & 46 deletions TTS/encoder/utils/generic_utils.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
import datetime
import glob
import os
import random
import re

import numpy as np
from scipy import signal

from TTS.encoder.models.lstm import LSTMSpeakerEncoder
from TTS.encoder.models.resnet import ResNetSpeakerEncoder
from TTS.utils.io import save_fsspec


class AugmentWAV(object):
Expand Down Expand Up @@ -118,11 +115,6 @@ def apply_one(self, audio):
return self.additive_noise(noise_type, audio)


def to_camel(text):
text = text.capitalize()
return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)


def setup_encoder_model(config: "Coqpit"):
if config.model_params["model_name"].lower() == "lstm":
model = LSTMSpeakerEncoder(
Expand All @@ -142,41 +134,3 @@ def setup_encoder_model(config: "Coqpit"):
audio_config=config.audio,
)
return model


def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
checkpoint_path = "checkpoint_{}.pth".format(current_step)
checkpoint_path = os.path.join(out_path, checkpoint_path)
print(" | | > Checkpoint saving : {}".format(checkpoint_path))

new_state_dict = model.state_dict()
state = {
"model": new_state_dict,
"optimizer": optimizer.state_dict() if optimizer is not None else None,
"criterion": criterion.state_dict(),
"step": current_step,
"epoch": epoch,
"loss": model_loss,
"date": datetime.date.today().strftime("%B %d, %Y"),
}
save_fsspec(state, checkpoint_path)


def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch):
if model_loss < best_loss:
new_state_dict = model.state_dict()
state = {
"model": new_state_dict,
"optimizer": optimizer.state_dict(),
"criterion": criterion.state_dict(),
"step": current_step,
"epoch": epoch,
"loss": model_loss,
"date": datetime.date.today().strftime("%B %d, %Y"),
}
best_loss = model_loss
bestmodel_path = "best_model.pth"
bestmodel_path = os.path.join(out_path, bestmodel_path)
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
save_fsspec(state, bestmodel_path)
return best_loss
38 changes: 0 additions & 38 deletions TTS/encoder/utils/io.py

This file was deleted.

2 changes: 1 addition & 1 deletion TTS/encoder/utils/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@

from coqpit import Coqpit
from trainer import TrainerArgs, get_last_checkpoint
from trainer.io import copy_model_files
from trainer.logging import logger_factory
from trainer.logging.console_logger import ConsoleLogger

from TTS.config import load_config, register_config
from TTS.tts.utils.text.characters import parse_symbols
from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch
from TTS.utils.io import copy_model_files


@dataclass
Expand Down
1 change: 1 addition & 0 deletions TTS/tts/configs/xtts_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ class XttsConfig(BaseTTSConfig):
"hu",
"ko",
"ja",
"hi",
]
)

Expand Down
3 changes: 3 additions & 0 deletions TTS/tts/layers/xtts/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,6 +636,9 @@ def preprocess_text(self, txt, lang):
txt = korean_transliterate(txt)
elif lang == "ja":
txt = japanese_cleaners(txt, self.katsu)
elif lang == "hi":
# @manmay will implement this
txt = basic_cleaners(txt)
else:
raise NotImplementedError(f"Language '{lang}' is not supported.")
return txt
Expand Down
12 changes: 4 additions & 8 deletions TTS/tts/utils/text/phonemizers/espeak_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,20 +185,16 @@ def phonemize_espeak(self, text: str, separator: str = "|", tie=False) -> str:
if tie:
args.append("--tie=%s" % tie)

args.append('"' + text + '"')
args.append(text)
# compute phonemes
phonemes = ""
for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True):
logging.debug("line: %s", repr(line))
ph_decoded = line.decode("utf8").strip()
# espeak need to skip first two characters of the retuned text:
# version 1.48.03: "_ p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
# espeak:
# version 1.48.15: " p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
# espeak-ng need to skip the first character of the retuned text:
# "_p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"

# dealing with the conditions descrived above
ph_decoded = ph_decoded[:1].replace("_", "") + ph_decoded[1:]
# espeak-ng:
# "p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"

# espeak-ng backend can add language flags that need to be removed:
# "sɛʁtˈɛ̃ mˈo kɔm (en)fˈʊtbɔːl(fr) ʒenˈɛʁ de- flˈaɡ də- lˈɑ̃ɡ."
Expand Down
Loading

0 comments on commit 6189e2f

Please sign in to comment.