Skip to content

Commit

Permalink
Merge pull request #2945 from coqui-ai/dev
Browse files Browse the repository at this point in the history
v0.17.2
  • Loading branch information
erogol committed Sep 14, 2023
2 parents 33b5e87 + 13dd7c4 commit ec5973f
Show file tree
Hide file tree
Showing 35 changed files with 6,315 additions and 40 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not relea
- Delightful TTS: [paper](https://arxiv.org/abs/2110.12612)

### End-to-End Models
- ⓍTTS: [blog]()
- VITS: [paper](https://arxiv.org/pdf/2106.06103)
- 🐸 YourTTS: [paper](https://arxiv.org/abs/2112.02418)
- 🐢 Tortoise: [orig. repo](https://github.com/neonbjb/tortoise-tts)
Expand Down Expand Up @@ -248,11 +249,11 @@ tts.tts_with_vc_to_file(
```

#### Example using [🐸Coqui Studio](https://coqui.ai) voices.
You access all of your cloned voices and built-in speakers in [🐸Coqui Studio](https://coqui.ai).
You access all of your cloned voices and built-in speakers in [🐸Coqui Studio](https://coqui.ai).
To do this, you'll need an API token, which you can obtain from the [account page](https://coqui.ai/account).
After obtaining the API token, you'll need to configure the COQUI_STUDIO_TOKEN environment variable.

Once you have a valid API token in place, the studio speakers will be displayed as distinct models within the list.
Once you have a valid API token in place, the studio speakers will be displayed as distinct models within the list.
These models will follow the naming convention `coqui_studio/en/<studio_speaker_name>/coqui_studio`

```python
Expand Down
15 changes: 14 additions & 1 deletion TTS/.models.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,19 @@
"tts_models": {
"multilingual": {
"multi-dataset": {
"xtts_v1": {
"description": "XTTS-v1 by Coqui with 13 languages and cross-language voice cloning.",
"hf_url": [
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/model.pth",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/config.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/vocab.json"
],
"default_vocoder": null,
"commit": "e9a1953e",
"license": "CPML",
"contact": "[email protected]",
"tos_required": true
},
"your_tts": {
"description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
Expand Down Expand Up @@ -881,4 +894,4 @@
}
}
}
}
}
2 changes: 1 addition & 1 deletion TTS/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.16.6
0.17.2
3 changes: 3 additions & 0 deletions TTS/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ def is_coqui_studio(self):

@property
def is_multi_lingual(self):
# TODO: fix this
if "xtts" in self.model_name:
return True
if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
return self.synthesizer.tts_model.language_manager.num_languages > 1
return False
Expand Down
6 changes: 4 additions & 2 deletions TTS/bin/synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ def main():
if args.encoder_path is not None:
encoder_path = args.encoder_path
encoder_config_path = args.encoder_config_path

device = args.device
if args.use_cuda:
device = "cuda"
Expand Down Expand Up @@ -459,7 +459,9 @@ def main():
target_wav=args.target_wav,
)
elif model_dir is not None:
wav = synthesizer.tts(args.text, speaker_name=args.speaker_idx)
wav = synthesizer.tts(
args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
)

# save the results
print(" > Saving output to {}".format(args.out_path))
Expand Down
6 changes: 6 additions & 0 deletions TTS/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ def register_config(model_name: str) -> Coqpit:
"""
config_class = None
config_name = model_name + "_config"

# TODO: fix this
if model_name == "xtts":
from TTS.tts.configs.xtts_config import XttsConfig

config_class = XttsConfig
paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"]
for path in paths:
try:
Expand Down
90 changes: 90 additions & 0 deletions TTS/tts/configs/xtts_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from dataclasses import dataclass, field
from typing import List

from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig


@dataclass
class XttsConfig(BaseTTSConfig):
"""Defines parameters for XTTS TTS model.
Args:
model (str):
Model name. Do not change unless you know what you are doing.
model_args (XttsArgs):
Model architecture arguments. Defaults to `XttsArgs()`.
audio (XttsAudioConfig):
Audio processing configuration. Defaults to `XttsAudioConfig()`.
model_dir (str):
Path to the folder that has all the XTTS models. Defaults to None.
temperature (float):
Temperature for the autoregressive model inference. Larger values makes predictions more creative sacrificing stability. Defaults to `0.2`.
length_penalty (float):
Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length,
which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative),
length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.
reperation_penalty (float):
The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`.
top_p (float):
If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
Defaults to `0.8`.
cond_free_k (float):
Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`.
diffusion_temperature (float):
Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
are the "mean" prediction of the diffusion network and will sound bland and smeared.
Defaults to `1.0`.
num_gpt_outputs (int):
Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
As XTTS is a probabilistic model, more samples means a higher probability of creating something "great".
Defaults to `16`.
decoder_iterations (int):
Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
however. Defaults to `30`.
decoder_sampler (str):
Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.
Note:
Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
Example:
>>> from TTS.tts.configs.xtts_config import XttsConfig
>>> config = XttsConfig()
"""

model: str = "xtts"
# model specific params
model_args: XttsArgs = field(default_factory=XttsArgs)
audio: XttsAudioConfig = field(default_factory=XttsAudioConfig)
model_dir: str = None
languages: List[str] = field(
default_factory=lambda: ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"]
)

# inference params
temperature: float = 0.2
length_penalty: float = 1.0
repetition_penalty: float = 2.0
top_k: int = 50
top_p: float = 0.8
cond_free_k: float = 2.0
diffusion_temperature: float = 1.0
num_gpt_outputs: int = 16
decoder_iterations: int = 30
decoder_sampler: str = "ddim"
39 changes: 39 additions & 0 deletions TTS/tts/datasets/formatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,45 @@
########################


def cml_tts(root_path, meta_file, ignored_speakers=None):
"""Normalizes the CML-TTS meta data file to TTS format
https://github.com/freds0/CML-TTS-Dataset/"""
filepath = os.path.join(root_path, meta_file)
# ensure there are 4 columns for every line
with open(filepath, "r", encoding="utf8") as f:
lines = f.readlines()
num_cols = len(lines[0].split("|")) # take the first row as reference
for idx, line in enumerate(lines[1:]):
if len(line.split("|")) != num_cols:
print(f" > Missing column in line {idx + 1} -> {line.strip()}")
# load metadata
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
assert all(x in metadata.columns for x in ["wav_filename", "transcript"])
client_id = None if "client_id" in metadata.columns else "default"
emotion_name = None if "emotion_name" in metadata.columns else "neutral"
items = []
not_found_counter = 0
for row in metadata.itertuples():
if client_id is None and ignored_speakers is not None and row.client_id in ignored_speakers:
continue
audio_path = os.path.join(root_path, row.wav_filename)
if not os.path.exists(audio_path):
not_found_counter += 1
continue
items.append(
{
"text": row.transcript,
"audio_file": audio_path,
"speaker_name": client_id if client_id is not None else row.client_id,
"emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
"root_path": root_path,
}
)
if not_found_counter > 0:
print(f" | > [!] {not_found_counter} files not found")
return items


def coqui(root_path, meta_file, ignored_speakers=None):
"""Interal dataset formatter."""
filepath = os.path.join(root_path, meta_file)
Expand Down
9 changes: 4 additions & 5 deletions TTS/tts/layers/tortoise/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@

from TTS.tts.utils.text.cleaners import english_cleaners

DEFAULT_VOCAB_FILE = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "../../utils/assets/tortoise/tokenizer.json"
)


class VoiceBpeTokenizer:
def __init__(self, vocab_file=DEFAULT_VOCAB_FILE):
def __init__(self, vocab_file=None, vocab_str=None):
self.tokenizer = None
if vocab_file is not None:
self.tokenizer = Tokenizer.from_file(vocab_file)
if vocab_str is not None:
self.tokenizer = Tokenizer.from_str(vocab_str)

def preprocess_text(self, txt):
txt = english_cleaners(txt)
Expand Down
Loading

0 comments on commit ec5973f

Please sign in to comment.