Merge pull request #2945 from coqui-ai/dev

v0.17.2
coqui-ai · Sep 14, 2023 · ec5973f · ec5973f
2 parents 33b5e87 + 13dd7c4
commit ec5973f
Show file tree

Hide file tree

Showing 35 changed files with 6,315 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -111,6 +111,7 @@ Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not relea
 - Delightful TTS: [paper](https://arxiv.org/abs/2110.12612)
 
 ### End-to-End Models
+- ⓍTTS: [blog]()
 - VITS: [paper](https://arxiv.org/pdf/2106.06103)
 - 🐸 YourTTS: [paper](https://arxiv.org/abs/2112.02418)
 - 🐢 Tortoise: [orig. repo](https://github.com/neonbjb/tortoise-tts)
@@ -248,11 +249,11 @@ tts.tts_with_vc_to_file(
 ```
 
 #### Example using [🐸Coqui Studio](https://coqui.ai) voices.
-You access all of your cloned voices and built-in speakers in [🐸Coqui Studio](https://coqui.ai). 
+You access all of your cloned voices and built-in speakers in [🐸Coqui Studio](https://coqui.ai).
 To do this, you'll need an API token, which you can obtain from the [account page](https://coqui.ai/account).
 After obtaining the API token, you'll need to configure the COQUI_STUDIO_TOKEN environment variable.
 
-Once you have a valid API token in place, the studio speakers will be displayed as distinct models within the list. 
+Once you have a valid API token in place, the studio speakers will be displayed as distinct models within the list.
 These models will follow the naming convention `coqui_studio/en/<studio_speaker_name>/coqui_studio`
 
 ```python

diff --git a/TTS/.models.json b/TTS/.models.json
@@ -2,6 +2,19 @@
     "tts_models": {
         "multilingual": {
             "multi-dataset": {
+                "xtts_v1": {
+                    "description": "XTTS-v1 by Coqui with 13 languages and cross-language voice cloning.",
+                    "hf_url": [
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/model.pth",
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/config.json",
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/vocab.json"
+                    ],
+                    "default_vocoder": null,
+                    "commit": "e9a1953e",
+                    "license": "CPML",
+                    "contact": "[email protected]",
+                    "tos_required": true
+                },
                 "your_tts": {
                     "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
                     "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
@@ -881,4 +894,4 @@
             }
         }
     }
-}
+}
diff --git a/TTS/VERSION b/TTS/VERSION
@@ -1 +1 @@
-0.16.6
+0.17.2
diff --git a/TTS/api.py b/TTS/api.py
@@ -105,6 +105,9 @@ def is_coqui_studio(self):
 
     @property
     def is_multi_lingual(self):
+        # TODO: fix this
+        if "xtts" in self.model_name:
+            return True
         if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
             return self.synthesizer.tts_model.language_manager.num_languages > 1
         return False

diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
@@ -392,7 +392,7 @@ def main():
     if args.encoder_path is not None:
         encoder_path = args.encoder_path
         encoder_config_path = args.encoder_config_path
-    
+
     device = args.device
     if args.use_cuda:
         device = "cuda"
@@ -459,7 +459,9 @@ def main():
             target_wav=args.target_wav,
         )
     elif model_dir is not None:
-        wav = synthesizer.tts(args.text, speaker_name=args.speaker_idx)
+        wav = synthesizer.tts(
+            args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
+        )
 
     # save the results
     print(" > Saving output to {}".format(args.out_path))

diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py
@@ -37,6 +37,12 @@ def register_config(model_name: str) -> Coqpit:
     """
     config_class = None
     config_name = model_name + "_config"
+
+    # TODO: fix this
+    if model_name == "xtts":
+        from TTS.tts.configs.xtts_config import XttsConfig
+
+        config_class = XttsConfig
     paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"]
     for path in paths:
         try:

diff --git a/TTS/tts/configs/xtts_config.py b/TTS/tts/configs/xtts_config.py
@@ -0,0 +1,90 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig
+
+
+@dataclass
+class XttsConfig(BaseTTSConfig):
+    """Defines parameters for XTTS TTS model.
+
+    Args:
+        model (str):
+            Model name. Do not change unless you know what you are doing.
+
+        model_args (XttsArgs):
+            Model architecture arguments. Defaults to `XttsArgs()`.
+
+        audio (XttsAudioConfig):
+            Audio processing configuration. Defaults to `XttsAudioConfig()`.
+
+        model_dir (str):
+            Path to the folder that has all the XTTS models. Defaults to None.
+
+        temperature (float):
+            Temperature for the autoregressive model inference. Larger values makes predictions more creative sacrificing stability. Defaults to `0.2`.
+
+        length_penalty (float):
+            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length,
+            which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative),
+            length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.
+
+        reperation_penalty (float):
+            The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`.
+
+        top_p (float):
+            If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+            Defaults to `0.8`.
+
+        cond_free_k (float):
+            Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
+            As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
+            Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`.
+
+        diffusion_temperature (float):
+            Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
+            are the "mean" prediction of the diffusion network and will sound bland and smeared.
+            Defaults to `1.0`.
+
+        num_gpt_outputs (int):
+            Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
+            As XTTS is a probabilistic model, more samples means a higher probability of creating something "great".
+            Defaults to `16`.
+
+        decoder_iterations (int):
+            Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
+            the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
+            however. Defaults to `30`.
+
+        decoder_sampler (str):
+            Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.
+    Note:
+        Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
+
+    Example:
+
+        >>> from TTS.tts.configs.xtts_config import XttsConfig
+        >>> config = XttsConfig()
+    """
+
+    model: str = "xtts"
+    # model specific params
+    model_args: XttsArgs = field(default_factory=XttsArgs)
+    audio: XttsAudioConfig = field(default_factory=XttsAudioConfig)
+    model_dir: str = None
+    languages: List[str] = field(
+        default_factory=lambda: ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"]
+    )
+
+    # inference params
+    temperature: float = 0.2
+    length_penalty: float = 1.0
+    repetition_penalty: float = 2.0
+    top_k: int = 50
+    top_p: float = 0.8
+    cond_free_k: float = 2.0
+    diffusion_temperature: float = 1.0
+    num_gpt_outputs: int = 16
+    decoder_iterations: int = 30
+    decoder_sampler: str = "ddim"
diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
@@ -13,6 +13,45 @@
 ########################
 
 
+def cml_tts(root_path, meta_file, ignored_speakers=None):
+    """Normalizes the CML-TTS meta data file to TTS format
+    https://github.com/freds0/CML-TTS-Dataset/"""
+    filepath = os.path.join(root_path, meta_file)
+    # ensure there are 4 columns for every line
+    with open(filepath, "r", encoding="utf8") as f:
+        lines = f.readlines()
+    num_cols = len(lines[0].split("|"))  # take the first row as reference
+    for idx, line in enumerate(lines[1:]):
+        if len(line.split("|")) != num_cols:
+            print(f" > Missing column in line {idx + 1} -> {line.strip()}")
+    # load metadata
+    metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
+    assert all(x in metadata.columns for x in ["wav_filename", "transcript"])
+    client_id = None if "client_id" in metadata.columns else "default"
+    emotion_name = None if "emotion_name" in metadata.columns else "neutral"
+    items = []
+    not_found_counter = 0
+    for row in metadata.itertuples():
+        if client_id is None and ignored_speakers is not None and row.client_id in ignored_speakers:
+            continue
+        audio_path = os.path.join(root_path, row.wav_filename)
+        if not os.path.exists(audio_path):
+            not_found_counter += 1
+            continue
+        items.append(
+            {
+                "text": row.transcript,
+                "audio_file": audio_path,
+                "speaker_name": client_id if client_id is not None else row.client_id,
+                "emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
+                "root_path": root_path,
+            }
+        )
+    if not_found_counter > 0:
+        print(f" | > [!] {not_found_counter} files not found")
+    return items
+
+
 def coqui(root_path, meta_file, ignored_speakers=None):
     """Interal dataset formatter."""
     filepath = os.path.join(root_path, meta_file)

diff --git a/TTS/tts/layers/tortoise/tokenizer.py b/TTS/tts/layers/tortoise/tokenizer.py
@@ -5,15 +5,14 @@
 
 from TTS.tts.utils.text.cleaners import english_cleaners
 
-DEFAULT_VOCAB_FILE = os.path.join(
-    os.path.dirname(os.path.realpath(__file__)), "../../utils/assets/tortoise/tokenizer.json"
-)
-
 
 class VoiceBpeTokenizer:
-    def __init__(self, vocab_file=DEFAULT_VOCAB_FILE):
+    def __init__(self, vocab_file=None, vocab_str=None):
+        self.tokenizer = None
         if vocab_file is not None:
             self.tokenizer = Tokenizer.from_file(vocab_file)
+        if vocab_str is not None:
+            self.tokenizer = Tokenizer.from_str(vocab_str)
 
     def preprocess_text(self, txt):
         txt = english_cleaners(txt)