From 365a5b6fdb73d4aa87e17246fb8921c186148283 Mon Sep 17 00:00:00 2001
From: Nathan Fradet <56734983+Natooz@users.noreply.github.com>
Date: Thu, 25 Apr 2024 12:50:06 +0200
Subject: [PATCH] update docs

---
 README.md                                     |  2 +-
 .../tokenizer_training/benchmark_training.py  | 23 ++++++++++++++-----
 docs/bases.rst                                | 12 +++++-----
 docs/index.rst                                |  9 ++++----
 4 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index c2eec7df..180a6794 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ Python package to tokenize music files, introduced at the ISMIR 2021 LBDs.
 [![Code style](https://img.shields.io/badge/code%20style-ruff-000000.svg)](https://github.com/astral-sh/ruff)
 
 MidiTok can tokenize MIDI and abc files, i.e. convert them into sequences of tokens ready to be fed to models such as Transformer, for any generation, transcription or MIR task.
-MidiTok features most known [music tokenizations](https://miditok.readthedocs.io/en/latest/tokenizations.html) (e.g. [REMI](https://arxiv.org/abs/2002.00212), [Compound Word](https://arxiv.org/abs/2101.02402)...), and is built around the idea that they all share common parameters and methods. Tokenizers can be trained with [Byte Pair Encoding (BPE)](https://aclanthology.org/2023.emnlp-main.123/) and [Unigram](https://aclanthology.org/P18-1007/), and it offers data augmentation methods.
+MidiTok features most known [music tokenizations](https://miditok.readthedocs.io/en/latest/tokenizations.html) (e.g. [REMI](https://arxiv.org/abs/2002.00212), [Compound Word](https://arxiv.org/abs/2101.02402)...), and is built around the idea that they all share common parameters and methods. Tokenizers can be trained with [Byte Pair Encoding (BPE)](https://aclanthology.org/2023.emnlp-main.123/), [Unigram](https://aclanthology.org/P18-1007/) and [WordPiece](https://arxiv.org/abs/1609.08144), and it offers data augmentation methods.
 
 MidiTok is integrated with the Hugging Face Hub 🤗! Don't hesitate to share your models to the community!
 
diff --git a/benchmarks/tokenizer_training/benchmark_training.py b/benchmarks/tokenizer_training/benchmark_training.py
index a1647d55..003dd6f1 100755
--- a/benchmarks/tokenizer_training/benchmark_training.py
+++ b/benchmarks/tokenizer_training/benchmark_training.py
@@ -37,8 +37,8 @@
 # Training
 TOKENIZATIONS = ["REMI", "TSD", "MIDILike"]
 MODELS: list[Literal["BPE", "Unigram", "WordPiece"]] = ["BPE", "Unigram", "WordPiece"]
-VOCAB_SIZE = 20000
-MAX_NUM_FILES_TRAINING = 20000
+VOCAB_SIZE = 200000
+MAX_NUM_FILES_TRAINING = 50000
 
 # Encoding-decoding
 BATCH_SIZES = [1, 16, 64, 128]
@@ -180,9 +180,18 @@ def seq_len_splits(datasets_params: list[tuple[str, dict, str]]) -> None:
     df.to_latex(RESULTS_PATH / "seq_split_lengths.txt")
 
 
-def benchmark_training_time() -> None:
-    r"""Benchmark BPE encoding, batched and un-batched."""
-    indexes = [f"{model} {split}-split" for model in MODELS for split in SPLITS]
+def benchmark_training_time(vocab_size: int) -> None:
+    r"""
+    Benchmark BPE encoding, batched and un-batched.
+
+    :param vocab_size: size of the vocabulary.
+    """
+    indexes = [
+        f"{model} {split}-split"
+        for model in MODELS
+        for split in SPLITS
+        if (model, split) != ("Unigram", "no")
+    ]
     df_file_path = RESULTS_PATH / "training_time.csv"
     if df_file_path.is_file():
         df = read_csv(df_file_path, index_col=0)
@@ -197,6 +206,8 @@ def benchmark_training_time() -> None:
             col_name = f"{dataset} {tokenization}"
             for model in MODELS:
                 for split in SPLITS:
+                    if (model, split) == ("Unigram", "no"):
+                        continue
                     index_name = f"{model} {split}-split"
 
                     # Check measure is not already performed
@@ -215,7 +226,7 @@ def benchmark_training_time() -> None:
                     random.seed(SEED)
                     t0 = time()
                     tokenizer.train(
-                        vocab_size=VOCAB_SIZE,
+                        vocab_size=vocab_size,
                         model=model,
                         files_paths=files_paths,
                     )
diff --git a/docs/bases.rst b/docs/bases.rst
index e2aa75e9..e111209b 100644
--- a/docs/bases.rst
+++ b/docs/bases.rst
@@ -8,14 +8,14 @@ Tokens and vocabulary
 ------------------------
 
 A token is a distinct element, part of a sequence of tokens. In natural language, a token can be a character, a subword or a word. A sentence can then be tokenized into a sequence of tokens representing the words and punctuation.
-For symbolic music, tokens can represent the values of the note attributes (pitch, valocity, duration) or time events. These are the "basic" tokens, that can be compared to the characters in natural language. With :ref:`Byte Pair Encoding (BPE)`, tokens can represent **successions** of these basic tokens.
+For symbolic music, tokens can represent the values of the note attributes (pitch, valocity, duration) or time events. These are the "basic" tokens, that can be compared to the characters in natural language. In the vocabulary of trained tokenizers, the tokens can represent **successions** of these basic tokens.
 A token can take three forms, which we name by convention:
 
 * Token (``string``): the form describing it, e.g. *Pitch_50*.
-* Id (``int``): an unique associated integer, used as an index.
-* Byte (``string``): an unique associated byte, used internally for :ref:`Byte Pair Encoding (BPE)`.
+* Id (``int``): an unique associated integer, which corresponds to the index of the index in the vocabulary.
+* Byte (``string``): an distinct byte, used internally for trained tokenizers (:ref:`Training a tokenizer`).
 
-MidiTok works with :ref:`TokSequence` objects to output token sequences of represented by these three forms.
+MidiTok works with :ref:`TokSequence` objects to conveniently represent these three forms.
 
 Vocabulary
 ------------------------
@@ -25,7 +25,7 @@ For tokenizations with embedding pooling (e.g. :ref:`CPWord` or :ref:`Octuple`),
 
 **With Byte Pair Encoding:**
 ``tokenizer.vocab`` holds all the basic tokens describing the note and time attributes of music. By analogy with text, these tokens can be seen as unique characters.
-After training a tokenizer with :ref:`Byte Pair Encoding (BPE)`, a new vocabulary is built with newly created tokens from pairs of basic tokens. This vocabulary can be accessed with ``tokenizer.vocab_bpe``, and binds tokens as bytes (string) to their associated ids (int). This is the vocabulary of the 🤗tokenizers BPE model.
+After :ref:`Training a tokenizer`, a new vocabulary is built with newly created tokens from pairs of basic tokens. This vocabulary can be accessed with ``tokenizer.vocab_bpe``, and binds tokens as bytes (string) to their associated ids (int). This is the vocabulary of the 🤗tokenizers BPE model.
 
 TokSequence
 ------------------------
@@ -37,7 +37,7 @@ You can use the :py:func:`miditok.MusicTokenizer.complete_sequence` method to au
 .. autoclass:: miditok.TokSequence
     :members:
 
-MIDI Tokenizer
+The MusicTokenizer class
 ------------------------
 
 MidiTok features several MIDI tokenizations, all inheriting from the :class:`miditok.MusicTokenizer` class.
diff --git a/docs/index.rst b/docs/index.rst
index 20b9db45..a14adf00 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -10,10 +10,9 @@ Welcome to MidiTok's documentation!
   :width: 600
   :alt:
 
-**MidiTok** is a Python package for MIDI file tokenization, presented at the ISMIR 2021 LBDs `(paper) <https://archives.ismir.net/ismir2021/latebreaking/000005.pdf>`_.
-It converts MIDI files to sequences of tokens ready to be fed to sequential Deep Learning models such as Transformers.
-
-MidiTok features most known MIDI :ref:`tokenizations`, and is built around the idea that they all share common methods. It properly pre-process MIDI files, and supports :ref:`Byte Pair Encoding (BPE)`.
+**MidiTok** is a Python package for MIDI file tokenization, introduced at the ISMIR 2021 LBDs `(paper) <https://archives.ismir.net/ismir2021/latebreaking/000005.pdf>`_.
+It tokenize symbolic music files (MIDI, abc), i.e. convert them into sequences of tokens ready to be fed to models such as Transformer, for any generation, transcription or MIR task.
+MidiTok features most known MIDI :ref:`tokenizations`, and is built around the idea that they all share common methods. Tokenizers can be trained with BPE, Unigram or WordPiece (:ref:`Training a tokenizer`) and be push to and pulled from the Hugging Face hub!
 `Github repository <https://github.com/Natooz/MidiTok>`_
 
 Installation
@@ -23,7 +22,7 @@ Installation
 
     pip install miditok
 
-MidiTok uses `symusic <https://github.com/Yikai-Liao/symusic>`_ to read and write MIDI files, and BPE is backed by `Hugging Face 🤗tokenizers <https://github.com/huggingface/tokenizers>`_ for super fast encoding.
+MidiTok uses `symusic <https://github.com/Yikai-Liao/symusic>`_ to read and write MIDI files, and tokenizer training is backed by the `Hugging Face 🤗tokenizers <https://github.com/huggingface/tokenizers>`_ for super fast encoding.
 
 Citation
 ==================