Hugging Face hub integration: push and load from the hub (#87)

* Hugging Face hub integration, adding the base files / changes Co-authored-by: Lucain
Natooz · Oct 24, 2023 · 8dddcca · 8dddcca
1 parent 8cd6a67
commit 8dddcca
Show file tree

Hide file tree

Showing 18 changed files with 270 additions and 99 deletions.
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -35,6 +35,6 @@ jobs:
           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
       - name: Test with pytest
         run: |
-          pytest --cov=./ --cov-report=xml -n auto
+          pytest --cov=./ --cov-report=xml -n auto --hf-token ${{ secrets.HF_TOKEN_HUB_TESTS }}
       - name: Codecov
         uses: codecov/[email protected]
diff --git a/colab-notebooks/Full_Example_HuggingFace_GPT2_Transformer.ipynb b/colab-notebooks/Full_Example_HuggingFace_GPT2_Transformer.ipynb
@@ -43,6 +43,8 @@
     "!pip install accelerate\n",
     "!pip install evaluate\n",
     "!pip install tqdm\n",
+    "!pip install scikit-learn\n",
+    "!pip install tensorboard\n",
     "\n",
     "!wget https://storage.googleapis.com/magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0-midi.zip\n",
     "!unzip 'maestro-v3.0.0-midi.zip'\n",
@@ -146,7 +148,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -310,7 +312,7 @@
     "        generation_config=generation_config)  # (N,T)\n",
     "\n",
     "    # Saves the generated music, as MIDI files and tokens (json)\n",
-    "    for prompt, continuation in zip(batch, res):\n",
+    "    for prompt, continuation in zip(batch[\"input_ids\"], res):\n",
     "        generated = continuation[len(prompt):]\n",
     "        midi = tokenizer.tokens_to_midi([deepcopy(generated.tolist())], time_division=384)\n",
     "        tokens = [generated, prompt, continuation]  # list compr. as seqs of dif. lengths\n",

diff --git a/docs/assets/Octuple_TS_Rest/original.png b/docs/assets/Octuple_TS_Rest/original.png
diff --git a/docs/assets/Octuple_TS_Rest/tokenized.png b/docs/assets/Octuple_TS_Rest/tokenized.png
diff --git a/docs/bpe.rst b/docs/bpe.rst
@@ -40,17 +40,17 @@ BPE example
     from copy import deepcopy
 
     tokenizer = REMI()  # using defaults parameters (constants.py)
-    token_paths = list(Path('path', 'to', 'dataset').glob('**/*.json'))
+    tokens_no_bpe_paths = list(Path('path', 'to', 'dataset').glob('**/*.json'))
 
     # Learns the vocabulary with BPE
     tokenizer.learn_bpe(
         vocab_size=500,
-        tokens_paths=list(Path('path', 'to', 'tokens_noBPE').glob("**/*.json")),
+        tokens_paths=tokens_no_bpe_paths,
         out_dir=Path('path', 'to', 'tokens_BPE'),
     )
 
     # Opens tokens, apply BPE on them, and decode BPE back
-    tokens = tokenizer.load_tokens(token_paths[0])
+    tokens = tokenizer.load_tokens(tokens_no_bpe_paths[0])
     tokens = TokSequence(ids=tokens)
     tokens_with_bpe = tokenizer.apply_bpe(deepcopy(tokens))  # copy as the method is inplace
     tokens_no_bpe = tokenizer.decode_bpe(deepcopy(tokens_with_bpe))

diff --git a/docs/conf.py b/docs/conf.py
@@ -14,7 +14,7 @@
 project = "MidiTok"
 copyright = "2023, Nathan Fradet"
 author = "Nathan Fradet"
-release = "2.0.7"
+release = "2.1.7"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

diff --git a/docs/hf_hub.rst b/docs/hf_hub.rst
@@ -0,0 +1,56 @@
+========================
+Hugging Face hub
+========================
+
+What is the Hugging Face hub
+---------------------------------
+
+The `Hugging Face Hub <https://huggingface.co>`_ is a model and dataset sharing platform which is widely used in the AI community. It allows to freely upload, share and download models and datasets, directly in your code. Its interactions rely on an open-source Python package named `huggingface_hub <https://github.com/huggingface/huggingface_hub>`_. As it works seamlessly in the Hugging Face ecosystem, especially the `Transformers <https://huggingface.co/docs/transformers/index>`_ or `Diffusers <https://huggingface.co/docs/diffusers/index>`_ libraries, it stood out and became one of the preferred way to openly share and download models.
+
+Now when downloading a Transformer model, you will need to also download its associated tokenizer to be able to "dialog" with it. MidiTok allows you to push and download tokenizers in similar way to what is done in the Hugging Face Transformers library.
+
+How MidiTok interoperates with the hub
+------------------------------------------
+
+Internally, MidiTok relies on the ``huggingface_hub.ModelHubMixin`` component. It implements the same methods commonly used in the Hugging Face ecosystem. Note that:
+
+* :py:func:`miditok.MIDITokenizer.save_pretrained` is equivalent to calling :py:func:`miditok.MIDITokenizer.save_params`;
+* :py:func:`miditok.MIDITokenizer.from_pretrained` can be used to load tokenizers whether from the Hugging Face hub or from a file on your local filesystem;
+* for :py:func:`miditok.MIDITokenizer.save_pretrained` and :py:func:`miditok.MIDITokenizer.push_to_hub`, you can ignore the ``config`` argument which is meant to be used with models (not applicable for tokenizers);
+* you can give a ``filename`` keyword argument with the :py:func:`miditok.MIDITokenizer.save_pretrained` and :py:func:`miditok.MIDITokenizer.from_pretrained` methods to use a specific tokenizer configuration file name, otherwise the default one will be used (``tokenizer.conf``).
+
+.. autofunction:: miditok.MIDITokenizer.from_pretrained
+    :noindex:
+
+.. autofunction:: miditok.MIDITokenizer.save_pretrained
+    :noindex:
+
+.. autofunction:: miditok.MIDITokenizer.push_to_hub
+    :noindex:
+
+Example
+------------------------
+
+..  code-block:: python
+
+    from miditok import REMI, TokSequence
+    from copy import deepcopy
+
+    tokenizer = REMI()  # using defaults parameters (constants.py)
+    hf_token = "your_hf_token"  # to create on huggingface.co
+
+    # Train the tokenizer with BPE
+    tokenizer.learn_bpe(
+        vocab_size=500,
+        tokens_paths=list(Path('path', 'to', 'tokens').glob("**/*.json")),
+        out_dir=Path('path', 'to', 'tokens_BPE'),
+    )
+
+    # Push the tokenizer to the HF hub
+    tokenizer.push_to_hub("YourUserName/model-name", private=True, token=hf_token)
+
+    # Recreates it from the configuration saved on the hub
+    tokenizer2 = REMI.from_pretrained("YourUserName/model-name", token=hf_token)
+    assert tokenizer == tokenizer2
+
+
diff --git a/docs/index.rst b/docs/index.rst
@@ -50,6 +50,7 @@ Contents
    examples
    tokenizations
    bpe
+   hf_hub
    pytorch_data
    data_augmentation
    utils

diff --git a/docs/midi_tokenizer.rst b/docs/midi_tokenizer.rst
@@ -65,7 +65,15 @@ MidiTok offers to include additional tokens on music information. You can specif
    :file: additional_tokens_table.csv
    :header-rows: 1
 
-¹: using both time signatures and rests with `CPWord` might result in time alterations, as the time signature is carried by the Bar tokens which are skipped during period of rests.
+¹: using both time signatures and rests with `CPWord` might result in time alterations, as the time signature is carried by the Bar tokens which are skipped during period of rests. An example is shown below
+
+.. image:: /assets/Octuple_TS_Rest/original.png
+  :width: 800
+  :alt: Original MIDI sample preprocessed / downsampled
+
+.. image:: /assets/Octuple_TS_Rest/tokenized.png
+  :width: 800
+  :alt: MIDI sample after being tokenized, the time has been shifted to a bar during the time signature change
 
 
 Special tokens
@@ -83,9 +91,9 @@ To use special tokens, you must specify them with the ``special_tokens`` argumen
 
 
 Tokens & TokSequence input / output format
-------------------------
+--------------------------------------------
 
-Depending on the tokenizer at use, the **format** of the tokens returned by the ``midi_to_tokens`` method may vary, as well as the expected format for the ``tokens_to_midi`` method. The format is given by the ``tokenizer.io_format` property. For any tokenizer, the format is the same for both methods.
+Depending on the tokenizer at use, the **format** of the tokens returned by the ``midi_to_tokens`` method may vary, as well as the expected format for the ``tokens_to_midi`` method. The format is given by the ``tokenizer.io_format`` property. For any tokenizer, the format is the same for both methods.
 
 The format is deduced from the ``is_multi_voc`` and ``one_token_stream`` tokenizer properties. **one_token_stream** being True means that the tokenizer will convert a MIDI file into a single stream of tokens for all instrument tracks, otherwise it will convert each track to a distinct token sequence. **is_mult_voc** being True means that each token stream is a list of lists of tokens, of shape ``(T,C)`` for T time steps and C subtokens per time step.
 

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -5,4 +5,5 @@ tokenizers>=0.13.2
 torch
 scipy  # needed for miditoolkit
 matplotlib  # needed for miditoolkit
-sphinx-rtd-theme
+sphinx-rtd-theme
+huggingface_hub
diff --git a/miditok/__init__.py b/miditok/__init__.py
@@ -12,11 +12,11 @@
 )
 
 from .utils import utils
-from .constants import CURRENT_VERSION_PACKAGE
+from .constants import CURRENT_MIDITOK_VERSION
 from miditok import data_augmentation
 
 
-__version__ = CURRENT_VERSION_PACKAGE
+__version__ = CURRENT_MIDITOK_VERSION
 
 
 class REMIPlus(REMI):