From 72001e82036e2137661d0498bc40b78a1848dad2 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Wed, 3 Apr 2024 09:28:11 -0400
Subject: [PATCH 01/19] change dict update (#2213)

---
 src/sparseml/export/validators.py                              | 3 ++-
 .../transformers/compression/compressors/sparse_bitmask.py     | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/sparseml/export/validators.py b/src/sparseml/export/validators.py
index 52c9fa05ee0..f513bda21aa 100644
--- a/src/sparseml/export/validators.py
+++ b/src/sparseml/export/validators.py
@@ -17,8 +17,9 @@
 import os.path
 from collections import OrderedDict
 from pathlib import Path
-from typing import Callable, List, Optional, Union
+from typing import Callable, List, Optional
 from typing import OrderedDict as OrderedDictType
+from typing import Union
 
 import numpy
 import onnx
diff --git a/src/sparseml/transformers/compression/compressors/sparse_bitmask.py b/src/sparseml/transformers/compression/compressors/sparse_bitmask.py
index 63b2912a3b0..1c6f35c7171 100644
--- a/src/sparseml/transformers/compression/compressors/sparse_bitmask.py
+++ b/src/sparseml/transformers/compression/compressors/sparse_bitmask.py
@@ -70,7 +70,7 @@ def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
                         f"found an existing entry for {key}. The existing entry will "
                         "be replaced."
                     )
-            compressed_dict |= bitmask_dict
+            compressed_dict.update(bitmask_dict)
 
         return compressed_dict
 

From 8ba1dbac2beead69b3927acd9fb58ea233191570 Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Wed, 3 Apr 2024 17:54:58 +0200
Subject: [PATCH 02/19] Update transformer version to <4.40 (#2204)

* initial commit

* initial commit

* fixing tests

* Update max transformers version

* Update mintransformers version

* initial commit

* fixing tests 1

* fixing tests 2

* quality

* fix bad rebase & quality

* Update setup.py

---------

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 setup.py                                      |  4 +-
 .../examples/llama2/recipes/llama_recipe.yaml |  2 +-
 .../transformers/finetune/callbacks.py        | 29 +----------
 .../transformers/finetune/session_mixin.py    |  7 ---
 src/sparseml/transformers/finetune/trainer.py |  4 --
 .../sparsification/modification/base.py       |  4 +-
 .../modification/modifying_bert.py            | 22 +++-----
 .../modification/modifying_distilbert.py      | 22 +++++---
 .../modification/modifying_llama.py           | 51 ++++++++-----------
 .../modification/modifying_mistral.py         | 43 ++++++++++------
 .../modification/modifying_opt.py             | 46 ++++++++---------
 .../sparsification/obcq/README.md             |  4 +-
 .../sparsification/obcq/example_llama.yaml    |  2 +-
 .../sparsification/obcq/example_mistral.yaml  |  2 +-
 .../sparsification/question_answering.py      |  7 ---
 .../transformers/sparsification/trainer.py    | 35 +------------
 .../finetune/test_quantization.yaml           |  2 +-
 .../transformers/obcq/test_repeats.py         |  8 +--
 .../sparseml/transformers/obcq/test_tiny.yaml |  2 +-
 19 files changed, 111 insertions(+), 185 deletions(-)

diff --git a/setup.py b/setup.py
index 4af84f957b0..27802de438c 100644
--- a/setup.py
+++ b/setup.py
@@ -79,8 +79,8 @@
     "opencv-python<=4.6.0.66",
 ]
 _transformers_deps = _pytorch_deps + [
-    "transformers<4.35.0",
-    "datasets<=2.14.6",
+    "transformers<4.40",
+    "datasets<2.19",
     "dvc",
     "scikit-learn",
     "seqeval",
diff --git a/src/sparseml/experimental/sparsegpt/examples/llama2/recipes/llama_recipe.yaml b/src/sparseml/experimental/sparsegpt/examples/llama2/recipes/llama_recipe.yaml
index 41513e49946..3056735e040 100644
--- a/src/sparseml/experimental/sparsegpt/examples/llama2/recipes/llama_recipe.yaml
+++ b/src/sparseml/experimental/sparsegpt/examples/llama2/recipes/llama_recipe.yaml
@@ -11,7 +11,7 @@ quantization_modifiers:
     ignore:
       - LlamaRotaryEmbedding
       - LlamaRMSNorm
-      - SiLUActivation
+      - SiLU
       - model.layers.0.mlp.down_proj
       - model.layers.1.mlp.down_proj
       - model.layers.2.mlp.down_proj
diff --git a/src/sparseml/transformers/finetune/callbacks.py b/src/sparseml/transformers/finetune/callbacks.py
index 1c483b5d99f..196240a2b8f 100644
--- a/src/sparseml/transformers/finetune/callbacks.py
+++ b/src/sparseml/transformers/finetune/callbacks.py
@@ -109,15 +109,6 @@ def __init__(self, trainer, *args, **kwargs):
         self.on_begin_called = False
         self.quant_start_epoch = math.inf
 
-    def check_disable(self, epoch: float, force: bool = False):
-        """
-        If needed due to active quantization, disable FP16 training
-        """
-        if (
-            force or hasattr(self.trainer, "scaler") and self.trainer.scaler._enabled
-        ) and self.qat_active():
-            self.disable_amp(epoch)
-
     def qat_active(self) -> bool:
         """
         :return: True if a quantization modifier is active in the current session
@@ -125,23 +116,6 @@ def qat_active(self) -> bool:
         session = session_manager.active_session()
         return session.state.model.qat_active()
 
-    def disable_amp(self, epoch: float):
-        """
-        Disable FP16 training
-
-        :param epoch: epoch to disable from
-        """
-        if not self.on_begin_called:
-            # disable if training loops haven't started so we don't load
-            # the empty scaler state dict and instead disable it from the start
-            self.trainer.use_cuda_amp = False
-
-        if hasattr(self.trainer, "scaler"):
-            self.trainer.scaler._enabled = False
-
-        self.quant_start_epoch = epoch
-        _LOGGER.info(f"entering QAT phase at epoch {epoch}, disabling FP16 training")
-
     def on_epoch_begin(
         self,
         args: TrainingArguments,
@@ -150,8 +124,7 @@ def on_epoch_begin(
         **kwargs,
     ):
         """
-        Event called at the beginning of an epoch. Disables FP16 training.
+        Event called at the beginning of an epoch.
         """
         super().on_epoch_begin(args, state, control, **kwargs)
         self.on_begin_called = True
-        self.check_disable(state.epoch)
diff --git a/src/sparseml/transformers/finetune/session_mixin.py b/src/sparseml/transformers/finetune/session_mixin.py
index 72d18d98a9b..3971b1c0a02 100644
--- a/src/sparseml/transformers/finetune/session_mixin.py
+++ b/src/sparseml/transformers/finetune/session_mixin.py
@@ -363,7 +363,6 @@ def train(self, *args, stage: Optional[str] = None, **kwargs):
         """
         checkpoint, epoch = self._calculate_checkpoint_info(kwargs)
         self.initialize_session(epoch=epoch, checkpoint=checkpoint, stage=stage)
-        self.callback_disable_fp16.check_disable(epoch, force=True)
         self.accelerator.wait_for_everyone()
         output = super().train(*args, **kwargs)
         self.accelerator.wait_for_everyone()
@@ -393,13 +392,7 @@ def evaluate(self, *args, **kwargs):
         """
         self.initialize_structure()
 
-        # Always evaluate w/ fp32 to be closer to DeepSparse
-        use_cuda_amp = self.use_cuda_amp
-        if not self.args.fp16_full_eval and not self.args.bf16_full_eval:
-            self.use_cuda_amp = False
-
         output = super().evaluate(*args, **kwargs)
-        self.use_cuda_amp = use_cuda_amp
         self.finalize_session()
 
         return output
diff --git a/src/sparseml/transformers/finetune/trainer.py b/src/sparseml/transformers/finetune/trainer.py
index cf920e1feb6..36a850f251b 100644
--- a/src/sparseml/transformers/finetune/trainer.py
+++ b/src/sparseml/transformers/finetune/trainer.py
@@ -91,10 +91,6 @@ def save_optimizer_and_scheduler(self, output_dir: Optional[str] = None):
                         os.path.join(output_dir, "scheduler.pt"),
                     )
             reissue_pt_warnings(caught_warnings)
-            if self.use_cuda_amp:
-                torch.save(
-                    self.scaler.state_dict(), os.path.join(output_dir, "scaler.pt")
-                )
 
     def _save_checkpoint(self, model, trial, metrics=None):
         # Call into the save checkpoint by HF Transformers, which saves the
diff --git a/src/sparseml/transformers/sparsification/modification/base.py b/src/sparseml/transformers/sparsification/modification/base.py
index 946e6499851..6d9435b8b8b 100644
--- a/src/sparseml/transformers/sparsification/modification/base.py
+++ b/src/sparseml/transformers/sparsification/modification/base.py
@@ -23,8 +23,8 @@
 
 __all__ = ["check_transformers_version"]
 
-_TRANSFORMERS_MIN_VERSION = "4.34.1"
-_TRANSFORMERS_MAX_VERSION = "4.35.0"
+_TRANSFORMERS_MIN_VERSION = "4.39.0"
+_TRANSFORMERS_MAX_VERSION = "4.39.2"
 
 
 def check_transformers_version(
diff --git a/src/sparseml/transformers/sparsification/modification/modifying_bert.py b/src/sparseml/transformers/sparsification/modification/modifying_bert.py
index d53046abd03..20e2e8ded4e 100644
--- a/src/sparseml/transformers/sparsification/modification/modifying_bert.py
+++ b/src/sparseml/transformers/sparsification/modification/modifying_bert.py
@@ -17,6 +17,7 @@
 context of SparseML
 """
 
+
 import logging
 import math
 from typing import Optional, Tuple
@@ -122,22 +123,16 @@ def forward(
 
         use_cache = past_key_value is not None
         if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor)
-            # of all cross attention key/value_states.
-            # Further calls to cross_attention
-            # layer can then reuse all cross-attention
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. # noqa
+            # Further calls to cross_attention layer can then reuse all cross-attention
             # key/value_states (first "if" case)
-            # if uni-directional self-attention
-            # (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states.
-            # Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to
-            # current projected key/value_states (third "elif" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of # noqa
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention # noqa
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) # noqa
             # if encoder bi-directional self-attention `past_key_value` is always `None`
             past_key_value = (key_layer, value_layer)
 
-        # Take the dot product between "query" and "key"
-        # to get the raw attention scores.
+        # Take the dot product between "query" and "key" to get the raw attention scores. # noqa
         # ==== SparseML MODIFICATION ====
         attention_scores = self.attention_scores_matmul(
             query_layer, key_layer.transpose(-1, -2)
@@ -189,8 +184,7 @@ def forward(
 
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
-            # Apply the attention mask is
-            # (precomputed for all layers in BertModel forward() function)
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function) # noqa
             attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
diff --git a/src/sparseml/transformers/sparsification/modification/modifying_distilbert.py b/src/sparseml/transformers/sparsification/modification/modifying_distilbert.py
index 0312f5c6bac..c37da2cbdd0 100644
--- a/src/sparseml/transformers/sparsification/modification/modifying_distilbert.py
+++ b/src/sparseml/transformers/sparsification/modification/modifying_distilbert.py
@@ -23,7 +23,10 @@
 
 import torch
 from torch import nn
-from transformers.models.distilbert.modeling_distilbert import MultiHeadSelfAttention
+from transformers.models.distilbert.modeling_distilbert import (
+    DistilBertFlashAttention2,
+    MultiHeadSelfAttention,
+)
 
 from sparseml.pytorch.utils.helpers import swap_modules
 from sparseml.transformers.sparsification.modification.modification_objects import (
@@ -45,6 +48,9 @@ def modify(model: nn.Module) -> nn.Module:
     1. Replaces the MultiHeadSelfAttention modules with
         MultiHeadSelfAttentionWithQuantizableMatmuls modules
 
+    Note: This function will not alter any of the alternatives
+    to the MultiHeadSelfAttention module such as DistilBertFlashAttention2
+
     :param model: the original DistilBert model
     :return: the modified DistilBert model
     """
@@ -53,6 +59,11 @@ def modify(model: nn.Module) -> nn.Module:
             swap_modules(
                 model, name, MultiHeadSelfAttentionWithQuantizableMatmuls(submodule)
             )
+        if isinstance(submodule, DistilBertFlashAttention2):
+            _LOGGER.debug(
+                f"The model contains {submodule.__class__.__name__} "
+                "module, which will not be modified"
+            )
     return model
 
 
@@ -92,15 +103,12 @@ def forward(
             mask: torch.tensor(bs, seq_length)
 
         Returns:
-            weights: torch.tensor(bs, n_heads, seq_length, seq_length)
-            Attention weights context: torch.tensor(bs,
-            seq_length, dim) Contextualized layer.
-            Optional: only if `output_attentions=True`
+            weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs, # noqa
+            seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True` # noqa
         """
         bs, q_length, dim = query.size()
         k_length = key.size(1)
-        # assert dim == self.dim, f'Dimensions do not match:
-        # {dim} input vs {self.dim} configured'
+        # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured' # noqa
         # assert key.size() == value.size()
 
         dim_per_head = self.dim // self.n_heads
diff --git a/src/sparseml/transformers/sparsification/modification/modifying_llama.py b/src/sparseml/transformers/sparsification/modification/modifying_llama.py
index ae5998ed69f..6c89469f524 100644
--- a/src/sparseml/transformers/sparsification/modification/modifying_llama.py
+++ b/src/sparseml/transformers/sparsification/modification/modifying_llama.py
@@ -25,8 +25,10 @@
 import torch.nn.functional as F
 from torch import nn
 from transformers.models.llama.modeling_llama import (
+    Cache,
     LlamaAttention,
     LlamaFlashAttention2,
+    LlamaSdpaAttention,
     apply_rotary_pos_emb,
     repeat_kv,
 )
@@ -54,6 +56,7 @@ def modify(model: nn.Module) -> nn.Module:
 
     Note: This function will not alter any of the alternatives
     to the LlamaAttention module such as LlamaFlashAttention2
+    or LlamaSdpaAttention
 
     :param model: the original LLaMa model
     :return: the modified LLaMa model
@@ -61,7 +64,7 @@ def modify(model: nn.Module) -> nn.Module:
     for name, submodule in model.named_modules():
         if isinstance(submodule, LlamaAttention):
             swap_modules(model, name, LlamaAttentionWithQuantizableMatmuls(submodule))
-        elif isinstance(submodule, LlamaFlashAttention2):
+        elif isinstance(submodule, (LlamaSdpaAttention, LlamaFlashAttention2)):
             _LOGGER.debug(
                 f"The model contains {submodule.__class__.__name__} "
                 "module, which will not be modified"
@@ -121,10 +124,11 @@ def forward(
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
-        padding_mask: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
 
@@ -171,20 +175,18 @@ def forward(
             bsz, q_len, self.num_key_value_heads, self.head_dim
         ).transpose(1, 2)
 
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        past_key_value = getattr(self, "past_key_value", past_key_value)
+        cos, sin = self.rotary_emb(value_states, position_ids)
         query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids
+            query_states, key_states, cos, sin
         )
 
         if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache # noqa
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
@@ -195,33 +197,24 @@ def forward(
         ) / math.sqrt(self.head_dim)
         # ==============================
 
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size "
-                f"{(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size "
-                    f"{(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
 
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(
             attn_weights, dim=-1, dtype=torch.float32
         ).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.attention_dropout, training=self.training
+        )
         # ==== SparseML MODIFICATION ====
         attn_output = self.attn_output_matmul(attn_weights, value_states)
         # ===============================
 
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size "
-                f"{(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"  # noqa
                 f" {attn_output.size()}"
             )
 
diff --git a/src/sparseml/transformers/sparsification/modification/modifying_mistral.py b/src/sparseml/transformers/sparsification/modification/modifying_mistral.py
index 2c206cce091..28d9d7f109f 100644
--- a/src/sparseml/transformers/sparsification/modification/modifying_mistral.py
+++ b/src/sparseml/transformers/sparsification/modification/modifying_mistral.py
@@ -18,13 +18,16 @@
 """
 import logging
 import math
+import warnings
 from typing import Optional, Tuple
 
 import torch
 from torch import nn
 from transformers.models.mistral.modeling_mistral import (
+    Cache,
     MistralAttention,
     MistralFlashAttention2,
+    MistralSdpaAttention,
     apply_rotary_pos_emb,
     repeat_kv,
 )
@@ -52,6 +55,7 @@ def modify(model: torch.nn.Module) -> torch.nn.Module:
 
     Note: This function will not alter any of the alternatives
     to the MistralAttention module such as MistralFlashAttention2
+    or MistralSdpaAttention
 
     :param model: the original Mistral model
     :return: the modified Mistral model
@@ -59,7 +63,7 @@ def modify(model: torch.nn.Module) -> torch.nn.Module:
     for name, submodule in model.named_modules():
         if isinstance(submodule, MistralAttention):
             swap_modules(model, name, MistralAttentionWithQuantizableMatmuls(submodule))
-        if isinstance(submodule, MistralFlashAttention2):
+        if isinstance(submodule, (MistralSdpaAttention, MistralFlashAttention2)):
             _LOGGER.debug(
                 f"The model contains {submodule.__class__.__name__} "
                 "module, which will not be modified"
@@ -112,11 +116,15 @@ def forward(
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
-        padding_mask: Optional[torch.Tensor] = None,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"  # noqa
+            )
         bsz, q_len, _ = hidden_states.size()
 
         query_states = self.q_proj(hidden_states)
@@ -135,18 +143,23 @@ def forward(
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "  # noqa
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "  # noqa
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(
             query_states, key_states, cos, sin, position_ids
         )
 
         if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
 
         # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
@@ -160,16 +173,14 @@ def forward(
 
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
-                f"Attention weights should be of size "
-                f"{(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"  # noqa
                 f" {attn_weights.size()}"
             )
 
         if attention_mask is not None:
             if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                 raise ValueError(
-                    f"Attention mask should be of size "
-                    f"{(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"  # noqa
                 )
 
             attn_weights = attn_weights + attention_mask
@@ -178,14 +189,16 @@ def forward(
         attn_weights = nn.functional.softmax(
             attn_weights, dim=-1, dtype=torch.float32
         ).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.attention_dropout, training=self.training
+        )
         # ==== SparseML MODIFICATION ====
         attn_output = self.attn_output_matmul(attn_weights, value_states)
         # ===============================
 
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size "
-                f"{(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"  # noqa
                 f" {attn_output.size()}"
             )
 
diff --git a/src/sparseml/transformers/sparsification/modification/modifying_opt.py b/src/sparseml/transformers/sparsification/modification/modifying_opt.py
index fad36f05f96..4d2fd58c4f2 100644
--- a/src/sparseml/transformers/sparsification/modification/modifying_opt.py
+++ b/src/sparseml/transformers/sparsification/modification/modifying_opt.py
@@ -22,7 +22,7 @@
 
 import torch
 from torch import nn
-from transformers.models.opt.modeling_opt import OPTAttention
+from transformers.models.opt.modeling_opt import OPTAttention, OptFlashAttention2
 
 from sparseml.pytorch.utils.helpers import swap_modules
 from sparseml.transformers.sparsification.modification.modification_objects import (
@@ -45,12 +45,20 @@ def modify(model: nn.Module) -> nn.Module:
     1. Replaces the OPTAttention modules with
         OPTAttentionWithQuantizableMatmuls modules
 
+    Note: This function will not alter any of the alternatives
+    to the OPTAttention module such as OptFlashAttention2
+
     :param model: the original LLaMa model
     :return: the modified LLaMa model
     """
     for name, submodule in model.named_modules():
         if isinstance(submodule, OPTAttention):
             swap_modules(model, name, OPTAttentionWithQuantizableMatmuls(submodule))
+        elif isinstance(submodule, OptFlashAttention2):
+            _LOGGER.debug(
+                f"The model contains {submodule.__class__.__name__} "
+                "module, which will not be modified"
+            )
     return model
 
 
@@ -141,19 +149,13 @@ def forward(
             value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
 
         if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor)
-            # of all cross attention key/value_states.
-            # Further calls to cross_attention layer
-            # can then reuse all cross-attention
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. # noqa
+            # Further calls to cross_attention layer can then reuse all cross-attention
             # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder)
-            # save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states.
-            # Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states
-            # to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention
-            # `past_key_value` is always `None`
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of # noqa
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention # noqa
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) # noqa
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
             past_key_value = (key_states, value_states)
 
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
@@ -168,16 +170,14 @@ def forward(
 
         if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
-                f"Attention weights should be of size "
-                f"{(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"  # noqa
                 f" {attn_weights.size()}"
             )
 
         if attention_mask is not None:
             if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                 raise ValueError(
-                    f"Attention mask should be of size "
-                    f"{(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"  # noqa
                 )
             attn_weights = (
                 attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
@@ -191,8 +191,7 @@ def forward(
             )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
-        # upcast to fp32 if the weights are in fp16.
-        # Please see https://github.com/huggingface/transformers/pull/17437
+        # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437 # noqa
         if attn_weights.dtype == torch.float16:
             attn_weights = nn.functional.softmax(
                 attn_weights, dim=-1, dtype=torch.float32
@@ -203,8 +202,7 @@ def forward(
         if layer_head_mask is not None:
             if layer_head_mask.size() != (self.num_heads,):
                 raise ValueError(
-                    f"Head mask for a single layer "
-                    f"should be of size {(self.num_heads,)}, but is"
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"  # noqa
                     f" {layer_head_mask.size()}"
                 )
             attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
@@ -236,16 +234,14 @@ def forward(
 
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size "
-                f"{(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"  # noqa
                 f" {attn_output.size()}"
             )
 
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
 
-        # Use the `embed_dim` from the config (stored in the class)
-        # rather than `hidden_state` because `attn_output` can be
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be # noqa
         # partitioned aross GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
diff --git a/src/sparseml/transformers/sparsification/obcq/README.md b/src/sparseml/transformers/sparsification/obcq/README.md
index b1d2ee6c34b..28f686f5afd 100644
--- a/src/sparseml/transformers/sparsification/obcq/README.md
+++ b/src/sparseml/transformers/sparsification/obcq/README.md
@@ -194,7 +194,7 @@ test_stage:
       # These operations don't make sense to quantize
       - LlamaRotaryEmbedding
       - LlamaRMSNorm
-      - SiLUActivation
+      - SiLU
       # Skip quantizing the BMMs
       - QuantizableMatMul
       # Skip quantizing the layers with the most sensitive activations
@@ -242,7 +242,7 @@ test_stage:
       # These operations don't make sense to quantize
       - MistralRotaryEmbedding
       - MistralRMSNorm
-      - SiLUActivation
+      - SiLU
       # Skip quantizing the layers with the most sensitive activations
       - model.layers.1.mlp.down_proj
       - model.layers.31.mlp.down_proj
diff --git a/src/sparseml/transformers/sparsification/obcq/example_llama.yaml b/src/sparseml/transformers/sparsification/obcq/example_llama.yaml
index db22f39ad0e..da265bf7d27 100644
--- a/src/sparseml/transformers/sparsification/obcq/example_llama.yaml
+++ b/src/sparseml/transformers/sparsification/obcq/example_llama.yaml
@@ -10,7 +10,7 @@ test_stage:
       ignore:
         - LlamaRotaryEmbedding
         - LlamaRMSNorm
-        - SiLUActivation
+        - SiLU
         - model.layers.0.mlp.down_proj
         - model.layers.1.mlp.down_proj
         - model.layers.2.mlp.down_proj
diff --git a/src/sparseml/transformers/sparsification/obcq/example_mistral.yaml b/src/sparseml/transformers/sparsification/obcq/example_mistral.yaml
index ba9c4124c1b..7800c9b9b09 100644
--- a/src/sparseml/transformers/sparsification/obcq/example_mistral.yaml
+++ b/src/sparseml/transformers/sparsification/obcq/example_mistral.yaml
@@ -4,7 +4,7 @@ test_stage:
       ignore:
         - MistralRotaryEmbedding
         - MistralRMSNorm
-        - SiLUActivation
+        - SiLU
         - model.layers.1.mlp.down_proj
         - model.layers.31.mlp.down_proj
         - model.layers.30.mlp.down_proj
diff --git a/src/sparseml/transformers/sparsification/question_answering.py b/src/sparseml/transformers/sparsification/question_answering.py
index a681122b5d0..ea933b92705 100644
--- a/src/sparseml/transformers/sparsification/question_answering.py
+++ b/src/sparseml/transformers/sparsification/question_answering.py
@@ -79,11 +79,6 @@ def evaluate(
         eval_dataloader = self.get_eval_dataloader(eval_dataset)
         eval_examples = self.eval_examples if eval_examples is None else eval_examples
 
-        # Always evaluate w/ fp32 to be closer to DeepSparse
-        use_cuda_amp = self.use_cuda_amp
-        if not self.args.fp16_full_eval and not self.args.bf16_full_eval:
-            self.use_cuda_amp = False
-
         # Temporarily disable metric computation, we will do it in the loop here.
         compute_metrics = self.compute_metrics
         self.compute_metrics = None
@@ -129,8 +124,6 @@ def evaluate(
             self.args, self.state, self.control, metrics
         )
 
-        self.use_cuda_amp = use_cuda_amp
-
         return metrics
 
     def predict(
diff --git a/src/sparseml/transformers/sparsification/trainer.py b/src/sparseml/transformers/sparsification/trainer.py
index bc45bec6d97..035c0215e1f 100644
--- a/src/sparseml/transformers/sparsification/trainer.py
+++ b/src/sparseml/transformers/sparsification/trainer.py
@@ -35,7 +35,7 @@
 from transformers.integrations import TensorBoardCallback
 from transformers.trainer_callback import TrainerState
 from transformers.trainer_pt_utils import reissue_pt_warnings
-from transformers.trainer_utils import ShardedDDPOption, get_last_checkpoint
+from transformers.trainer_utils import get_last_checkpoint
 
 from sparseml.pytorch.model_load.helpers import log_model_load
 from sparseml.pytorch.optim import ScheduledModifierManager, ScheduledOptimizer
@@ -787,7 +787,6 @@ def train(self, *args, **kwargs):
         """
         checkpoint, epoch = self._generate_apply_manager_params(kwargs)
         applied = self.apply_manager(epoch=epoch, checkpoint=checkpoint)
-        self.callback_disable_fp16.check_disable(epoch, force=True)
         output = None
         if not self.one_shot:
             output = super().train(*args, **kwargs)
@@ -811,13 +810,7 @@ def evaluate(self, *args, **kwargs):
         """
         applied = self.apply_manager(epoch=math.inf, checkpoint=None)
 
-        # Always evaluate w/ fp32 to be closer to DeepSparse
-        use_cuda_amp = self.use_cuda_amp
-        if not self.args.fp16_full_eval and not self.args.bf16_full_eval:
-            self.use_cuda_amp = False
-
         output = super().evaluate(*args, **kwargs)
-        self.use_cuda_amp = use_cuda_amp
         if applied:
             self.finalize_manager()
 
@@ -894,9 +887,6 @@ def save_optimizer_and_scheduler(self, output_dir: Optional[str] = None):
         if output_dir is None:
             output_dir = self.args.output_dir
 
-        if self.sharded_ddp == ShardedDDPOption.SIMPLE and self.optimizer is not None:
-            self.optimizer.consolidate_state_dict()
-
         if self.is_world_process_zero():
             if self.optimizer is not None:
                 torch.save(
@@ -910,10 +900,6 @@ def save_optimizer_and_scheduler(self, output_dir: Optional[str] = None):
                         os.path.join(output_dir, "scheduler.pt"),
                     )
             reissue_pt_warnings(caught_warnings)
-            if self.use_cuda_amp:
-                torch.save(
-                    self.scaler.state_dict(), os.path.join(output_dir, "scaler.pt")
-                )
 
     def _load_optimizer_and_scheduler(self, checkpoint):
         """
@@ -1027,12 +1013,6 @@ def __init__(self, trainer: RecipeManagerTrainerInterface, *args, **kwargs):
         self.on_begin_called = False
         self.quant_start_epoch = math.inf
 
-    def check_disable(self, epoch: float, force: bool = False):
-        if (
-            force or hasattr(self.trainer, "scaler") and self.trainer.scaler._enabled
-        ) and self.qat_active(epoch):
-            self.disable_amp(epoch)
-
     def qat_active(self, epoch: float) -> bool:
         manager_q_active = arch_manager_q_active = False
         if self.trainer.manager:
@@ -1043,18 +1023,6 @@ def qat_active(self, epoch: float) -> bool:
             )
         return manager_q_active or arch_manager_q_active
 
-    def disable_amp(self, epoch: float):
-        if not self.on_begin_called:
-            # disable if training loops haven't started so we don't load
-            # the empty scaler state dict and instead disable it from the start
-            self.trainer.use_cuda_amp = False
-
-        if hasattr(self.trainer, "scaler"):
-            self.trainer.scaler._enabled = False
-
-        self.quant_start_epoch = epoch
-        _LOGGER.info(f"entering QAT phase at epoch {epoch}, disabling FP16 training")
-
     def on_epoch_begin(
         self,
         args: TrainingArguments,
@@ -1067,7 +1035,6 @@ def on_epoch_begin(
         """
         super().on_epoch_begin(args, state, control, **kwargs)
         self.on_begin_called = True
-        self.check_disable(state.epoch)
 
         if state.epoch > self.quant_start_epoch:
             _LOGGER.info(self.trainer.model)
diff --git a/tests/sparseml/transformers/finetune/test_quantization.yaml b/tests/sparseml/transformers/finetune/test_quantization.yaml
index 825074e227d..89381c31006 100644
--- a/tests/sparseml/transformers/finetune/test_quantization.yaml
+++ b/tests/sparseml/transformers/finetune/test_quantization.yaml
@@ -4,7 +4,7 @@ test_stage:
       ignore:
         - LlamaRotaryEmbedding
         - LlamaRMSNorm
-        - SiLUActivation
+        - SiLU
         - model.layers.0.mlp.down_proj
         - model.layers.1.mlp.down_proj
         - model.layers.2.mlp.down_proj
diff --git a/tests/sparseml/transformers/obcq/test_repeats.py b/tests/sparseml/transformers/obcq/test_repeats.py
index 93cd7667841..d4b2d2ee5a0 100644
--- a/tests/sparseml/transformers/obcq/test_repeats.py
+++ b/tests/sparseml/transformers/obcq/test_repeats.py
@@ -97,7 +97,7 @@ def test_fail_on_repeated_quant(tmp_path):
                 ignore:
                     - LlamaRotaryEmbedding
                     - LlamaRMSNorm
-                    - SiLUActivation
+                    - SiLU
                 scheme_overrides:
                     Embedding:
                         input_activations: null
@@ -110,7 +110,7 @@ def test_fail_on_repeated_quant(tmp_path):
                 ignore:
                     - LlamaRotaryEmbedding
                     - LlamaRMSNorm
-                    - SiLUActivation
+                    - SiLU
                     - Embedding
     """
 
@@ -152,7 +152,7 @@ def test_separate_quants_allowed(tmp_path):
                 ignore:
                     - LlamaRotaryEmbedding
                     - LlamaRMSNorm
-                    - SiLUActivation
+                    - SiLU
                     - Linear
                 scheme_overrides:
                     Embedding:
@@ -166,7 +166,7 @@ def test_separate_quants_allowed(tmp_path):
                 ignore:
                     - LlamaRotaryEmbedding
                     - LlamaRMSNorm
-                    - SiLUActivation
+                    - SiLU
                     - Embedding
                     - MatMulLeftInput_QK
                     - MatMulRightInput_QK
diff --git a/tests/sparseml/transformers/obcq/test_tiny.yaml b/tests/sparseml/transformers/obcq/test_tiny.yaml
index 7949b454d90..422baf87580 100644
--- a/tests/sparseml/transformers/obcq/test_tiny.yaml
+++ b/tests/sparseml/transformers/obcq/test_tiny.yaml
@@ -10,7 +10,7 @@ test_stage:
       ignore:
         - LlamaRotaryEmbedding
         - LlamaRMSNorm
-        - SiLUActivation
+        - SiLU
         - model.layers.0.mlp.down_proj
         - model.layers.1.mlp.down_proj
         - model.layers.2.mlp.down_proj

From 3b813b6de50411df21330ff634f55bb975cc2023 Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Wed, 3 Apr 2024 12:15:17 -0400
Subject: [PATCH 03/19] [Quantization] Add alias mapping for target types
 (fixes SiLU name change issue) (#2214)

---
 .../modifiers/quantization/utils/quantize.py  | 24 +++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/sparseml/modifiers/quantization/utils/quantize.py b/src/sparseml/modifiers/quantization/utils/quantize.py
index 123ad4fd722..3b6d17cab65 100644
--- a/src/sparseml/modifiers/quantization/utils/quantize.py
+++ b/src/sparseml/modifiers/quantization/utils/quantize.py
@@ -46,6 +46,7 @@
 
 
 __all__ = [
+    "LAYER_NAME_ALIASES",
     "convert_module_qat_from_schemes",
     "is_qat_helper_module",
     "is_quantizable_module",
@@ -59,6 +60,9 @@
 ]
 
 
+LAYER_NAME_ALIASES: Dict[str, List[str]] = {"SiLU": ["SiLUActivation"]}
+
+
 def is_qat_helper_module(module: Module) -> bool:
     """
     :param module: module to check
@@ -172,7 +176,10 @@ def set_quantization_schemes(
         submodule_scheme = (
             scheme if override_key is None else scheme_overrides[override_key]
         )
-        is_module_type_override = override_key == submodule.__class__.__name__
+        is_module_type_override = (
+            override_key == submodule.__class__.__name__
+            or submodule.__class__.__name__ in LAYER_NAME_ALIASES.get(override_key, [])
+        )
 
         if getattr(submodule, "wrap_qat", False):
             # wrap_qat overrides default scheme behavior
@@ -404,14 +411,20 @@ def _match_submodule_name_or_type(
     #   1. match module type name
     #   2. match the submodule prefix (longest first)
     submodule_match = ""
+    submodule_type = submodule.__class__.__name__
     for name_or_type in names_or_types:
         name_to_compare = submodule_name[:]
         name_to_compare = fix_fsdp_module_name(name_to_compare)
         if name_to_compare.startswith("module."):
             name_to_compare = name_to_compare[7:]
-        if name_or_type == submodule.__class__.__name__:
+        if name_or_type == submodule_type:
             # type match, return type name
             return name_or_type
+        if submodule_type in LAYER_NAME_ALIASES.get(name_or_type, []):
+            # submodule type is aliased to a target type in the recipe
+            # return type in recipe so it can be matched to its target
+            # scheme
+            return name_or_type
         if name_to_compare.startswith(name_or_type) and (
             len(name_or_type) > len(submodule_match)
         ):
@@ -474,8 +487,11 @@ def _get_unmatched_types_or_names(types_or_names):
                 name_to_compare = fix_fsdp_module_name(name_to_compare)
                 if name_to_compare.startswith("module."):
                     name_to_compare = name_to_compare[7:]
-                if name_to_compare.startswith(type_or_name) or (
-                    submodule.__class__.__name__ == type_or_name
+                if (
+                    name_to_compare.startswith(type_or_name)
+                    or (submodule.__class__.__name__ == type_or_name)
+                    or type_or_name
+                    in LAYER_NAME_ALIASES.get(submodule.__class__.__name__, [])
                 ):
                     matched = True
                     break

From 5ac1e15d0aa164f64d811c1a4e36fd4f818d7267 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Thu, 4 Apr 2024 07:59:10 -0400
Subject: [PATCH 04/19] Support for Decompressing Models from HF Hub (#2212)

---
 .../compression/compressors/base.py           |  9 ++--
 .../compression/utils/safetensors_load.py     | 51 ++++++++++++++++++-
 .../sparsification/sparse_model.py            | 10 +++-
 3 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/src/sparseml/transformers/compression/compressors/base.py b/src/sparseml/transformers/compression/compressors/base.py
index b1570e0336b..2a1a37d9196 100644
--- a/src/sparseml/transformers/compression/compressors/base.py
+++ b/src/sparseml/transformers/compression/compressors/base.py
@@ -70,15 +70,14 @@ def replace_layer(param_name: str, data: Tensor, model: Module):
         model_device = operator.attrgetter(param_name)(model).device
         set_layer(param_name, Parameter(data.to(model_device)), model)
 
-    def overwrite_weights(self, pretrained_model_name_or_path: str, model: Module):
+    def overwrite_weights(self, model_path: str, model: Module):
         """
-        Overwrites the weights in model with weights decompressed from
-        pretrained_model_name_or_path
+        Overwrites the weights in model with weights decompressed from model_path
 
-        :param pretrained_model_name_or_path: path to compressed weights
+        :param model_path: path to compressed weights
         :param model: pytorch model to load decompressed weights into
         """
-        dense_gen = self.decompress(pretrained_model_name_or_path)
+        dense_gen = self.decompress(model_path)
         for name, data in tqdm(dense_gen, desc="Decompressing model"):
             ModelCompressor.replace_layer(name, data, model)
         setattr(model, SPARSITY_CONFIG_NAME, self.config)
diff --git a/src/sparseml/transformers/compression/utils/safetensors_load.py b/src/sparseml/transformers/compression/utils/safetensors_load.py
index 7defda7e7f6..4d71482a8e9 100644
--- a/src/sparseml/transformers/compression/utils/safetensors_load.py
+++ b/src/sparseml/transformers/compression/utils/safetensors_load.py
@@ -16,12 +16,13 @@
 import os
 import re
 import struct
-from typing import Dict, List
+from typing import Dict, List, Optional
 
-from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, cached_file
 
 
 __all__ = [
+    "get_safetensors_folder",
     "get_safetensors_header",
     "match_param_name",
     "merge_names",
@@ -30,6 +31,48 @@
 ]
 
 
+def get_safetensors_folder(
+    pretrained_model_name_or_path: str, cache_dir: Optional[str] = None
+) -> str:
+    """
+    Given a Hugging Face stub or a local path, return the folder containing the
+    safetensors weight files
+
+    :param pretrained_model_name_or_path: local path to model or HF stub
+    :param cache_dir: optional cache dir to search through, if none is specified the
+    model will be searched for in the default TRANSFORMERS_CACHE
+    :return: local folder containing model data
+    """
+    if os.path.exists(pretrained_model_name_or_path):
+        # argument is a path to a local folder
+        return pretrained_model_name_or_path
+
+    safetensors_path = cached_file(
+        pretrained_model_name_or_path,
+        SAFE_WEIGHTS_NAME,
+        cache_dir=cache_dir,
+        _raise_exceptions_for_missing_entries=False,
+    )
+    index_path = cached_file(
+        pretrained_model_name_or_path,
+        SAFE_WEIGHTS_INDEX_NAME,
+        cache_dir=cache_dir,
+        _raise_exceptions_for_missing_entries=False,
+    )
+    if safetensors_path is not None:
+        # found a single cached safetensors file
+        return os.path.split(safetensors_path)[0]
+    if index_path is not None:
+        # found a cached safetensors weight index file
+        return os.path.split(index_path)[0]
+
+    # model weights could not be found locally or cached from HF Hub
+    raise ValueError(
+        "Could not locate safetensors weight or index file from "
+        f"{pretrained_model_name_or_path}."
+    )
+
+
 def get_safetensors_header(safetensors_path: str) -> Dict[str, str]:
     """
     Extracts the metadata from a safetensors file as JSON
@@ -105,6 +148,10 @@ def get_weight_mappings(model_path: str) -> Dict[str, str]:
         with open(index_path, "r", encoding="utf-8") as f:
             index = json.load(f)
         header = index["weight_map"]
+    else:
+        raise ValueError(
+            f"Could not find a safetensors weight or index file at {model_path}"
+        )
 
     # convert weight locations to full paths
     for key, value in header.items():
diff --git a/src/sparseml/transformers/sparsification/sparse_model.py b/src/sparseml/transformers/sparsification/sparse_model.py
index 4a639fcb4d6..260795f13da 100644
--- a/src/sparseml/transformers/sparsification/sparse_model.py
+++ b/src/sparseml/transformers/sparsification/sparse_model.py
@@ -35,6 +35,7 @@
     log_model_load,
 )
 from sparseml.transformers.compression.utils import (
+    get_safetensors_folder,
     infer_compressor_from_model_config,
     modify_save_pretrained,
 )
@@ -128,9 +129,14 @@ def skip(*args, **kwargs):
 
         # If model is compressed on disk, decompress and load the weights
         if compressor is not None:
-            compressor.overwrite_weights(
-                pretrained_model_name_or_path=pretrained_model_name_or_path, model=model
+            # if we loaded from a HF stub, find the cached model
+            model_path = get_safetensors_folder(
+                pretrained_model_name_or_path, cache_dir=kwargs.get("cache_dir", None)
             )
+
+            # decompress weights
+            compressor.overwrite_weights(model_path=model_path, model=model)
+
         recipe = resolve_recipe(recipe=recipe, model_path=pretrained_model_name_or_path)
         if recipe:
             apply_recipe_structure_to_model(

From b9ac6a19ec7112f5181ca9c41fcbe1dddab84727 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Thu, 4 Apr 2024 11:29:22 -0400
Subject: [PATCH 05/19] Fix small typo (#2209)

---
 .../transformers/sparsification/modification/modifying_opt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/sparseml/transformers/sparsification/modification/modifying_opt.py b/src/sparseml/transformers/sparsification/modification/modifying_opt.py
index 4d2fd58c4f2..373f6fbd467 100644
--- a/src/sparseml/transformers/sparsification/modification/modifying_opt.py
+++ b/src/sparseml/transformers/sparsification/modification/modifying_opt.py
@@ -48,8 +48,8 @@ def modify(model: nn.Module) -> nn.Module:
     Note: This function will not alter any of the alternatives
     to the OPTAttention module such as OptFlashAttention2
 
-    :param model: the original LLaMa model
-    :return: the modified LLaMa model
+    :param model: the original OPT model
+    :return: the modified OPT model
     """
     for name, submodule in model.named_modules():
         if isinstance(submodule, OPTAttention):

From e5d2fc414511e51d37370f747d92b0e990df24dd Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 4 Apr 2024 08:29:43 -0700
Subject: [PATCH 06/19] Remove setuptools restriction from setup.py (#2207)

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index 27802de438c..233618fc418 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,6 @@
     _PACKAGE_NAME = "sparseml-nightly"
 
 _deps = [
-    "setuptools<=59.5.0",
     "pyyaml>=5.0.0",
     "numpy>=1.17.0",
     "matplotlib>=3.0.0",

From fd0a7793c2a1b27d7300f40a5f8d1e9a58f9a34d Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 5 Apr 2024 11:13:37 -0700
Subject: [PATCH 07/19] Reorder transformers recipe download logs (#2225)

* Reorder transformers recipe download logs

* Format
---
 src/sparseml/transformers/utils/helpers.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/sparseml/transformers/utils/helpers.py b/src/sparseml/transformers/utils/helpers.py
index 1e488c501b1..ac5a870d263 100644
--- a/src/sparseml/transformers/utils/helpers.py
+++ b/src/sparseml/transformers/utils/helpers.py
@@ -362,11 +362,8 @@ def recipe_from_huggingface_model_id(
         f"{HUGGINGFACE_CO_URL_HOME}"
     )
     try:
-        _LOGGER.info(
-            f"Found recipe: {recipe_name} for model id: "
-            f"{model_path}. Downloading..."
-        )
         recipe = hf_hub_download(repo_id=model_path, filename=recipe_name)
+        _LOGGER.info(f"Found recipe: {recipe_name} for model id: {model_path}.")
     except Exception as e:
         _LOGGER.info(
             f"Unable to to find recipe {recipe_name} "

From 88196d5754d721b7124b7741f7dd440d05cfda57 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Fri, 5 Apr 2024 18:34:09 -0400
Subject: [PATCH 08/19] [BugFix] Model State Reload with Quantized Stubs in
 SparseAutoModelForCausalLM (#2226)

* Fix bug for loading models from hf hub

* Update to download only relevant files and not the whole model repo

* Add py files to relevant suffixes
---
 .../sparsification/sparse_model.py            | 16 +--
 src/sparseml/transformers/utils/helpers.py    | 98 ++++++++++++++++++-
 2 files changed, 101 insertions(+), 13 deletions(-)

diff --git a/src/sparseml/transformers/sparsification/sparse_model.py b/src/sparseml/transformers/sparsification/sparse_model.py
index 260795f13da..88f90de65d9 100644
--- a/src/sparseml/transformers/sparsification/sparse_model.py
+++ b/src/sparseml/transformers/sparsification/sparse_model.py
@@ -40,9 +40,7 @@
     modify_save_pretrained,
 )
 from sparseml.transformers.sparsification.modification import modify_model
-from sparseml.transformers.utils.helpers import resolve_recipe
-from sparseml.utils import download_zoo_training_dir
-from sparseml.utils.fsdp.context import main_process_first_context
+from sparseml.transformers.utils.helpers import download_model_directory, resolve_recipe
 
 
 __all__ = ["SparseAutoModel", "SparseAutoModelForCausalLM", "get_shared_tokenizer_src"]
@@ -101,15 +99,9 @@ def skip(*args, **kwargs):
             else pretrained_model_name_or_path
         )
 
-        if pretrained_model_name_or_path.startswith("zoo:"):
-            _LOGGER.debug(
-                "Passed zoo stub to SparseAutoModelForCausalLM object. "
-                "Loading model from SparseZoo training files..."
-            )
-            with main_process_first_context():
-                pretrained_model_name_or_path = download_zoo_training_dir(
-                    zoo_stub=pretrained_model_name_or_path
-                )
+        pretrained_model_name_or_path = download_model_directory(
+            pretrained_model_name_or_path, **kwargs
+        )
 
         # determine compression format, if any, from the model config
         compressor = infer_compressor_from_model_config(pretrained_model_name_or_path)
diff --git a/src/sparseml/transformers/utils/helpers.py b/src/sparseml/transformers/utils/helpers.py
index ac5a870d263..07463d355e0 100644
--- a/src/sparseml/transformers/utils/helpers.py
+++ b/src/sparseml/transformers/utils/helpers.py
@@ -34,8 +34,10 @@
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import PaddingStrategy
 
-from huggingface_hub import HUGGINGFACE_CO_URL_HOME, hf_hub_download
+from huggingface_hub import HUGGINGFACE_CO_URL_HOME, HfFileSystem, hf_hub_download
 from sparseml.export.helpers import ONNX_MODEL_NAME
+from sparseml.utils import download_zoo_training_dir
+from sparseml.utils.fsdp.context import main_process_first_context
 from sparsezoo import Model, setup_model
 
 
@@ -52,6 +54,8 @@
     "ALL_TASK_NAMES",
     "create_fake_dataloader",
     "POSSIBLE_TOKENIZER_FILES",
+    "download_repo_from_huggingface_hub",
+    "download_model_directory",
 ]
 
 
@@ -92,6 +96,7 @@ class TaskNames(Enum):
     "special_tokens_map.json",
     "tokenizer_config.json",
 }
+RELEVANT_HF_SUFFIXES = ["json", "md", "bin", "safetensors", "yaml", "yml", "py"]
 
 
 def remove_past_key_value_support_from_config(config: AutoConfig) -> AutoConfig:
@@ -553,3 +558,94 @@ def fetch_recipe_path(target: str):
         recipe_path = hf_hub_download(repo_id=target, filename=DEFAULT_RECIPE_NAME)
 
     return recipe_path
+
+
+def download_repo_from_huggingface_hub(repo_id, **kwargs):
+    """
+    Download relevant model files from the Hugging Face Hub
+    using the huggingface_hub.hf_hub_download function
+
+    Note(s):
+    - Does not download the entire repo, only the relevant files
+    for the model, such as the model weights, tokenizer files, etc.
+    - Does not re-download files that already exist locally, unless
+    the force_download flag is set to True
+
+    :pre-condition: the repo_id must be a valid Hugging Face Hub repo id
+    :param repo_id: the repo id to download
+    :param kwargs: additional keyword arguments to pass to hf_hub_download
+    """
+    hf_filesystem = HfFileSystem()
+    files = hf_filesystem.ls(repo_id)
+
+    if not files:
+        raise ValueError(f"Could not find any files in HF repo {repo_id}")
+
+    # All file(s) from hf_filesystem have "name" key
+    # Extract the file names from the files
+    relevant_file_names = (
+        Path(file["name"]).name
+        for file in files
+        if any(file["name"].endswith(suffix) for suffix in RELEVANT_HF_SUFFIXES)
+    )
+
+    hub_kwargs_names = (
+        "subfolder",
+        "repo_type",
+        "revision",
+        "library_name",
+        "library_version",
+        "cache_dir",
+        "local_dir",
+        "local_dir_use_symlinks",
+        "user_agent",
+        "force_download",
+        "force_filename",
+        "proxies",
+        "etag_timeout",
+        "resume_download",
+        "token",
+        "local_files_only",
+        "headers",
+        "legacy_cache_layout",
+        "endpoint",
+    )
+    hub_kwargs = {name: kwargs[name] for name in hub_kwargs_names if name in kwargs}
+
+    for file_name in relevant_file_names:
+        last_file = hf_hub_download(repo_id=repo_id, filename=file_name, **hub_kwargs)
+
+    # parent directory of the last file is the model directory
+    return str(Path(last_file).parent.resolve().absolute())
+
+
+def download_model_directory(pretrained_model_name_or_path: str, **kwargs):
+    """
+    Download the model directory from the HF hub or SparseZoo if the model
+    is not found locally
+
+    :param pretrained_model_name_or_path: the name of or path to the model to load
+        can be a SparseZoo/HuggingFace model stub
+    :param kwargs: additional keyword arguments to pass to the download function
+    :return: the path to the downloaded model directory
+    """
+    pretrained_model_path: Path = Path(pretrained_model_name_or_path)
+
+    if pretrained_model_path.exists():
+        _LOGGER.debug(
+            "Model directory already exists locally.",
+        )
+        return pretrained_model_name_or_path
+
+    with main_process_first_context():
+        if pretrained_model_name_or_path.startswith("zoo:"):
+            _LOGGER.debug(
+                "Passed zoo stub to SparseAutoModelForCausalLM object. "
+                "Loading model from SparseZoo training files..."
+            )
+            return download_zoo_training_dir(zoo_stub=pretrained_model_name_or_path)
+
+        _LOGGER.debug("Downloading model from HuggingFace Hub.")
+        return download_repo_from_huggingface_hub(
+            repo_id=pretrained_model_name_or_path, **kwargs
+        )

From 5aae81b30f26a6faa1f9eb34ffabf083ad885275 Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Sat, 6 Apr 2024 05:02:35 +0200
Subject: [PATCH 09/19] initial commit (#2223)

---
 .../sparsification/modification/base.py       |  4 +-
 .../transformers/test_recipe_compatibility.py | 62 +++++++++++++++++++
 2 files changed, 64 insertions(+), 2 deletions(-)
 create mode 100644 tests/sparseml/transformers/test_recipe_compatibility.py

diff --git a/src/sparseml/transformers/sparsification/modification/base.py b/src/sparseml/transformers/sparsification/modification/base.py
index 6d9435b8b8b..72d9cc5954a 100644
--- a/src/sparseml/transformers/sparsification/modification/base.py
+++ b/src/sparseml/transformers/sparsification/modification/base.py
@@ -24,7 +24,7 @@
 __all__ = ["check_transformers_version"]
 
 _TRANSFORMERS_MIN_VERSION = "4.39.0"
-_TRANSFORMERS_MAX_VERSION = "4.39.2"
+_TRANSFORMERS_MAX_VERSION = "4.39.3"
 
 
 def check_transformers_version(
@@ -56,7 +56,7 @@ def check_transformers_version(
         _LOGGER.warning(
             "Attempting to modify the transformers model to support "
             "the SparseML-specific functionalities. However, the detected "
-            f"transformers version ({current_version}) does not fall within the"
+            f"transformers version ({current_version}) does not fall within the "
             f"supported version range ({min_version} - {max_version}). "
             "This may lead to unexpected behavior. Please ensure that the "
             "correct transformers version is installed."
diff --git a/tests/sparseml/transformers/test_recipe_compatibility.py b/tests/sparseml/transformers/test_recipe_compatibility.py
new file mode 100644
index 00000000000..6e4f13b812c
--- /dev/null
+++ b/tests/sparseml/transformers/test_recipe_compatibility.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+
+import pytest
+
+import sparseml.core.session as session_manager
+from huggingface_hub import snapshot_download
+from sparseml.transformers import SparseAutoModelForCausalLM
+
+
+@pytest.fixture
+def model_path(tmp_path):
+    yield snapshot_download("stas/tiny-random-llama-2", local_dir=tmp_path)
+    shutil.rmtree(tmp_path)
+
+
+@pytest.fixture
+def recipe():
+    return """test_stage:
+  obcq_modifiers:
+    QuantizationModifier:
+      ignore:
+        - LlamaRotaryEmbedding
+        - LlamaRMSNorm
+        - {silu_activation}
+      scheme_overrides:
+        Embedding:
+          input_activations: null
+          weights:
+            num_bits: 8
+            symmetric: false"""
+
+
+def test_silu_alias_same_output(recipe, model_path):
+    model_ = SparseAutoModelForCausalLM.from_pretrained(
+        model_path, recipe=recipe.format(silu_activation="SiLU")
+    )
+    session_manager.create_session()
+    session_manager.active_session().reset()
+    model = SparseAutoModelForCausalLM.from_pretrained(
+        model_path, recipe=recipe.format(silu_activation="SiLUActivation")
+    )
+
+    dummy_input = model.dummy_inputs
+
+    out = model(**dummy_input)
+    out_ = model_(**dummy_input)
+
+    out.logits.allclose(out_.logits)

From d636d35de8353ab91aad8ee73adef72b68801399 Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Mon, 8 Apr 2024 10:46:58 -0400
Subject: [PATCH 10/19] ClearML integration (#2197)

* clearml

* lint

* include in loggers

* add step

* fix torch dep
---
 setup.py                                    |   1 +
 src/sparseml/pytorch/utils/logger.py        | 105 ++++++++++++++++++++
 tests/sparseml/pytorch/utils/test_logger.py |   8 +-
 tests/sparseml/test_clear_ml.py             |  63 ++++++++++++
 4 files changed, 174 insertions(+), 3 deletions(-)
 create mode 100644 tests/sparseml/test_clear_ml.py

diff --git a/setup.py b/setup.py
index 233618fc418..5c52df4c8fe 100644
--- a/setup.py
+++ b/setup.py
@@ -55,6 +55,7 @@
     "GPUtil>=1.4.0",
     "protobuf>=3.12.2,<=3.20.3",
     "click>=7.1.2,!=8.0.0",  # latest version < 8.0 + blocked version with reported bug
+    "clearml==1.14.4",
 ]
 _nm_deps = [f"{'sparsezoo' if is_release else 'sparsezoo-nightly'}~={version_nm_deps}"]
 _deepsparse_deps = [
diff --git a/src/sparseml/pytorch/utils/logger.py b/src/sparseml/pytorch/utils/logger.py
index 0e9a5bc0ff6..82d3fc79845 100644
--- a/src/sparseml/pytorch/utils/logger.py
+++ b/src/sparseml/pytorch/utils/logger.py
@@ -45,11 +45,21 @@
     wandb = None
     wandb_err = err
 
+
+try:
+    from clearml import Task
+
+    clearml_err = None
+except Exception as err:
+    clearml = None
+    clearml_err = err
+
 from sparseml.utils import ALL_TOKEN, create_dirs
 
 
 __all__ = [
     "BaseLogger",
+    "ClearMLLogger",
     "LambdaLogger",
     "PythonLogger",
     "TensorBoardLogger",
@@ -628,6 +638,101 @@ def save(
         return True
 
 
+class ClearMLLogger(LambdaLogger):
+    @staticmethod
+    def available() -> bool:
+        """
+        :return: True if wandb is available and installed, False, otherwise
+        """
+        return not clearml_err
+
+    def __init__(
+        self,
+        name: str = "clearml",
+        enabled: bool = True,
+        project_name: str = "sparseml",
+        task_name: str = "",
+    ):
+        if task_name == "":
+            now = datetime.now()
+            task_name = now.strftime("%d-%m-%Y_%H.%M.%S")
+
+        self.task = Task.init(project_name=project_name, task_name=task_name)
+
+        super().__init__(
+            lambda_func=self.log_scalar,
+            name=name,
+            enabled=enabled,
+        )
+
+    def log_hyperparams(
+        self,
+        params: Dict,
+        level: Optional[int] = None,
+    ) -> bool:
+        """
+        :param params: Each key-value pair in the dictionary is the name of the
+            hyper parameter and it's corresponding value.
+        :return: True if logged, False otherwise.
+        """
+        if not self.enabled:
+            return False
+
+        self.task.connect(params)
+        return True
+
+    def log_scalar(
+        self,
+        tag: str,
+        value: float,
+        step: Optional[int] = None,
+        wall_time: Optional[float] = None,
+        level: Optional[int] = None,
+    ) -> bool:
+        """
+        :param tag: identifying tag to log the value with
+        :param value: value to save
+        :param step: global step for when the value was taken
+        :param wall_time: global wall time for when the value was taken,
+            defaults to time.time()
+        :param kwargs: additional logging arguments to support Python and custom loggers
+        :return: True if logged, False otherwise.
+        """
+        logger = self.task.get_logger()
+        # each series is superimposed on the same plot on title
+        logger.report_scalar(
+            title=tag, series=str(level) or tag, value=value, iteration=step
+        )
+        return True
+
+    def log_scalars(
+        self,
+        tag: str,
+        values: Dict[str, float],
+        step: Optional[int] = None,
+        wall_time: Optional[float] = None,
+        level: Optional[int] = None,
+    ) -> bool:
+        """
+        :param tag: identifying tag to log the values with
+        :param values: values to save
+        :param step: global step for when the values were taken
+        :param wall_time: global wall time for when the values were taken,
+            defaults to time.time()
+        :param kwargs: additional logging arguments to support Python and custom loggers
+        :return: True if logged, False otherwise.
+        """
+        for k, v in values.items():
+            self.log_scalar(
+                tag=f"{tag}.{k}",
+                value=v,
+                step=step,
+                wall_time=wall_time,
+                level=level,
+            )
+        return True
+
+
 class SparsificationGroupLogger(BaseLogger):
     """
     Modifier logger that handles outputting values to other supported systems.
diff --git a/tests/sparseml/pytorch/utils/test_logger.py b/tests/sparseml/pytorch/utils/test_logger.py
index 7cceeff3017..82510aea47a 100644
--- a/tests/sparseml/pytorch/utils/test_logger.py
+++ b/tests/sparseml/pytorch/utils/test_logger.py
@@ -20,6 +20,7 @@
 import pytest
 
 from sparseml.pytorch.utils import (
+    ClearMLLogger,
     LambdaLogger,
     LoggerManager,
     PythonLogger,
@@ -45,6 +46,7 @@
             or True
         ),
         *([WANDBLogger()] if WANDBLogger.available() else []),
+        *([ClearMLLogger()] if ClearMLLogger.available() else []),
         SparsificationGroupLogger(
             lambda_func=lambda tag, value, values, step, wall_time, level: logging.info(
                 f"{tag}, {value}, {values}, {step}, {wall_time}, {level}"
@@ -79,12 +81,12 @@ def test_log_scalar(self, logger):
 
     def test_log_scalars(self, logger):
         logger.log_scalars("test-scalars-tag", {"scalar1": 0.0, "scalar2": 1.0})
-        logger.log_scalars("test-scalars-tag", {"scalar1": 0.0, "scalar2": 1.0}, 1)
+        logger.log_scalars("test-scalars-tag2", {"scalar1": 0.0, "scalar2": 1.0}, 1)
         logger.log_scalars(
-            "test-scalars-tag", {"scalar1": 0.0, "scalar2": 1.0}, 2, time.time() - 1
+            "test-scalars-tag3", {"scalar1": 0.0, "scalar2": 1.0}, 2, time.time() - 1
         )
         logger.log_scalars(
-            "test-scalars-tag",
+            "test-scalars-tag4",
             {"scalar1": 0.0, "scalar2": 1.0},
             2,
             time.time() - 1,
diff --git a/tests/sparseml/test_clear_ml.py b/tests/sparseml/test_clear_ml.py
new file mode 100644
index 00000000000..987d15a15fe
--- /dev/null
+++ b/tests/sparseml/test_clear_ml.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+from clearml import Task
+from sparseml.transformers import apply
+from sparseml.utils import is_package_available
+
+
+is_torch_available = is_package_available("torch")
+if is_torch_available:
+    import torch
+
+    torch_err = None
+else:
+    torch = object
+    torch_err = ModuleNotFoundError(
+        "`torch` is not installed, use `pip install torch` to log to Weights and Biases"
+    )
+
+
+def test_oneshot_and_finetune(tmp_path: Path):
+    recipe_str = "tests/sparseml/transformers/finetune/test_alternate_recipe.yaml"
+    model = "Xenova/llama2.c-stories15M"
+    device = "cuda:0"
+    if is_torch_available and not torch.cuda.is_available():
+        device = "cpu"
+    dataset = "wikitext"
+    dataset_config_name = "wikitext-2-raw-v1"
+    concatenate_data = True
+    run_stages = True
+    output_dir = tmp_path
+    max_steps = 50
+    splits = {"train": "train[:50%]", "calibration": "train[50%:60%]"}
+
+    # clearML will automatically log default capturing entries without
+    # explicitly calling logger. Logs accessible in https://app.clear.ml/
+    Task.init(project_name="test", task_name="test_oneshot_and_finetune")
+
+    apply(
+        model=model,
+        dataset=dataset,
+        dataset_config_name=dataset_config_name,
+        run_stages=run_stages,
+        output_dir=output_dir,
+        recipe=recipe_str,
+        max_steps=max_steps,
+        concatenate_data=concatenate_data,
+        splits=splits,
+        oneshot_device=device,
+    )

From 55698e3596a6d89164ac50c70f919f460cec4fca Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 9 Apr 2024 09:58:25 -0400
Subject: [PATCH 11/19] Remove usage of SparseAutoModel.log_model_load (#2232)

---
 ...ment-analysis-python-custom-teacher-rottentomatoes.ipynb | 4 +---
 .../docs-sentiment-analysis-python-sst2.ipynb               | 6 +-----
 ...ext-classification-python-custom-teacher-tweeteval.ipynb | 5 +----
 .../docs-text-classification-python-qqp.ipynb               | 4 +---
 .../docs-text-classification-python-sick.ipynb              | 1 -
 .../docs-token-classification-python-conll2003.ipynb        | 2 +-
 ...cs-token-classification-python-custom-teacher-wnut.ipynb | 3 +--
 7 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/integrations/huggingface-transformers/tutorials/sentiment-analysis/docs-sentiment-analysis-python-custom-teacher-rottentomatoes.ipynb b/integrations/huggingface-transformers/tutorials/sentiment-analysis/docs-sentiment-analysis-python-custom-teacher-rottentomatoes.ipynb
index 0eeb728df72..8fd73c5b3d1 100644
--- a/integrations/huggingface-transformers/tutorials/sentiment-analysis/docs-sentiment-analysis-python-custom-teacher-rottentomatoes.ipynb
+++ b/integrations/huggingface-transformers/tutorials/sentiment-analysis/docs-sentiment-analysis-python-custom-teacher-rottentomatoes.ipynb
@@ -364,13 +364,11 @@
     "model_kwargs = {\"config\": model_config}\n",
     "model_kwargs[\"state_dict\"], s_delayed = SparseAutoModel._loadable_state_dict(model_path)\n",
     "model = AutoModelForSequenceClassification.from_pretrained(model_path,**model_kwargs,)\n",
-    "SparseAutoModel.log_model_load(model, model_path, \"student\", s_delayed) # prints metrics on sparsity profile\n",
     "\n",
     "# initialize teacher using familiar HF AutoModel\n",
     "teacher_kwargs = {\"config\": teacher_config}\n",
     "teacher_kwargs[\"state_dict\"], t_delayed = SparseAutoModel._loadable_state_dict(teacher_path)\n",
-    "teacher = AutoModelForSequenceClassification.from_pretrained(teacher_path,**teacher_kwargs,)\n",
-    "SparseAutoModel.log_model_load(teacher, teacher_path, \"teacher\", t_delayed)"
+    "teacher = AutoModelForSequenceClassification.from_pretrained(teacher_path,**teacher_kwargs,)"
    ]
   },
   {
diff --git a/integrations/huggingface-transformers/tutorials/sentiment-analysis/docs-sentiment-analysis-python-sst2.ipynb b/integrations/huggingface-transformers/tutorials/sentiment-analysis/docs-sentiment-analysis-python-sst2.ipynb
index a734dc09fe6..0f34fa42328 100644
--- a/integrations/huggingface-transformers/tutorials/sentiment-analysis/docs-sentiment-analysis-python-sst2.ipynb
+++ b/integrations/huggingface-transformers/tutorials/sentiment-analysis/docs-sentiment-analysis-python-sst2.ipynb
@@ -440,11 +440,7 @@
     "\n",
     "teacher_kwargs = {'config':teacher_config}\n",
     "teacher_kwargs[\"state_dict\"], t_delayed = SparseAutoModel._loadable_state_dict(teacher_path)\n",
-    "teacher = AutoModelForSequenceClassification.from_pretrained(teacher_path, **teacher_kwargs,)\n",
-    "\n",
-    "# optional - prints metrics about sparsity profiles of the models\n",
-    "SparseAutoModel.log_model_load(model, model_path, \"student\", s_delayed)\n",
-    "SparseAutoModel.log_model_load(teacher, teacher_path, \"teacher\", t_delayed)"
+    "teacher = AutoModelForSequenceClassification.from_pretrained(teacher_path, **teacher_kwargs,)"
    ]
   },
   {
diff --git a/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-custom-teacher-tweeteval.ipynb b/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-custom-teacher-tweeteval.ipynb
index 510fd548551..271dcad9f01 100644
--- a/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-custom-teacher-tweeteval.ipynb
+++ b/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-custom-teacher-tweeteval.ipynb
@@ -515,10 +515,7 @@
     "# initialize model using familiar HF AutoModel\n",
     "model_kwargs = {\"config\": config}\n",
     "model_kwargs[\"state_dict\"], s_delayed = SparseAutoModel._loadable_state_dict(model_path)\n",
-    "model = AutoModelForSequenceClassification.from_pretrained(model_path, **model_kwargs,)\n",
-    "\n",
-    "# prints metrics on sparsity profile\n",
-    "SparseAutoModel.log_model_load(model, model_path, \"student\", s_delayed)"
+    "model = AutoModelForSequenceClassification.from_pretrained(model_path, **model_kwargs,)"
    ]
   },
   {
diff --git a/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-qqp.ipynb b/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-qqp.ipynb
index 15831a3309a..ada42932630 100644
--- a/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-qqp.ipynb
+++ b/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-qqp.ipynb
@@ -375,13 +375,11 @@
     "model_kwargs = {\"config\": model_config}\n",
     "model_kwargs[\"state_dict\"], s_delayed = SparseAutoModel._loadable_state_dict(model_path)\n",
     "model = AutoModelForSequenceClassification.from_pretrained(model_path, **model_kwargs,)\n",
-    "SparseAutoModel.log_model_load(model, model_path, \"student\", s_delayed) # prints metrics on sparsity profile\n",
     "\n",
     "# initialize teacher using familiar HF AutoModel\n",
     "teacher_kwargs = {\"config\": teacher_config}\n",
     "teacher_kwargs[\"state_dict\"], t_delayed = SparseAutoModel._loadable_state_dict(teacher_path)\n",
-    "teacher = AutoModelForSequenceClassification.from_pretrained(teacher_path, **teacher_kwargs,)\n",
-    "SparseAutoModel.log_model_load(teacher, teacher_path, \"teacher\", t_delayed) # prints metrics on sparsity profile"
+    "teacher = AutoModelForSequenceClassification.from_pretrained(teacher_path, **teacher_kwargs,)\n"
    ]
   },
   {
diff --git a/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-sick.ipynb b/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-sick.ipynb
index 03ec4a54d16..f086165632a 100644
--- a/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-sick.ipynb
+++ b/integrations/huggingface-transformers/tutorials/text-classification/docs-text-classification-python-sick.ipynb
@@ -361,7 +361,6 @@
     "model_kwargs = {\"config\": config}\n",
     "model_kwargs[\"state_dict\"], s_delayed = SparseAutoModel._loadable_state_dict(model_path)\n",
     "model = AutoModelForSequenceClassification.from_pretrained(model_path,**model_kwargs,)\n",
-    "SparseAutoModel.log_model_load(model, model_path, \"student\", s_delayed) # prints metrics on sparsity profile\n",
     "\n",
     "# FYI: there is a factory function called SparseAutoModel that does the same as above\n",
     "# model, teacher = SparseAutoModel.text_classification_from_pretrained_distil(\n",
diff --git a/integrations/huggingface-transformers/tutorials/token-classification/docs-token-classification-python-conll2003.ipynb b/integrations/huggingface-transformers/tutorials/token-classification/docs-token-classification-python-conll2003.ipynb
index a32705402b3..676747fa06e 100644
--- a/integrations/huggingface-transformers/tutorials/token-classification/docs-token-classification-python-conll2003.ipynb
+++ b/integrations/huggingface-transformers/tutorials/token-classification/docs-token-classification-python-conll2003.ipynb
@@ -1 +1 @@
-{"cells":[{"cell_type":"markdown","metadata":{"id":"kSNEB-3orJ9C"},"source":["# **Token Classification: Sparse Transfer Learning with the Python API**\n","\n","In this example, you will fine-tune a 90% pruned BERT model onto the Conll2003 NER dataset using SparseML's Hugging Face Integration.\n","\n","### **Sparse Transfer Learning Overview**\n","\n","Sparse Transfer Learning is very similiar to typical fine-tuning you are used to when training models. However, with Sparse Transfer Learning, we start the training process from a pre-sparsified checkpoint and maintain the sparsity structure while the fine tuning occurs. \n","\n","At the end, you will have a sparse model trained on your dataset, ready to be deployed with DeepSparse for GPU-class performance on CPUs!\n","\n","### **Pre-Sparsified BERT**\n","SparseZoo, Neural Magic's open source repository of pre-sparsified models, contains a 90% pruned version of BERT, which has been sparsified on the upstream Wikipedia and BookCorpus datasets with the\n","masked language modeling objective. [Check out the model card](https://sparsezoo.neuralmagic.com/models/nlp%2Fmasked_language_modeling%2Fobert-base%2Fpytorch%2Fhuggingface%2Fwikipedia_bookcorpus%2Fpruned90-none). We will use this model as the starting point for the transfer learning process.\n","\n","\n","**Let's dive in!**"]},{"cell_type":"markdown","metadata":{"id":"Y0WybTbssU0g"},"source":["## **Installation**\n","\n","Install SparseML via `pip`.\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{"collapsed":true,"id":"AkR1u2_NnXqY"},"outputs":[],"source":["!pip install sparseml[transformers]"]},{"cell_type":"markdown","metadata":{"id":"_jY0SKdXFGO3"},"source":["If you are running on Google Colab, restart the runtime after this step."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"XXj0S5Jdq2M-"},"outputs":[],"source":["import sparseml\n","from sparsezoo import Model\n","from sparseml.transformers.utils import SparseAutoModel\n","from sparseml.transformers.sparsification import Trainer, TrainingArguments\n","import numpy as np\n","from transformers import (\n","    AutoModelForTokenClassification,\n","    AutoConfig, \n","    AutoTokenizer,\n","    EvalPrediction,\n","    DataCollatorForTokenClassification,\n","    PreTrainedTokenizerFast\n",")\n","from datasets import ClassLabel, load_dataset, load_metric"]},{"cell_type":"markdown","metadata":{"id":"A6GwDnLL2Zn_"},"source":["## **Step 1: Load a Dataset**\n","\n","SparseML is integrated with Hugging Face, so we can use the `datasets` class to load datasets from the Hugging Face hub or from local files. \n","\n","[Conll2003 Dataset Card](https://huggingface.co/datasets/conll2003)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"CkvbT1i9p87z"},"outputs":[],"source":["# load dataset from HF hub\n","dataset = load_dataset(\"conll2003\")\n","\n","# alternatively, load from JSONL file\n","data_files = {}\n","dataset[\"train\"].to_json(\"conll2003-train.json\")\n","dataset[\"validation\"].to_json(\"conll2003-validation.json\")\n","data_files[\"train\"] = \"conll2003-train.json\"\n","data_files[\"validation\"] = \"conll2003-validation.json\"\n","dataset_from_json = load_dataset('json', data_files=data_files)"]},{"cell_type":"markdown","metadata":{"id":"IiFcAKt82qSh"},"source":["We can see the input is `tokens` which is a list of words and the labels are `ner_tags` which are a list of integers corresponding to a tag type for each word."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":725,"status":"ok","timestamp":1677766765729,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"kc8DQY2HyUWy","outputId":"366d295f-ff33-4409-cba0-1b30963b674f"},"outputs":[{"name":"stdout","output_type":"stream","text":["{\"id\":\"0\",\"tokens\":[\"EU\",\"rejects\",\"German\",\"call\",\"to\",\"boycott\",\"British\",\"lamb\",\".\"],\"pos_tags\":[22,42,16,21,35,37,16,21,7],\"chunk_tags\":[11,21,11,12,21,22,11,12,0],\"ner_tags\":[3,0,7,0,0,0,7,0,0]}\n","{\"id\":\"1\",\"tokens\":[\"Peter\",\"Blackburn\"],\"pos_tags\":[22,22],\"chunk_tags\":[11,12],\"ner_tags\":[1,2]}\n","{\"id\":\"2\",\"tokens\":[\"BRUSSELS\",\"1996-08-22\"],\"pos_tags\":[22,11],\"chunk_tags\":[11,12],\"ner_tags\":[5,0]}\n","{\"id\":\"3\",\"tokens\":[\"The\",\"European\",\"Commission\",\"said\",\"on\",\"Thursday\",\"it\",\"disagreed\",\"with\",\"German\",\"advice\",\"to\",\"consumers\",\"to\",\"shun\",\"British\",\"lamb\",\"until\",\"scientists\",\"determine\",\"whether\",\"mad\",\"cow\",\"disease\",\"can\",\"be\",\"transmitted\",\"to\",\"sheep\",\".\"],\"pos_tags\":[12,22,22,38,15,22,28,38,15,16,21,35,24,35,37,16,21,15,24,41,15,16,21,21,20,37,40,35,21,7],\"chunk_tags\":[11,12,12,21,13,11,11,21,13,11,12,13,11,21,22,11,12,17,11,21,17,11,12,12,21,22,22,13,11,0],\"ner_tags\":[0,3,4,0,0,0,0,0,0,7,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0]}\n","{\"id\":\"4\",\"tokens\":[\"Germany\",\"'s\",\"representative\",\"to\",\"the\",\"European\",\"Union\",\"'s\",\"veterinary\",\"committee\",\"Werner\",\"Zwingmann\",\"said\",\"on\",\"Wednesday\",\"consumers\",\"should\",\"buy\",\"sheepmeat\",\"from\",\"countries\",\"other\",\"than\",\"Britain\",\"until\",\"the\",\"scientific\",\"advice\",\"was\",\"clearer\",\".\"],\"pos_tags\":[22,27,21,35,12,22,22,27,16,21,22,22,38,15,22,24,20,37,21,15,24,16,15,22,15,12,16,21,38,17,7],\"chunk_tags\":[11,11,12,13,11,12,12,11,12,12,12,12,21,13,11,12,21,22,11,13,11,1,13,11,17,11,12,12,21,1,0],\"ner_tags\":[5,0,0,0,0,3,4,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0]}\n"]}],"source":["!head conll2003-train.json --lines=5"]},{"cell_type":"markdown","metadata":{"id":"1urGaah73OUm"},"source":["## **Step 2: Setup Evaluation Metric**\n","\n","Token classification predicts a category for every word in the input sentence. We can use the [seqeval metric](https://huggingface.co/spaces/evaluate-metric/seqeval) to evaluate the tag-level precision and recall of the pipeline. \n","\n","The seqeval metric needs to be passed tags rather than tag indexes, so we need to create a mapping between the indexes and the tags so that we can pass the tags to the seqeval metric.\n","\n","The Conll2003 named-entity-recognition tags map to the following classes:\n","\n","```\n","{\n","  'O': 0, \n","  'B-PER': 1, \n","  'I-PER': 2, \n","  'B-ORG': 3, \n","  'I-ORG': 4, \n","  'B-LOC': 5, \n","  'I-LOC': 6, \n","  'B-MISC': 7, \n","  'I-MISC': 8\n","}\n","```"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"7ti52fgQqdSU"},"outputs":[],"source":["# label mapping\n","LABEL_MAP = {\n","    0: 'O', \n","    1: 'B-PER', \n","    2: 'I-PER', \n","    3: 'B-ORG', \n","    4: 'I-ORG', \n","    5: 'B-LOC', \n","    6: 'I-LOC', \n","    7: 'B-MISC', \n","    8: 'I-MISC'\n","}\n","\n","# other configs\n","INPUT_COL = \"tokens\"\n","LABEL_COL = \"ner_tags\"\n","NUM_LABELS = len(LABEL_MAP)\n","SPECIAL_TOKEN_ID = -100"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ZZUOmaW1u7C1"},"outputs":[],"source":["# load evaluation metric\n","metric = load_metric(\"seqeval\")\n","\n","# setup metrics function\n","def compute_metrics(p: EvalPrediction):\n","  predictions, labels = p\n","  predictions = np.argmax(predictions, axis=2)\n","   \n","  # Remove ignored index (special tokens) and convert indexed tags to labels\n","  true_predictions = [\n","    [LABEL_MAP[pred] for (pred, lab) in zip(prediction, label) if lab != SPECIAL_TOKEN_ID]\n","    for prediction, label in zip(predictions, labels)\n","  ]\n","  true_labels = [\n","    [LABEL_MAP[lab] for (_, lab) in zip(prediction, label) if lab != SPECIAL_TOKEN_ID]\n","    for prediction, label in zip(predictions, labels)\n","  ]\n","  \n","  # example: results = metrics.compute(predictions=[\"0\", \"B-group\", \"0\"], true_labels=[\"0\", \"B-org\", \"I-org\"])\n","  #   we used the LABEL to convert the tags (which are integers) into the corresponding LABEL\n","  #   seqeval should be passed the actual labels\n","  results = metric.compute(predictions=true_predictions, references=true_labels)\n","  return {\n","    \"precision\": results[\"overall_precision\"],\n","    \"recall\": results[\"overall_recall\"],\n","    \"f1\": results[\"overall_f1\"],\n","    \"accuracy\": results[\"overall_accuracy\"],\n","  }"]},{"cell_type":"markdown","metadata":{"id":"1GEhYi53HoAH"},"source":["## **Step 3: Download Files for Sparse Transfer Learning**\n","\n","First, we need to select a sparse checkpoint to begin the training process. In this case, we will fine-tune a 90% pruned version of BERT onto the Conll2003 NER dataset. This model is available in SparseZoo, identified by the following stub:\n","```\n","zoo:nlp/masked_language_modeling/obert-base/pytorch/huggingface/wikipedia_bookcorpus/pruned90-none\n","```\n","\n","Next, we need to create a sparsification recipe for usage in the training process. Recipes are YAML files that encode the sparsity related algorithms and parameters to be applied by SparseML. For Sparse Transfer Learning, we need to use a recipe that instructs SparseML to maintain sparsity during the training process and to apply quantization over the final few epochs. \n","\n","In the case of Conll2003, there is a transfer learning recipe available in the SparseZoo, identified by the following stub:\n","```\n","zoo:nlp/token_classification/obert-base/pytorch/huggingface/conll2003/pruned90_quant-none\n","```\n","\n","Finally, SparseML has the optional ability to apply model distillation from a teacher model during the transfer learning process to boost accuracy. In this case, we will use a dense version of BERT trained on the Conll2003 dataset which is hosted in SparseZoo. This model is identified by the following stub:\n","\n","```\n","zoo:nlp/token_classification/obert-base/pytorch/huggingface/conll2003/base-none\n","```"]},{"cell_type":"markdown","metadata":{"id":"U_iyuuB4Wq7N"},"source":["Use the `sparsezoo` python client to download the models and recipe using their SparseZoo stubs."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Ykg8fEN2Q5o_"},"outputs":[],"source":["# downloads 90% pruned upstream BERT trained on MLM objective (pruned90)\n","model_stub = \"zoo:nlp/masked_language_modeling/obert-base/pytorch/huggingface/wikipedia_bookcorpus/pruned90-none\" \n","model_path = Model(model_stub, download_path=\"./model\").training.path\n","\n","# downloads dense BERT trained on CONLL2003 (base_none)\n","teacher_stub = \"zoo:nlp/token_classification/obert-base/pytorch/huggingface/conll2003/base-none\"\n","teacher_path = Model(teacher_stub, download_path=\"./teacher\").training.path\n","\n","# download pruned quantized transfer recipe for CONLL2003 (pruned90_quant)\n","transfer_stub = \"zoo:nlp/token_classification/obert-base/pytorch/huggingface/conll2003/pruned90_quant-none\"\n","recipe_path = Model(transfer_stub, download_path=\"./transfer_recipe\").recipes.default.path"]},{"cell_type":"markdown","metadata":{"id":"RLe8iEWxV_zz"},"source":["We can see that the upstream model (trained on Wikipedia BookCorpus) and  configuration files have been downloaded to the local directory."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":25,"status":"ok","timestamp":1677766834654,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"0NTVj1kPRSCW","outputId":"cfdf5ff4-9b8a-4f4d-a1b0-d9a9fa1dec19"},"outputs":[{"name":"stdout","output_type":"stream","text":["all_results.json   special_tokens_map.json  trainer_state.json  vocab.txt\n","config.json        tokenizer_config.json    training_args.bin\n","pytorch_model.bin  tokenizer.json           train_results.json\n"]}],"source":["%ls ./model/training"]},{"cell_type":"markdown","metadata":{"id":"orjvrvdCWEUi"},"source":["We can see that a transfer learning recipe has been downloaded. The `ConstantPruningModifier` instructs SparseML to maintain the sparsity structure of the network as the model trains and the `QuantizationModifier` instructs SparseML to run Quantization Aware Training at the end of training."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"eUYg-7eBRT5f"},"outputs":[],"source":["%cat ./transfer_recipe/recipe/recipe_original.md"]},{"attachments":{},"cell_type":"markdown","metadata":{"id":"0824QZuAqdVY"},"source":["#### **Inspecting the Recipe**\n","\n","Here is the transfer learning recipe:\n","\n","```yaml\n","version: 1.1.0\n","\n","# General Variables\n","num_epochs: 13\n","init_lr: 1.5e-4 \n","final_lr: 0\n","\n","qat_start_epoch: 8.0\n","observer_epoch: 12.0\n","quantize_embeddings: 1\n","\n","distill_hardness: 1.0\n","distill_temperature: 2.0\n","\n","# Modifiers:\n","\n","training_modifiers:\n","  - !EpochRangeModifier\n","      end_epoch: eval(num_epochs)\n","      start_epoch: 0.0\n","\n","  - !LearningRateFunctionModifier\n","      start_epoch: 0\n","      end_epoch: eval(num_epochs)\n","      lr_func: linear\n","      init_lr: eval(init_lr)\n","      final_lr: eval(final_lr)\n","    \n","quantization_modifiers:\n","  - !QuantizationModifier\n","      start_epoch: eval(qat_start_epoch)\n","      disable_quantization_observer_epoch: eval(observer_epoch)\n","      freeze_bn_stats_epoch: eval(observer_epoch)\n","      quantize_embeddings: eval(quantize_embeddings)\n","      quantize_linear_activations: 0\n","      exclude_module_types: ['LayerNorm']\n","      submodules:\n","        - bert.embeddings\n","        - bert.encoder\n","        - classifier\n","\n","distillation_modifiers:\n","  - !DistillationModifier\n","     hardness: eval(distill_hardness)\n","     temperature: eval(distill_temperature)\n","     distill_output_keys: [logits]\n","\n","constant_modifiers:\n","  - !ConstantPruningModifier\n","      start_epoch: 0.0\n","      params: __ALL_PRUNABLE__\n","```\n","\n","\n","The `Modifiers` in the transfer learning recipe are the important items that encode how SparseML should modify the training process for Sparse Transfer Learning:\n","- `ConstantPruningModifier` tells SparseML to pin weights at 0 over all epochs, maintaining the sparsity structure of the network\n","- `QuantizationModifier` tells SparseML to quanitze the weights with quantization aware training over the last 5 epochs\n","- `DistillationModifier` tells SparseML how to apply distillation during the trainign process, targeting the logits\n","\n","Below, SparseML's `Trainer` will parses the modifiers and updates the training process to implement the algorithms specified here."]},{"attachments":{},"cell_type":"markdown","metadata":{"id":"FStnDScEKoMX"},"source":["## **Step 4: Setup Hugging Face Model Objects**\n","\n","Next, we will set up the Hugging Face `tokenizer`, `config`, and `model`. \n","\n","These are all native Hugging Face objects, so check out the Hugging Face docs for more details on `AutoModel`, `AutoConfig`, and `AutoTokenizer` as needed. \n","\n","We instantiate these classes by passing the local path to the directory containing the `pytorch_model.bin`, `tokenizer.json`, and `config.json` files from the SparseZoo download."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"dhN1oGcTQ9RE"},"outputs":[],"source":["# shared tokenizer between teacher and student\n","tokenizer = AutoTokenizer.from_pretrained(model_path)\n","assert(isinstance(tokenizer, PreTrainedTokenizerFast))\n","\n","# setup configs\n","model_config = AutoConfig.from_pretrained(model_path, num_labels=NUM_LABELS)\n","teacher_config = AutoConfig.from_pretrained(teacher_path, num_labels=NUM_LABELS)\n","\n","# initialize model using familiar HF AutoModel\n","model_kwargs = {\"config\": model_config}\n","model_kwargs[\"state_dict\"], s_delayed = SparseAutoModel._loadable_state_dict(model_path)\n","model = AutoModelForTokenClassification.from_pretrained(model_path, **model_kwargs,)\n","model.config.id2label = LABEL_MAP\n","\n","# initialize teacher using familiar HF AutoModel\n","teacher_kwargs = {\"config\": teacher_config}\n","teacher_kwargs[\"state_dict\"], t_delayed = SparseAutoModel._loadable_state_dict(teacher_path)\n","teacher = AutoModelForTokenClassification.from_pretrained(teacher_path, **teacher_kwargs,)\n","\n","# optional - prints metrics about sparsity profiles of the models\n","SparseAutoModel.log_model_load(model, model_path, \"student\", s_delayed) # prints metrics on sparsity profile\n","SparseAutoModel.log_model_load(teacher, teacher_path, \"teacher\", t_delayed) # prints metrics on sparsity profile"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":336,"status":"ok","timestamp":1677767281068,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"xogGex9gsZ-8","outputId":"cb5fb0b2-60fb-40cf-b886-ca439ad08252"},"outputs":[{"name":"stdout","output_type":"stream","text":["{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n","{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n","{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}\n","{'B-LOC': 5, 'B-MISC': 7, 'B-ORG': 3, 'B-PER': 1, 'I-LOC': 6, 'I-MISC': 8, 'I-ORG': 4, 'I-PER': 2, 'O': 0}\n"]}],"source":["model.config.id2label = LABEL_MAP\n","model.config.label2id = {LABEL_MAP[id]: id for id in LABEL_MAP.keys()}\n","\n","print(model.config.id2label)\n","print(teacher.config.id2label)\n","\n","print(model.config.label2id)\n","print(teacher.config.label2id)"]},{"cell_type":"markdown","metadata":{"id":"K1JSDkCdMghS"},"source":["## **Step 5: Tokenize Dataset**\n","\n","Run the tokenizer on the dataset. \n","\n","In this function, we handle the case where an individual word is tokenized into multiple tokens. In particular, we set the `label_id = SPECIAL_TOKEN_ID` for each token besides the first token in a word. \n","\n","When evaluating the accuracy with `compute_metrics` (defined above), we filter out tokens with `SPECIAL_TOKEN_ID`, such that each word counts only once in the precision and recall calculations."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"2EUuFSTzRAvp"},"outputs":[],"source":["MAX_LEN = 128\n","\n","def preprocess_fn(examples):\n","  tokenized_inputs = tokenizer(\n","    examples[INPUT_COL], \n","    padding=\"max_length\", \n","    max_length=min(tokenizer.model_max_length, MAX_LEN), \n","    truncation=True,\n","    is_split_into_words=True # the texts in our dataset are lists of words (with a label for each word)\n","  )\n","  \n","  labels = []\n","  for i, label in enumerate(examples[LABEL_COL]):\n","    word_ids = tokenized_inputs.word_ids(batch_index=i)\n","    previous_word_idx = None\n","    label_ids = []\n","    for word_idx in word_ids:\n","      # Special tokens have a word id that is None. We set the label to SPECIAL_TOKEN_ID\n","      # so they are automatically ignored in the loss function.\n","      if word_idx is None:\n","        label_ids.append(SPECIAL_TOKEN_ID)\n","\n","      # We set the label for the first token of each word.\n","      elif word_idx != previous_word_idx:\n","        label_ids.append(label[word_idx])\n","\n","      # We will not label the other tokens of a word, so set to SPECIAL_TOKEN_ID\n","      else:\n","        label_ids.append(SPECIAL_TOKEN_ID)\n","      previous_word_idx = word_idx\n","\n","    labels.append(label_ids)\n","\n","  tokenized_inputs[\"labels\"] = labels\n","  return tokenized_inputs\n","\n","# tokenize the dataset\n","tokenized_dataset = dataset_from_json.map(\n","    preprocess_fn,\n","    batched=True,\n","    desc=\"Running tokenizer on dataset\"\n",")"]},{"cell_type":"markdown","metadata":{"id":"19mnPsKHN_y1"},"source":["## **Step 6: Run Training**\n","\n","SparseML has a custom `Trainer` class that inherits from the [Hugging Face `Trainer` Class](https://huggingface.co/docs/transformers/main_classes/trainer). As such, the SparseML `Trainer` has all of the existing functionality of the HF trainer. However, in addition, we can supply a `recipe` and (optionally) a `teacher`. \n","\n","\n","As we saw above, the `recipe` encodes the sparsity related algorithms and hyperparameters of the training process in a YAML file. The SparseML `Trainer` parses the `recipe` and adjusts the training workflow to apply the algorithms in the recipe.\n","\n","The `teacher` is an optional argument that instructs SparseML to apply model distillation to support the training process."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"34IXj1n6RCgQ"},"outputs":[],"source":["# run with subset of dataset so we can complete in 15 minutes\n","MAX_SAMPLES = 2000\n","if MAX_SAMPLES is not None:\n","  train_dataset = tokenized_dataset[\"train\"].select(range(MAX_SAMPLES))\n","else:\n","  train_dataset = tokenized_dataset[\"train\"]\n","eval_dataset = tokenized_dataset[\"validation\"]\n","\n","# setup trainer arguments\n","training_args = TrainingArguments(\n","    output_dir=\"./training_output\",\n","    do_train=True,\n","    do_eval=True,\n","    resume_from_checkpoint=False,\n","    evaluation_strategy=\"epoch\",\n","    save_strategy=\"epoch\",\n","    logging_strategy=\"epoch\",\n","    save_total_limit=1,\n","    per_device_train_batch_size=32,\n","    per_device_eval_batch_size=32,\n","    fp16=True)\n","\n","# initialize trainer\n","trainer = Trainer(\n","    model=model,\n","    model_state_path=model_path,\n","    recipe=recipe_path,\n","    teacher=teacher,\n","    metadata_args=[\"per_device_train_batch_size\",\"per_device_eval_batch_size\",\"fp16\"],\n","    args=training_args,\n","    train_dataset=train_dataset,\n","    eval_dataset=eval_dataset,\n","    tokenizer=tokenizer,\n","    data_collator=DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None),\n","    compute_metrics=compute_metrics)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"LLBgAdqyRDro"},"outputs":[],"source":["%rm -rf training_output\n","train_result = trainer.train(resume_from_checkpoint=False)\n","trainer.save_model()\n","trainer.save_state()\n","trainer.save_optimizer_and_scheduler(training_args.output_dir)"]},{"cell_type":"markdown","metadata":{"id":"2vgxbUDKqdVZ"},"source":["## **Step 7: Export To ONNX**\n","\n","Run the following to export the model to ONNX. The script creates a `deployment` folder containing ONNX file and the necessary configuration files (e.g. `tokenizer.json`) for deployment with DeepSparse."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"-rhWjiHBeR7M"},"outputs":[],"source":["!sparseml.transformers.export_onnx \\\n","  --model_path training_output \\\n","  --task token_classification"]},{"cell_type":"markdown","metadata":{"id":"a72xHJ5594C4"},"source":["## **Deploy with DeepSparse**"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"-XubpXohO_8A"},"outputs":[],"source":["%pip install deepsparse"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"m_USM8mCPETg"},"outputs":[],"source":["from deepsparse import Pipeline\n","\n","pipeline = Pipeline.create(\"token_classification\", model_path=\"./deployment\")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":22,"status":"ok","timestamp":1677768836760,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"Bncg7Xx5ONqB","outputId":"b753f175-289c-4fd1-bc4f-468060349f0b"},"outputs":[{"name":"stdout","output_type":"stream","text":["[[TokenClassificationResult(entity='B-LOC', score=0.9966669082641602, index=1, word='japan', start=0, end=5, is_grouped=False),\n","  TokenClassificationResult(entity='B-MISC', score=0.7956981062889099, index=8, word='world', start=23, end=28, is_grouped=False),\n","  TokenClassificationResult(entity='I-MISC', score=0.9346566796302795, index=9, word='cup', start=29, end=32, is_grouped=False),\n","  TokenClassificationResult(entity='B-MISC', score=0.4572566747665405, index=19, word='fifa', start=73, end=77, is_grouped=False)]]\n"]}],"source":["from pprint import pprint\n","prediction = pipeline(\"Japan, co-hosts of the World Cup in 2002 and ranked 20th in the world by FIFA, are favourites to regain their title here.\")\n","pprint(prediction.predictions)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":6,"status":"ok","timestamp":1677768836760,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"4zqbsVpoSZ-R","outputId":"0032570b-6637-47b8-ad19-b25d834a944b"},"outputs":[{"name":"stdout","output_type":"stream","text":["[[TokenClassificationResult(entity='B-LOC', score=0.9878184795379639, index=1, word='china', start=0, end=5, is_grouped=False),\n","  TokenClassificationResult(entity='B-MISC', score=0.7045027613639832, index=18, word='u', start=93, end=94, is_grouped=False),\n","  TokenClassificationResult(entity='I-LOC', score=0.31070953607559204, index=19, word='##zbek', start=94, end=98, is_grouped=False),\n","  TokenClassificationResult(entity='B-PER', score=0.9934289455413818, index=21, word='igor', start=107, end=111, is_grouped=False),\n","  TokenClassificationResult(entity='I-PER', score=0.9966109395027161, index=22, word='sh', start=112, end=114, is_grouped=False),\n","  TokenClassificationResult(entity='I-PER', score=0.9972546696662903, index=23, word='##k', start=114, end=115, is_grouped=False),\n","  TokenClassificationResult(entity='I-PER', score=0.9971543550491333, index=24, word='##vy', start=115, end=117, is_grouped=False),\n","  TokenClassificationResult(entity='I-PER', score=0.9964032173156738, index=25, word='##rin', start=117, end=120, is_grouped=False),\n","  TokenClassificationResult(entity='B-MISC', score=0.8585354089736938, index=44, word='chinese', start=205, end=212, is_grouped=False)]]\n"]}],"source":["prediction = pipeline(\"China controlled most of the match and saw several chances missed until the 78th minute when Uzbek striker Igor Shkvyrin took advantage of a misdirected defensive header to lob the ball over the advancing Chinese keeper and into an empty net.\")\n","pprint(prediction.predictions)"]}],"metadata":{"accelerator":"GPU","colab":{"provenance":[{"file_id":"1NzTgvXgE5e17JdD1BKXJu-ABvE2-5gKj","timestamp":1677770924764},{"file_id":"1nCs9zm2goooiw0gfU6S4ACRiBiPCxst9","timestamp":1677449350236},{"file_id":"1cXfeYQ_ZbnJRoQsaYOIDR2N7YP--mMiL","timestamp":1677358343826},{"file_id":"1Zawa0sifXr2wIl9tbF7ySJ7xYY0dtTzI","timestamp":1677345946788}]},"gpuClass":"standard","kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.6"},"vscode":{"interpreter":{"hash":"b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"}}},"nbformat":4,"nbformat_minor":0}
+{"cells":[{"cell_type":"markdown","metadata":{"id":"kSNEB-3orJ9C"},"source":["# **Token Classification: Sparse Transfer Learning with the Python API**\n","\n","In this example, you will fine-tune a 90% pruned BERT model onto the Conll2003 NER dataset using SparseML's Hugging Face Integration.\n","\n","### **Sparse Transfer Learning Overview**\n","\n","Sparse Transfer Learning is very similiar to typical fine-tuning you are used to when training models. However, with Sparse Transfer Learning, we start the training process from a pre-sparsified checkpoint and maintain the sparsity structure while the fine tuning occurs. \n","\n","At the end, you will have a sparse model trained on your dataset, ready to be deployed with DeepSparse for GPU-class performance on CPUs!\n","\n","### **Pre-Sparsified BERT**\n","SparseZoo, Neural Magic's open source repository of pre-sparsified models, contains a 90% pruned version of BERT, which has been sparsified on the upstream Wikipedia and BookCorpus datasets with the\n","masked language modeling objective. [Check out the model card](https://sparsezoo.neuralmagic.com/models/nlp%2Fmasked_language_modeling%2Fobert-base%2Fpytorch%2Fhuggingface%2Fwikipedia_bookcorpus%2Fpruned90-none). We will use this model as the starting point for the transfer learning process.\n","\n","\n","**Let's dive in!**"]},{"cell_type":"markdown","metadata":{"id":"Y0WybTbssU0g"},"source":["## **Installation**\n","\n","Install SparseML via `pip`.\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{"collapsed":true,"id":"AkR1u2_NnXqY"},"outputs":[],"source":["!pip install sparseml[transformers]"]},{"cell_type":"markdown","metadata":{"id":"_jY0SKdXFGO3"},"source":["If you are running on Google Colab, restart the runtime after this step."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"XXj0S5Jdq2M-"},"outputs":[],"source":["import sparseml\n","from sparsezoo import Model\n","from sparseml.transformers.utils import SparseAutoModel\n","from sparseml.transformers.sparsification import Trainer, TrainingArguments\n","import numpy as np\n","from transformers import (\n","    AutoModelForTokenClassification,\n","    AutoConfig, \n","    AutoTokenizer,\n","    EvalPrediction,\n","    DataCollatorForTokenClassification,\n","    PreTrainedTokenizerFast\n",")\n","from datasets import ClassLabel, load_dataset, load_metric"]},{"cell_type":"markdown","metadata":{"id":"A6GwDnLL2Zn_"},"source":["## **Step 1: Load a Dataset**\n","\n","SparseML is integrated with Hugging Face, so we can use the `datasets` class to load datasets from the Hugging Face hub or from local files. \n","\n","[Conll2003 Dataset Card](https://huggingface.co/datasets/conll2003)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"CkvbT1i9p87z"},"outputs":[],"source":["# load dataset from HF hub\n","dataset = load_dataset(\"conll2003\")\n","\n","# alternatively, load from JSONL file\n","data_files = {}\n","dataset[\"train\"].to_json(\"conll2003-train.json\")\n","dataset[\"validation\"].to_json(\"conll2003-validation.json\")\n","data_files[\"train\"] = \"conll2003-train.json\"\n","data_files[\"validation\"] = \"conll2003-validation.json\"\n","dataset_from_json = load_dataset('json', data_files=data_files)"]},{"cell_type":"markdown","metadata":{"id":"IiFcAKt82qSh"},"source":["We can see the input is `tokens` which is a list of words and the labels are `ner_tags` which are a list of integers corresponding to a tag type for each word."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":725,"status":"ok","timestamp":1677766765729,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"kc8DQY2HyUWy","outputId":"366d295f-ff33-4409-cba0-1b30963b674f"},"outputs":[{"name":"stdout","output_type":"stream","text":["{\"id\":\"0\",\"tokens\":[\"EU\",\"rejects\",\"German\",\"call\",\"to\",\"boycott\",\"British\",\"lamb\",\".\"],\"pos_tags\":[22,42,16,21,35,37,16,21,7],\"chunk_tags\":[11,21,11,12,21,22,11,12,0],\"ner_tags\":[3,0,7,0,0,0,7,0,0]}\n","{\"id\":\"1\",\"tokens\":[\"Peter\",\"Blackburn\"],\"pos_tags\":[22,22],\"chunk_tags\":[11,12],\"ner_tags\":[1,2]}\n","{\"id\":\"2\",\"tokens\":[\"BRUSSELS\",\"1996-08-22\"],\"pos_tags\":[22,11],\"chunk_tags\":[11,12],\"ner_tags\":[5,0]}\n","{\"id\":\"3\",\"tokens\":[\"The\",\"European\",\"Commission\",\"said\",\"on\",\"Thursday\",\"it\",\"disagreed\",\"with\",\"German\",\"advice\",\"to\",\"consumers\",\"to\",\"shun\",\"British\",\"lamb\",\"until\",\"scientists\",\"determine\",\"whether\",\"mad\",\"cow\",\"disease\",\"can\",\"be\",\"transmitted\",\"to\",\"sheep\",\".\"],\"pos_tags\":[12,22,22,38,15,22,28,38,15,16,21,35,24,35,37,16,21,15,24,41,15,16,21,21,20,37,40,35,21,7],\"chunk_tags\":[11,12,12,21,13,11,11,21,13,11,12,13,11,21,22,11,12,17,11,21,17,11,12,12,21,22,22,13,11,0],\"ner_tags\":[0,3,4,0,0,0,0,0,0,7,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0]}\n","{\"id\":\"4\",\"tokens\":[\"Germany\",\"'s\",\"representative\",\"to\",\"the\",\"European\",\"Union\",\"'s\",\"veterinary\",\"committee\",\"Werner\",\"Zwingmann\",\"said\",\"on\",\"Wednesday\",\"consumers\",\"should\",\"buy\",\"sheepmeat\",\"from\",\"countries\",\"other\",\"than\",\"Britain\",\"until\",\"the\",\"scientific\",\"advice\",\"was\",\"clearer\",\".\"],\"pos_tags\":[22,27,21,35,12,22,22,27,16,21,22,22,38,15,22,24,20,37,21,15,24,16,15,22,15,12,16,21,38,17,7],\"chunk_tags\":[11,11,12,13,11,12,12,11,12,12,12,12,21,13,11,12,21,22,11,13,11,1,13,11,17,11,12,12,21,1,0],\"ner_tags\":[5,0,0,0,0,3,4,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0]}\n"]}],"source":["!head conll2003-train.json --lines=5"]},{"cell_type":"markdown","metadata":{"id":"1urGaah73OUm"},"source":["## **Step 2: Setup Evaluation Metric**\n","\n","Token classification predicts a category for every word in the input sentence. We can use the [seqeval metric](https://huggingface.co/spaces/evaluate-metric/seqeval) to evaluate the tag-level precision and recall of the pipeline. \n","\n","The seqeval metric needs to be passed tags rather than tag indexes, so we need to create a mapping between the indexes and the tags so that we can pass the tags to the seqeval metric.\n","\n","The Conll2003 named-entity-recognition tags map to the following classes:\n","\n","```\n","{\n","  'O': 0, \n","  'B-PER': 1, \n","  'I-PER': 2, \n","  'B-ORG': 3, \n","  'I-ORG': 4, \n","  'B-LOC': 5, \n","  'I-LOC': 6, \n","  'B-MISC': 7, \n","  'I-MISC': 8\n","}\n","```"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"7ti52fgQqdSU"},"outputs":[],"source":["# label mapping\n","LABEL_MAP = {\n","    0: 'O', \n","    1: 'B-PER', \n","    2: 'I-PER', \n","    3: 'B-ORG', \n","    4: 'I-ORG', \n","    5: 'B-LOC', \n","    6: 'I-LOC', \n","    7: 'B-MISC', \n","    8: 'I-MISC'\n","}\n","\n","# other configs\n","INPUT_COL = \"tokens\"\n","LABEL_COL = \"ner_tags\"\n","NUM_LABELS = len(LABEL_MAP)\n","SPECIAL_TOKEN_ID = -100"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ZZUOmaW1u7C1"},"outputs":[],"source":["# load evaluation metric\n","metric = load_metric(\"seqeval\")\n","\n","# setup metrics function\n","def compute_metrics(p: EvalPrediction):\n","  predictions, labels = p\n","  predictions = np.argmax(predictions, axis=2)\n","   \n","  # Remove ignored index (special tokens) and convert indexed tags to labels\n","  true_predictions = [\n","    [LABEL_MAP[pred] for (pred, lab) in zip(prediction, label) if lab != SPECIAL_TOKEN_ID]\n","    for prediction, label in zip(predictions, labels)\n","  ]\n","  true_labels = [\n","    [LABEL_MAP[lab] for (_, lab) in zip(prediction, label) if lab != SPECIAL_TOKEN_ID]\n","    for prediction, label in zip(predictions, labels)\n","  ]\n","  \n","  # example: results = metrics.compute(predictions=[\"0\", \"B-group\", \"0\"], true_labels=[\"0\", \"B-org\", \"I-org\"])\n","  #   we used the LABEL to convert the tags (which are integers) into the corresponding LABEL\n","  #   seqeval should be passed the actual labels\n","  results = metric.compute(predictions=true_predictions, references=true_labels)\n","  return {\n","    \"precision\": results[\"overall_precision\"],\n","    \"recall\": results[\"overall_recall\"],\n","    \"f1\": results[\"overall_f1\"],\n","    \"accuracy\": results[\"overall_accuracy\"],\n","  }"]},{"cell_type":"markdown","metadata":{"id":"1GEhYi53HoAH"},"source":["## **Step 3: Download Files for Sparse Transfer Learning**\n","\n","First, we need to select a sparse checkpoint to begin the training process. In this case, we will fine-tune a 90% pruned version of BERT onto the Conll2003 NER dataset. This model is available in SparseZoo, identified by the following stub:\n","```\n","zoo:nlp/masked_language_modeling/obert-base/pytorch/huggingface/wikipedia_bookcorpus/pruned90-none\n","```\n","\n","Next, we need to create a sparsification recipe for usage in the training process. Recipes are YAML files that encode the sparsity related algorithms and parameters to be applied by SparseML. For Sparse Transfer Learning, we need to use a recipe that instructs SparseML to maintain sparsity during the training process and to apply quantization over the final few epochs. \n","\n","In the case of Conll2003, there is a transfer learning recipe available in the SparseZoo, identified by the following stub:\n","```\n","zoo:nlp/token_classification/obert-base/pytorch/huggingface/conll2003/pruned90_quant-none\n","```\n","\n","Finally, SparseML has the optional ability to apply model distillation from a teacher model during the transfer learning process to boost accuracy. In this case, we will use a dense version of BERT trained on the Conll2003 dataset which is hosted in SparseZoo. This model is identified by the following stub:\n","\n","```\n","zoo:nlp/token_classification/obert-base/pytorch/huggingface/conll2003/base-none\n","```"]},{"cell_type":"markdown","metadata":{"id":"U_iyuuB4Wq7N"},"source":["Use the `sparsezoo` python client to download the models and recipe using their SparseZoo stubs."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Ykg8fEN2Q5o_"},"outputs":[],"source":["# downloads 90% pruned upstream BERT trained on MLM objective (pruned90)\n","model_stub = \"zoo:nlp/masked_language_modeling/obert-base/pytorch/huggingface/wikipedia_bookcorpus/pruned90-none\" \n","model_path = Model(model_stub, download_path=\"./model\").training.path\n","\n","# downloads dense BERT trained on CONLL2003 (base_none)\n","teacher_stub = \"zoo:nlp/token_classification/obert-base/pytorch/huggingface/conll2003/base-none\"\n","teacher_path = Model(teacher_stub, download_path=\"./teacher\").training.path\n","\n","# download pruned quantized transfer recipe for CONLL2003 (pruned90_quant)\n","transfer_stub = \"zoo:nlp/token_classification/obert-base/pytorch/huggingface/conll2003/pruned90_quant-none\"\n","recipe_path = Model(transfer_stub, download_path=\"./transfer_recipe\").recipes.default.path"]},{"cell_type":"markdown","metadata":{"id":"RLe8iEWxV_zz"},"source":["We can see that the upstream model (trained on Wikipedia BookCorpus) and  configuration files have been downloaded to the local directory."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":25,"status":"ok","timestamp":1677766834654,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"0NTVj1kPRSCW","outputId":"cfdf5ff4-9b8a-4f4d-a1b0-d9a9fa1dec19"},"outputs":[{"name":"stdout","output_type":"stream","text":["all_results.json   special_tokens_map.json  trainer_state.json  vocab.txt\n","config.json        tokenizer_config.json    training_args.bin\n","pytorch_model.bin  tokenizer.json           train_results.json\n"]}],"source":["%ls ./model/training"]},{"cell_type":"markdown","metadata":{"id":"orjvrvdCWEUi"},"source":["We can see that a transfer learning recipe has been downloaded. The `ConstantPruningModifier` instructs SparseML to maintain the sparsity structure of the network as the model trains and the `QuantizationModifier` instructs SparseML to run Quantization Aware Training at the end of training."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"eUYg-7eBRT5f"},"outputs":[],"source":["%cat ./transfer_recipe/recipe/recipe_original.md"]},{"attachments":{},"cell_type":"markdown","metadata":{"id":"0824QZuAqdVY"},"source":["#### **Inspecting the Recipe**\n","\n","Here is the transfer learning recipe:\n","\n","```yaml\n","version: 1.1.0\n","\n","# General Variables\n","num_epochs: 13\n","init_lr: 1.5e-4 \n","final_lr: 0\n","\n","qat_start_epoch: 8.0\n","observer_epoch: 12.0\n","quantize_embeddings: 1\n","\n","distill_hardness: 1.0\n","distill_temperature: 2.0\n","\n","# Modifiers:\n","\n","training_modifiers:\n","  - !EpochRangeModifier\n","      end_epoch: eval(num_epochs)\n","      start_epoch: 0.0\n","\n","  - !LearningRateFunctionModifier\n","      start_epoch: 0\n","      end_epoch: eval(num_epochs)\n","      lr_func: linear\n","      init_lr: eval(init_lr)\n","      final_lr: eval(final_lr)\n","    \n","quantization_modifiers:\n","  - !QuantizationModifier\n","      start_epoch: eval(qat_start_epoch)\n","      disable_quantization_observer_epoch: eval(observer_epoch)\n","      freeze_bn_stats_epoch: eval(observer_epoch)\n","      quantize_embeddings: eval(quantize_embeddings)\n","      quantize_linear_activations: 0\n","      exclude_module_types: ['LayerNorm']\n","      submodules:\n","        - bert.embeddings\n","        - bert.encoder\n","        - classifier\n","\n","distillation_modifiers:\n","  - !DistillationModifier\n","     hardness: eval(distill_hardness)\n","     temperature: eval(distill_temperature)\n","     distill_output_keys: [logits]\n","\n","constant_modifiers:\n","  - !ConstantPruningModifier\n","      start_epoch: 0.0\n","      params: __ALL_PRUNABLE__\n","```\n","\n","\n","The `Modifiers` in the transfer learning recipe are the important items that encode how SparseML should modify the training process for Sparse Transfer Learning:\n","- `ConstantPruningModifier` tells SparseML to pin weights at 0 over all epochs, maintaining the sparsity structure of the network\n","- `QuantizationModifier` tells SparseML to quanitze the weights with quantization aware training over the last 5 epochs\n","- `DistillationModifier` tells SparseML how to apply distillation during the trainign process, targeting the logits\n","\n","Below, SparseML's `Trainer` will parses the modifiers and updates the training process to implement the algorithms specified here."]},{"attachments":{},"cell_type":"markdown","metadata":{"id":"FStnDScEKoMX"},"source":["## **Step 4: Setup Hugging Face Model Objects**\n","\n","Next, we will set up the Hugging Face `tokenizer`, `config`, and `model`. \n","\n","These are all native Hugging Face objects, so check out the Hugging Face docs for more details on `AutoModel`, `AutoConfig`, and `AutoTokenizer` as needed. \n","\n","We instantiate these classes by passing the local path to the directory containing the `pytorch_model.bin`, `tokenizer.json`, and `config.json` files from the SparseZoo download."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"dhN1oGcTQ9RE"},"outputs":[],"source":["# shared tokenizer between teacher and student\n","tokenizer = AutoTokenizer.from_pretrained(model_path)\n","assert(isinstance(tokenizer, PreTrainedTokenizerFast))\n","\n","# setup configs\n","model_config = AutoConfig.from_pretrained(model_path, num_labels=NUM_LABELS)\n","teacher_config = AutoConfig.from_pretrained(teacher_path, num_labels=NUM_LABELS)\n","\n","# initialize model using familiar HF AutoModel\n","model_kwargs = {\"config\": model_config}\n","model_kwargs[\"state_dict\"], s_delayed = SparseAutoModel._loadable_state_dict(model_path)\n","model = AutoModelForTokenClassification.from_pretrained(model_path, **model_kwargs,)\n","model.config.id2label = LABEL_MAP\n","\n","# initialize teacher using familiar HF AutoModel\n","teacher_kwargs = {\"config\": teacher_config}\n","teacher_kwargs[\"state_dict\"], t_delayed = SparseAutoModel._loadable_state_dict(teacher_path)\n","teacher = AutoModelForTokenClassification.from_pretrained(teacher_path, **teacher_kwargs,)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":336,"status":"ok","timestamp":1677767281068,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"xogGex9gsZ-8","outputId":"cb5fb0b2-60fb-40cf-b886-ca439ad08252"},"outputs":[{"name":"stdout","output_type":"stream","text":["{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n","{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n","{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}\n","{'B-LOC': 5, 'B-MISC': 7, 'B-ORG': 3, 'B-PER': 1, 'I-LOC': 6, 'I-MISC': 8, 'I-ORG': 4, 'I-PER': 2, 'O': 0}\n"]}],"source":["model.config.id2label = LABEL_MAP\n","model.config.label2id = {LABEL_MAP[id]: id for id in LABEL_MAP.keys()}\n","\n","print(model.config.id2label)\n","print(teacher.config.id2label)\n","\n","print(model.config.label2id)\n","print(teacher.config.label2id)"]},{"cell_type":"markdown","metadata":{"id":"K1JSDkCdMghS"},"source":["## **Step 5: Tokenize Dataset**\n","\n","Run the tokenizer on the dataset. \n","\n","In this function, we handle the case where an individual word is tokenized into multiple tokens. In particular, we set the `label_id = SPECIAL_TOKEN_ID` for each token besides the first token in a word. \n","\n","When evaluating the accuracy with `compute_metrics` (defined above), we filter out tokens with `SPECIAL_TOKEN_ID`, such that each word counts only once in the precision and recall calculations."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"2EUuFSTzRAvp"},"outputs":[],"source":["MAX_LEN = 128\n","\n","def preprocess_fn(examples):\n","  tokenized_inputs = tokenizer(\n","    examples[INPUT_COL], \n","    padding=\"max_length\", \n","    max_length=min(tokenizer.model_max_length, MAX_LEN), \n","    truncation=True,\n","    is_split_into_words=True # the texts in our dataset are lists of words (with a label for each word)\n","  )\n","  \n","  labels = []\n","  for i, label in enumerate(examples[LABEL_COL]):\n","    word_ids = tokenized_inputs.word_ids(batch_index=i)\n","    previous_word_idx = None\n","    label_ids = []\n","    for word_idx in word_ids:\n","      # Special tokens have a word id that is None. We set the label to SPECIAL_TOKEN_ID\n","      # so they are automatically ignored in the loss function.\n","      if word_idx is None:\n","        label_ids.append(SPECIAL_TOKEN_ID)\n","\n","      # We set the label for the first token of each word.\n","      elif word_idx != previous_word_idx:\n","        label_ids.append(label[word_idx])\n","\n","      # We will not label the other tokens of a word, so set to SPECIAL_TOKEN_ID\n","      else:\n","        label_ids.append(SPECIAL_TOKEN_ID)\n","      previous_word_idx = word_idx\n","\n","    labels.append(label_ids)\n","\n","  tokenized_inputs[\"labels\"] = labels\n","  return tokenized_inputs\n","\n","# tokenize the dataset\n","tokenized_dataset = dataset_from_json.map(\n","    preprocess_fn,\n","    batched=True,\n","    desc=\"Running tokenizer on dataset\"\n",")"]},{"cell_type":"markdown","metadata":{"id":"19mnPsKHN_y1"},"source":["## **Step 6: Run Training**\n","\n","SparseML has a custom `Trainer` class that inherits from the [Hugging Face `Trainer` Class](https://huggingface.co/docs/transformers/main_classes/trainer). As such, the SparseML `Trainer` has all of the existing functionality of the HF trainer. However, in addition, we can supply a `recipe` and (optionally) a `teacher`. \n","\n","\n","As we saw above, the `recipe` encodes the sparsity related algorithms and hyperparameters of the training process in a YAML file. The SparseML `Trainer` parses the `recipe` and adjusts the training workflow to apply the algorithms in the recipe.\n","\n","The `teacher` is an optional argument that instructs SparseML to apply model distillation to support the training process."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"34IXj1n6RCgQ"},"outputs":[],"source":["# run with subset of dataset so we can complete in 15 minutes\n","MAX_SAMPLES = 2000\n","if MAX_SAMPLES is not None:\n","  train_dataset = tokenized_dataset[\"train\"].select(range(MAX_SAMPLES))\n","else:\n","  train_dataset = tokenized_dataset[\"train\"]\n","eval_dataset = tokenized_dataset[\"validation\"]\n","\n","# setup trainer arguments\n","training_args = TrainingArguments(\n","    output_dir=\"./training_output\",\n","    do_train=True,\n","    do_eval=True,\n","    resume_from_checkpoint=False,\n","    evaluation_strategy=\"epoch\",\n","    save_strategy=\"epoch\",\n","    logging_strategy=\"epoch\",\n","    save_total_limit=1,\n","    per_device_train_batch_size=32,\n","    per_device_eval_batch_size=32,\n","    fp16=True)\n","\n","# initialize trainer\n","trainer = Trainer(\n","    model=model,\n","    model_state_path=model_path,\n","    recipe=recipe_path,\n","    teacher=teacher,\n","    metadata_args=[\"per_device_train_batch_size\",\"per_device_eval_batch_size\",\"fp16\"],\n","    args=training_args,\n","    train_dataset=train_dataset,\n","    eval_dataset=eval_dataset,\n","    tokenizer=tokenizer,\n","    data_collator=DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None),\n","    compute_metrics=compute_metrics)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"LLBgAdqyRDro"},"outputs":[],"source":["%rm -rf training_output\n","train_result = trainer.train(resume_from_checkpoint=False)\n","trainer.save_model()\n","trainer.save_state()\n","trainer.save_optimizer_and_scheduler(training_args.output_dir)"]},{"cell_type":"markdown","metadata":{"id":"2vgxbUDKqdVZ"},"source":["## **Step 7: Export To ONNX**\n","\n","Run the following to export the model to ONNX. The script creates a `deployment` folder containing ONNX file and the necessary configuration files (e.g. `tokenizer.json`) for deployment with DeepSparse."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"-rhWjiHBeR7M"},"outputs":[],"source":["!sparseml.transformers.export_onnx \\\n","  --model_path training_output \\\n","  --task token_classification"]},{"cell_type":"markdown","metadata":{"id":"a72xHJ5594C4"},"source":["## **Deploy with DeepSparse**"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"-XubpXohO_8A"},"outputs":[],"source":["%pip install deepsparse"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"m_USM8mCPETg"},"outputs":[],"source":["from deepsparse import Pipeline\n","\n","pipeline = Pipeline.create(\"token_classification\", model_path=\"./deployment\")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":22,"status":"ok","timestamp":1677768836760,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"Bncg7Xx5ONqB","outputId":"b753f175-289c-4fd1-bc4f-468060349f0b"},"outputs":[{"name":"stdout","output_type":"stream","text":["[[TokenClassificationResult(entity='B-LOC', score=0.9966669082641602, index=1, word='japan', start=0, end=5, is_grouped=False),\n","  TokenClassificationResult(entity='B-MISC', score=0.7956981062889099, index=8, word='world', start=23, end=28, is_grouped=False),\n","  TokenClassificationResult(entity='I-MISC', score=0.9346566796302795, index=9, word='cup', start=29, end=32, is_grouped=False),\n","  TokenClassificationResult(entity='B-MISC', score=0.4572566747665405, index=19, word='fifa', start=73, end=77, is_grouped=False)]]\n"]}],"source":["from pprint import pprint\n","prediction = pipeline(\"Japan, co-hosts of the World Cup in 2002 and ranked 20th in the world by FIFA, are favourites to regain their title here.\")\n","pprint(prediction.predictions)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":6,"status":"ok","timestamp":1677768836760,"user":{"displayName":"Robert Shaw","userId":"06782962127877519905"},"user_tz":300},"id":"4zqbsVpoSZ-R","outputId":"0032570b-6637-47b8-ad19-b25d834a944b"},"outputs":[{"name":"stdout","output_type":"stream","text":["[[TokenClassificationResult(entity='B-LOC', score=0.9878184795379639, index=1, word='china', start=0, end=5, is_grouped=False),\n","  TokenClassificationResult(entity='B-MISC', score=0.7045027613639832, index=18, word='u', start=93, end=94, is_grouped=False),\n","  TokenClassificationResult(entity='I-LOC', score=0.31070953607559204, index=19, word='##zbek', start=94, end=98, is_grouped=False),\n","  TokenClassificationResult(entity='B-PER', score=0.9934289455413818, index=21, word='igor', start=107, end=111, is_grouped=False),\n","  TokenClassificationResult(entity='I-PER', score=0.9966109395027161, index=22, word='sh', start=112, end=114, is_grouped=False),\n","  TokenClassificationResult(entity='I-PER', score=0.9972546696662903, index=23, word='##k', start=114, end=115, is_grouped=False),\n","  TokenClassificationResult(entity='I-PER', score=0.9971543550491333, index=24, word='##vy', start=115, end=117, is_grouped=False),\n","  TokenClassificationResult(entity='I-PER', score=0.9964032173156738, index=25, word='##rin', start=117, end=120, is_grouped=False),\n","  TokenClassificationResult(entity='B-MISC', score=0.8585354089736938, index=44, word='chinese', start=205, end=212, is_grouped=False)]]\n"]}],"source":["prediction = pipeline(\"China controlled most of the match and saw several chances missed until the 78th minute when Uzbek striker Igor Shkvyrin took advantage of a misdirected defensive header to lob the ball over the advancing Chinese keeper and into an empty net.\")\n","pprint(prediction.predictions)"]}],"metadata":{"accelerator":"GPU","colab":{"provenance":[{"file_id":"1NzTgvXgE5e17JdD1BKXJu-ABvE2-5gKj","timestamp":1677770924764},{"file_id":"1nCs9zm2goooiw0gfU6S4ACRiBiPCxst9","timestamp":1677449350236},{"file_id":"1cXfeYQ_ZbnJRoQsaYOIDR2N7YP--mMiL","timestamp":1677358343826},{"file_id":"1Zawa0sifXr2wIl9tbF7ySJ7xYY0dtTzI","timestamp":1677345946788}]},"gpuClass":"standard","kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.6"},"vscode":{"interpreter":{"hash":"b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"}}},"nbformat":4,"nbformat_minor":0}
diff --git a/integrations/huggingface-transformers/tutorials/token-classification/docs-token-classification-python-custom-teacher-wnut.ipynb b/integrations/huggingface-transformers/tutorials/token-classification/docs-token-classification-python-custom-teacher-wnut.ipynb
index 45370c6af56..b3152185af8 100644
--- a/integrations/huggingface-transformers/tutorials/token-classification/docs-token-classification-python-custom-teacher-wnut.ipynb
+++ b/integrations/huggingface-transformers/tutorials/token-classification/docs-token-classification-python-custom-teacher-wnut.ipynb
@@ -587,8 +587,7 @@
     "# initialize model\n",
     "model_kwargs = {\"config\": config}\n",
     "model_kwargs[\"state_dict\"], s_delayed = SparseAutoModel._loadable_state_dict(model_path)\n",
-    "model = AutoModelForTokenClassification.from_pretrained(model_path, **model_kwargs,)\n",
-    "SparseAutoModel.log_model_load(model, model_path, \"student\", s_delayed) # prints metrics on sparsity profile"
+    "model = AutoModelForTokenClassification.from_pretrained(model_path, **model_kwargs,)"
    ]
   },
   {

From 13418248f80fd6c78c51ff02db10dd3d4e5ae647 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 9 Apr 2024 14:15:14 -0400
Subject: [PATCH 12/19] [GHA] Update workflows to use new runners, push to new
 PyPi internal server (#2217)

* add updated workflows which use new runners, push to new internal pypi server and break jobs into individual workflows

* update github actions to use main

* update pypi push to use main
---
 .github/workflows/build-container.yml         |  74 ++++--------
 .../workflows/build-wheel-and-container.yml   |  77 ++++++++++++
 .github/workflows/build-wheel.yml             | 113 ++++++++++--------
 .../workflows/test-wheel-push-to-internal.yml |  68 +++++++++++
 tests/sparseml/test_base.py                   |  16 ++-
 5 files changed, 241 insertions(+), 107 deletions(-)
 create mode 100644 .github/workflows/build-wheel-and-container.yml
 create mode 100644 .github/workflows/test-wheel-push-to-internal.yml

diff --git a/.github/workflows/build-container.yml b/.github/workflows/build-container.yml
index fc81e513e30..9eda86ae0d0 100644
--- a/.github/workflows/build-container.yml
+++ b/.github/workflows/build-container.yml
@@ -1,80 +1,56 @@
-name: Build Docker Container
+name: Build Container with wheel and push to GCR
 on:
-  pull_request:
-    types: [opened, synchronize, reopened]
-    branches:
-      - main
-      - 'release/[0-9]+.[0-9]+'
-  push:
-    branches:
-      - 'main'
-  release:
-    types: [created, published]
-  schedule:
-    - cron: '0 2 * * *'
-
-# TODO: docker containers created through a release cut vs PR to the release branch
-# will be pushed to different locations (i.e one will be sparseml the other will be test-sparseml).
-# These containers rely on the new internal pypi server being enabled. Once enabled,
-# this workflow can be expanded to make this distinction.
-env:
-  RELEASE: ${{ github.event_name =='release' || (startsWith(github.base_ref, 'release/') && github.event_name == 'pull_request')}}
-  DEV: ${{ github.base_ref == 'main' && github.event_name == 'pull_request'}}
-  NAME: ${{ github.event.number }} 
-
-permissions:
-  contents: read
-  packages: write
+  workflow_call:
+    inputs:
+      build-label:
+        description: "requested runner label"
+        type: string
+      dev:
+        type: string
+        required: true 
+      release:
+        type: string
+        required: true
+      name:
+        type: string
 
 jobs:
   build-container:
-    name: Build sparseml container
-    runs-on: ubuntu-20.04
+    runs-on: ${{ inputs.build-label }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
         with:
           fetch-depth: 1
+
       - name: Set up Docker Buildx
         id: buildx
         uses: docker/setup-buildx-action@v2
         with:
           buildkitd-flags: --debug
+
       - name: Get current date
         id: date
-        run: echo "::set-output name=date::$(date +'%Y%m%d')"
+        run: echo "date=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
+
       - name: Get the current version
-        if: ${{ env.RELEASE == 'true' }}
+        if: ${{ inputs.release == 'true' }}
         id: version
-        run: echo "::set-output name=version::$(echo ${{ github.base_ref }} | cut -c 9-15)"
+        run: echo "version=$(echo ${{ github.base_ref }} | cut -c 9-15)" >> $GITHUB_OUTPUT
+
       - name: Login to Github Packages
         uses: docker/login-action@v2
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
+
       - name: Build Dev Docker Container
-        if: ${{ env.DEV == 'true' }}
+        if: ${{ inputs.dev == 'true' }}
         uses: docker/build-push-action@v4
         with: 
           context: ./docker/containers/docker_dev
           build-args: |
             BRANCH=${{github.head_ref}}
           push: true
-          tags: ghcr.io/neuralmagic/sparseml-dev:${{ env.NAME }}
-      - name: Build Release Docker Container
-        if: ${{ env.RELEASE == 'true' }}
-        uses: docker/build-push-action@v4
-        with: 
-          context: ./docker/containers/docker_release
-          build-args: |
-            VERSION=${{ steps.version.outputs.version }}
-          push: true
-          tags: ghcr.io/neuralmagic/test-sparseml:latest, ghcr.io/neuralmagic/test-sparseml:${{ steps.version.outputs.version }}
-      - name: Build Nightly Docker Container
-        if: ${{ env.DEV == 'false' && env.RELEASE == 'false'}}
-        uses: docker/build-push-action@v4
-        with:
-          context: ./docker/containers/docker_nightly
-          push: true
-          tags: ghcr.io/neuralmagic/test-sparseml-nightly:latest, ghcr.io/neuralmagic/test-sparseml-nightly:${{ steps.date.outputs.date }}
+          tags: ghcr.io/neuralmagic/sparseml-dev:${{ inputs.name }}
\ No newline at end of file
diff --git a/.github/workflows/build-wheel-and-container.yml b/.github/workflows/build-wheel-and-container.yml
new file mode 100644
index 00000000000..05ec8a57d72
--- /dev/null
+++ b/.github/workflows/build-wheel-and-container.yml
@@ -0,0 +1,77 @@
+name: Build PyPi Wheel and Docker Container
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+    branches:
+      - main
+      - 'release/[0-9]+.[0-9]+'
+  push:
+    branches:
+      - 'release/[0-9]+.[0-9]+'
+      - main
+  release:
+    types: [created, published]
+  schedule:
+    - cron: '0 0 * * *'
+
+permissions:
+  id-token: write
+  contents: read
+  packages: write
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+# if not dev or release, will create a nightly build
+env:
+  PRODUCTION: ${{ github.event_name == 'schedule' || github.event_name == 'release'}}
+  RELEASE: ${{ github.event_name =='release' || startsWith(github.base_ref, 'release/') }}
+  DEV: ${{ github.base_ref == 'main' && github.event_name == 'pull_request'}}
+
+jobs:
+  set-outputs:
+    runs-on: ubuntu-latest
+    outputs:
+      dev: ${{ steps.set-outputs.outputs.dev }}
+      release: ${{ steps.set-outputs.outputs.release }}
+    steps:
+      - name: Set variables for workflows
+        id: set-outputs
+        run: |
+          echo "dev=$DEV" >> $GITHUB_OUTPUT
+          echo "release=$RELEASE" >> $GITHUB_OUTPUT
+
+  build-wheel-and-push:
+    needs: set-outputs
+    uses: ./.github/workflows/build-wheel.yml
+    with:
+      build-label: ubuntu-20.04
+      dev: ${{ needs.set-outputs.outputs.dev }}
+      release: ${{ needs.set-outputs.outputs.release  }}
+      name: ${{ github.event.number }} 
+      filename: dist/*.whl
+      bucket_name: nm-actions-test
+      python: '3.10'
+    secrets: inherit
+
+  test-wheel-and-push-internal:
+    needs: build-wheel-and-push
+    uses: ./.github/workflows/test-wheel-push-to-internal.yml
+    with:
+      build-label: aws-avx2-64G
+      whl: ${{ needs.build-wheel-and-push.outputs.wheel }}
+      python: '3.10'
+    secrets: inherit
+
+  # TODO: add nightly and release container build steps once wheel build push
+  # to production is automated. Removed until then.
+  build-container-and-push:
+    needs: [set-outputs, test-wheel-and-push-internal]
+    uses: ./.github/workflows/build-container.yml
+    with:
+      build-label: aws-avx2-64G
+      dev: ${{ needs.set-outputs.outputs.dev }}
+      release: ${{ needs.set-outputs.outputs.release  }}
+      name: ${{ github.event.number }}
+    secrets: inherit
\ No newline at end of file
diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml
index 292d874dd2f..6eb656b4eaf 100644
--- a/.github/workflows/build-wheel.yml
+++ b/.github/workflows/build-wheel.yml
@@ -1,58 +1,67 @@
-name: Build PyPi Wheel
+name: Build Wheel and Push to s3
 on:
-  pull_request:
-    types: [opened, synchronize, reopened]
-    branches:
-      - main
-      - 'release/[0-9]+.[0-9]+'
-  push:
-    branches:
-      - main
-  release:
-    types: [created, published]
-  schedule:
-    - cron: '0 0 * * *'
-
-permissions:
-  id-token: write
-  contents: read
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-# if not dev or release, will create a nightly build
-# everything is pushed to internal unless created through a nightly scheduled cron job which creates the build or 
-# missing release tag workflow/needs to be added in
-env:
-  INTERNAL: ${{ github.event_name != 'schedule' && github.event_name != 'release'}}
-  RELEASE: ${{ github.event_name =='release' || (startsWith(github.base_ref, 'release/') && github.event_name == 'pull_request')}}
-  DEV: ${{ github.base_ref == 'main' && github.event_name == 'pull_request'}}
-  NAME: ${{ github.event.number }} 
+  workflow_call:
+    inputs:
+      build-label:
+        description: "requested runner label"
+        type: string
+        required: true
+      dev:
+        type: string
+        required: true 
+      release:
+        type: string
+        required: true
+      name:
+        type: string
+      filename:
+        type: string
+        required: true
+      bucket_name:
+        type: string
+        required: true
+      python:
+        type: string
+    outputs:
+      wheel:
+        value: ${{ jobs.build-wheel-and-push.outputs.wheel }}
 
 jobs:
-  build_and_push:
-    runs-on: ubuntu-latest
+  build-wheel-and-push:
+    runs-on: ${{ inputs.build-label }}
     outputs:
       wheel: ${{ steps.push-wheel.outputs.wheel }}
     steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-      - name: Login to s3
-        uses: aws-actions/configure-aws-credentials@v2
-        with:
-          role-to-assume: ${{ secrets.AWS_WEBIDENTITY_FOR_GITHUB_ACTIONS }}
-          aws-region: us-east-1 
-      - name: Build PyPi Wheel
-        id: build-wheel
-        uses: neuralmagic/nm-actions/actions/pypi_build@main
-        with:
-          dev: $DEV
-          release: $RELEASE
-          name: $NAME
-      - name: Push to s3 bucket
-        id: push-wheel
-        uses: neuralmagic/nm-actions/actions/s3_push@main
-        with:
-          filename: dist/*.whl
-          internal: $INTERNAL
+    - uses: actions/setup-python@v4
+      with:
+        python-version: ${{ inputs.python }}
+
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Login to s3
+      uses: aws-actions/configure-aws-credentials@v2
+      with:
+        role-to-assume: ${{ secrets.AWS_WEBIDENTITY_FOR_GITHUB_ACTIONS }}
+        aws-region: us-east-1 
+    
+    - name: Set Env
+      run: |
+        pip3 install virtualenv
+        virtualenv venv
+        source venv/bin/activate
+
+    - name: Build PyPi Wheel
+      id: build-wheel
+      uses: neuralmagic/nm-actions/actions/pypi_build@main
+      with:
+        dev: ${{ inputs.dev }}
+        release: ${{ inputs.release }}
+        name: ${{ inputs.name }}
+    
+    - name: Push to s3 bucket
+      id: push-wheel
+      uses: neuralmagic/nm-actions/actions/s3_push@main
+      with:
+        filename: ${{ inputs.filename }}
+        bucket_name: ${{ inputs.bucket_name }}
\ No newline at end of file
diff --git a/.github/workflows/test-wheel-push-to-internal.yml b/.github/workflows/test-wheel-push-to-internal.yml
new file mode 100644
index 00000000000..1b4f52dc701
--- /dev/null
+++ b/.github/workflows/test-wheel-push-to-internal.yml
@@ -0,0 +1,68 @@
+name: Test Wheel and Push to Internal PyPi
+on:
+  workflow_call:
+    inputs:
+      build-label:
+        description: "requested runner label"
+        type: string
+        required: true
+      whl:
+        type: string
+        required: true
+      python:
+        type: string
+
+jobs:
+  test-wheel-and-push-internal:
+    runs-on: ${{ inputs.build-label }}
+    steps:
+    - uses: actions/setup-python@v4
+      with:
+        python-version: ${{ inputs.python }}
+
+    - name: Login to s3
+      uses: aws-actions/configure-aws-credentials@v2
+      with:
+        role-to-assume: ${{ secrets.AWS_WEBIDENTITY_FOR_GITHUB_ACTIONS }}
+        aws-region: us-east-1 
+
+    - name: Make directory for wheel
+      run: |
+          mkdir dist_s3
+          
+    - name: Pull from s3
+      uses: neuralmagic/nm-actions/actions/s3_pull@main
+      with:
+        filename: ${{ inputs.whl }}
+        dst: dist_s3
+
+    - name: Set Env
+      run: |
+        pip3 install virtualenv
+        virtualenv venv
+        source venv/bin/activate
+  
+    - name: Fetch name of whl
+      run: |
+          echo "FILENAME=$(echo dist_s3/*.whl)" >> $GITHUB_ENV
+          
+    - name: Push to internal pypi
+      uses: neuralmagic/nm-actions/actions/nm-upload-whl@main
+      with:
+        server: ${{ secrets.NM_PRIVATE_PYPI_LOCATION }}
+        username: ${{ secrets.NM_PRIVATE_PYPI_USER }}
+        password: ${{ secrets.NM_PRIVATE_PYPI_AUTH }}
+        whl: ./$FILENAME
+        port: 8080
+
+    - name: Install whl
+      run: |
+          pip3 install $FILENAME[dev]
+
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Remove src files and run tests
+      run: |
+            rm -rf src
+            make test
\ No newline at end of file
diff --git a/tests/sparseml/test_base.py b/tests/sparseml/test_base.py
index 6804f9e49dd..02166a6b828 100644
--- a/tests/sparseml/test_base.py
+++ b/tests/sparseml/test_base.py
@@ -70,7 +70,9 @@ def test_execute_in_sparseml_framework():
 
 def test_get_version():
     version = get_version(
-        "sparseml", raise_on_error=True, alternate_package_names=["sparseml-nightly"]
+        "sparseml",
+        raise_on_error=True,
+        alternate_package_names=["sparseml-nightly", "sparseml-dev"],
     )
     assert version == __version__
 
@@ -81,32 +83,34 @@ def test_get_version():
 
 
 def test_check_version():
-    assert check_version("sparseml", alternate_package_names=["sparseml-nightly"])
+    assert check_version(
+        "sparseml", alternate_package_names=["sparseml-nightly", "sparseml-dev"]
+    )
 
     assert not check_version(
         "sparseml",
         min_version="10.0.0",
         raise_on_error=False,
-        alternate_package_names=["sparseml-nightly"],
+        alternate_package_names=["sparseml-nightly", "sparseml-dev"],
     )
     with pytest.raises(ImportError):
         check_version(
             "sparseml",
             min_version="10.0.0",
-            alternate_package_names=["sparseml-nightly"],
+            alternate_package_names=["sparseml-nightly", "sparseml-dev"],
         )
 
     assert not check_version(
         "sparseml",
         max_version="0.0.1",
         raise_on_error=False,
-        alternate_package_names=["sparseml-nightly"],
+        alternate_package_names=["sparseml-nightly", "sparseml-dev"],
     )
     with pytest.raises(ImportError):
         check_version(
             "sparseml",
             max_version="0.0.1",
-            alternate_package_names=["sparseml-nightly"],
+            alternate_package_names=["sparseml-nightly", "sparseml-dev"],
         )
 
     assert not check_version("unknown", raise_on_error=False)

From 501e7098d3c0ed48cdd2eb52d71ac9422dd6e2eb Mon Sep 17 00:00:00 2001
From: Jeannie Finks <74554921+jeanniefinks@users.noreply.github.com>
Date: Wed, 10 Apr 2024 10:32:44 -0400
Subject: [PATCH 13/19] Update README.md

Updating outdated Slack link
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 1b7cc28fda9..b15ec4f30fe 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ limitations under the License.
     <a href="https://docs.neuralmagic.com/sparseml/">
         <img alt="Documentation" src="https://img.shields.io/badge/documentation-darkred?&style=for-the-badge&logo=read-the-docs" height=25>
     </a>
-    <a href="https://join.slack.com/t/discuss-neuralmagic/shared_invite/zt-q1a1cnvo-YBoICSIw3L1dmQpjBeDurQ/">
+    <a href="https://neuralmagic.com/community/">
         <img src="https://img.shields.io/badge/slack-purple?style=for-the-badge&logo=slack" height=25>
     </a>
     <a href="https://github.com/neuralmagic/sparseml/issues">
@@ -226,7 +226,7 @@ We appreciate contributions to the code, examples, integrations, and documentati
 
 ### Join
 
-For user help or questions about SparseML, sign up or log in to our [**Neural Magic Community Slack**](https://join.slack.com/t/discuss-neuralmagic/shared_invite/zt-q1a1cnvo-YBoICSIw3L1dmQpjBeDurQ). We are growing the community member by member and happy to see you there. Bugs, feature requests, or additional questions can also be posted to our [GitHub Issue Queue.](https://github.com/neuralmagic/sparseml/issues)
+For user help or questions about SparseML, sign up or log in to our [**Neural Magic Community Slack**](https://neuralmagic.com/community/). We are growing the community member by member and happy to see you there. Bugs, feature requests, or additional questions can also be posted to our [GitHub Issue Queue.](https://github.com/neuralmagic/sparseml/issues)
 
 You can get the latest news, webinar and event invites, research papers, and other ML Performance tidbits by [subscribing](https://neuralmagic.com/subscribe/) to the Neural Magic community.
 

From e9a686660b3513c587d6c2acd9fbcb97f402f42a Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Wed, 10 Apr 2024 10:41:02 -0400
Subject: [PATCH 14/19] Support for Passing in Tokenized Data to One-Shot
 (#2202)

* initial commit

* quality

* fixing unit tests
---
 .../transformers/finetune/data/data_args.py   |  6 +++
 .../finetune/data/data_helpers.py             | 16 ++++--
 src/sparseml/transformers/finetune/runner.py  | 15 ++++--
 .../transformers/finetune/text_generation.py  |  5 +-
 .../finetune/data/test_dataset_loading.py     | 53 +++++++++++++++++--
 5 files changed, 76 insertions(+), 19 deletions(-)

diff --git a/src/sparseml/transformers/finetune/data/data_args.py b/src/sparseml/transformers/finetune/data/data_args.py
index c332ac65bb7..9517a19e4de 100644
--- a/src/sparseml/transformers/finetune/data/data_args.py
+++ b/src/sparseml/transformers/finetune/data/data_args.py
@@ -118,6 +118,12 @@ class DataTrainingArguments(CustomDataTrainingArguments):
         default=512,
         metadata={"help": "Number of samples to use for one-shot calibration"},
     )
+    shuffle_calibration_samples: Optional[bool] = field(
+        default=True,
+        metadata={
+            "help": "whether to shuffle the dataset before selecting calibration data"
+        },
+    )
     streaming: Optional[bool] = field(
         default=False,
         metadata={"help": "True to stream data from a cloud dataset"},
diff --git a/src/sparseml/transformers/finetune/data/data_helpers.py b/src/sparseml/transformers/finetune/data/data_helpers.py
index 243f4085023..8fa8eb9bca3 100644
--- a/src/sparseml/transformers/finetune/data/data_helpers.py
+++ b/src/sparseml/transformers/finetune/data/data_helpers.py
@@ -18,7 +18,7 @@
 
 import torch
 from datasets import Dataset, load_dataset
-from torch.utils.data import DataLoader, RandomSampler
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from transformers.data import default_data_collator
 
 
@@ -36,6 +36,7 @@
 def format_calibration_data(
     tokenized_dataset: Dataset,
     num_calibration_samples: Optional[int] = None,
+    do_shuffle: bool = True,
     collate_fn: Callable = default_data_collator,
     accelerator: Optional[Any] = None,
 ) -> List[torch.Tensor]:
@@ -45,6 +46,8 @@ def format_calibration_data(
 
     :param tokenized_dataset: dataset to convert to dataloader
     :param num_calibration_samples: number of data samples to convert
+    :param do_shuffle: whether to shuffle the dataset before selecting calibration
+    samples, true by default
     :param collate_fn: optional custom collate function, or use default
     :param accelerator: optional accelerator for if preparing in FSDP mode
     :return: list of trimmed calibration data tensors
@@ -58,17 +61,20 @@ def format_calibration_data(
                 f"the provided dataset only has {safe_calibration_samples}. "
             )
 
-    shuffled_calibration = tokenized_dataset.shuffle()
-    shuffled_calibration = shuffled_calibration.select(range(safe_calibration_samples))
+    if do_shuffle:
+        tokenized_dataset = tokenized_dataset.shuffle()
+    tokenized_calibration = tokenized_dataset.select(range(safe_calibration_samples))
 
     dataloader_params = {
         "batch_size": 1,
-        "sampler": RandomSampler(shuffled_calibration),
+        "sampler": RandomSampler(tokenized_calibration)
+        if do_shuffle
+        else SequentialSampler(tokenized_calibration),
         "collate_fn": collate_fn,
         "pin_memory": True,
     }
 
-    calib_dataloader = DataLoader(shuffled_calibration, **dataloader_params)
+    calib_dataloader = DataLoader(tokenized_calibration, **dataloader_params)
     if accelerator:
         calib_dataloader = accelerator.prepare(calib_dataloader)
 
diff --git a/src/sparseml/transformers/finetune/runner.py b/src/sparseml/transformers/finetune/runner.py
index e970e3b7264..df1aa0ca967 100644
--- a/src/sparseml/transformers/finetune/runner.py
+++ b/src/sparseml/transformers/finetune/runner.py
@@ -19,7 +19,6 @@
 from typing import List, Optional
 
 import torch
-from torch.nn import Module
 from torch.utils.data import Dataset
 from transformers import AutoTokenizer
 
@@ -72,7 +71,6 @@ def __init__(
         data_args: "DataTrainingArguments",
         model_args: "ModelArguments",
         training_args: "TrainingArguments",
-        model: Module,
     ):
         self._data_args = data_args
         self._model_args = model_args
@@ -121,9 +119,15 @@ def _get_split_name(inp_str):
                 tokenizer=tokenizer,
             )
 
-            raw_dataset = dataset_manager.get_raw_dataset(self._model_args.cache_dir)
-            tokenized_dataset = dataset_manager.tokenize_and_process(raw_dataset)
-            tokenized_datasets[split_name] = tokenized_dataset
+            dataset = self._data_args.dataset
+            if hasattr(dataset, "column_names") and "input_ids" in dataset.column_names:
+                # dataset is already tokenized
+                tokenized_datasets[split_name] = dataset
+            else:
+                # dataset needs to be tokenized
+                raw_dataset = dataset_manager.get_raw_dataset()
+                tokenized_dataset = dataset_manager.tokenize_and_process(raw_dataset)
+                tokenized_datasets[split_name] = tokenized_dataset
 
         self.datasets = make_dataset_splits(
             tokenized_datasets,
@@ -154,6 +158,7 @@ def one_shot(self, stage: Optional[str] = None):
         calib_data = format_calibration_data(
             tokenized_dataset=self.get_dataset_split("calibration"),
             num_calibration_samples=self._data_args.num_calibration_samples,
+            do_shuffle=self._data_args.shuffle_calibration_samples,
             accelerator=self.trainer.accelerator,
         )
 
diff --git a/src/sparseml/transformers/finetune/text_generation.py b/src/sparseml/transformers/finetune/text_generation.py
index 6005c26f034..a25778aa5fa 100644
--- a/src/sparseml/transformers/finetune/text_generation.py
+++ b/src/sparseml/transformers/finetune/text_generation.py
@@ -319,10 +319,7 @@ def main(
 
     # Load datasets
     stage_runner = StageRunner(
-        model_args=model_args,
-        data_args=data_args,
-        training_args=training_args,
-        model=model,
+        model_args=model_args, data_args=data_args, training_args=training_args
     )
     stage_runner.populate_datasets(tokenizer=tokenizer)
     train_dataset = stage_runner.get_dataset_split("train")
diff --git a/tests/sparseml/transformers/finetune/data/test_dataset_loading.py b/tests/sparseml/transformers/finetune/data/test_dataset_loading.py
index 6493689416f..cd2c230b581 100644
--- a/tests/sparseml/transformers/finetune/data/test_dataset_loading.py
+++ b/tests/sparseml/transformers/finetune/data/test_dataset_loading.py
@@ -14,10 +14,12 @@
 # limitations under the License.
 
 import pytest
-from datasets import IterableDataset
+import torch
+from datasets import IterableDataset, load_dataset
 
 from sparseml.transformers.finetune.data import TextGenerationDataset
 from sparseml.transformers.finetune.data.data_args import DataTrainingArguments
+from sparseml.transformers.finetune.data.data_helpers import format_calibration_data
 from sparseml.transformers.finetune.model_args import ModelArguments
 from sparseml.transformers.finetune.runner import StageRunner
 from sparseml.transformers.finetune.training_args import TrainingArguments
@@ -229,13 +231,54 @@ def test_split_loading(split_def, tiny_llama_tokenizer):
     training_args = TrainingArguments(do_train=True, output_dir="dummy")
     model_args = ModelArguments(model=None)
     stage_runner = StageRunner(
-        model_args=model_args,
-        data_args=data_args,
-        training_args=training_args,
-        model=None,
+        model_args=model_args, data_args=data_args, training_args=training_args
     )
     stage_runner.populate_datasets(tokenizer=tiny_llama_tokenizer)
 
     train_dataset = stage_runner.get_dataset_split("train")
     assert train_dataset is not None
     assert isinstance(train_dataset[0], dict)
+
+
+def test_load_tokenized_data(tiny_llama_tokenizer):
+    dataset = load_dataset("garage-bAInd/Open-Platypus")["train"]
+    NUM_CALIB_SAMPS = 256
+    MAX_SEQ_LEN = 512
+    dataset = dataset.shuffle(seed=42).select(range(NUM_CALIB_SAMPS))
+
+    def preprocess(sample):
+        concat_text = "INPUT: " + sample.get("input", "")
+        concat_text += "INSTRUCTIONS: " + sample.get("instruction", "")
+        concat_text += "OUTPUT: " + sample.get("output", "")
+
+        return tiny_llama_tokenizer(
+            concat_text, padding=False, max_length=MAX_SEQ_LEN, truncation=True
+        )
+
+    tokenized_dataset = dataset.map(
+        preprocess, remove_columns=["input", "output", "instruction", "data_source"]
+    )
+    stage_runner = StageRunner(
+        model_args=None,
+        data_args=DataTrainingArguments(
+            dataset=tokenized_dataset, shuffle_calibration_samples=False
+        ),
+        training_args=TrainingArguments(do_oneshot=True),
+    )
+    stage_runner.populate_datasets(tokenizer=None)
+    calib_dataset = stage_runner.get_dataset_split("calibration")
+    assert len(calib_dataset) == NUM_CALIB_SAMPS
+    data_cols = calib_dataset.column_names
+    assert len(data_cols) == 2
+    assert "input_ids" in data_cols and "attention_mask" in data_cols
+
+    # confirm turning shuffle off works
+    calib_dataloader = format_calibration_data(
+        tokenized_dataset=calib_dataset,
+        num_calibration_samples=NUM_CALIB_SAMPS,
+        do_shuffle=stage_runner._data_args.shuffle_calibration_samples,
+    )
+    assert len(calib_dataloader) == NUM_CALIB_SAMPS
+    dataloader_sample = next(iter(calib_dataloader))["input_ids"]
+    diff = dataloader_sample - torch.Tensor(calib_dataset[0]["input_ids"])
+    assert torch.sum(diff) == 0

From 2de5c92d7ca70b4bc00f5e13b0caff5976ce7ee7 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Wed, 10 Apr 2024 10:41:39 -0400
Subject: [PATCH 15/19] Fix Sparsity Logs on FSDP Model Save (#2203)

* fix for reporting sparsity of FSDP models

* docstrings

* quality
---
 src/sparseml/pytorch/utils/sparsification.py  | 22 +++++--
 .../transformers/compression/config/base.py   | 32 +++++++---
 .../compression/utils/compress_save.py        |  8 +--
 .../transformers/finetune/session_mixin.py    | 61 +++++++++++--------
 src/sparseml/utils/fsdp/helpers.py            | 19 +++---
 5 files changed, 92 insertions(+), 50 deletions(-)

diff --git a/src/sparseml/pytorch/utils/sparsification.py b/src/sparseml/pytorch/utils/sparsification.py
index 72ede1c6be4..f22750c85c6 100644
--- a/src/sparseml/pytorch/utils/sparsification.py
+++ b/src/sparseml/pytorch/utils/sparsification.py
@@ -26,6 +26,7 @@
     Iterable,
     Iterator,
     List,
+    Optional,
     Tuple,
     Union,
 )
@@ -57,13 +58,22 @@ class ModuleSparsificationInfo:
     and quantization
 
     :param module: torch Module to analyze
+    :param state_dict: optional state_dict to analyze in place of the torch model. This
+    is used when analyzing an FSDP model, where the full weights may not be accessible
     """
 
-    def __init__(self, module: Module):
+    def __init__(
+        self, module: Module, state_dict: Optional[Dict[str, torch.Tensor]] = None
+    ):
         self.module = module
-        self.trainable_params = list(
-            filter(lambda param: param.requires_grad, self.module.parameters())
-        )
+        self.state_dict = state_dict
+
+        if self.state_dict is not None:
+            self.trainable_params = [param for _, param in state_dict.items()]
+        else:
+            self.trainable_params = list(
+                filter(lambda param: param.requires_grad, self.module.parameters())
+            )
 
     def __str__(self):
         return json.dumps(
@@ -124,7 +134,9 @@ def params_prunable_sparse(self) -> int:
         """
         return sum(
             round(tensor_sparsity(layer.weight).item() * torch.numel(layer.weight))
-            for (name, layer) in get_prunable_layers(self.module)
+            for (name, layer) in tqdm(
+                get_prunable_layers(self.module), desc="Calculating model sparsity"
+            )
         )
 
     @property
diff --git a/src/sparseml/transformers/compression/config/base.py b/src/sparseml/transformers/compression/config/base.py
index a642676c5a4..071a8718f5a 100644
--- a/src/sparseml/transformers/compression/config/base.py
+++ b/src/sparseml/transformers/compression/config/base.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import Dict, Optional
 
 from pydantic import BaseModel
+from torch import Tensor
 from torch.nn import Module
 
 import sparseml.core.session as session_manager
@@ -40,14 +41,19 @@ class CompressionConfig(RegistryMixin, BaseModel):
     sparsity_structure: Optional[str] = "unstructured"
 
     @staticmethod
-    def infer_global_sparsity(model: Module) -> float:
+    def infer_global_sparsity(
+        model: Module, state_dict: Optional[Dict[str, Tensor]] = None
+    ) -> float:
         """
         Calculates the global percentage of sparse zero weights in the model
 
         :param model: pytorch model to infer sparsity of
+        :param state_dict: optional state_dict to replace that in model, used for
+        gathering global FSDP model info
         :return: global sparsity of model
         """
-        info = ModuleSparsificationInfo(model)
+
+        info = ModuleSparsificationInfo(model, state_dict=state_dict)
         global_sparsity = info.params_sparse_percent
         return global_sparsity
 
@@ -75,17 +81,23 @@ def infer_sparsity_structure() -> str:
 
     @staticmethod
     def infer_config_from_model(
-        model: Module, compress: bool = False
+        model: Module,
+        state_dict: Optional[Dict[str, Tensor]] = None,
+        compress: bool = False,
     ) -> Optional["CompressionConfig"]:
         """
         Determines compression type and informational parameters for a given model
 
         :param model: pytorch model to calculate sparsity config for
+        :param state_dict: optional state_dict to replace that in model, used for
+        gathering global FSDP model info
         :param compress: whether or not to compress the model on disk
         :return: compression config inferred from the model
         """
 
-        global_sparsity = CompressionConfig.infer_global_sparsity(model)
+        global_sparsity = CompressionConfig.infer_global_sparsity(
+            model, state_dict=state_dict
+        )
 
         if global_sparsity < 0.05:
             return None
@@ -102,11 +114,17 @@ def infer_config_from_model(
             sparsity_structure=sparsity_structure,
         )
 
-    def fill_config_details(self, model: Module):
+    def fill_config_details(
+        self, model: Module, state_dict: Optional[Dict[str, Tensor]] = None
+    ):
         """
         Fills in informational sparsity parameters from a given model
 
         :param model: pytorch model to infer config parameters from
+        :param state_dict: optional state_dict to replace that in model, used for
+        gathering global FSDP model info
         """
-        self.global_sparsity = CompressionConfig.infer_global_sparsity(model)
+        self.global_sparsity = CompressionConfig.infer_global_sparsity(
+            model, state_dict=state_dict
+        )
         self.sparsity_structure = CompressionConfig.infer_sparsity_structure()
diff --git a/src/sparseml/transformers/compression/utils/compress_save.py b/src/sparseml/transformers/compression/utils/compress_save.py
index 3013449b67d..96315fd1685 100644
--- a/src/sparseml/transformers/compression/utils/compress_save.py
+++ b/src/sparseml/transformers/compression/utils/compress_save.py
@@ -74,6 +74,8 @@ def save_pretrained_wrapper(
             :param kwargs: additional kwargs to pass on to model.save_pretrained
             """
             model = model_ref()
+            # state_dict gets passed in as a kwarg for FSDP models
+            state_dict = kwargs.get("state_dict", None)
 
             if qat_active(model):
                 _LOGGER.info(
@@ -86,7 +88,7 @@ def save_pretrained_wrapper(
                 )
 
             if sparsity_config is not None:
-                sparsity_config.fill_config_details(model)
+                sparsity_config.fill_config_details(model, state_dict=state_dict)
             elif not skip_compression_stats:
                 # try to infer a sparsity config from the model if none is provided
                 _LOGGER.info(
@@ -96,7 +98,7 @@ def save_pretrained_wrapper(
                     "skip_compression_stats=True"
                 )
                 sparsity_config = CompressionConfig.infer_config_from_model(
-                    model, compress=save_compressed
+                    model, state_dict=state_dict, compress=save_compressed
                 )
 
             if sparsity_config is None:
@@ -111,8 +113,6 @@ def save_pretrained_wrapper(
                 sparsity_config.format, config=sparsity_config
             )
 
-            # state_dict gets passed in as a kwarg for FSDP models
-            state_dict = kwargs.get("state_dict", None)
             if state_dict is None:
                 state_dict = model.state_dict()
 
diff --git a/src/sparseml/transformers/finetune/session_mixin.py b/src/sparseml/transformers/finetune/session_mixin.py
index 3971b1c0a02..a696fc02a6c 100644
--- a/src/sparseml/transformers/finetune/session_mixin.py
+++ b/src/sparseml/transformers/finetune/session_mixin.py
@@ -371,11 +371,7 @@ def train(self, *args, stage: Optional[str] = None, **kwargs):
         self.accelerator.wait_for_everyone()
 
         # log model sparsity
-        with summon_full_params_context(self.model, offload_to_cpu=True):
-            if self.accelerator.is_main_process:
-                if not qat_active(self.model):
-                    self.log_model_sparsification()
-
+        self.maybe_log_model_sparsification()
         self.accelerator.wait_for_everyone()
 
         return output
@@ -433,11 +429,7 @@ def one_shot(self, calib_data: DataLoader, stage: Optional[str] = None):
         )
 
         # log model sparsity
-        with summon_full_params_context(self.model, offload_to_cpu=True):
-            if self.accelerator.is_main_process:
-                if not qat_active(self.model):
-                    self.log_model_sparsification()
-
+        self.maybe_log_model_sparsification()
         self.accelerator.wait_for_everyone()
 
     def save_model(
@@ -479,17 +471,36 @@ def save_model(
         if not self.recipe:
             return
 
-        # save recipe, will contain modifiers from the model's original recipe as well
-        # as those added from self.recipe
-        recipe_path = os.path.join(output_dir, RECIPE_FILE_NAME)
-        session = session_manager.active_session()
-        recipe_yaml_str = session.get_serialized_recipe()
-        with open(recipe_path, "w") as fp:
-            fp.write(recipe_yaml_str)
+        if self.accelerator.is_main_process:
+            # save recipe, will contain modifiers from the model's original recipe as
+            # well as those added from self.recipe
+            recipe_path = os.path.join(output_dir, RECIPE_FILE_NAME)
+            session = session_manager.active_session()
+            recipe_yaml_str = session.get_serialized_recipe()
+            with open(recipe_path, "w") as fp:
+                fp.write(recipe_yaml_str)
+
+            _LOGGER.info(f"Saved SparseML recipe with model state to {recipe_path}")
 
-        _LOGGER.info(f"Saved SparseML recipe with model state to {recipe_path}")
         self.accelerator.wait_for_everyone()
 
+    def maybe_log_model_sparsification(self):
+        """
+        Log info on model sparsity and quantization if possible. Only print logs on the
+        main process, and avoid logging for quantized FSDP models
+        """
+        with summon_full_params_context(self.model, offload_to_cpu=True):
+            # offload to avoid OOM errors
+            if not self.accelerator.is_main_process:
+                # only calculate stats rank0 GPU
+                return
+            if self.is_fsdp_enabled and qat_active(self.model):
+                # due to state dict changes we can't log sparsity info with quantized
+                # models in FSDP
+                return
+
+            self.log_model_sparsification()
+
     def log_model_sparsification(self):
         """
         Log the current model sparsification info including pruned and quantized states
@@ -499,18 +510,16 @@ def log_model_sparsification(self):
         _LOGGER.info(
             f"Sparsification info for {self.model_state_path}: "
             f"{sparsification_info.params_total} total params. "
-            f"Of those there are {sparsification_info.params_prunable_total} prunable "
+        )
+        _LOGGER.info(
+            f"There are {sparsification_info.params_prunable_total} prunable "
             f"params which have {sparsification_info.params_prunable_sparse_percent} "
             "avg sparsity."
         )
-        model_type = (
-            "sparse"
-            if sparsification_info.params_prunable_sparse_percent > 5
-            else "dense"
-        )
         _LOGGER.info(
-            f"{model_type} model detected, "
-            f"all sparsification info: {sparsification_info}"
+            f"There are {sparsification_info.params_quantizable} quantizable "
+            f"params, with a quantization percentage of "
+            f"{sparsification_info.params_quantized_percent}."
         )
 
     def _prepare_model_for_fsdp(self):
diff --git a/src/sparseml/utils/fsdp/helpers.py b/src/sparseml/utils/fsdp/helpers.py
index d2def7fef39..2a0606b6cd6 100644
--- a/src/sparseml/utils/fsdp/helpers.py
+++ b/src/sparseml/utils/fsdp/helpers.py
@@ -163,14 +163,17 @@ def save_pretrained_fsdp(
     ):
         state_dict = accelerator.get_state_dict(model, unwrap=False)
 
-    accelerator.unwrap_model(model).save_pretrained(
-        output_dir,
-        is_main_process=accelerator.is_main_process,
-        save_function=accelerator.save,
-        state_dict=state_dict,
-        save_compressed=save_compressed,
-        safe_serialization=save_safetensors,
-    )
+    if accelerator.is_main_process:
+        accelerator.unwrap_model(model).save_pretrained(
+            output_dir,
+            is_main_process=accelerator.is_main_process,
+            save_function=accelerator.save,
+            state_dict=state_dict,
+            save_compressed=save_compressed,
+            safe_serialization=save_safetensors,
+        )
+
+    accelerator.wait_for_everyone()
 
 
 def get_fsdp_parent(layer_name: str, model: Module) -> Optional[Module]:

From 8de19e3da7ad6be0a3c8e798942ba409776c62ed Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Wed, 10 Apr 2024 10:41:51 -0400
Subject: [PATCH 16/19] Refactor Initialized Model Export (#2224)

* initial commit

* cleaning up interface

* style

* fixing unit tests

---------

Co-authored-by: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
---
 src/sparseml/export/export.py                 | 26 ++++++++++++++-----
 src/sparseml/export/helpers.py                | 12 ++++++++-
 src/sparseml/integration_helper_functions.py  | 24 ++++++++++++++++-
 .../pytorch/torch_to_onnx_exporter.py         |  3 ++-
 .../integration_helper_functions.py           |  4 +--
 src/sparseml/transformers/utils/helpers.py    | 16 ------------
 .../test_generative_transformers.py           | 23 ++++------------
 7 files changed, 63 insertions(+), 45 deletions(-)

diff --git a/src/sparseml/export/export.py b/src/sparseml/export/export.py
index 05469f2a792..920feef0c2e 100644
--- a/src/sparseml/export/export.py
+++ b/src/sparseml/export/export.py
@@ -90,6 +90,7 @@ def export(
     source_path: Union[Path, str] = None,
     target_path: Union[Path, str, None] = None,
     model: Optional["torch.nn.Module"] = None,  # noqa F401
+    tokenizer: Optional["PreTrainedTokenizer"] = None,  # noqa F401
     onnx_model_name: str = ONNX_MODEL_NAME,
     deployment_target: str = "deepsparse",
     opset: Optional[int] = None,
@@ -134,11 +135,9 @@ def export(
         will default to source_path
     :param model: The PyTorch model to export. If provided, the source_path
         should be set to None to avoid potential confusion and entaglement
-        of sources. This means that, the full
-        export logic will not be enforced (e.g. the final deployment directory
-        will not be complete, it will not be possible to run validate_structure
-        method or apply some optimizations that require complete deployment
-        directory structure)
+        of sources
+    :param tokenizer: An optional tokenizer to export if passing in a source through
+    the model argument. This argument takes no effect if a source_path is provided
     :param onnx_model_name: The name of the exported model.
         Defaults to ONNX_MODEL_NAME.
     :param deployment_target: The deployment target to export
@@ -184,6 +183,7 @@ def export(
     from sparseml.export.validators import validate_structure as validate_structure_
     from sparseml.integration_helper_functions import (
         IntegrationHelperFunctions,
+        remove_past_key_value_support_from_config,
         resolve_integration,
     )
     from sparseml.pytorch.opset import TORCH_DEFAULT_ONNX_OPSET
@@ -206,8 +206,18 @@ def export(
         source_path = process_source_path(source_path)
         if target_path is None:
             target_path = source_path
+        if tokenizer is not None:
+            _LOGGER.warning(
+                "Passed a tokenizer is not supported when exporting from ",
+                "a source path. The tokenizer will be ignored. ",
+            )
+
+    if model is not None and hasattr(model, "config"):
+        model.config = remove_past_key_value_support_from_config(model.config)
 
-    integration = resolve_integration(source_path, integration)
+    integration = resolve_integration(
+        source_path=source_path, source_model=model, integration=integration
+    )
     _LOGGER.info(f"Starting export for {integration} model...")
 
     if target_path is None:
@@ -262,6 +272,8 @@ def export(
     session_manager.active_session().reset()
 
     _LOGGER.info("Creating data loader for the export...")
+    if tokenizer is not None:
+        loaded_model_kwargs["tokenizer"] = tokenizer
     data_loader, loaded_data_loader_kwargs = helper_functions.create_data_loader(
         model=model,
         task=task,
@@ -323,6 +335,8 @@ def export(
 
     deployment_folder_dir = create_deployment_folder(
         source_path=source_path,
+        source_config=getattr(model, "config", None),
+        source_tokenizer=tokenizer,
         target_path=target_path,
         deployment_directory_name=deployment_directory_name,
         deployment_directory_files_mandatory=helper_functions.deployment_directory_files_mandatory,  # noqa: E501
diff --git a/src/sparseml/export/helpers.py b/src/sparseml/export/helpers.py
index 4a07a5886d2..5c067f7b24c 100644
--- a/src/sparseml/export/helpers.py
+++ b/src/sparseml/export/helpers.py
@@ -115,6 +115,8 @@ def create_deployment_folder(
     target_path: Union[Path, str],
     deployment_directory_files_mandatory: List[str],
     source_path: Union[Path, str, None] = None,
+    source_config: Optional["PreTrainedConfig"] = None,  # noqa F401
+    source_tokenizer: Optional["PreTrainedTokenizer"] = None,  # noqa F401
     deployment_directory_files_optional: Optional[List[str]] = None,
     deployment_directory_name: str = "deployment",
     onnx_model_name: Optional[str] = None,
@@ -135,6 +137,8 @@ def create_deployment_folder(
         The files will be copied to target_path/deployment_directory_name.
     :param source_path: The path to the source folder (where the original model
         files are stored)
+    :param source_config: Optional Hugging Face config to copy to deployment dir
+    :param source_tokenizer: Optional Hugging Face tokenizer to copy to deployment dir
     :param deployment_directory_files_mandatory: The mandatory list of files
         to copy to the deployment directory. If the file is an ONNX model
         (or ONNX data file), the file will be copied from target_path.
@@ -161,10 +165,16 @@ def create_deployment_folder(
         deployment_folder_dir=deployment_folder_dir,
         onnx_model_name=onnx_model_name,
     )
+
     if source_path is None:
+        # exporting an instantiated model
+        if source_config is not None:
+            source_config.save_pretrained(deployment_folder_dir)
+        if source_tokenizer is not None:
+            source_tokenizer.save_pretrained(deployment_folder_dir)
         return deployment_folder_dir
 
-    # copy the relevant files from source_path
+    # exporting from a source path, copy the relevant files to deployment directory
     for file_name in deployment_directory_files_mandatory:
         copy_mandatory_deployment_files(
             file_name, source_path, target_path, onnx_model_name, deployment_folder_dir
diff --git a/src/sparseml/integration_helper_functions.py b/src/sparseml/integration_helper_functions.py
index dcd7ea922ef..cf8cbcdd690 100644
--- a/src/sparseml/integration_helper_functions.py
+++ b/src/sparseml/integration_helper_functions.py
@@ -37,16 +37,18 @@ class Integrations(Enum):
 
 def resolve_integration(
     source_path: Union[Path, str, None] = None,
+    source_model: Optional["PreTrainedModel"] = None,  # noqa F401
     integration: Optional[str] = None,
 ) -> str:
     """
     Resolve the integration to use.
 
-    If integration is not provided, attempt to infer it from the source_path.
+    If integration is not provided, attempt to infer it from the source_path or model.
     Once the integration is resolved, perform the hot import to register
     the integration helper functions.
 
     :param source_path: The path to the PyTorch model to export.
+    :param source_model: An instantiated model to export
     :param integration: Optional name of the integration to use. If not provided,
         will attempt to infer it from the source_path.
     :return: The name of the integration to use for exporting the model.
@@ -54,6 +56,10 @@ def resolve_integration(
 
     integration = integration or _infer_integration_from_source_path(source_path)
 
+    # attempt to infer transformers based on model attribute
+    if source_model is not None and hasattr(source_model, "config_class"):
+        integration = Integrations.transformers.value
+
     if integration == Integrations.image_classification.value:
         import sparseml.pytorch.image_classification.integration_helper_functions  # noqa F401
 
@@ -72,6 +78,22 @@ def resolve_integration(
         )
 
 
+def remove_past_key_value_support_from_config(config):
+    """
+    Modify config of the causal language model so that it turns off the
+    past key value support. This means that the model initialized from
+    this config will not take past key values as input and will not output
+    past key values.
+    """
+    # not take past_key_values as input
+    config.is_decoder = True
+    # whether to use past key values an input
+    config.use_past = False
+    # whether to output past key values
+    config.use_cache = False
+    return config
+
+
 def _infer_integration_from_source_path(
     source_path: Union[Path, str, None] = None
 ) -> Optional[str]:
diff --git a/src/sparseml/pytorch/torch_to_onnx_exporter.py b/src/sparseml/pytorch/torch_to_onnx_exporter.py
index 7e24f7cc3d7..608b5833900 100644
--- a/src/sparseml/pytorch/torch_to_onnx_exporter.py
+++ b/src/sparseml/pytorch/torch_to_onnx_exporter.py
@@ -129,7 +129,8 @@ def remove_leftover_files(self):
             torch_onnx_export_transform, _TorchOnnxExport
         ), "Expected the first transform from self.transform to be _TorchOnnxExport"
         for file in torch_onnx_export_transform.leftover_files:
-            os.remove(file)
+            if os.path.exists(file):
+                os.remove(file)
 
 
 class _TorchOnnxExport(BaseTransform):
diff --git a/src/sparseml/transformers/integration_helper_functions.py b/src/sparseml/transformers/integration_helper_functions.py
index 31f3a4505a3..e6f997b2b36 100644
--- a/src/sparseml/transformers/integration_helper_functions.py
+++ b/src/sparseml/transformers/integration_helper_functions.py
@@ -24,6 +24,7 @@
 from sparseml.integration_helper_functions import (
     IntegrationHelperFunctions,
     Integrations,
+    remove_past_key_value_support_from_config,
 )
 from sparseml.transformers.finetune.data.data_helpers import format_calibration_data
 from sparseml.transformers.utils.helpers import (
@@ -34,7 +35,6 @@
     OPTIONAL_DEPLOYMENT_FILES,
     TaskNames,
     create_fake_dataloader,
-    remove_past_key_value_support_from_config,
     resolve_sequence_length,
 )
 from sparseml.transformers.utils.initializers import (
@@ -115,7 +115,7 @@ def create_data_loader(
     data_args: Optional[Dict[str, Any]] = None,
     config: Optional["AutoConfig"] = None,  # noqa F821
     source_path: Optional[str] = None,
-    sequence_length: Optional[int] = None,
+    sequence_length: int = 384,
     tokenizer: Optional["AutoTokenizer"] = None,  # noqa F821
     dataset_with_labels: bool = False,
     **kwargs,
diff --git a/src/sparseml/transformers/utils/helpers.py b/src/sparseml/transformers/utils/helpers.py
index 07463d355e0..944d0bd32ff 100644
--- a/src/sparseml/transformers/utils/helpers.py
+++ b/src/sparseml/transformers/utils/helpers.py
@@ -99,22 +99,6 @@ class TaskNames(Enum):
 RELEVANT_HF_SUFFIXES = ["json", "md", "bin", "safetensors", "yaml", "yml", "py"]
 
 
-def remove_past_key_value_support_from_config(config: AutoConfig) -> AutoConfig:
-    """
-    Modify config of the causal language model so that it turns off the
-    past key value support. This means that the model initialized from
-    this config will not take past key values as input and will not output
-    past key values.
-    """
-    # not take past_key_values as input
-    config.is_decoder = True
-    # whether to use past key values an input
-    config.use_past = False
-    # whether to output past key values
-    config.use_cache = False
-    return config
-
-
 def is_transformer_model(source_path: Union[Path, str]) -> bool:
     """
     :param source_path: The path to the model
diff --git a/tests/sparseml/export/transformers/test_generative_transformers.py b/tests/sparseml/export/transformers/test_generative_transformers.py
index 79be506e6d6..7f5f7d1c844 100644
--- a/tests/sparseml/export/transformers/test_generative_transformers.py
+++ b/tests/sparseml/export/transformers/test_generative_transformers.py
@@ -23,10 +23,7 @@
 
 from huggingface_hub import snapshot_download
 from sparseml import export
-from sparseml.transformers import SparseAutoConfig, SparseAutoModelForCausalLM
-from sparseml.transformers.utils.helpers import (
-    remove_past_key_value_support_from_config,
-)
+from sparseml.transformers import SparseAutoModelForCausalLM, SparseAutoTokenizer
 
 
 @pytest.mark.parametrize(
@@ -49,21 +46,11 @@ def test_export_initialized_model_no_source_path(self, setup):
         # export the transformer model, that is being passed to the
         # `export` API directly as an object
         source_path, target_path, task = setup
-        config = remove_past_key_value_support_from_config(
-            SparseAutoConfig.from_pretrained(source_path)
-        )
         export(
-            model=SparseAutoModelForCausalLM.from_pretrained(
-                source_path, config=config
-            ),
+            model=SparseAutoModelForCausalLM.from_pretrained(source_path),
+            tokenizer=SparseAutoTokenizer.from_pretrained(source_path),
             target_path=target_path,
-            integration="transformers",
             sequence_length=384,
-            # we need to disable applying kv cache injection
-            # because the script does not have access to the
-            # config.json (we are not creating a full deployment
-            # directory during the export)
-            graph_optimizations="none",
             task=task,
             validate_correctness=True,
             num_export_samples=2,
@@ -73,11 +60,11 @@ def test_export_initialized_model_no_source_path(self, setup):
         )
         assert (target_path / "deployment" / "model.onnx").exists()
         assert not (target_path / "deployment" / "model.data").exists()
-        # assert that kv cache injection has not been applied
+        # check if kv cache injection has been applied
         onnx_model = onnx.load(
             str(target_path / "deployment" / "model.onnx"), load_external_data=False
         )
-        assert not any(
+        assert any(
             inp.name == "past_key_values.0.key" for inp in onnx_model.graph.input
         )
 

From 038ad808c7abc5f1d1d714a6c12596b72bb66b05 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Wed, 10 Apr 2024 11:02:08 -0400
Subject: [PATCH 17/19] Update OneShot Test Pathways (#2228)

* update oneshot tests to use new pathways

* typo

* update structure arg

---------

Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 .../finetune/test_alternate_recipe.yaml       |  3 +-
 ...sparsity.yaml => additional_sparsity.yaml} |  3 +-
 tests/sparseml/transformers/obcq/quant.yaml   | 40 ++++++++
 .../{test_tiny.yaml => quant_and_sparse.yaml} |  3 +-
 .../{test_tiny_w_head.yaml => sparse.yaml}    |  3 +-
 tests/sparseml/transformers/obcq/test_obcq.py | 97 ++++++-------------
 .../transformers/obcq/test_repeats.py         | 93 ++++++++++--------
 .../transformers/obcq/test_tiny2.yaml         |  3 +-
 8 files changed, 122 insertions(+), 123 deletions(-)
 rename tests/sparseml/transformers/obcq/{test_additional_sparsity.yaml => additional_sparsity.yaml} (89%)
 create mode 100644 tests/sparseml/transformers/obcq/quant.yaml
 rename tests/sparseml/transformers/obcq/{test_tiny.yaml => quant_and_sparse.yaml} (97%)
 rename tests/sparseml/transformers/obcq/{test_tiny_w_head.yaml => sparse.yaml} (89%)

diff --git a/tests/sparseml/transformers/finetune/test_alternate_recipe.yaml b/tests/sparseml/transformers/finetune/test_alternate_recipe.yaml
index f49f56351d3..411d6a41fed 100644
--- a/tests/sparseml/transformers/finetune/test_alternate_recipe.yaml
+++ b/tests/sparseml/transformers/finetune/test_alternate_recipe.yaml
@@ -6,8 +6,7 @@ test_oneshot_stage:
       sequential_update: False
       quantize: False
       percdamp: 0.01
-      prunen: 0
-      prunem: 0
+      mask_structure: "0:0"
       targets: [
         "model.layers.0"
       ]
diff --git a/tests/sparseml/transformers/obcq/test_additional_sparsity.yaml b/tests/sparseml/transformers/obcq/additional_sparsity.yaml
similarity index 89%
rename from tests/sparseml/transformers/obcq/test_additional_sparsity.yaml
rename to tests/sparseml/transformers/obcq/additional_sparsity.yaml
index 4615625675f..19d479e8666 100644
--- a/tests/sparseml/transformers/obcq/test_additional_sparsity.yaml
+++ b/tests/sparseml/transformers/obcq/additional_sparsity.yaml
@@ -6,8 +6,7 @@ test_stage:
       sequential_update: True
       quantize: False
       percdamp: 0.01
-      prunen: 0
-      prunem: 0
+      mask_structure: "0:0"
       targets: [
         "model.layers.0"
       ]
diff --git a/tests/sparseml/transformers/obcq/quant.yaml b/tests/sparseml/transformers/obcq/quant.yaml
new file mode 100644
index 00000000000..d229cba2923
--- /dev/null
+++ b/tests/sparseml/transformers/obcq/quant.yaml
@@ -0,0 +1,40 @@
+test_stage:
+  obcq_modifiers:
+    SmoothQuantModifier:
+      smoothing_strength: 0.5
+      mappings: [
+        [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"],
+        [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"]
+      ]
+    QuantizationModifier:
+      ignore:
+        - LlamaRotaryEmbedding
+        - LlamaRMSNorm
+        - SiLU
+        - model.layers.0.mlp.down_proj
+        - model.layers.1.mlp.down_proj
+        - model.layers.2.mlp.down_proj
+        - model.layers.3.mlp.down_proj
+        - model.layers.4.mlp.down_proj
+        - model.layers.5.mlp.down_proj
+      scheme_overrides:
+        Embedding:
+          input_activations: null
+          weights:
+            num_bits: 8
+            symmetric: False
+    SparseGPTModifier:
+      sparsity: 0.0
+      block_size: 128
+      sequential_update: False
+      quantize: True
+      percdamp: 0.01
+      mask_structure: "0:0"
+      targets: [
+        "model.layers.0",
+        "model.layers.1",
+        "model.layers.2",
+        "model.layers.3",
+        "model.layers.4",
+        "model.layers.5"
+      ]
\ No newline at end of file
diff --git a/tests/sparseml/transformers/obcq/test_tiny.yaml b/tests/sparseml/transformers/obcq/quant_and_sparse.yaml
similarity index 97%
rename from tests/sparseml/transformers/obcq/test_tiny.yaml
rename to tests/sparseml/transformers/obcq/quant_and_sparse.yaml
index 422baf87580..ddaf20b854f 100644
--- a/tests/sparseml/transformers/obcq/test_tiny.yaml
+++ b/tests/sparseml/transformers/obcq/quant_and_sparse.yaml
@@ -30,8 +30,7 @@ test_stage:
       sequential_update: False
       quantize: True
       percdamp: 0.01
-      prunen: 0
-      prunem: 0
+      mask_structure: "0:0"
       targets: [
         "model.layers.0",
         "model.layers.1",
diff --git a/tests/sparseml/transformers/obcq/test_tiny_w_head.yaml b/tests/sparseml/transformers/obcq/sparse.yaml
similarity index 89%
rename from tests/sparseml/transformers/obcq/test_tiny_w_head.yaml
rename to tests/sparseml/transformers/obcq/sparse.yaml
index a5debe7e25e..3b03ff95f7e 100644
--- a/tests/sparseml/transformers/obcq/test_tiny_w_head.yaml
+++ b/tests/sparseml/transformers/obcq/sparse.yaml
@@ -6,8 +6,7 @@ test_stage:
       sequential_update: False
       quantize: False
       percdamp: 0.01
-      prunen: 0
-      prunem: 0
+      mask_structure: "0:0"
       targets: [
         "model.layers.0",
         "model.layers.1",
diff --git a/tests/sparseml/transformers/obcq/test_obcq.py b/tests/sparseml/transformers/obcq/test_obcq.py
index 12cbb5e6240..f61ac1c2567 100644
--- a/tests/sparseml/transformers/obcq/test_obcq.py
+++ b/tests/sparseml/transformers/obcq/test_obcq.py
@@ -16,96 +16,48 @@
 
 import pytest
 import torch
-from transformers import AutoTokenizer
 
 from sparseml.core import ModifiableModel
 from sparseml.core.framework import Framework
 from sparseml.core.state import State
 from sparseml.modifiers.obcq import SparseGPTModifier
 from sparseml.modifiers.obcq.pytorch import SparseGPTModifierPyTorch
-from sparseml.modifiers.obcq.utils.helpers import ppl_eval_general
+from sparseml.pytorch.model_load.helpers import get_session_model
 from sparseml.pytorch.utils.helpers import tensor_sparsity
-from sparseml.transformers import SparseAutoModelForCausalLM
-from sparseml.transformers.finetune.data import TextGenerationDataset
-from sparseml.transformers.finetune.data.data_args import DataTrainingArguments
-from sparseml.transformers.finetune.data.data_helpers import format_calibration_data
-from sparseml.transformers.sparsification.obcq.obcq import one_shot
-from sparseml.transformers.sparsification.obcq.utils.helpers import llama_forward
-from sparseml.transformers.utils.helpers import resolve_sequence_length
-from sparseml.transformers.utils.initializers import (
-    initialize_config,
-    initialize_sparse_model,
-)
+from sparseml.transformers import SparseAutoModelForCausalLM, oneshot
 
 
 @pytest.mark.parametrize(
     "recipe_file_path",
     [
-        "tests/sparseml/transformers/obcq/test_tiny.yaml",
-        "tests/sparseml/transformers/obcq/test_tiny2.yaml",
-        "tests/sparseml/transformers/obcq/test_tiny_w_head.yaml",
+        "tests/sparseml/transformers/obcq/sparse.yaml",
+        "tests/sparseml/transformers/obcq/quant.yaml",
+        "tests/sparseml/transformers/obcq/quant_and_sparse.yaml",
     ],
 )
 def test_obcq_tinystories(recipe_file_path):
     tiny_model_path = "Xenova/llama2.c-stories15M"
     device = "cuda:0"
-    num_samples = 64
-    dataset = "open_platypus"
     if not torch.cuda.is_available():
         device = "cpu"
-    config = initialize_config(model_path=tiny_model_path)
 
-    # test recipe with 50% sparsity, quantization and smoothquant
-    tiny_model = one_shot(
-        model_path=tiny_model_path,
-        dataset=dataset,
-        num_samples=num_samples,
-        device=device,
-        recipe_file=recipe_file_path,
-    )
-
-    data_args = DataTrainingArguments(
-        dataset=dataset,
-        max_seq_length=resolve_sequence_length(config),
-        num_calibration_samples=num_samples,
-        concatenate_data=False,
+    oneshot(
+        model=tiny_model_path,
+        dataset="open_platypus",
+        oneshot_device=device,
+        recipe=recipe_file_path,
+        max_seq_length=128,
+        num_calibration_samples=64,
         pad_to_max_length=False,
     )
 
-    tokenizer = AutoTokenizer.from_pretrained(
-        tiny_model_path, use_fast=True, trust_remote_code=True
-    )
-    dataset_manager = TextGenerationDataset.load_from_registry(
-        dataset, data_args=data_args, split="train", tokenizer=tokenizer
-    )
-    raw_dataset = dataset_manager.get_raw_dataset()
-    tokenized_dataset = dataset_manager.tokenize_and_process(raw_dataset)
-    test_data = format_calibration_data(
-        tokenized_dataset=tokenized_dataset, num_calibration_samples=num_samples
-    )
-    test_data = [d["input_ids"] for d in test_data]
-    perplexity = ppl_eval_general(
-        llama_forward, tiny_model, test_data, device, max_samples_per_iteration=8
-    )
-
-    # we aren't expecting good results from this tiny model, but this should catch any
-    # egregious errors with the OBCQ algorithm
-    assert perplexity < 10000.0
-
 
 def test_lm_head_target():
     tiny_model_path = "Xenova/llama2.c-stories15M"
     device = "cuda:0"
     if not torch.cuda.is_available():
         device = "cpu"
-
-    config = initialize_config(model_path=tiny_model_path)
-    model = initialize_sparse_model(
-        model_path=tiny_model_path,
-        device=device,
-        task="text-generation",
-        config=config,
-    )
+    model = SparseAutoModelForCausalLM.from_pretrained(tiny_model_path)
 
     kwargs = {
         "sparsity": 0.5,
@@ -140,25 +92,30 @@ def test_lm_head_target():
 
 def test_sparsities():
     tiny_model_path = "Xenova/llama2.c-stories15M"
-    lm_head_recipe = "tests/sparseml/transformers/obcq/test_tiny_w_head.yaml"
+    recipe = "tests/sparseml/transformers/obcq/sparse.yaml"
     device = "cuda:0"
     if not torch.cuda.is_available():
         device = "cpu"
 
     # test recipe with 50% sparsity, quantization and smoothquant
-    tiny_model = one_shot(
-        model_path=tiny_model_path,
+    oneshot(
+        model=tiny_model_path,
         dataset="open_platypus",
-        num_samples=64,
-        device=device,
-        recipe_file=lm_head_recipe,
+        oneshot_device=device,
+        recipe=recipe,
+        max_seq_length=128,
+        num_calibration_samples=64,
+        pad_to_max_length=False,
+        clear_sparse_session=False,
     )
 
-    lm_head_sparsity = tensor_sparsity(tiny_model.lm_head.weight)
+    model = get_session_model()
+
+    lm_head_sparsity = tensor_sparsity(model.lm_head.weight)
     assert math.isclose(lm_head_sparsity.item(), 0.3, rel_tol=1e-4)
-    layer_1_sparse = tensor_sparsity(tiny_model.model.layers[1].self_attn.k_proj.weight)
+    layer_1_sparse = tensor_sparsity(model.model.layers[1].self_attn.k_proj.weight)
     assert math.isclose(layer_1_sparse.item(), 0.3, rel_tol=1e-4)
-    layer_2_dense = tensor_sparsity(tiny_model.model.layers[2].self_attn.k_proj.weight)
+    layer_2_dense = tensor_sparsity(model.model.layers[2].self_attn.k_proj.weight)
     assert math.isclose(layer_2_dense.item(), 0.0, rel_tol=1e-4)
 
 
diff --git a/tests/sparseml/transformers/obcq/test_repeats.py b/tests/sparseml/transformers/obcq/test_repeats.py
index d4b2d2ee5a0..f7267ac3d4d 100644
--- a/tests/sparseml/transformers/obcq/test_repeats.py
+++ b/tests/sparseml/transformers/obcq/test_repeats.py
@@ -19,8 +19,9 @@
 import yaml
 
 import sparseml.core.session as session_manager
+from sparseml.pytorch.model_load.helpers import get_session_model
 from sparseml.pytorch.utils.helpers import tensor_sparsity
-from sparseml.transformers.sparsification.obcq.obcq import one_shot
+from sparseml.transformers import oneshot
 from sparseml.utils.pytorch import qat_active
 
 
@@ -32,22 +33,23 @@
 
 def test_consecutive_runs(tmp_path):
     tiny_model_path = "Xenova/llama2.c-stories15M"
-    first_recipe = "tests/sparseml/transformers/obcq/test_tiny.yaml"
-    second_recipe = "tests/sparseml/transformers/obcq/test_additional_sparsity.yaml"
+    first_recipe = "tests/sparseml/transformers/obcq/quant_and_sparse.yaml"
+    second_recipe = "tests/sparseml/transformers/obcq/additional_sparsity.yaml"
     device = "cuda:0"
     if not torch.cuda.is_available():
         device = "cpu"
 
     # test recipe with 50% sparsity, quantization and smoothquant
-    first_tiny_model = one_shot(
-        model_path=tiny_model_path,
+    oneshot(
+        model=tiny_model_path,
         dataset="open_platypus",
-        num_samples=16,
-        device=device,
-        recipe_file=first_recipe,
-        deploy_dir=tmp_path / "test1",
-        do_save=True,
+        num_calibration_samples=16,
+        recipe=first_recipe,
+        output_dir=tmp_path / "test1",
+        oneshot_device=device,
+        clear_sparse_session=False,
     )
+    first_tiny_model = get_session_model()
     layer_0_sparse = tensor_sparsity(
         first_tiny_model.model.layers[0].self_attn.k_proj.module.weight
     )
@@ -61,15 +63,17 @@ def test_consecutive_runs(tmp_path):
     session.reset()
 
     # reload saved model and up sparsity to 0.7
-    second_tiny_model = one_shot(
-        model_path=tmp_path / "test1" / "obcq_deployment",
+    oneshot(
+        model=tmp_path / "test1",
         dataset="open_platypus",
-        num_samples=16,
-        device=device,
-        recipe_file=second_recipe,
-        deploy_dir=tmp_path / "test2",
-        do_save=True,
+        num_calibration_samples=16,
+        recipe=second_recipe,
+        output_dir=tmp_path / "test2",
+        oneshot_device=device,
+        clear_sparse_session=False,
     )
+
+    second_tiny_model = get_session_model()
     layer_0_sparse = tensor_sparsity(
         second_tiny_model.model.layers[0].self_attn.k_proj.module.weight
     )
@@ -81,7 +85,7 @@ def test_consecutive_runs(tmp_path):
     stages = [stage.group for stage in session_recipe.stages]
     assert len(stages) == 2
 
-    recipe_path = tmp_path / "test2" / "obcq_deployment" / "recipe.yaml"
+    recipe_path = tmp_path / "test2" / "recipe.yaml"
     recipe_data = yaml.safe_load(recipe_path.read_text())
     stage_keys = recipe_data.keys()
     assert len(stage_keys) == 2
@@ -119,14 +123,14 @@ def test_fail_on_repeated_quant(tmp_path):
     if not torch.cuda.is_available():
         device = "cpu"
 
-    one_shot(
-        model_path=tiny_model_path,
+    oneshot(
+        model=tiny_model_path,
         dataset="open_platypus",
-        num_samples=4,
-        device=device,
-        recipe_file=first_recipe_str,
-        deploy_dir=tmp_path,
-        do_save=True,
+        num_calibration_samples=4,
+        oneshot_device=device,
+        recipe=first_recipe_str,
+        output_dir=tmp_path / "test",
+        clear_sparse_session=False,
     )
 
     session = session_manager.active_session()
@@ -135,12 +139,12 @@ def test_fail_on_repeated_quant(tmp_path):
     # When trying to re-quantize with the second recipe, we should error out
     # to avoid nested quantizations
     with pytest.raises(RuntimeError):
-        one_shot(
-            model_path=tmp_path / "obcq_deployment",
+        oneshot(
+            model=tmp_path / "test",
             dataset="open_platypus",
-            num_samples=4,
-            device=device,
-            recipe_file=second_recipe_str,
+            num_calibration_samples=4,
+            oneshot_device=device,
+            recipe=second_recipe_str,
         )
 
 
@@ -182,17 +186,17 @@ def test_separate_quants_allowed(tmp_path):
     if not torch.cuda.is_available():
         device = "cpu"
 
-    first_model = one_shot(
-        model_path=tiny_model_path,
+    oneshot(
+        model=tiny_model_path,
         dataset="open_platypus",
-        num_samples=4,
-        device=device,
-        recipe_file=first_recipe_str,
-        deploy_dir=tmp_path,
-        do_save=True,
+        num_calibration_samples=16,
+        recipe=first_recipe_str,
+        output_dir=tmp_path / "test1",
+        oneshot_device=device,
+        clear_sparse_session=False,
     )
-
     # only embedding quantized after first recipe
+    first_model = get_session_model()
     assert not isinstance(
         first_model.model.layers[0].mlp.down_proj, torch_quantization.QuantWrapper
     )
@@ -202,14 +206,17 @@ def test_separate_quants_allowed(tmp_path):
 
     # When trying to re-quantize with the second recipe, we should error out
     # to avoid nested quantizations
-    second_model = one_shot(
-        model_path=tmp_path / "obcq_deployment",
+    oneshot(
+        model=tmp_path / "test1",
         dataset="open_platypus",
-        num_samples=4,
-        device=device,
-        recipe_file=second_recipe_str,
+        num_calibration_samples=16,
+        recipe=second_recipe_str,
+        output_dir=tmp_path / "test2",
+        oneshot_device=device,
+        clear_sparse_session=False,
     )
 
+    second_model = get_session_model()
     # linear and embeddings should be quantized now
     assert isinstance(
         second_model.model.layers[0].mlp.down_proj, torch_quantization.QuantWrapper
diff --git a/tests/sparseml/transformers/obcq/test_tiny2.yaml b/tests/sparseml/transformers/obcq/test_tiny2.yaml
index ca3c9e8b4c9..f513b7e0c4f 100644
--- a/tests/sparseml/transformers/obcq/test_tiny2.yaml
+++ b/tests/sparseml/transformers/obcq/test_tiny2.yaml
@@ -6,8 +6,7 @@ test_stage:
       sequential_update: False
       quantize: False
       percdamp: 0.01
-      prunen: 0
-      prunem: 0
+      mask_structure: "0:0"
       targets: [
         "model.layers.0",
         "model.layers.1",

From a7778bbb7a5b69044660f011d6cf1f895f1876ac Mon Sep 17 00:00:00 2001
From: Jeannie Finks <74554921+jeanniefinks@users.noreply.github.com>
Date: Wed, 10 Apr 2024 13:07:47 -0400
Subject: [PATCH 18/19] Update CONTRIBUTING.md

---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0b8934b61c1..140ae6176e8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -77,7 +77,7 @@ For documentation edits, include:
 
 ## Question or Problem
 
-Sign up or log in to our [**Neural Magic Community Slack**](https://join.slack.com/t/discuss-neuralmagic/shared_invite/zt-q1a1cnvo-YBoICSIw3L1dmQpjBeDurQ). We are growing the community member by member and happy to see you there. Don’t forget to search through existing discussions to avoid duplication! Thanks!
+Sign up or log in to our [**Neural Magic Community Slack**](https://neuralmagic.com/community/). We are growing the community member by member and happy to see you there. Don’t forget to search through existing discussions to avoid duplication! Thanks!
 
 ## Developing SparseML
 

From 45b6533b3e64d62ea7020746b3a3636bb4ae75d7 Mon Sep 17 00:00:00 2001
From: Rob Greenberg <100797996+rgreenberg1@users.noreply.github.com>
Date: Thu, 11 Apr 2024 07:38:06 -0400
Subject: [PATCH 19/19] Updated logo in README.md (#2238)

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b15ec4f30fe..e55d2b6145b 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<h1><img alt="tool icon" src="https://raw.githubusercontent.com/neuralmagic/sparseml/main/docs/source/icon-sparseml.png" />&nbsp;&nbsp;SparseML</h1>
+<h1 style="display: flex; align-items: center;" >
+     <img width="100" height="100" alt="tool icon" src="https://neuralmagic.com/wp-content/uploads/2024/03/icon_SparseML-002.svg" />
+      <span>&nbsp;&nbsp;SparseML</span>
+  </h1>
 
 <h3>Libraries for applying sparsification recipes to neural networks with a few lines of code, enabling faster and smaller models</h3>