Diogo-V
diff --git a/‎test/sparsity/test_marlin.py
+87 b/‎test/sparsity/test_marlin.py
+87
diff --git a/‎torchao/csrc/cuda/sparse_marlin/marlin_kernel_nm.cu
+1-1 b/‎torchao/csrc/cuda/sparse_marlin/marlin_kernel_nm.cu
+1-1
diff --git a/‎torchao/csrc/sparse_marlin.cpp
+1-1 b/‎torchao/csrc/sparse_marlin.cpp
+1-1
diff --git a/‎torchao/dtypes/__init__.py
+2 b/‎torchao/dtypes/__init__.py
+2
diff --git a/‎torchao/dtypes/affine_quantized_tensor.py
+215-3 b/‎torchao/dtypes/affine_quantized_tensor.py
+215-3
@@ -0,0 +1,87 @@
+import torch
+import copy
+import pytest
+
+from torch import nn
+from torch.testing._internal.common_utils import TestCase, run_tests
+from torchao.dtypes import MarlinSparseLayoutType
+from torchao.sparsity.sparse_api import apply_fake_sparsity
+from torchao.quantization.quant_api import int4_weight_only, quantize_
+from torchao.sparsity.marlin import (
+    pack_to_marlin_24,
+    unpack_from_marlin_24,
+    inject_24
+)
+from torchao.quantization.utils import (
+    get_group_qparams_symmetric,
+    groupwise_affine_quantize_tensor_from_qparams,
+)
+
+
+class SparseMarlin24(TestCase):
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
+    def test_quant_sparse_marlin_layout_e2e(self):
+        input = torch.randn((16, 4096), dtype=torch.float16, device="cuda")
+        model = (
+            nn.Sequential(
+                nn.Linear(4096, 11008),  # Llama2 shapes
+                nn.Linear(11008, 4096),
+                nn.ReLU(),
+                nn.Linear(4096, 11008),
+                nn.Linear(11008, 4096),
+            )
+            .half()
+            .cuda()
+        )
+
+        # Baseline
+        ref_result = model(input)
+
+        apply_fake_sparsity(model)
+        model_copy = copy.deepcopy(model)
+
+        # Quantized
+        quantize_(model_copy.bfloat16(), int4_weight_only())
+        dense_result = model_copy(input.bfloat16()).half()
+
+        # Sparse + quantized
+        quantize_(model, int4_weight_only(layout_type=MarlinSparseLayoutType()))
+        sparse_result = model(input)
+
+        error_dense = torch.mean(torch.abs(ref_result - dense_result) ** 2)
+        error_sparse = torch.mean(torch.abs(ref_result - sparse_result) ** 2)
+        assert torch.allclose(error_dense, error_sparse, atol=1e-2), "Mean error is not close"
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
+    def test_pack_unpack_equivalence(self):
+        num_bits = 4
+        group_size = 128
+        shape = (11008, 4096)
+        w = torch.rand(shape, dtype=torch.float16, device="cuda")
+
+        # Inject 2:4 sparsity mask
+        w_24, _ = inject_24(w, *w.shape)
+
+        # Quantize weights 
+        scales, zeros = get_group_qparams_symmetric(w_24, n_bit=4, groupsize=group_size)
+        w_q_24 = groupwise_affine_quantize_tensor_from_qparams(
+            w_24, scales, zeros, n_bit=4, groupsize=group_size
+        )
+
+        scales = scales.reshape(-1, w_q_24.shape[1])
+
+        # Test pack/unpack equivalence
+        q_w_comp, packed_scales, meta = pack_to_marlin_24(
+            w_q_24, scales, num_bits, group_size
+        )
+        unpacked_q_w, unpacked_scales = unpack_from_marlin_24(
+            q_w_comp, packed_scales, meta, shape, group_size, num_bits
+        )
+
+        assert torch.equal(w_q_24, unpacked_q_w), "Unpacked weights do not match original weights"
+        assert torch.equal(scales, unpacked_scales), "Unpacked scales do not match original scales"
+
+
+if __name__ == "__main__":
+    run_tests()
@@ -1123,4 +1123,4 @@ TORCH_LIBRARY_IMPL(torchao, CUDA, m) {
   m.impl("torchao::marlin_24_gemm", &marlin_24_gemm);
 }
 
-}  // namespace torchao
+}  // namespace torchao
@@ -5,4 +5,4 @@
 TORCH_LIBRARY_FRAGMENT(torchao, m) {
   m.impl_abstract_pystub("torchao.ops");
   m.def("marlin_24_gemm(Tensor x, Tensor weight_marlin, Tensor meta, Tensor s, Tensor workspace, int bits, int size_m, int size_n, int size_k) -> Tensor");
-}
+}
@@ -15,6 +15,7 @@
     TensorCoreTiledLayoutType,
     Float8LayoutType,
     Float8AQTLayout,
+    MarlinSparseLayoutType,
 )
 
 __all__ = [
@@ -33,4 +34,5 @@
     "TensorCoreTiledLayoutType",
     "Float8LayoutType",
     "Float8AQTLayout",
+    "MarlinSparseLayoutType",
 ]
@@ -1,5 +1,6 @@
 import torch
-from typing import Dict, Callable, Any, Tuple, Optional, Union
+from typing import Tuple, Optional, Union
+import torchao.ops
 from collections import defaultdict
 import functools
 import math
@@ -39,7 +40,7 @@
 logger = logging.getLogger(__name__)
 
 from torchao.float8.inference import Float8MMConfig
-aten = torch.ops.aten
+
 
 ###############################
 # Base Layout Tensor Subclass #
@@ -489,6 +490,16 @@ class Float8LayoutType(LayoutType):
     mm_config: Optional[Float8MMConfig] = None
 
 
+@dataclass(frozen=True)
+class MarlinSparseLayoutType(LayoutType):
+
+    # Inject 2:4 sparsity
+    def pre_process(self, input: torch.Tensor) -> torch.Tensor:
+        from torchao.sparsity.marlin import inject_24  # avoid circular import
+        w_24, _ = inject_24(input, *input.shape)
+        return w_24
+
+
 @register_layout_cls(PlainLayoutType)
 class PlainAQTLayout(AQTLayout):
     """
@@ -642,6 +653,153 @@ def from_plain(
         return cls(int_data_compressed, scale, zero_point, layout_type)
 
 
+@register_layout_cls(MarlinSparseLayoutType)
+class MarlinSparseAQTLayout(AQTLayout):
+    """
+    Layout storage class for sparse_marlin_24 layout for affine quantized tensor. 
+    
+    Can be used with 4 bits and 8 bits quantization.
+
+    Original marlin documentation and information:
+    https://github.com/IST-DASLab/marlin/tree/master
+
+    Sparse marlin documentation and information:
+    https://github.com/IST-DASLab/Sparse-Marlin?tab=readme-ov-file
+
+    fields:
+        original_shape (torch.Size): the original shape of the tensor. used to unpack the tensor to the original shape
+        group_size (int): the group size used to pack the tensor
+        num_bits (int): the number of bits used to quantize the tensor
+    """
+
+    implements = classmethod(_implements)
+    __torch_dispatch__ = classmethod(_dispatch__torch_dispatch__)
+    __torch_function__ = classmethod(_dispatch__torch_function__)
+
+    def __new__(
+        cls,
+        int_data: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        meta: torch.Tensor,
+        layout_type: LayoutType,
+        original_shape: torch.Size,
+        group_size: int,
+        num_bits: int,
+    ):
+        kwargs = {}
+        kwargs["device"] = int_data.device
+        kwargs["layout"] = (
+            kwargs.get("layout") if kwargs.get("layout", False) else int_data.layout
+        )
+        kwargs["dtype"] = int_data.dtype
+        kwargs["requires_grad"] = False
+        shape = int_data.shape
+        return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
+
+    def __init__(
+        self,
+        int_data: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        meta: torch.Tensor,
+        layout_type: LayoutType,
+        original_shape: torch.Size,
+        group_size: int,
+        num_bits: int,
+    ):
+        self.int_data = int_data
+        self.scale = scale
+        self.zero_point = zero_point
+        self.meta = meta
+        self.layout_type = layout_type
+        self.original_shape = original_shape
+        self.group_size = group_size
+        self.num_bits = num_bits
+
+    def get_plain(self):
+        from torchao.sparsity.marlin import unpack_from_marlin_24  # avoid circular import
+        int_data_expanded, scales_expanded = unpack_from_marlin_24(
+            self.int_data, 
+            self.scale, 
+            self.meta, 
+            self.original_shape,
+            self.group_size,
+            self.num_bits,
+        )
+        return int_data_expanded, scales_expanded, self.zero_point
+
+    @classmethod
+    def from_plain(
+        cls,
+        int_data: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        layout_type: LayoutType,
+    ):
+        from torchao.sparsity.marlin import pack_to_marlin_24, const  # avoid circular import
+        assert isinstance(layout_type, MarlinSparseLayoutType)
+
+        # Linear layers are (in_features, out_features) but the int_data that is reaching this point
+        # is (out_features, in_features). We need to transpose it to match the expected shape in the marlin code.
+        # NOTE(reviewers): Please check if this is what I should do.
+        q_w_24 = int_data.t()
+        scale = scale.reshape(-1, q_w_24.shape[1])
+
+        if q_w_24.dtype != torch.int32:
+            raise ValueError("Only `torch.int32` weights are supported.")
+        
+        in_features, out_features = q_w_24.shape
+        if in_features % 128 != 0 or out_features != 256 == 0:
+            raise ValueError(
+                "`in_features` must be divisible by 64 and `out_features` by 256."
+            )
+
+        # NOTE: The current marlin 2:4 kernel supports both 4 and 8 bits quantization but fp8
+        # will require a bit more work to get our current quantization flow to work with it.
+        # Check the link for a reference: https://github.com/neuralmagic/nm-vllm/tree/main
+        num_bits = 4 if torch.max(q_w_24) < 16 else -1
+        if num_bits not in [4]:
+            raise ValueError(
+                f"Only {[4]} bits are supported, got {num_bits}."
+            )
+
+        group_size = in_features // scale.shape[0]
+        if group_size == 0:
+            group_size = in_features
+        assert group_size <= in_features, "Group size must be less than or equal to in_features."
+
+        if group_size not in const.SUPPORTED_GROUP_SIZES:
+            raise ValueError(
+                f"Only {const.SUPPORTED_GROUP_SIZES} group sizes are supported, got {group_size}."
+            )
+
+        # Compress quantized weight to marlin 2:4 format
+        marlin_24_q_w_comp, marlin_24_s, meta = pack_to_marlin_24(q_w_24, scale, num_bits, group_size)
+
+        return cls(
+            marlin_24_q_w_comp, marlin_24_s, zero_point, 
+            meta, layout_type, q_w_24.shape,
+            group_size, num_bits
+        )
+    
+    def get_layout_type(self) -> LayoutType:
+        return self.layout_type
+
+    def _apply_fn_to_data(self, fn):
+        self.int_data = fn(self.int_data)
+        self.scale = fn(self.scale)
+        self.zero_point = fn(self.zero_point)
+        self.meta = fn(self.meta)
+        return self
+
+
+# Marlin Sparse op dispatch registration 
+@MarlinSparseAQTLayout.implements(aten.detach.default)
+def block_sparse_detach(func, types, args, kwargs):
+    return return_and_correct_aliasing(func, args, kwargs, args[0]._apply_fn_to_data(torch.detach))
+
+
 @register_layout_cls(Float8LayoutType)
 class Float8AQTLayout(AQTLayout):
     """
@@ -758,7 +916,7 @@ def __repr__(self):
                 f"scale={scale},\n"
                 f"transposed={self.transposed}, "
                 f"layout_type={layout_type})")
-
+    
 
 @register_layout_cls(TensorCoreTiledLayoutType)
 class TensorCoreTiledAQTLayout(AQTLayout):
@@ -941,6 +1099,7 @@ def _aqt_is_uint4(aqt):
         aqt.quant_max is None or aqt.quant_max == 15
     )
 
+
 implements = AffineQuantizedTensor.implements
 
 # following are a list of (dispatch_condition, implementation) functions that takes the following args:
@@ -1219,6 +1378,58 @@ def _linear_fp_act_fp8_weight_impl(
     ).reshape(out_shape)
 
 
+def _linear_fp_act_int4_weight_sparse_marlin_check(input_tensor, weight_tensor, bias):
+    return (
+        _aqt_is_uint4(weight_tensor) and
+        input_tensor.dtype == torch.float16 and
+        len(weight_tensor.shape) == 2 and
+        weight_tensor.zero_point_domain == ZeroPointDomain.INT and
+        isinstance(weight_tensor.layout_type, MarlinSparseLayoutType)
+    )
+
+def _linear_fp_act_int4_weight_sparse_marlin_impl(input_tensor, weight_tensor, bias):
+    from torchao.sparsity.marlin import marlin_24_workspace, const
+
+    sparse_w_int4 = weight_tensor.layout_tensor.int_data
+    scale = weight_tensor.layout_tensor.scale
+    meta = weight_tensor.layout_tensor.meta
+    original_shape = weight_tensor.layout_tensor.original_shape
+    num_bits = weight_tensor.layout_tensor.num_bits
+
+    # Saves batch size for reshaping back to original shape after the matmul
+    # Reshapes tensor to (m, k) where m is in_features * batch and k is out_features
+    # NOTE(reviewers): Please check if I am handling the batch size correctly
+    batch_size = -1
+    if input_tensor.dim() == 3:
+        batch_size = input_tensor.size(0)
+        input_tensor = input_tensor.reshape(-1, input_tensor.shape[-1]).contiguous()
+
+    size_m = input_tensor.shape[0]
+    size_n = original_shape[1]
+    size_k = input_tensor.shape[1]
+    workspace_24 = marlin_24_workspace(original_shape[1])
+
+    # Pad input_tensor dim 1 to a multiple of the marlin tile size (16)
+    if size_k % const.TILE != 0:
+        pad_size = find_multiple(size_k, const.TILE)
+        input_tensor = torch.nn.functional.pad(input_tensor, (0, pad_size - size_k))
+        size_k = pad_size
+
+    out = torchao.ops.marlin_24_gemm(
+        input_tensor, sparse_w_int4, meta, scale, 
+        workspace_24, num_bits, size_m, size_n, size_k
+    )
+    torch.cuda.synchronize()
+
+    # Reshape back to original shape
+    if batch_size != -1:
+        out = out.reshape(batch_size, -1, out.shape[-1])
+
+    if bias is not None:
+        out += bias.to(out.dtype)
+    return out
+
+
 def _register_aqt_quantized_linear_dispatches():
     for dispatch_condition, impl in [
         (_linear_int8_act_int8_weight_check, _linear_int8_act_int8_weight_impl),
@@ -1227,6 +1438,7 @@ def _register_aqt_quantized_linear_dispatches():
         (_linear_bf16_act_uint4_weight_check, _linear_bf16_act_uint4_weight_impl),
         (_linear_fp_act_int8_weight_check, _linear_fp_act_int8_weight_impl),
         (_linear_f16_act_fpx_weight_check, _linear_f16_act_fpx_weight_impl),
+        (_linear_fp_act_int4_weight_sparse_marlin_check, _linear_fp_act_int4_weight_sparse_marlin_impl),
     ]:
         register_aqt_quantized_linear_dispatch(dispatch_condition, impl)
Original file line number	Diff line number	Diff line change
`@@ -1123,4 +1123,4 @@ TORCH_LIBRARY_IMPL(torchao, CUDA, m) {`
`1123`	`1123`	`m.impl("torchao::marlin_24_gemm", &marlin_24_gemm);`
`1124`	`1124`	`}`
`1125`	`1125`
`1126`		`-} // namespace torchao`
	`1126`	`+} // namespace torchao`
Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@`
`15`	`15`	`TensorCoreTiledLayoutType,`
`16`	`16`	`Float8LayoutType,`
`17`	`17`	`Float8AQTLayout,`
	`18`	`+ MarlinSparseLayoutType,`
`18`	`19`	`)`
`19`	`20`
`20`	`21`	`__all__ = [`
`@@ -33,4 +34,5 @@`
`33`	`34`	`"TensorCoreTiledLayoutType",`
`34`	`35`	`"Float8LayoutType",`
`35`	`36`	`"Float8AQTLayout",`
	`37`	`+ "MarlinSparseLayoutType",`
`36`	`38`	`]`