Diogo-V
diff --git a/‎test/sparsity/test_marlin.py
+22-14 b/‎test/sparsity/test_marlin.py
+22-14
diff --git a/‎test/test_ops.py
+91-118 b/‎test/test_ops.py
+91-118
diff --git a/‎torchao/csrc/cuda/sparse_marlin/base.h
+7-5 b/‎torchao/csrc/cuda/sparse_marlin/base.h
+7-5
@@ -8,8 +8,9 @@
 from torchao.sparsity.sparse_api import apply_fake_sparsity
 from torchao.quantization.quant_api import int4_weight_only, quantize_
 from torchao.sparsity.marlin import (
-    pack_to_sparse_marlin_24,
-    unpack_from_sparse_marlin_24,
+    pack_to_marlin_24,
+    unpack_from_marlin_24,
+    inject_24
 )
 
 
@@ -24,41 +25,48 @@ def test_quant_sparse_marlin_layout_e2e(self):
                 nn.Linear(21504, 256),
                 nn.ReLU(),
                 nn.Linear(256, 128),
+                nn.ReLU(),
+                nn.Linear(128, 4096),
             )
             .half()
             .cuda()
         )
 
+        # Baseline
+        ref_result = model(input)
+
         apply_fake_sparsity(model)
         model_copy = copy.deepcopy(model)
 
-        # Baseline to match against
+        # Quantized
         quantize_(model_copy.bfloat16(), int4_weight_only())
         dense_result = model_copy(input.bfloat16()).half()
 
         # Sparse + quantized
         quantize_(model, int4_weight_only(layout_type=MarlinSparseLayoutType()))
         sparse_result = model(input)
 
-        assert torch.allclose(dense_result, sparse_result, rtol=1e-2, atol=1e-2), "Sparse and dense results do not match"
+        error_dense = torch.mean(torch.abs(ref_result - dense_result) ** 2)
+        error_sparse = torch.mean(torch.abs(ref_result - sparse_result) ** 2)
+        assert torch.allclose(error_dense, error_sparse, atol=1e-3), "Mean error is not close"
 
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
     def test_pack_unpack_equivalence(self):
-        tiles = 16
+        num_bits = 4
+        group_size = 128
         shape = (512, 4096)
-        w_int4 = torch.randint(0, 15, shape).int().cuda()
+        w_q = torch.randint(0, 15, shape).int().cuda()
         scales = torch.rand(4096).cuda()
 
-        # Test pack/unpack equivalence
-        sparse_w_int4, packed_scales, meta = pack_to_sparse_marlin_24(w_int4, scales, tiles)
-        unpacked_w_int4, unpacked_scales = unpack_from_sparse_marlin_24(sparse_w_int4, packed_scales, meta, tiles, shape)
+        w_q_24, _ = inject_24(w_q, *w_q.shape)
 
-        # When unpacking, that values that were masked will be zeroed out. So, we need
-        # to zero out the same values in the original weights to compare
-        makeshift_mask = unpacked_w_int4 == 0
-        w_int4[makeshift_mask] = 0
+        # Test pack/unpack equivalence
+        q_w_comp, packed_scales, meta = pack_to_marlin_24(w_q_24, scales, num_bits, group_size)
+        unpacked_q_w, unpacked_scales = unpack_from_marlin_24(
+            q_w_comp, packed_scales, meta, shape, group_size, num_bits
+        )
 
-        assert torch.equal(w_int4, unpacked_w_int4), "Unpacked weights do not match original weights"
+        assert torch.equal(w_q, unpacked_q_w), "Unpacked weights do not match original weights"
         assert torch.equal(scales, unpacked_scales), "Unpacked scales do not match original scales"
 
 
 
@@ -10,8 +10,9 @@
     run_tests,
 )
 from torch.testing._internal.optests import opcheck
-from torchao.utils import is_fbcode, TORCH_VERSION_AT_LEAST_2_5
+from torchao.utils import is_fbcode, TORCH_VERSION_AT_LEAST_2_5, compute_max_diff
 from torchao.prototype.quant_llm import from_scaled_tc_fpx
+from torchao.sparsity.marlin import marlin_24_workspace, pack_to_marlin_24, inject_24
 import pytest
 
 if is_fbcode():
@@ -22,12 +23,6 @@
 except RuntimeError:
     pytest.skip("torchao.ops not available")
 
-from torchao.sparsity.utils import mask_creator
-from torchao.sparsity.marlin import (
-    pack_to_sparse_marlin_24,
-    marlin_24_mm,
-    fp16_to_int4_marlin_format
-)
 from torchao.quantization.utils import (
     get_groupwise_affine_qparams,
     groupwise_affine_dequantize_tensor_from_qparams,
@@ -309,139 +304,117 @@ def test_dequantize_tensor_core_tiled_layout_op(shape, inner_k_tiles, group_size
     )
 
 
-class SparseMarlin24(TestCase):
-    TILES = 16
+MARLIN_24_K_CHUNKS = [128]
+MARLIN_24_N_CHUNKS = [512]
+MNK_FACTORS = [
+    (1, 1, 1),
+    (1, 4, 8),
+    (1, 7, 5),
+    (13, 17, 67),
+    (26, 37, 13),
+    (67, 13, 11),
+]
+MARLIN_24_SUPPORTED_NUM_BITS = [4, 8]
+MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128]
 
-    def _op_check(self, inputs, sparse_w_int4, meta, scales, workspace, thread_k, thread_m, sms=-1, max_par=16):
-        out = torch.empty((inputs.size(0), scales.size(1)), dtype=inputs.dtype, device=inputs.device)
+MARLIN_TEST_PARAMS = list(itertools.product(
+    MARLIN_24_K_CHUNKS, MARLIN_24_N_CHUNKS, MARLIN_24_SUPPORTED_NUM_BITS, 
+    MARLIN_24_SUPPORTED_GROUP_SIZES, MNK_FACTORS
+))
 
-        prob_n = inputs.size(0)
-        prob_m = out.size(1)
-        prob_k = inputs.size(1)
-        group_size = -1 if scales.size(0) == 1 else int(prob_k / 2 / scales.size(0))
-        device = torch.cuda.current_device()
+def _symmetric_quantize_with_ref(w: torch.Tensor, num_bits: int, group_size: int):
+    orig_device = w.device
+    size_k, size_n = w.shape
 
-        test_utils = ["test_schema", "test_autograd_registration", "test_faketensor", "test_aot_dispatch_dynamic"]
-        opcheck(
-            torch.ops.torchao.marlin_24_mm,
-            (
-                inputs, sparse_w_int4, meta, out, scales, prob_m, prob_n, prob_k, 
-                workspace, group_size, device, thread_k, thread_m, sms, max_par
-            ),
-            test_utils=test_utils,
-        )
-
-    def _gen_values(self, m, n, k, group_size):
-        maxq = 2**4 - 1
-        inputs = torch.randn((n, k), dtype=torch.half, device="cuda")
-        w = torch.randn((m, k), dtype=torch.half, device="cuda")
-
-        w = w.t()
-        if group_size != -1:
-            w = w.reshape((-1, group_size, m))
-            w = w.permute(1, 0, 2)
-            w = w.reshape((group_size, -1))
+    assert w.is_floating_point(), "w must be float"
 
-        scales = torch.max(torch.abs(w), 0, keepdim=True)[0]
-        scales *= 2 / maxq
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
 
-        w = torch.round(w / scales).int()
-        w += (maxq + 1) // 2
-        w = torch.clamp(w, 0, maxq)
+    max_q_val = 2**num_bits - 1
+    half_q_val = (max_q_val + 1) // 2
 
-        w_fp16 = (w - (maxq + 1) // 2).half() * scales
-        scales = scales.reshape((-1, m)).contiguous()
+    # Reshape to [groupsize, -1]
+    if group_size < size_k:
+        w = w.reshape((-1, group_size, size_n))
+        w = w.permute(1, 0, 2)
+        w = w.reshape((group_size, -1))
 
-        if group_size != -1:
+    # Compute scale for each group
+    s = torch.max(torch.abs(w), 0, keepdim=True)[0]
+    s *= 2 / max_q_val  # 2 => symmetric
 
-            def reshape(w):
-                w = w.reshape((group_size, -1, m))
-                w = w.permute(1, 0, 2)
-                w = w.reshape((k, m)).contiguous()
-                return w
+    # Quantize
+    q_w = torch.round(w / s).int()
+    q_w += half_q_val
+    q_w = torch.clamp(q_w, 0, max_q_val)
 
-            w_fp16 = reshape(w_fp16)
-            w = reshape(w)
-        
-        mask = mask_creator(w.T).cuda().bool()
-        sparse_w_fp16_ref = (mask * w_fp16.T).T
+    # Compute ref (dequantized)
+    w_ref = (q_w - half_q_val).half() * s
 
-        return inputs, sparse_w_fp16_ref, w_fp16, scales
+    # Restore original shapes
+    if group_size < size_k:
 
-    def _run_problem(self, m, n, k, thread_k, thread_m, group_size=-1):
-        inputs, sparse_w_fp16_ref, w_fp16, scales = self._gen_values(m, n, k, group_size)
-        out_ref = torch.matmul(inputs, sparse_w_fp16_ref)
+        def reshape_w(w):
+            w = w.reshape((group_size, -1, size_n))
+            w = w.permute(1, 0, 2)
+            w = w.reshape((size_k, size_n)).contiguous()
+            return w
 
-        # If no groupsize is provided, we assume it is the same as the in_features of the weights
-        # https://github.com/IST-DASLab/Sparse-Marlin/blob/c2ffa2395a3ada26c8cb7f910a5ec65bd3ce288a/marlin/__init__.py#L290
-        if group_size == -1:
-            group_size = k
+        q_w = reshape_w(q_w)
+        w_ref = reshape_w(w_ref)
 
-        w_int4, scales = fp16_to_int4_marlin_format(w_fp16, scales, group_size)
-        sparse_w_int4, scales, meta = pack_to_sparse_marlin_24(w_int4, scales, self.TILES)
+    s = s.reshape((-1, size_n)).contiguous()
 
-        workspace = torch.zeros(m // 128 * 16, device="cuda", dtype=torch.int32)
-        out = marlin_24_mm(inputs, sparse_w_int4, meta, scales, workspace, thread_k, thread_m, -1)
-        torch.cuda.synchronize()
+    return (
+        w_ref.to(device=orig_device),
+        q_w.to(device=orig_device),
+        s.to(device=orig_device),
+    )
 
-        self.assertLess(
-            torch.mean(torch.abs(out - out_ref)) / torch.mean(torch.abs(out_ref)), 0.002
-        )
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.parametrize("k_chunk, n_chunk, num_bits, group_size, mnk_factors", MARLIN_TEST_PARAMS, ids=str)
+def test_marlin_24(k_chunk, n_chunk, num_bits, group_size, mnk_factors):
+    m_factor, n_factor, k_factor = mnk_factors
 
-        # TODO(diogo): Enable this check once I understand how to make `out` mutable
-        # self._op_check(inputs, sparse_w_int4, meta, scales, workspace, thread_k, thread_m)
+    size_m = m_factor
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
 
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-    def test_correctness(self):
-        self._run_problem(256, 16, 256, 128, 128, -1)
-        self._run_problem(21504, 16, 4096, 64, 256, 128)
+    a_input = torch.randn((size_m, size_k), dtype=torch.float16, device="cuda")
+    b_weight = torch.rand((size_k, size_n), dtype=torch.float16, device="cuda")
 
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-    def test_tiles(self):
-        for m in [1, 2, 4, 8, 12, 16, 32, 64]:
-            for thread_k, thread_n in [(64, 256), (128, 128)]:
-                if m > 16 and thread_k == 128:
-                    continue
-                self._run_problem(2 * 256, m, 1024, thread_k, thread_n)
+    # Inject 2:4 sparsity
+    w_24, _ = inject_24(b_weight, size_k, size_n)
 
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-    def test_k_stages_divisibility(self):
-        for k in [3 * 64 + 64 * 4 * 2 + 64 * i for i in range(1, 4)]:
-            self._run_problem(2 * 256, 16, k, 64, 256)
+    # Symmetric quantize
+    w_24_ref, q_w_24, scale = _symmetric_quantize_with_ref(w_24, num_bits, group_size)
 
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-    def test_very_few_stages(self):
-        for k in [64, 128, 192]:
-            self._run_problem(3 * 256, 16, k, 64, 256)
+    # Obtains reference output
+    output_ref = torch.matmul(a_input, w_24_ref)
 
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-    def test_llama_shapes(self):
-        MODELS = {
-            " 7B": [(4096, 3 * 4096), (4096, 4096), (4096, 2 * 10752), (10752, 4096)],
-            "13B": [(5120, 3 * 5120), (5120, 5120), (5120, 2 * 13568), (13568, 5120)],
-            "33B": [(6656, 3 * 6656), (6656, 6656), (6656, 2 * 17664), (17664, 6656)],
-            "70B": [(8192, 3 * 8192), (8192, 8192), (8192, 2 * 21760), (21760, 8192)],
-        }
-
-        try:
-            for _, layers in MODELS.items():
-                for layer in layers:
-                    for thread_k, thread_m in [(128, 128)]:
-                        for batch in [16]:
-                            print(layer[1], batch, layer[0])
-                            self._run_problem(layer[1], batch, layer[0], thread_k, thread_m)
-        # If someone runs this on a GPU with less than 24GB of memory, it will run out of memory
-        # but we don't want to fail the test
-        except torch.OutOfMemoryError:
-            pass
+    # Packs to marlin 2:4
+    marlin_24_q_w_comp, marlin_24_scale, meta = pack_to_marlin_24(q_w_24, scale, num_bits, group_size)
+    workspace_24 = marlin_24_workspace(size_n)
 
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-    def test_groups(self):
-        for m in [16]:
-            for groupsize in [128]:
-                for n, k in [(256, 512), (256, 1024), (256 * 128, 1024)]:
-                    for thread_shape in [(128, 128), (64, 256)]:
-                        self._run_problem(n, m, k, *thread_shape, groupsize)
+    fn_inputs = (
+        a_input, marlin_24_q_w_comp, meta, marlin_24_scale, workspace_24, 
+        num_bits, a_input.shape[0], b_weight.shape[1], a_input.shape[1],
+    )
+    output = torchao.ops.marlin_24_gemm(*fn_inputs)
+    torch.cuda.synchronize()
+
+    max_diff = compute_max_diff(output, output_ref)
+    assert max_diff < 0.04
+
+    # Performs opcheck
+    test_utils = ["test_schema", "test_autograd_registration", "test_faketensor"]
+    opcheck(
+        torch.ops.torchao.marlin_24_gemm,
+        fn_inputs,
+        test_utils=test_utils,
+    )
 
 
 if __name__ == "__main__":
 
@@ -26,12 +26,14 @@ constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
 // corresponding index accesses must be compile-time constants, which is why we
 // extensively use `#pragma unroll` throughout the kernel code to guarantee
 // this.
-template <typename T, int n> struct Vec {
+template <typename T, int n>
+struct Vec {
   T elems[n];
-  __device__ T &operator[](int i) { return elems[i]; }
+  __device__ T& operator[](int i) { return elems[i]; }
 };
 
-template <int M_, int N_, int K_> struct ShapeBase {
+template <int M_, int N_, int K_>
+struct ShapeBase {
   static constexpr int M = M_, N = N_, K = K_;
 };
 
@@ -44,6 +46,6 @@ using FragA = Vec<half2, 4>;
 using FragB = Vec<half2, 2>;
 using FragM = Vec<uint, 1>;
 using FragC = Vec<float, 4>;
-using FragS = Vec<half2, 1>; // quantization scales
+using FragS = Vec<half2, 1>;  // quantization scales
 
-} // namespace torchao
+}  // namespace torchao