Fix templates for FP8 Rowwise Slow Accumulation (#4037)

jwfromm · facebook-github-bot · commit e63e5786859d · 2025-04-28T14:32:24.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1122 It turns out there are a few tile / cluster configurations for FP8 Rowwise Matmul that work fine for fast accumulation but produce bad outputs when used for slow accumcumulation. Spefically tile sizes of [128, 256, 128] seem to be problematic. This would not affect any production use-cases since slow accumulation only is used for debugging. Differential Revision: D73805710
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise.cu
@@ -87,7 +87,7 @@ at::Tensor dispatch_fp8_rowwise_kernel(
     } else if (N <= 2048) {
       return f8f8bf16_rowwise_64_128_128_1_1_1_f_f(
           XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
-    } else if (N <= 4096) {
+    } else if (N <= 4096 || use_fast_accum == false) {
       return f8f8bf16_rowwise_64_256_128_1_1_1_f_f(
           XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
     } else {
@@ -98,7 +98,7 @@ at::Tensor dispatch_fp8_rowwise_kernel(
     if (N <= 1024) {
       return f8f8bf16_rowwise_64_128_128_1_1_1_f_f(
           XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
-    } else if (N <= 2048) {
+    } else if (N <= 2048 || use_fast_accum == false) {
       return f8f8bf16_rowwise_64_256_128_1_1_1_f_f(
           XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
     } else {
@@ -109,7 +109,7 @@ at::Tensor dispatch_fp8_rowwise_kernel(
     if (M <= 2048 && N <= 1024) {
       return f8f8bf16_rowwise_64_256_128_2_1_1_f_f(
           XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
-    } else if (K <= 4096) {
+    } else if (K <= 4096 || use_fast_accum == false) {
       return f8f8bf16_rowwise_128_128_128_2_1_1_t_f(
           XQ, WQ, x_scale, w_scale, use_fast_accum, bias, output);
     } else if (M > 8192 && N > 8192) {
diff --git a/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py b/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py
@@ -227,7 +227,7 @@ def test_f8f8bf16(self, kernel: str, use_fast_accum: bool) -> None:
     @given(
         B_T=st.sampled_from([0, 2048, 4096]),
         D=st.sampled_from([128, 256]),
-        HD_L=st.sampled_from([256, 512]),
+        HD_L=st.sampled_from([256, 512, 4096, 8192]),
         Mode=st.sampled_from(
             ["rowwise", "blockwise"]
             + (["tensorwise_broadcast", "tensorwise"] if torch.version.cuda else [])
@@ -236,6 +236,7 @@ def test_f8f8bf16(self, kernel: str, use_fast_accum: bool) -> None:
         Bias=st.sampled_from([True, False]),
         CudaGraph=st.sampled_from([True, False]),
         UseTriton=st.sampled_from([False] + ([True] if torch.version.cuda else [])),
+        UseFastAccum=st.booleans(),
         InputMultiDim=st.booleans(),
     )
     def test_quantize_fp8_matmul(
@@ -248,8 +249,13 @@ def test_quantize_fp8_matmul(
         Bias: bool,
         CudaGraph: bool,
         UseTriton: bool,
+        UseFastAccum: bool,
         InputMultiDim: bool,
     ) -> None:
+        # Fast accumulation is only supported on Nvidia.
+        if torch.version.hip:
+            UseFastAccum = False
+        # Setup input shapes.
         if InputMultiDim and not torch.version.hip:
             x = torch.randn(size=(3, B_T, D), dtype=torch.bfloat16, device="cuda") * 0.1
         else:
@@ -285,12 +291,16 @@ def test_quantize_fp8_matmul(
             if CudaGraph:
                 g = torch.cuda.CUDAGraph()
                 with torch.cuda.graph(g):
-                    zq = torch.ops.fbgemm.f8f8bf16_tensorwise(xq, wq, x_scale * w_scale)
+                    zq = torch.ops.fbgemm.f8f8bf16_tensorwise(
+                        xq, wq, x_scale * w_scale, use_fast_accum=UseFastAccum
+                    )
                     if bias is not None:
                         zq += bias
                 g.replay()
             else:
-                zq = torch.ops.fbgemm.f8f8bf16_tensorwise(xq, wq, x_scale * w_scale)
+                zq = torch.ops.fbgemm.f8f8bf16_tensorwise(
+                    xq, wq, x_scale * w_scale, use_fast_accum=UseFastAccum
+                )
                 if bias is not None:
                     zq += bias
         elif Mode == "rowwise":
@@ -299,7 +309,9 @@ def test_quantize_fp8_matmul(
                 xq, x_scale = quantize_fp8_row(x)
                 wq, w_scale = quantize_fp8_row(w)
                 if UseTriton and torch.version.cuda:
-                    zq = matmul_fp8_row(xq, wq, x_scale, w_scale)
+                    zq = matmul_fp8_row(
+                        xq, wq, x_scale, w_scale, fp8_fast_accum=UseFastAccum
+                    )
                 g = torch.cuda.CUDAGraph()
                 with torch.cuda.graph(g):
                     if torch.version.cuda:
@@ -321,6 +333,7 @@ def test_quantize_fp8_matmul(
                             x_scale,
                             w_scale,
                             bias=bias if torch.version.cuda else None,
+                            use_fast_accum=UseFastAccum,
                         )
                         # Bias fusion not yet supported on AMD.
                         if bias is not None and torch.version.hip:
@@ -336,7 +349,9 @@ def test_quantize_fp8_matmul(
                     xq, x_scale = quantize_fp8_row(x)
                     wq, w_scale = quantize_fp8_row(w)
                 if UseTriton and torch.version.cuda:
-                    zq = matmul_fp8_row(xq, wq, x_scale, w_scale)
+                    zq = matmul_fp8_row(
+                        xq, wq, x_scale, w_scale, fp8_fast_accum=UseFastAccum
+                    )
                     if bias is not None:
                         zq += bias
                 else:
@@ -346,6 +361,7 @@ def test_quantize_fp8_matmul(
                         x_scale,
                         w_scale,
                         bias=bias if torch.version.cuda else None,
+                        use_fast_accum=UseFastAccum,
                     )
                     # Bias fusion not yet supported on AMD.
                     if bias is not None and torch.version.hip:
@@ -369,7 +385,7 @@ def test_quantize_fp8_matmul(
                         block_m,
                         block_n,
                         block_k,
-                        fp8_fast_accum=True,
+                        fp8_fast_accum=UseFastAccum,
                     )
                 else:
                     zq = torch.ops.fbgemm.f8f8bf16_blockwise(
@@ -393,7 +409,7 @@ def test_quantize_fp8_matmul(
                             block_m,
                             block_n,
                             block_k,
-                            fp8_fast_accum=True,
+                            fp8_fast_accum=UseFastAccum,
                         )
                     else:
                         zq = torch.ops.fbgemm.f8f8bf16_blockwise(
@@ -416,7 +432,7 @@ def test_quantize_fp8_matmul(
                         block_m,
                         block_n,
                         block_k,
-                        fp8_fast_accum=True,
+                        fp8_fast_accum=UseFastAccum,
                     )
                 else:
                     zq = torch.ops.fbgemm.f8f8bf16_blockwise(