Autodetect Triton WS support. (pytorch#4009)

levendlee · facebook-github-bot · commit 6d64f90157e4 · 2025-04-25T03:46:50.000-07:00
Summary: Pull Request resolved: pytorch#4009 X-link: facebookresearch/FBGEMM#1096 By default, enable warp specialization if the Triton version of the Python environment has warp specialization support. Otherwise, disable the feature to avoid API errors. Reviewed By: htyu Differential Revision: D73518121 fbshipit-source-id: c50ed6312b3705e7a083ef2ef8d58fe432a63430
diff --git a/fbgemm_gpu/experimental/gemm/test/grouped_gemm_test.py b/fbgemm_gpu/experimental/gemm/test/grouped_gemm_test.py
@@ -8,32 +8,19 @@
 # pyre-ignore-all-errors[53]
 
 import logging
-import os
 import unittest
 from typing import Tuple
 
 import torch
 
-try:
-    # pyre-ignore[21]
-    # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
-    from fbgemm_gpu import open_source
-except Exception:
-    open_source: bool = False
-
-
-if not open_source and torch.cuda.is_available():
-    from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import quantize_fp8_row
-    from fbgemm_gpu.experimental.gemm.triton_gemm.grouped_gemm import (
-        grouped_gemm,
-        grouped_gemm_fp8_rowwise,
-    )
+from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import quantize_fp8_row
+from fbgemm_gpu.experimental.gemm.triton_gemm.grouped_gemm import (
+    _HAS_WS_SUPPORT,
+    grouped_gemm,
+    grouped_gemm_fp8_rowwise,
+)
 
 
-@unittest.skipIf(
-    open_source,
-    "TypeError: __init__() got an unexpected keyword argument 'num_consumer_groups'",
-)
 @unittest.skipIf(
     not torch.cuda.is_available(),
     "Skip when CUDA is not available",
@@ -42,10 +29,11 @@ class TestGroupedGEMM(unittest.TestCase):
     def setUp(self) -> None:
         torch.manual_seed(0)
 
-    # pyre-ignore [56]
-    @unittest.skipIf(
-        os.getenv("GITHUB_ENV") is not None,
-        """This test fails on the GitHub runners: module 'triton.language' has no attribute 'async_task'""",
+    @unittest.skipIf(  # pyre-ignore [56]
+        (not torch.cuda.is_available())
+        or (torch.version.hip is None)
+        and (torch.cuda.get_device_properties(0).major < 9),
+        "Skip FP8 test on architectures before SM90.",
     )
     def test_grouped_gemm_fp8_rowwise(self) -> None:
         def _test_grouped_gemm_fp8_rowwise(
@@ -154,6 +142,8 @@ def msg(s: str) -> str:
                 for fast_accu in (True, False):
                     for ws in (True, False):
                         for fuse_scatter_add in (True, False):
+                            if ws and not _HAS_WS_SUPPORT:
+                                continue
                             if not ws and fuse_scatter_add:
                                 continue
                             logging.info(
@@ -168,9 +158,12 @@ def msg(s: str) -> str:
                                 fuse_scatter_add=fuse_scatter_add,
                             )
 
+    # TODO(shikaili): Re-enable the test for SM80 after fixing TMA issues.
     @unittest.skipIf(  # pyre-ignore [56]
-        os.getenv("GITHUB_ENV") is not None,
-        """This test fails on the GitHub runners: "type fp8e4nv not supported in this architecture. The supported fp8 dtypes are ('fp8e4b15', 'fp8e5')""",
+        (not torch.cuda.is_available())
+        or (torch.version.hip is None)
+        and (torch.cuda.get_device_properties(0).major < 9),
+        "Skip BF16 test on architectures before SM90.",
     )
     def test_grouped_gemm_bf16(self) -> None:
         def _test_grouped_gemm_bf16(
@@ -251,6 +244,8 @@ def msg(s: str) -> str:
             for M in (0, 64, 512, 1000000):
                 for ws in (True, False):
                     for fuse_scatter_add in (True, False):
+                        if ws and not _HAS_WS_SUPPORT:
+                            continue
                         if not ws and fuse_scatter_add:
                             continue
                         logging.info(
diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/grouped_gemm.py
@@ -7,6 +7,7 @@
 # pyre-unsafe
 
 import functools
+import inspect
 import logging
 
 from typing import Optional
@@ -41,34 +42,63 @@
     for num_ctas in [1]
 ]
 
-_NV_WS_CONFIGS = [
-    triton.Config(
-        {
-            "BLOCK_SIZE_M": block_size_m,
-            "BLOCK_SIZE_N": block_size_n,
-            "BLOCK_SIZE_K": block_size_k,
-            "NUM_CONSUMER_GROUPS": max(1, num_consumer_groups),
-            "USE_TMA_LOAD_ON_SCALES": use_tma_load_on_scales,
-            "USE_TMA_STORE": use_tma_store,
-        },
-        num_stages=num_stages,
-        num_warps=num_warps,
-        num_ctas=num_ctas,
-        num_consumer_groups=num_consumer_groups,
-        num_buffers_warp_spec=num_stages,
-    )
-    for block_size_m in [64, 128, 256]
-    for block_size_n in [64, 128, 256]
-    for block_size_k in [64, 128, 256]
-    for num_stages in [2, 3, 4]
-    for num_warps in [4, 8, 16]
-    # TODO(shikaili): Resolve LLVM error.
-    for num_ctas in [1]
-    for num_consumer_groups in [0, 2]
-    for use_tma_load_on_scales in [True, False]
-    # TODO(shikaili): Resolve compatibility with ws.
-    for use_tma_store in [False]
-]
+_HAS_WS_SUPPORT = None
+
+
+def _check_ws_support():
+    if not hasattr(tl, "async_task"):
+        return False
+    config_signature = inspect.signature(triton.Config).parameters
+    if (
+        "num_consumer_groups" not in config_signature
+        or "num_buffers_warp_spec" not in config_signature
+    ):
+        return False
+    if not utils.HAS_TMA_DESC:
+        return False
+    return True
+
+
+def _set_ws_support():
+    global _HAS_WS_SUPPORT
+    if _HAS_WS_SUPPORT is None:
+        _HAS_WS_SUPPORT = _check_ws_support()
+
+
+_set_ws_support()
+
+if _HAS_WS_SUPPORT:
+    _NV_WS_CONFIGS = [
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": block_size_m,
+                "BLOCK_SIZE_N": block_size_n,
+                "BLOCK_SIZE_K": block_size_k,
+                "NUM_CONSUMER_GROUPS": max(1, num_consumer_groups),
+                "USE_TMA_LOAD_ON_SCALES": use_tma_load_on_scales,
+                "USE_TMA_STORE": use_tma_store,
+            },
+            num_stages=num_stages,
+            num_warps=num_warps,
+            num_ctas=num_ctas,
+            num_consumer_groups=num_consumer_groups,
+            num_buffers_warp_spec=num_stages,
+        )
+        for block_size_m in [64, 128, 256]
+        for block_size_n in [64, 128, 256]
+        for block_size_k in [64, 128, 256]
+        for num_stages in [2, 3, 4]
+        for num_warps in [4, 8, 16]
+        # TODO(shikaili): Resolve LLVM error.
+        for num_ctas in [1]
+        for num_consumer_groups in [0, 2]
+        for use_tma_load_on_scales in [True, False]
+        # TODO(shikaili): Resolve compatibility with ws.
+        for use_tma_store in [False]
+    ]
+else:
+    _NV_WS_CONFIGS = _NV_CONFIGS
+
 
 _AMD_CONFIGS = [
     triton.Config(
@@ -880,15 +910,16 @@ def _fbgemm_grouped_gemm_fp8_rowwise_ws(
 
 
 def _grouped_gemm(
+    *,
     x: torch.Tensor,
     w: torch.Tensor,
     m_sizes: torch.Tensor,
-    x_scale: Optional[torch.Tensor] = None,
-    w_scale: Optional[torch.Tensor] = None,
-    use_fast_accum: bool = False,
-    use_warp_specialization: bool = False,
-    output_tensor: Optional[torch.Tensor] = None,
-    scatter_add_indices: Optional[torch.Tensor] = None,
+    x_scale: Optional[torch.Tensor],
+    w_scale: Optional[torch.Tensor],
+    use_fast_accum: bool,
+    use_warp_specialization: bool,
+    output_tensor: Optional[torch.Tensor],
+    scatter_add_indices: Optional[torch.Tensor],
 ) -> torch.Tensor:
 
     USE_TMA_LOAD = not torch.version.hip
@@ -902,12 +933,19 @@ def _grouped_gemm(
         USE_TMA_STORE = False
         logging.warning("TMA store is disabled as there is no TMA descriptor support!")
 
+    # TODO(shikaili): Check the readniess of WS on ROCm side in Meta's Triton.
     if use_warp_specialization and torch.version.hip:
         logging.warning(
             "Warp specialization is disabled as it is not supported on ROCm."
         )
         use_warp_specialization = False
 
+    if use_warp_specialization and not _HAS_WS_SUPPORT:
+        logging.warning(
+            "Warp specialization is disabled as the Triton build in current environment doesn't have such support. Please build from https://github.com/facebookexperimental/triton/tree/ws-3.2.x to enable it for best performance on Nvidia's SM90 GPUs."
+        )
+        use_warp_specialization = False
+
     if use_warp_specialization:
         assert utils.HAS_TMA_DESC
         USE_TMA_STORE = True  # Tuning decision
@@ -1063,14 +1101,16 @@ def grouped_gemm(
     m_sizes: torch.Tensor,
     use_fast_accum: bool = True,
     *,
-    _use_warp_specialization: bool = False,
+    _use_warp_specialization: bool = True,
     _output_tensor: Optional[torch.Tensor] = None,
     _scatter_add_indices: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     return _grouped_gemm(
-        x,
-        w,
-        m_sizes,
+        x=x,
+        w=w,
+        m_sizes=m_sizes,
+        x_scale=None,
+        w_scale=None,
         use_fast_accum=use_fast_accum,
         use_warp_specialization=_use_warp_specialization,
         output_tensor=_output_tensor,
@@ -1086,16 +1126,16 @@ def grouped_gemm_fp8_rowwise(
     w_scale: torch.Tensor,
     use_fast_accum: bool = True,
     *,
-    _use_warp_specialization: bool = False,
+    _use_warp_specialization: bool = True,
     _output_tensor: Optional[torch.Tensor] = None,
     _scatter_add_indices: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     return _grouped_gemm(
-        x,
-        w,
-        m_sizes,
-        x_scale,
-        w_scale,
+        x=x,
+        w=w,
+        m_sizes=m_sizes,
+        x_scale=x_scale,
+        w_scale=w_scale,
         use_fast_accum=use_fast_accum,
         use_warp_specialization=_use_warp_specialization,
         output_tensor=_output_tensor,