Cleanup shuffling ops. (pytorch#4013)

levendlee · facebook-github-bot · commit 2cffc7b33902 · 2025-04-24T07:29:54.000-07:00
Summary: Pull Request resolved: pytorch#4013 X-link: facebookresearch/FBGEMM#1101 - Sort the file structure. - Make sure compatible with `torch.compile`. - Move benchmark from test to dedicated benchmark script. Reviewed By: jianyuh Differential Revision: D73471757 fbshipit-source-id: 7067c9e50a4f5924f1ed3e02726b2e38ec8a28f6
diff --git a/fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py b/fbgemm_gpu/experimental/gen_ai/bench/gather_scatter_bench.py
@@ -4,27 +4,32 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import argparse
 import itertools
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 import torch
 import triton  # noqa: F401
 from fbgemm_gpu.experimental.gen_ai.moe import (
+    combine_shuffling,
     gather_along_first_dim,
     gather_scale_dense_tokens,
     gather_scale_quant_dense_tokens,
     index_shuffling,
     scatter_add_along_first_dim,
+    split_shuffling,
 )
 from triton.testing import do_bench, do_bench_cudagraph
 
+_ACCELERATOR_TAG = torch.accelerator.current_accelerator()
+
 
 def bench_gather_along_first_dim(M: int, N: int, K: int) -> None:
-    src = torch.randn([M, K], device="cuda", dtype=torch.bfloat16).abs()
+    src = torch.randn([M, K], device=_ACCELERATOR_TAG, dtype=torch.bfloat16).abs()
     if M == N:
-        indices = torch.randperm(N, device="cuda", dtype=torch.int32)
+        indices = torch.randperm(N, device=_ACCELERATOR_TAG, dtype=torch.int32)
     else:
-        indices = torch.randint(0, M, [N], device="cuda", dtype=torch.int32)
+        indices = torch.randint(0, M, [N], device=_ACCELERATOR_TAG, dtype=torch.int32)
 
     def fn():
         return gather_along_first_dim(src, indices)
@@ -51,12 +56,14 @@ def ref_fn():
 
 
 def bench_scatter_add_along_first_dim(M: int, N: int, K: int) -> None:
-    src = torch.randn([M, K], device="cuda", dtype=torch.bfloat16).abs()
-    dst = torch.randn([N, K], device="cuda", dtype=torch.bfloat16).abs()
+    src = torch.randn([M, K], device=_ACCELERATOR_TAG, dtype=torch.bfloat16).abs()
+    dst = torch.randn([N, K], device=_ACCELERATOR_TAG, dtype=torch.bfloat16).abs()
     if M == N:
-        indices_1d = torch.randperm(N, device="cuda", dtype=torch.int64)
+        indices_1d = torch.randperm(N, device=_ACCELERATOR_TAG, dtype=torch.int64)
     else:
-        indices_1d = torch.randint(0, N, [M], device="cuda", dtype=torch.int64)
+        indices_1d = torch.randint(
+            0, N, [M], device=_ACCELERATOR_TAG, dtype=torch.int64
+        )
 
     indices_2d = indices_1d.to(torch.int64).unsqueeze(1).expand(-1, K)
 
@@ -88,10 +95,10 @@ def ref_fn():
 
 
 def bench_gather_scale_dense_tokens(E: int, T: int, D: int, quantize: bool):
-    x = torch.randn((T, D), dtype=torch.bfloat16, device="cuda").abs()
-    expert_indices = torch.randint(0, E, (T,), device="cuda")
-    token_indices = torch.randperm(T, device="cuda")
-    scores = torch.rand((E, T), dtype=torch.bfloat16, device="cuda")
+    x = torch.randn((T, D), dtype=torch.bfloat16, device=_ACCELERATOR_TAG).abs()
+    expert_indices = torch.randint(0, E, (T,), device=_ACCELERATOR_TAG)
+    token_indices = torch.randperm(T, device=_ACCELERATOR_TAG)
+    scores = torch.rand((E, T), dtype=torch.bfloat16, device=_ACCELERATOR_TAG)
 
     def torch_fn():
         shuffled_x = torch.index_select(x, dim=0, index=token_indices)
@@ -134,12 +141,13 @@ def triton_fn():
     )
 
 
-def bench_top1_index_shuffling(num_tokens: int, num_experts: int) -> None:
+def bench_top1_index_shuffling(T: int, E: int) -> None:
     torch.manual_seed(0)
 
+    num_rotating_buffers = max(2, triton.cdiv(1024 * 1024 * 1024, T * E * 2))
     scores_list: List[torch.Tensor] = [
-        torch.randn(num_tokens, num_experts, device="cuda", dtype=torch.bfloat16)
-        for i in range(100)
+        torch.randn(T, E, device=_ACCELERATOR_TAG, dtype=torch.bfloat16)
+        for i in range(num_rotating_buffers)
     ]
 
     def fn() -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -152,39 +160,171 @@ def ref_fn() -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
             expert_indices, _ = torch.sort(selected_expert_indices, dim=0)
             _ = (
                 expert_indices[:, None]
-                == torch.arange(num_experts, device=expert_indices.device)[None, :]
+                == torch.arange(E, device=expert_indices.device)[None, :]
             ).sum(dim=0)
 
-    fbgemm_time = do_bench_cudagraph(fn) * 1e3 / 100
-    torch_time = do_bench_cudagraph(ref_fn) * 1e3 / 100
+    fbgemm_time = do_bench_cudagraph(fn) * 1e3 / num_rotating_buffers
+    torch_time = do_bench_cudagraph(ref_fn) * 1e3 / num_rotating_buffers
     print(
-        f"Benchmark index_shuffling, num_tokens={num_tokens:4}, num_experts={num_experts:4}, "
+        f"Benchmark index_shuffling, num_tokens={T:4}, num_experts={E:4}, "
         f"fbgemm_time={fbgemm_time:7.3f}us, torch_time={torch_time:7.3f}us"
     )
 
 
-def main():
+def bench_combine_or_split_shuffling(
+    T: int,
+    D: int,
+    E: int,
+    EP: bool,
+    is_padded: bool,
+    is_balanced: bool,
+    is_combine_shuffling: bool,
+):
+    torch.manual_seed(0)
+
+    assert E % EP == 0
+    if is_padded:
+        # graph. allgather
+        input_num_tokens: int = EP * T
+        input_num_experts: int = E
+        output_num_experts: int = E // EP
+        start_expert_index: int = 1
+        end_expert_index: int = 1 + output_num_experts
+    else:
+        # eager. all2all
+        input_num_tokens: int = T
+        input_num_experts: int = E // EP
+        output_num_experts: int = E // EP
+        start_expert_index: int = 0
+        end_expert_index: int = output_num_experts
+
+    tokens = torch.randn(
+        input_num_tokens, D, device=_ACCELERATOR_TAG, dtype=torch.bfloat16
+    )
+
+    if input_num_tokens < (EP * input_num_experts) != 0:
+        return
+
+    input_num_tokens_per_expert: int = input_num_tokens // (EP * input_num_experts)
+    token_counts: torch.Tensor = (
+        torch.ones(
+            [EP, input_num_experts],
+            dtype=torch.int32,
+            device=_ACCELERATOR_TAG,
+        )
+        * input_num_tokens_per_expert
+    )
+    if not is_balanced:
+        for i in range(EP):
+            token_counts[i, start_expert_index] -= input_num_tokens_per_expert
+            token_counts[i, end_expert_index - 1] += input_num_tokens_per_expert
+
+    assert token_counts.sum().item() == input_num_tokens
+
+    num_rotating_buffers = triton.cdiv(1024 * 1024 * 1024, tokens.numel() * 2)
+    token_list: List[torch.Tensor] = [
+        tokens.clone() for _ in range(num_rotating_buffers)
+    ]
+    token_count_list: List[torch.Tensor] = [
+        token_counts.clone() for _ in range(num_rotating_buffers)
+    ]
+
+    def fn() -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        for tokens, token_counts in zip(token_list, token_count_list):
+            if is_combine_shuffling:
+                combine_shuffling(
+                    tokens,
+                    token_counts,
+                    expert_start=start_expert_index,
+                    expert_end=end_expert_index,
+                    is_balanced=is_balanced,
+                )
+            else:
+                split_shuffling(
+                    tokens,
+                    token_counts,
+                    expert_start=start_expert_index,
+                    expert_end=end_expert_index,
+                    is_balanced=is_balanced,
+                )
+
+    fn()
+
+    output_num_tokens = 0
+    for per_rank_counts in token_counts.tolist():
+        for expert_index, per_expert_counts in enumerate(per_rank_counts):
+            if expert_index >= start_expert_index and expert_index < end_expert_index:
+                output_num_tokens += per_expert_counts
+
+    mem_bytes = output_num_tokens * D * 2 * 2
+    fbgemm_time = do_bench_cudagraph(fn) * 1e3 / num_rotating_buffers
+    fbgemm_bw = mem_bytes * 1e-9 / (fbgemm_time * 1e-6)
+
+    print(
+        f"Benchmark {'combine_shuffling' if is_combine_shuffling else 'split_shuffling'}, "
+        f"num_tokens={T:4}, dim={D:4}, num_experts={E:4}, expert_parallelism={EP:4}, output_num_tokens={output_num_tokens:4}, "
+        f"{is_balanced=}, {is_padded=}, "
+        f"fbgemm_time={fbgemm_time:7.3f}us, fbgemm_bw={fbgemm_bw:8.3f}GBytes/s."
+    )
+
+
+def main(kernels: Optional[str]):
+    if kernels is not None:
+        kernels = kernels.split(",")
+
+    def should_bench_kernel(fn):
+        return (fn is not None) and (kernels is None or fn.__name__ in kernels)
+
     Es = [16, 128]
     Ts = [1, 128, 2048, 4096, 8192, 16384]
     Ds = [5120]
 
-    for E, T, D in itertools.product(Es, Ts, Ds):
-        bench_gather_scale_dense_tokens(E, T, D, quantize=False)
+    # Gather/Scatter
+    if should_bench_kernel(gather_scale_dense_tokens):
+        for E, T, D in itertools.product(Es, Ts, Ds):
+            bench_gather_scale_dense_tokens(E, T, D, quantize=False)
 
-    for E, T, D in itertools.product(Es, Ts, Ds):
-        bench_gather_scale_dense_tokens(E, T, D, quantize=True)
+    if should_bench_kernel(gather_scale_quant_dense_tokens):
+        for E, T, D in itertools.product(Es, Ts, Ds):
+            bench_gather_scale_dense_tokens(E, T, D, quantize=True)
 
-    if gather_along_first_dim is not None:
+    if should_bench_kernel(gather_along_first_dim):
         for T, D in itertools.product(Ts, Ds):
             bench_gather_along_first_dim(T, T, D)
 
-    if scatter_add_along_first_dim is not None:
+    if should_bench_kernel(scatter_add_along_first_dim):
         for T, D in itertools.product(Ts, Ds):
             bench_scatter_add_along_first_dim(T, T, D)
 
-    for T, E in itertools.product(Ts, Es):
-        bench_top1_index_shuffling(T, E)
+    # Shuffling
+    if should_bench_kernel(index_shuffling):
+        for T, E in itertools.product(Ts, Es):
+            bench_top1_index_shuffling(T, E)
+
+    EPs = [2, 16]
+    Ts = [32, 128, 2048, 4096, 8192, 16384]
+    padded = [True, False]
+    balanced = [True, False]
+
+    if should_bench_kernel(combine_shuffling):
+        for T, D, E, EP, p, b in itertools.product(Ts, Ds, Es, EPs, padded, balanced):
+            bench_combine_or_split_shuffling(
+                T, D, E, EP, p, b, is_combine_shuffling=True
+            )
+
+    if should_bench_kernel(split_shuffling):
+        for T, D, E, EP, p, b in itertools.product(Ts, Ds, Es, EPs, padded, balanced):
+            bench_combine_or_split_shuffling(
+                T, D, E, EP, p, b, is_combine_shuffling=False
+            )
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--kernels",
+        default=None,
+        help="Comma separated list of kernels to benchmark. Defaults to all kernels.",
+    )
+    args = parser.parse_args()
+    main(args.kernels)
diff --git a/fbgemm_gpu/experimental/gen_ai/gen_ai/moe/shuffling.py b/fbgemm_gpu/experimental/gen_ai/gen_ai/moe/shuffling.py
@@ -13,6 +13,7 @@
 import triton.language as tl
 
 
+# Function APIs
 def combine_shuffling(
     tokens: torch.Tensor,
     token_counts: torch.Tensor,
@@ -115,6 +116,92 @@ def _combine_or_split_shuffling(
         return output_tokens
 
 
+# Torch Custom Op Registrations
+_COMBINE_SHUFFLING_OP_NAME = "fbgemm::combine_shuffling"
+
+torch.library.define(
+    "fbgemm::combine_shuffling",
+    "(Tensor tokens, Tensor token_counts, int expert_start, int expert_end, bool is_balanced) -> (Tensor, Tensor)",
+)
+
+
+@torch.library.impl(_COMBINE_SHUFFLING_OP_NAME, "Meta")
+def combine_shuffling_meta(
+    tokens,
+    token_counts,
+    expert_start,
+    expert_end,
+    is_balanced,
+):
+    _, E = token_counts.shape
+    if expert_start is None:
+        expert_start = 0
+    if expert_end is None:
+        expert_end = E
+
+    EG: int = expert_end - expert_start
+    output_tokens = torch.empty_like(tokens)
+    output_token_counts = torch.empty(
+        EG + 1, dtype=token_counts.dtype, device=token_counts.device
+    )
+    return output_tokens, output_token_counts
+
+
+@torch.library.impl(_COMBINE_SHUFFLING_OP_NAME, "CUDA")
+def combine_shuffling_cuda(
+    tokens,
+    token_counts,
+    expert_start=None,
+    expert_end=None,
+    is_balanced=False,
+):
+    return combine_shuffling(
+        tokens,
+        token_counts,
+        expert_start,
+        expert_end,
+        is_balanced,
+    )
+
+
+_SPLIT_SHUFFLING_OP_NAME = "fbgemm::split_shuffling"
+
+torch.library.define(
+    "fbgemm::split_shuffling",
+    "(Tensor tokens, Tensor token_counts, int expert_start, int expert_end, bool is_balanced) -> Tensor",
+)
+
+
+@torch.library.impl(_SPLIT_SHUFFLING_OP_NAME, "Meta")
+def split_shuffling_meta(
+    tokens,
+    token_counts,
+    expert_start,
+    expert_end,
+    is_balanced,
+):
+    output_tokens = torch.empty_like(tokens)
+    return output_tokens
+
+
+@torch.library.impl(_SPLIT_SHUFFLING_OP_NAME, "CUDA")
+def split_shuffling_cuda(
+    tokens,
+    token_counts,
+    expert_start=None,
+    expert_end=None,
+    is_balanced=False,
+):
+    return split_shuffling(
+        tokens,
+        token_counts,
+        expert_start,
+        expert_end,
+        is_balanced,
+    )
+
+
+# Kernel Implementations
 _NV_CONFIGS = [
     triton.Config(
         {
diff --git a/fbgemm_gpu/experimental/gen_ai/test/moe/gather_scatter_test.py b/fbgemm_gpu/experimental/gen_ai/test/moe/gather_scatter_test.py
@@ -8,7 +8,6 @@
 # pyre-ignore-all-errors[16,21,53,56]
 
 import logging
-import os
 import unittest
 from typing import Tuple
 
@@ -26,7 +25,6 @@
 logger: logging.Logger = logging.getLogger()
 logger.setLevel(logging.INFO)
 
-_BENCHMARK_IN_TEST: bool = os.environ.get("BENCHMARK_IN_TEST", "0") == "1"
 _MAX_SAMPLES: int = 100
 
 
diff --git a/fbgemm_gpu/experimental/gen_ai/test/moe/shuffling_test.py b/fbgemm_gpu/experimental/gen_ai/test/moe/shuffling_test.py