Report TBE data configuration with EEG-based indices estimation (pytorch#4018)

gchalump · facebook-github-bot · commit 5982c6401e68 · 2025-04-24T16:20:13.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1106 - Separate a new method in the TBEBenchmarkParamsReporter class that extracts the TBE data configuration parameters from the SplitTableBatchedEmbeddingBagsCodegen object and returns them as TBEDataConfig. - Add unit test to verify extracted TBEDataConfig. Differential Revision: D73450767
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/stats/bench_params_reporter.py b/fbgemm_gpu/fbgemm_gpu/tbe/stats/bench_params_reporter.py
@@ -45,6 +45,7 @@ def __init__(
     ) -> None:
         self.report_interval = report_interval
         self.report_once = report_once
+        self.has_reported = False
 
         default_bucket = "/tmp" if open_source else "tlparse_reports"
         bucket = (
@@ -149,7 +150,9 @@ def report_stats(
             per_sample_weights (Optional[Tensor]): Input per
                 sample weights
         """
-        if embedding_op.iter.item() % self.report_interval == 0:
+        if embedding_op.iter.item() % self.report_interval == 0 and (
+            not self.report_once or (self.report_once and not self.has_reported)
+        ):
             # Extract TBE config
             config = self.extract_params(
                 embedding_op, indices, offsets, per_sample_weights
@@ -160,3 +163,5 @@ def report_stats(
                 f"tbe-{embedding_op.uuid}-config-estimation-{embedding_op.iter.item()}.json",
                 io.BytesIO(config.json(format=True).encode()),
             )
+
+            self.has_reported = True
diff --git a/fbgemm_gpu/test/tbe/stats/tbe_bench_params_reporter_test.py b/fbgemm_gpu/test/tbe/stats/tbe_bench_params_reporter_test.py
@@ -8,17 +8,14 @@
 # pyre-strict
 
 import unittest
-from unittest.mock import MagicMock, patch
 
-import torch
-from fbgemm_gpu.split_embedding_configs import EmbOptimType as OptimType, SparseType
-from fbgemm_gpu.split_table_batched_embeddings_ops_common import (
-    EmbeddingLocation,
-    PoolingMode,
-)
+import hypothesis.strategies as st
 
+import torch
+from fbgemm_gpu.split_table_batched_embeddings_ops_common import EmbeddingLocation
 from fbgemm_gpu.split_table_batched_embeddings_ops_training import (
     ComputeDevice,
+    get_available_compute_device,
     SplitTableBatchedEmbeddingBagsCodegen,
 )
 from fbgemm_gpu.tbe.bench import (
@@ -29,83 +26,90 @@
 )
 from fbgemm_gpu.tbe.stats import TBEBenchmarkParamsReporter
 from fbgemm_gpu.tbe.utils import get_device
+from hypothesis import given, settings
 
 
 class TestTBEBenchmarkParamsReporter(unittest.TestCase):
-    @patch("fbgemm_gpu.utils.FileStore")  # Mock FileStore
+    # pyre-ignore[56]
+    @given(
+        T=st.integers(1, 10),
+        E=st.integers(100, 10000),
+        D=st.sampled_from([32, 64, 128, 256]),
+        L=st.integers(1, 10),
+        B=st.integers(20, 100),
+    )
+    @settings(max_examples=1, deadline=None)
     def test_report_stats(
         self,
-        mock_filestore: MagicMock,  # Mock FileStore
+        T: int,
+        E: int,
+        D: int,
+        L: int,
+        B: int,
     ) -> None:
+        """Test that the reporter can extract a valid JSON configuration from the embedding operation and requests."""
 
+        # Generate a TBEDataConfig
         tbeconfig = TBEDataConfig(
-            T=2,
-            E=1024,
-            D=32,
-            mixed_dim=True,
+            T=T,
+            E=E,
+            D=D,
+            mixed_dim=False,
             weighted=False,
-            batch_params=BatchParams(B=512),
+            batch_params=BatchParams(B=B),
             indices_params=IndicesParams(
                 heavy_hitters=torch.tensor([]),
                 zipf_q=0.1,
                 zipf_s=0.1,
                 index_dtype=torch.int64,
                 offset_dtype=torch.int64,
             ),
-            pooling_params=PoolingParams(L=2),
-            use_cpu=True,
+            pooling_params=PoolingParams(L=L),
+            use_cpu=get_available_compute_device() == ComputeDevice.CPU,
         )
 
-        embedding_location = EmbeddingLocation.HOST
+        embedding_location = (
+            EmbeddingLocation.DEVICE
+            if torch.cuda.is_available()
+            else EmbeddingLocation.HOST
+        )
 
+        # Generate the embedding dimension list
         _, Ds = tbeconfig.generate_embedding_dims()
+
+        # Generate the embedding operation
         embedding_op = SplitTableBatchedEmbeddingBagsCodegen(
             [
                 (
                     tbeconfig.E,
                     D,
                     embedding_location,
-                    ComputeDevice.CPU,
+                    ComputeDevice.CUDA if get_device() == "cuda" else ComputeDevice.CPU,
                 )
                 for D in Ds
             ],
-            optimizer=OptimType.EXACT_ROWWISE_ADAGRAD,
-            learning_rate=0.01,
-            weights_precision=SparseType.FP32,
-            pooling_mode=PoolingMode.SUM,
-            output_dtype=SparseType.FP32,
         )
 
         embedding_op = embedding_op.to(get_device())
 
-        requests = tbeconfig.generate_requests(1)
-
         # Initialize the reporter
         reporter = TBEBenchmarkParamsReporter(report_interval=1)
-        # Set the mock filestore as the reporter's filestore
-        reporter.filestore = mock_filestore
 
-        request = requests[0]
+        # Generate indices and offsets
+        request = tbeconfig.generate_requests(1)[0]
+
         # Call the report_stats method
         extracted_config = reporter.extract_params(
             embedding_op=embedding_op,
             indices=request.indices,
             offsets=request.offsets,
         )
 
-        reporter.report_stats(
-            embedding_op=embedding_op,
-            indices=request.indices,
-            offsets=request.offsets,
-        )
-
-        # TODO: This is not working because need more details in initial config
-        # Assert that the reconstructed configuration matches the original
-        # assert (
-        #     extracted_config == tbeconfig
-        # ), "Extracted configuration does not match the original TBEDataConfig"
-
-        # Check if the write method was called on the FileStore
         assert (
-            reporter.filestore.write.assert_called_once
-        ), "FileStore.write() was not called"
+            extracted_config.T == tbeconfig.T
+            and extracted_config.E == tbeconfig.E
+            and extracted_config.D == tbeconfig.D
+            and extracted_config.pooling_params.L == tbeconfig.pooling_params.L
+            and extracted_config.batch_params.B == tbeconfig.batch_params.B
+        ), "Extracted config does not match the original TBEDataConfig"
+        # Attempt to reconstruct TBEDataConfig from extracted_json_config