Update on "scale_grads with foreach + compile"

IvanKobzarev · IvanKobzarev · commit 9e1ab29c5c72 · 2025-04-28T12:13:08.000-07:00
Scaling gradients with for_each, adding also compilation of it.
9 -&gt; 9.8 (max value of tokens_per_second in the first 10 iterations)

Helps with tokens_per_second (for llama4 the parameters are on cpu, not super big wins, but should be better when parameters are on gpu)

```
tune run --nproc_per_node 8 \
  full_finetune_distributed \
  --config recipes/configs/llama4/scout_17B_16E_full.yaml
```
PS:
Current repo compilation fails if to set skip_rope_interval=4,, have to test with skip_rope_interval=None,


[ghstack-poisoned]
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -318,7 +318,7 @@ def setup(self, cfg: DictConfig) -> None:
             self._compile_model = compile.get("model", True)
             self._compile_loss = compile.get("loss", True)
             self._compile_optimizer_step = compile.get("optimizer_step", False)
-            self._compile_scale_grads = compile_components.get("scale_grads", True)
+            self._compile_scale_grads = compile.get("scale_grads", True)
 
         # This indirection is needed to apply torch.compile to scale_grads step.
         self._grad_scaler = training.scale_grads_
diff --git a/torchtune/training/_grad_scaler.py b/torchtune/training/_grad_scaler.py
@@ -11,8 +11,10 @@
 from torch import nn, Tensor
 from torch.nn.utils.clip_grad import _no_grad, _tensor_or_tensors
 from torch.utils._foreach_utils import _device_has_foreach_support, _has_foreach_support
+from torchtune.utils._logging import deprecated
 
 
+@deprecated(msg="Please use `scale_grads_` instead.")
 def scale_grads(model: nn.Module, scaler: torch.Tensor) -> None:
     """
     Utility to scale the gradients of a model.