pytorch
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎recipes/configs/llama4/scout_17B_16E_lora.yaml
+94 b/‎recipes/configs/llama4/scout_17B_16E_lora.yaml
+94
diff --git a/‎recipes/dev/lora_finetune_distributed_multi_dataset.py
+2-5 b/‎recipes/dev/lora_finetune_distributed_multi_dataset.py
+2-5
diff --git a/‎recipes/knowledge_distillation_distributed.py
+2-5 b/‎recipes/knowledge_distillation_distributed.py
+2-5
diff --git a/‎recipes/lora_dpo_distributed.py
+2-5 b/‎recipes/lora_dpo_distributed.py
+2-5
diff --git a/‎recipes/lora_finetune_distributed.py
+2-5 b/‎recipes/lora_finetune_distributed.py
+2-5
diff --git a/‎recipes/qat_lora_finetune_distributed.py
+2-4 b/‎recipes/qat_lora_finetune_distributed.py
+2-4
diff --git a/‎tests/torchtune/models/llama3_2_vision/test_llama_vision_lora.py
+1-2 b/‎tests/torchtune/models/llama3_2_vision/test_llama_vision_lora.py
+1-2
diff --git a/‎tests/torchtune/modules/moe/test_experts.py
+104-1 b/‎tests/torchtune/modules/moe/test_experts.py
+104-1
diff --git a/‎torchtune/_recipe_registry.py
+4 b/‎torchtune/_recipe_registry.py
+4
diff --git a/‎torchtune/models/llama3_2_vision/_component_builders.py
-7 b/‎torchtune/models/llama3_2_vision/_component_builders.py
-7
@@ -10,7 +10,7 @@
 [**Overview**](#overview-) | [**Installation**](#installation-%EF%B8%8F) | [**Get Started**](#get-started-) |  [**Documentation**](https://pytorch.org/torchtune/main/index.html) | [**Community**](#community-) | [**Citing torchtune**](#citing-torchtune-) | [**License**](#license)
 
 ### 📣 Recent updates 📣
-* *April 2025*: **Llama4** is now available in torchtune! Try out our full finetuning configs [here](recipes/configs/llama4) (LoRA coming soon!)
+* *April 2025*: **Llama4** is now available in torchtune! Try out our full and LoRA finetuning configs [here](recipes/configs/llama4)
 * *February 2025*: Multi-node training is officially [open for business in torchtune](https://pytorch.org/torchtune/main/tutorials/multinode.html)! Full finetune on multiple nodes to take advantage of larger batch sizes and models.
 * *December 2024*: torchtune now supports **Llama 3.3 70B**! Try it out by following our installation instructions [here](#installation-%EF%B8%8F), then run any of the configs [here](recipes/configs/llama3_3).
 * *November 2024*: torchtune has released [v0.4.0](https://github.com/pytorch/torchtune/releases/tag/v0.4.0) which includes stable support for exciting features like activation offloading and multimodal QLoRA
 
@@ -0,0 +1,94 @@
+# Config for multi-device full finetuning in full_finetune_distributed.py
+# using a Llama4 17Bx16E MoE model
+#
+# This config assumes that you've run the following command before launching:
+#   tune download meta-llama/Llama-4-Scout-17B-16E-Instruct
+#
+# To launch on 8 devices, run the following command from root:
+#   tune run --nproc_per_node 8 lora_finetune_distributed --config llama4/scout_17B_16E_lora
+#
+# You can add specific overrides through the command line. For example, to use a larger bsz:
+#   tune run --nproc_per_node 8 lora_finetune_distributed --config llama4/scout_17B_16E_lora batch_size=8
+#
+# This config was only tested on 8xA100 machine.
+
+output_dir: /tmp/torchtune/llama4_17Bx16E/lora
+
+# Modeling Arguments
+model:
+  _component_: torchtune.models.llama4.lora_llama4_scout_17b_16e
+  decoder_trainable: "lora"
+  encoder_trainable: "frozen"
+  fusion_trainable: "lora"
+  lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
+  apply_lora_to_mlp: True
+  apply_lora_to_output: False
+  lora_rank: 16  # higher increases accuracy and memory
+  lora_alpha: 32  # usually alpha=2*rank
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.llama4.llama4_transform
+  path: /tmp/Llama-4-Scout-17B-16E-Instruct/tokenizer.model
+  max_seq_len: null
+  max_num_tiles: 16
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Llama-4-Scout-17B-16E-Instruct
+  checkpoint_files:
+    filename_format: model-{}-of-{}.safetensors
+    max_filename: "00050"
+  recipe_checkpoint: null
+  output_dir: ${output_dir}
+  model_type: LLAMA4
+resume_from_checkpoint: False
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_dataset
+  packed: False
+seed: null
+shuffle: True
+
+# Training arguments
+epochs: 1
+max_steps_per_epoch: null
+batch_size: 1
+gradient_accumulation_steps: 1 # Use to increase effective batch size
+optimizer:
+  _component_: torch.optim.AdamW
+  lr: 2e-5
+  fused: False
+optimizer_in_bwd: False
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+clip_grad_norm: null
+
+# cuda, cpu, rocm, xpu...
+device: cuda
+
+# Memory management / performance
+enable_activation_checkpointing: True
+enable_activation_offloading: False
+custom_sharded_layers: ['tok_embeddings']
+fsdp_cpu_offload: False
+compile: False # torch.compile, set to true for perf/memory improvement
+
+# Reduced precision
+dtype: bf16
+
+# Log metrics during training
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}/logs
+log_every_n_steps: 1
+log_peak_memory_stats: True
+
+# Useful for understanding how to optimize memory and performance
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
@@ -26,12 +26,11 @@
 from torchtune.data._utils import get_dataloader, get_multi_dataset, load_hf_dataset
 from torchtune.datasets._sft import SFTTransform
 from torchtune.modules.peft import (
-    DoRALinear,
+    AdapterModule,
     get_adapter_params,
     get_adapter_state_dict,
     get_lora_module_names,
     get_merged_lora_ckpt,
-    LoRALinear,
     set_trainable_params,
     validate_missing_and_unexpected_for_lora,
 )
@@ -495,9 +494,7 @@ def _setup_model(
         with training.set_default_dtype(self._dtype), self._device:
             lora_device = "cpu" if fsdp_cpu_offload else self._device
             for m in model.modules():
-                if (
-                    isinstance(m, LoRALinear) or isinstance(m, DoRALinear)
-                ) and not lora_weights_state_dict:
+                if (isinstance(m, AdapterModule)) and not lora_weights_state_dict:
                     # lora may not be covered in state dict
                     # if finetune for the 1st time
                     m.to_empty(device=lora_device)
 
@@ -24,12 +24,11 @@
 from torchtune.data import padded_collate_packed, padded_collate_sft
 from torchtune.datasets import ConcatDataset
 from torchtune.modules.peft import (
-    DoRALinear,
+    AdapterModule,
     get_adapter_params,
     get_adapter_state_dict,
     get_lora_module_names,
     get_merged_lora_ckpt,
-    LoRALinear,
     set_trainable_params,
     validate_missing_and_unexpected_for_lora,
 )
@@ -478,9 +477,7 @@ def _setup_model(
         with training.set_default_dtype(self._dtype), self._device:
             lora_device = "cpu" if fsdp_cpu_offload else self._device
             for m in model.modules():
-                if (
-                    isinstance(m, LoRALinear) or isinstance(m, DoRALinear)
-                ) and not lora_weights_state_dict:
+                if isinstance(m, AdapterModule) and not lora_weights_state_dict:
                     # lora may not be covered in state dict
                     # if finetune for the 1st time
                     m.to_empty(device=lora_device)
 
@@ -23,13 +23,12 @@
 from torchtune.data import CROSS_ENTROPY_IGNORE_IDX, padded_collate_dpo
 from torchtune.datasets import ConcatDataset
 from torchtune.modules.peft import (
+    AdapterModule,
     disable_adapter,
-    DoRALinear,
     get_adapter_params,
     get_adapter_state_dict,
     get_lora_module_names,
     get_merged_lora_ckpt,
-    LoRALinear,
     set_trainable_params,
     validate_missing_and_unexpected_for_lora,
 )
@@ -407,9 +406,7 @@ def _setup_model(
         with training.set_default_dtype(self._dtype), self._device:
             lora_device = "cpu" if fsdp_cpu_offload else self._device
             for m in model.modules():
-                if (
-                    isinstance(m, LoRALinear) or isinstance(m, DoRALinear)
-                ) and not lora_weights_state_dict:
+                if (isinstance(m, AdapterModule)) and not lora_weights_state_dict:
                     # lora may not be covered in state dict
                     # if finetune for the 1st time
                     m.to_empty(device=lora_device)
 
@@ -25,12 +25,11 @@
 from torchtune.data import padded_collate_packed
 from torchtune.datasets import ConcatDataset
 from torchtune.modules.peft import (
-    DoRALinear,
+    AdapterModule,
     get_adapter_params,
     get_adapter_state_dict,
     get_lora_module_names,
     get_merged_lora_ckpt,
-    LoRALinear,
     set_trainable_params,
     validate_missing_and_unexpected_for_lora,
 )
@@ -519,9 +518,7 @@ def _setup_model(
         with training.set_default_dtype(self._dtype), self._device:
             lora_device = "cpu" if fsdp_cpu_offload else self._device
             for m in model.modules():
-                if (
-                    isinstance(m, LoRALinear) or isinstance(m, DoRALinear)
-                ) and not lora_weights_state_dict:
+                if (isinstance(m, AdapterModule)) and not lora_weights_state_dict:
                     # lora may not be covered in state dict
                     # if finetune for the 1st time
                     m.to_empty(device=lora_device)
 
@@ -25,12 +25,12 @@
 from torchtune.data import padded_collate_packed
 from torchtune.datasets import ConcatDataset
 from torchtune.modules.peft import (
+    AdapterModule,
     DoRALinear,
     get_adapter_params,
     get_adapter_state_dict,
     get_lora_module_names,
     get_merged_lora_ckpt,
-    LoRALinear,
     set_trainable_params,
     validate_missing_and_unexpected_for_lora,
 )
@@ -532,9 +532,7 @@ def _setup_model(
         with training.set_default_dtype(self._dtype), self._device:
             lora_device = "cpu" if fsdp_cpu_offload else self._device
             for m in model.modules():
-                if (
-                    isinstance(m, LoRALinear) or isinstance(m, DoRALinear)
-                ) and not lora_weights_state_dict:
+                if (isinstance(m, AdapterModule)) and not lora_weights_state_dict:
                     # lora may not be covered in state dict
                     # if finetune for the 1st time
                     m.to_empty(device=lora_device)
 
@@ -10,10 +10,9 @@
 from torchtune.models.llama3_2_vision._component_builders import (
     lora_llama3_2_vision_decoder,
     lora_llama3_2_vision_encoder,
-    LoRATrainable,
 )
 from torchtune.modules.model_fusion import DeepFusionModel
-from torchtune.modules.peft import get_adapter_params
+from torchtune.modules.peft import get_adapter_params, LoRATrainable
 from torchtune.training.seed import set_seed
 
 EMBED_DIM = 128
 
@@ -9,9 +9,14 @@
 import torch
 from tests.test_utils import assert_expected, fixed_init_model
 from torch import nn
-from torchtune.modules.moe import GroupedExperts
+from torchtune.modules.moe import GroupedExperts, LoRAGroupedExperts
+from torchtune.modules.peft import LoRALinear
 from torchtune.training.seed import set_seed
 
+RANK = 4
+ALPHA = 1.0
+SEQ_LEN = 32
+
 
 @pytest.fixture(autouse=True)
 def random():
@@ -57,3 +62,101 @@ def test_forward(self, experts, num_tokens_per_expert, dim):
 
         assert out.shape == (16, dim)
         assert_expected(out.mean().item(), 120.8260, atol=1e-3, rtol=1e-3)
+
+
+class TestLoRAGroupedExperts:
+    @pytest.fixture
+    def dim(self) -> int:
+        return 64
+
+    @pytest.fixture
+    def hidden_dim(self) -> int:
+        return 128
+
+    @pytest.fixture
+    def num_experts(self) -> int:
+        return 8
+
+    @pytest.fixture
+    def experts_per_token(self) -> int:
+        return 2
+
+    @pytest.fixture
+    def num_tokens_per_expert(self, num_experts) -> int:
+        return torch.tensor([1, 2, 1, 4, 3, 1, 2, 2], dtype=torch.int)
+
+    @pytest.fixture
+    def inputs(self, dim, num_experts, experts_per_token) -> torch.Tensor:
+        inputs = torch.randn(num_experts * experts_per_token, SEQ_LEN, dim)
+        return inputs
+
+    @pytest.fixture
+    def experts(self, dim, hidden_dim, num_experts) -> nn.Module:
+        experts = GroupedExperts(
+            dim=dim,
+            hidden_dim=hidden_dim,
+            num_experts=num_experts,
+        )
+        fixed_init_model(experts, min_val=-0.1, max_val=0.1)
+        return experts
+
+    @pytest.fixture
+    def lora_experts(self, dim, hidden_dim, num_experts) -> nn.Module:
+        experts = LoRAGroupedExperts(
+            dim=dim,
+            hidden_dim=hidden_dim,
+            num_experts=num_experts,
+            rank=RANK,
+            alpha=ALPHA,
+        )
+        fixed_init_model(experts, min_val=-0.1, max_val=0.1)
+        return experts
+
+    @pytest.fixture
+    def lora_linear(self, dim, hidden_dim):
+        def create_lora_linear(dim=dim, hidden_dim=hidden_dim):
+            lora_linear = LoRALinear(
+                in_dim=dim,
+                out_dim=hidden_dim,
+                rank=RANK,
+                alpha=ALPHA,
+            )
+            fixed_init_model(lora_linear)
+            return lora_linear
+
+        return create_lora_linear
+
+    def test_lora_tc_layer_forward(self, lora_linear, lora_experts, inputs):
+        """Compare TC forward with LoRALinear as reference"""
+        lora = lora_linear()
+        actual = lora_experts._lora_tc_layer_forward(
+            inputs[0],
+            lora.weight.T,
+            lora.lora_a.weight.T,
+            lora.lora_b.weight.T,
+        )
+        expected = lora(inputs[0])
+        assert_expected(actual, expected, rtol=1e-6, atol=1e-4)
+
+    def test_forward_disabled(
+        self, experts, lora_experts, inputs, num_tokens_per_expert
+    ):
+        """Test forward with lora layers disabled and comparing with GroupedExperts"""
+        lora_experts.disabled = True
+        actual = lora_experts(inputs, num_tokens_per_expert)
+        expected = experts(inputs, num_tokens_per_expert)
+        assert_expected(actual, expected, rtol=1e-6, atol=1e-4)
+
+    def test_forward(
+        self,
+        lora_experts,
+        inputs,
+        num_experts,
+        experts_per_token,
+        dim,
+        num_tokens_per_expert,
+    ) -> None:
+        expected = torch.tensor(0.441491)
+        actual = lora_experts(inputs, num_tokens_per_expert)
+        assert actual.shape == (num_experts * experts_per_token, SEQ_LEN, dim)
+        torch.testing.assert_close(actual.mean(), expected, atol=1e-4, rtol=1e-6)
@@ -442,6 +442,10 @@ class Recipe:
                 name="llama3_2_vision/90B_qlora",
                 file_path="llama3_2_vision/90B_qlora.yaml",
             ),
+            Config(
+                name="llama4/scout_17B_16E_lora",
+                file_path="llama4/scout_17B_16E_lora.yaml",
+            ),
         ],
         supports_distributed=True,
     ),
 
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from enum import Enum
 from functools import partial
 from typing import List, Optional
 
@@ -327,12 +326,6 @@ def llama3_2_vision_projection_head(
 # ------------------ LoRA Llama 3.2 Vision ------------------
 
 
-class LoRATrainable(Enum):
-    FULL = "full"
-    LORA = "lora"
-    FROZEN = "frozen"
-
-
 def lora_llama3_2_vision_encoder(
     encoder_lora: bool,
     fusion_lora: bool,