Allow disabling activation offloading streams in full finetune recipe (#2710)

nathan-az · web-flow · commit c3c532a9e74a · 2025-05-09T09:10:11.000-07:00
Signed-off-by: Nathan Azrak &lt;nathan.azrak@gmail.com&gt;
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -227,6 +227,16 @@ def __init__(self, cfg: DictConfig) -> None:
         self._enable_activation_offloading = cfg.get(
             "enable_activation_offloading", False
         )
+        self._activation_offloading_use_streams = cfg.get(
+            "activation_offloading_use_streams", True
+        )
+        if self._activation_offloading_use_streams and self.parallel_dims.tp_enabled:
+            warn(
+                message=(
+                    "Using activation offloading with streams is not advised in tensor parallel, and may "
+                    "cause unstable training. It is advised to set activation_offloading_use_streams: False"
+                )
+            )
         if self._enable_activation_offloading:
             if device_type != "cuda":
                 raise RuntimeError(
@@ -339,6 +349,7 @@ def setup(self, cfg: DictConfig) -> None:
             cfg_model=cfg.model,
             enable_activation_checkpointing=self._enable_activation_checkpointing,
             enable_activation_offloading=self._enable_activation_offloading,
+            activation_offloading_use_streams=self._activation_offloading_use_streams,
             custom_sharded_layers=cfg.get("custom_sharded_layers", None),
             fsdp_cpu_offload=self.fsdp_cpu_offload,
             reshard_after_forward=cfg.get("fsdp_reshard_after_forward", True),
@@ -541,6 +552,7 @@ def _setup_model(
         cfg_model: DictConfig,
         enable_activation_checkpointing: bool,
         enable_activation_offloading: bool,
+        activation_offloading_use_streams: bool,
         fsdp_cpu_offload: bool,
         reshard_after_forward: bool,
         model_state_dict: Dict[str, Any],
@@ -659,7 +671,7 @@ def _setup_model(
 
         # activation offloading
         self.activations_handling_ctx = training.get_act_offloading_ctx_manager(
-            model, enable_activation_offloading
+            model, enable_activation_offloading, activation_offloading_use_streams
         )
 
         # Ensure no params and buffers are on meta device
diff --git a/torchtune/training/_activation_offloading.py b/torchtune/training/_activation_offloading.py
@@ -378,7 +378,7 @@ def noop(tensor):
 
 
 def get_act_offloading_ctx_manager(
-    model: nn.Module, enable_activation_offloading: bool
+    model: nn.Module, enable_activation_offloading: bool, use_streams: bool = True
 ) -> Union[OffloadActivations, contextlib.nullcontext]:
     """Returns the activation offloading context manager for the model, which will be
     a null context if enable_activation_offloading is False.
@@ -390,6 +390,7 @@ def get_act_offloading_ctx_manager(
         model (nn.Module): the model to wrap with the activation offloading context manager.
         enable_activation_offloading (bool): whether or not to enable activation offloading
             for the model.
+        use_streams (bool): whether or not to enable streams for overlapping communication.
 
     Returns:
         contextlib.ContextDecorator: the activation offloading context manager for the model.
@@ -398,7 +399,7 @@ def get_act_offloading_ctx_manager(
         NotImplementedError: If the model is a multimodal model and activation offloading is enabled.
     """
     if enable_activation_offloading:
-        activations_handling_ctx = OffloadActivations()
+        activations_handling_ctx = OffloadActivations(use_streams=use_streams)
 
         # Below is our hack to disable offloading the last output Linear in every
         # step, as the cost for offloading the activation and then soon after bringing