I am done

2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions
--- a/rl/Lib/site-packages/torch/distributed/_tools/init.py
+++ b/rl/Lib/site-packages/torch/distributed/_tools/init.py
@ -0,0 +1,5 @@
+from .fsdp2_mem_tracker import FSDPMemTracker
+from .mem_tracker import MemTracker
+from .memory_tracker import MemoryTracker
+from .mod_tracker import ModTracker
+from .runtime_estimator import RuntimeEstimator
--- a/rl/Lib/site-packages/torch/distributed/_tools/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/distributed/_tools/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/distributed/_tools/pycache/fsdp2_mem_tracker.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/distributed/_tools/pycache/fsdp2_mem_tracker.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/distributed/_tools/pycache/mem_tracker.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/distributed/_tools/pycache/mem_tracker.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/distributed/_tools/pycache/memory_tracker.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/distributed/_tools/pycache/memory_tracker.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/distributed/_tools/pycache/mod_tracker.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/distributed/_tools/pycache/mod_tracker.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/distributed/_tools/pycache/runtime_estimator.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/distributed/_tools/pycache/runtime_estimator.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/distributed/_tools/fsdp2_mem_tracker.py
+++ b/rl/Lib/site-packages/torch/distributed/_tools/fsdp2_mem_tracker.py
@ -0,0 +1,610 @@
+from copy import deepcopy
+from datetime import timedelta
+from functools import partial, wraps
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type, Union
+
+import torch
+import torch.distributed as dist
+from torch import nn, optim
+from torch._guards import active_fake_mode
+from torch.distributed._composable.fsdp import FSDPModule
+from torch.distributed._composable.fsdp._fsdp_param_group import FSDPParamGroup
+from torch.distributed._tools.mem_tracker import _RefType, _State, MemTracker
+from torch.distributed.distributed_c10d import (
+    _IllegalWork,
+    ProcessGroup,
+    ReduceOp,
+    Work,
+)
+from torch.futures import Future
+from torch.utils._python_dispatch import TorchDispatchMode
+from torch.utils._pytree import tree_map_only
+from torch.utils.weak import WeakIdKeyDictionary, weakref
+
+
+_TOTAL_KEY = "Total"
+
+__all__ = ["FSDPMemTracker"]
+
+
+class _FSDPRefType(_RefType):
+    """
+    Enumerates categories of memory usage in FSDP modules, including parameters, gradients, activations,
+    and optimizer states.
+
+    Attributes:
+        SHARDED_PARAM (str): Memory usage of sharded parameters.
+        UNSHARDED_PARAM (str): Memory usage of unsharded parameters.
+        SHARDED_GRAD (str): Memory usage of sharded gradients corresponding to the sharded parameters.
+        UNSHARDED_GRAD (str): Memory usage of unsharded gradients corresponding to the unsharded parameters.
+        ACT (str): Memory usage of activations and tensors from forward and AC recomputation.
+        TEMP (str): Memory usage of temporary tensors during the backward pass including gradients of activations.
+        ALL_GATHER (str): Memory usage of all_gather output tensor.
+        REDUCE_SCATTER (str): Memory usage of reduce_scatter input tensor.
+        OPT (str): Memory usage of tensors storing optimizer states.
+        INP (str): Memory usage of input tensors.
+    """
+
+    SHARDED_PARAM = "Sharded Param"
+    UNSHARDED_PARAM = "Unsharded Param"
+    BUFFER = "Buffer"
+    SHARDED_GRAD = "Sharded Grad"
+    UNSHARDED_GRAD = "Unsharded Grad"
+    ACT = "Activation"
+    TEMP = "Temp"
+    ALL_GATHER = "All Gather"
+    REDUCE_SCATTER = "Reduce Scatter"
+    OPT = "OptState"
+    INP = "Inputs"
+
+
+class _SavedFSDPMethods(NamedTuple):
+    pre_backward: Callable
+    post_backward: Callable
+
+
+class _SavedCollectives(NamedTuple):
+    all_gather_into_tensor: Callable
+    reduce_scatter_tensor: Callable
+    all_reduce: Callable
+    barrier: Callable
+
+
+class _FSDPModState(_State):
+    """
+    Enumerates the states of FSDP modules during the forward and backward passes.
+    """
+
+    BEF_PRE_FW = "Before Pre-Forward"
+    AFT_PRE_FW = "After Pre-Forward"
+    BEF_POST_FW = "Before Post-Forward"
+    AFT_POST_FW = "After Post-Forward"
+    BEF_PRE_BW = "Before Pre-Backward"
+    AFT_PRE_BW = "After Pre-Backward"
+    BEF_POST_BW = "Before Post-Backward"
+    AFT_POST_BW = "After Post-Backward"
+    PRE_FW_AC = "Pre-Forward AC"
+    POST_FW_AC = "Post-Forward AC"
+    PEAK_FW = "Peak Forward"
+    PEAK_BW = "Peak Backward"
+
+
+class _FSDPModMemStats:
+    """
+    A class to store the memory statistics of an FSDP module.
+
+    Args:
+        mod_fqn (str): The fully qualified name of the FSDP module.
+
+    Attributes:
+        snapshots (Dict[_FSDPModState, Dict[torch.device, Dict[str, int]]]): A dictionary of memory snapshots
+        of the module at different states as defined by ``_FSDPModState``. Each key is a device, and
+        each value is another dictionary with keys as memory reference types defined by ``_FSDPRefType`` and
+        values as the memory consumed in bytes.
+
+    """
+
+    def __init__(self, mod_fqn: str) -> None:
+        self.mod_fqn = mod_fqn
+        self.local_peak: Dict[torch.device, int] = {}
+        self.snapshots: Dict[
+            _FSDPModState, List[Dict[torch.device, Dict[str, int]]]
+        ] = {}
+
+
+class FSDPMemTracker(MemTracker):
+    """
+    A ``TorchDispatchMode`` based context manager that extends ``torch.distributed._tools.mem_tracker.MemTracker`` to track
+    and categorize the peak memory and module-wise memory usage of FSDP modules.
+
+    It tracks the peak memory usage across all the devices of all the FSDP modules in the module tree and categorizes
+    the tensor memory usage as defined by ``_FSDPRefType``. Further, it captures memory `snapshots` at different stages of
+    the module execution defined by ``_FSDPModState``.
+
+    Attributes:
+        memory_tracking: A weakref key dictionary to store the memory statistics of each module. Each key is a reference
+        to a module, and each value is a ``_FSDPModMemStats`` object that stores the memory statistics of the module.
+
+    Args:
+        mod (torch.nn.Module): The root FSDP module to be tracked.
+        optm (torch.optim.Optimizer, optional): The optimizer to be tracked.
+
+    Note: Please refer to ``torch.distributed._tools.mem_tracker.MemTracker`` to learn about the limitations.
+
+    Example usage
+
+    .. code-block:: python
+
+        module = ...
+        optimizer = ...
+        inp = ...
+        fmt = FSDPMemTracker(module, optimizer)
+        fmt.track_inputs((inp,))
+        with fmt:
+            optimizer.zero_grad()
+            loss = module(inp)
+            print("After Forward:")
+            fmt.display_snapshot("current")
+            loss.backward()
+            optimizer.step()
+        fmt.display_snapshot("peak")
+        fmt.display_modulewise_snapshots(depth = 3, units = "MB")
+
+    """
+
+    def __init__(
+        self,
+        mod: torch.nn.Module,
+        optm: Optional[torch.optim.Optimizer] = None,
+    ) -> None:
+        super().__init__()
+        assert isinstance(mod, FSDPModule), "FSDPMemTracker only supports FSDP modules"
+        self._root_mod = mod
+        self._optm = optm
+        self._in_fake_mode: bool = False
+        self._fsdp_mod_to_saved_methods: WeakIdKeyDictionary = WeakIdKeyDictionary()
+        self._saved_collectives: _SavedCollectives
+        self._ref_class: Type[_RefType] = _FSDPRefType
+
+    def _instrument_fsdp_sharded_params_grads(
+        self, fsdp_param_group: FSDPParamGroup
+    ) -> None:
+        # Track sharded params and grads after initilization
+        for fsdp_param in fsdp_param_group.fsdp_params:
+            self._update_and_maybe_create_winfos(
+                fsdp_param.sharded_param,
+                _FSDPRefType.SHARDED_PARAM,
+            )
+            sharded_grad = fsdp_param.sharded_param.grad
+            if sharded_grad is not None:
+                self._update_and_maybe_create_winfos(
+                    sharded_grad,
+                    _FSDPRefType.SHARDED_GRAD,
+                )
+
+    def _fsdp_state_pre_forward(
+        self,
+        fsdp_mod: FSDPModule,
+        orig_fsdp_state_pre_fw: Callable,
+    ) -> Callable:
+        # We capture memory snapshots before and after ``FSDPState._pre_forward`` to attribute the `unsharded` params
+        # and `all_gather` buffers.  There are three cases:
+        # Case 1: If the module is not in the ``memory_tracking`` dictionary, create a new ``_FSDPModMemStats``
+        #         instance for the module and add it to the ``memory_tracking`` dictionary.
+        # Case 2: If the module is already in the ``memory_tracking`` dictionary and we are in backward, this means
+        #         we are in the AC region. We check if this is the top most module in the AC region. If it is,
+        #         we store a weak reference and set the flag ``_in_ac`` to True.
+        # Case 3: If the module is already in the ``memory_tracking`` dictionary and we are in forward, this means
+        #         this module is called for the second time. If it is a root module, that means we are in the next
+        #         iteration and we error out. If it is not a root module, that means it's a submodule that is being
+        #         used multiple times in the same iteration, which we allow and track.
+        # For Case 1 and 3, we also initialiaze the ``local_peak`` and ``PEAK_FW`` snapshot for the module.
+        # For Case 2 we only capture 1 snapshot after ``FSDPState._pre_forward`` runs because it is a no-op.
+        @wraps(orig_fsdp_state_pre_fw)
+        def inner(*args: Any, **kwargs: Any) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
+            mod_fqn = self._mod_tracker.get_known_fqn(fsdp_mod)
+            assert mod_fqn is not None
+            if fsdp_mod not in self.memory_tracking:
+                mod_stat = _FSDPModMemStats(mod_fqn)
+                self.memory_tracking[fsdp_mod] = mod_stat
+                snapshot = self.get_tracker_snapshot()
+                mod_stat.local_peak = {
+                    dev: dev_snap[_TOTAL_KEY] for dev, dev_snap in snapshot.items()
+                }
+                mod_stat.snapshots.setdefault(_FSDPModState.PEAK_FW, []).append(
+                    snapshot
+                )
+                mod_stat.snapshots.setdefault(_FSDPModState.BEF_PRE_FW, []).append(
+                    deepcopy(snapshot)
+                )
+            elif not self._mod_tracker.is_bw:
+                parents = self._mod_tracker.parents - {mod_fqn}
+                if len(parents) == 1 and "Global" in parents:
+                    raise NotImplementedError(
+                        "FSDPMemTracker does not support memory tracking for multiple iterative calls."
+                        " Either use ``reset_mod_stats`` to clear module memory stats for the previous iteration"
+                        " or file a github issue if you need this feature."
+                    )
+
+            args, kwargs = orig_fsdp_state_pre_fw(*args, **kwargs)
+
+            fsdp_state = fsdp_mod._get_fsdp_state()
+            if fsdp_param_group := fsdp_state._fsdp_param_group:
+                for fsdp_param in fsdp_param_group.fsdp_params:
+                    self._update_and_maybe_create_winfos(
+                        fsdp_param.unsharded_param,
+                        _FSDPRefType.UNSHARDED_PARAM,
+                    )
+            mod_stat = self.memory_tracking[fsdp_mod]
+            if self._mod_tracker.is_bw:
+                state = _FSDPModState.PRE_FW_AC
+                if self._ac_mod is None:
+                    self._ac_mod = weakref.ref(fsdp_mod)
+                    self._in_ac = True
+            else:
+                state = _FSDPModState.AFT_PRE_FW
+            mod_stat.snapshots.setdefault(state, []).append(self.get_tracker_snapshot())
+            return args, kwargs
+
+        return inner
+
+    def _fsdp_state_post_forward(
+        self,
+        fsdp_mod: FSDPModule,
+        orig_fsdp_state_post_fw: Callable,
+    ) -> Callable:
+        # We capture memory snapshots before and after ``FSDPState._post_forward`` to capture the resharded state
+        # if ``reshard_after_forward`` is not ``False``. There are two cases:
+        # Case 1: This is called in backward, which means we are in the AC region. If this is the top most module
+        #         in the AC region, we set the flag ``_in_ac`` to False.
+        # Case 2: This is called in forward.
+        @wraps(orig_fsdp_state_post_fw)
+        def inner(*args: Any, **kwargs: Any) -> Any:
+            mod_stat = self.memory_tracking[fsdp_mod]
+            if self._mod_tracker.is_bw:
+                state = _FSDPModState.POST_FW_AC
+                if self._ac_mod is not None and self._ac_mod() is fsdp_mod:
+                    self._ac_mod = None
+                    self._in_ac = False
+            else:
+                state = _FSDPModState.BEF_POST_FW
+            mod_stat.snapshots.setdefault(state, []).append(self.get_tracker_snapshot())
+
+            output = orig_fsdp_state_post_fw(*args, **kwargs)
+
+            if not self._mod_tracker.is_bw:
+                mod_stat.snapshots.setdefault(_FSDPModState.AFT_POST_FW, []).append(
+                    self.get_tracker_snapshot()
+                )
+            return output
+
+        return inner
+
+    def _fsdp_param_group_pre_backward(
+        self,
+        fsdp_mod: FSDPModule,
+        orig_fsdp_param_group_pre_backward: Callable,
+    ) -> Callable:
+        # We capture memory snapshots before and after ``FSDPParamGroup.pre_backward`` to capture the pre-fetching
+        # and unsharding of params. We also initialize ``local_peak`` and ``PEAK_BW`` snapshot for the module.
+        @wraps(orig_fsdp_param_group_pre_backward)
+        def inner(*args: Any, **kwargs: Any) -> None:
+            mod_stat = self.memory_tracking[fsdp_mod]
+            snapshot = self.get_tracker_snapshot()
+            mod_stat.local_peak = {
+                dev: dev_snap[_TOTAL_KEY] for dev, dev_snap in snapshot.items()
+            }
+            mod_stat.snapshots.setdefault(_FSDPModState.PEAK_BW, []).append(snapshot)
+            mod_stat.snapshots.setdefault(_FSDPModState.BEF_PRE_BW, []).append(
+                deepcopy(snapshot)
+            )
+            orig_fsdp_param_group_pre_backward(*args, **kwargs)
+
+            mod_stat.snapshots.setdefault(_FSDPModState.AFT_PRE_BW, []).append(
+                self.get_tracker_snapshot()
+            )
+
+        return inner
+
+    def _fsdp_param_group_post_backward(
+        self,
+        fsdp_mod: FSDPModule,
+        orig_fsdp_param_group_post_backward: Callable,
+    ) -> Callable:
+        # We capture the memory snapshots before and after ``FSDPParamGroup.post_backward`` to track and attribute
+        # the `unsharded` grads before the post backward and then `sharded` grads and `reduce_scatter`  buffers
+        # after the post backward.
+        @wraps(orig_fsdp_param_group_post_backward)
+        def inner(*args: Any, **kwargs: Any) -> None:
+            fsdp_state = fsdp_mod._get_fsdp_state()
+            if fsdp_param_group := fsdp_state._fsdp_param_group:
+                for fsdp_param in fsdp_param_group.fsdp_params:
+                    unsharded_grad = fsdp_param._unsharded_param.grad
+                    if unsharded_grad is not None:
+                        self._update_and_maybe_create_winfos(
+                            unsharded_grad,
+                            _FSDPRefType.UNSHARDED_GRAD,
+                            update_existing=True,
+                        )
+
+            mod_stat = self.memory_tracking[fsdp_mod]
+            mod_stat.snapshots.setdefault(_FSDPModState.BEF_POST_BW, []).append(
+                self.get_tracker_snapshot()
+            )
+
+            orig_fsdp_param_group_post_backward(*args, **kwargs)
+
+            if fsdp_param_group := fsdp_state._fsdp_param_group:
+                for fsdp_param in fsdp_param_group.fsdp_params:
+                    sharded_grad = fsdp_param.sharded_param.grad
+                    if sharded_grad is not None:
+                        self._update_and_maybe_create_winfos(
+                            sharded_grad,
+                            _FSDPRefType.SHARDED_GRAD,
+                        )
+
+            mod_stat.snapshots.setdefault(_FSDPModState.AFT_POST_BW, []).append(
+                self.get_tracker_snapshot()
+            )
+
+        return inner
+
+    def _instrument_fsdp_module(self) -> None:
+        # We uninstall the existing `FSDPState._pre_forward` and `FSDPState._post_forward` hooks and install
+        # our own hooks that wrap them. We choose this over monkey-patching `FSDPParamGroup.pre_forward` and
+        # `FSDPParamGroup.post_forward` because during AC these won't be called.
+        # TODO(@sanketpurandare): This will need to be modified after this PR (https://github.com/pytorch/pytorch/pull/127786)
+        # lands. For backward we monkey-patch the `FSDPParamGroup.pre_backward` and `FSDPParamGroup.post_backward`.
+        for module in self._root_mod.modules():
+            if isinstance(module, FSDPModule):
+                fsdp_state = module._get_fsdp_state()
+                if fsdp_param_group := fsdp_state._fsdp_param_group:
+                    self._instrument_fsdp_sharded_params_grads(fsdp_param_group)
+                    fsdp_state._pre_forward_hook_handle.remove()
+                    fsdp_state._post_forward_hook_handle.remove()
+                    fsdp_state._pre_forward_hook_handle = (
+                        module.register_forward_pre_hook(
+                            self._fsdp_state_pre_forward(
+                                module, fsdp_state._pre_forward
+                            ),
+                            prepend=True,
+                            with_kwargs=True,
+                        )
+                    )
+                    fsdp_state._post_forward_hook_handle = module.register_forward_hook(
+                        self._fsdp_state_post_forward(module, fsdp_state._post_forward),
+                        prepend=False,
+                        always_call=True,
+                    )
+                    self._fsdp_mod_to_saved_methods[module] = _SavedFSDPMethods(
+                        fsdp_param_group.pre_backward,
+                        fsdp_param_group.post_backward,
+                    )
+                    fsdp_param_group.pre_backward = self._fsdp_param_group_pre_backward(  # type: ignore[assignment]
+                        module, fsdp_param_group.pre_backward
+                    )
+                    fsdp_param_group.post_backward = (  # type: ignore[assignment]
+                        self._fsdp_param_group_post_backward(
+                            module, fsdp_param_group.post_backward
+                        )
+                    )
+
+        for buffer in self._root_mod.buffers():
+            self._update_and_maybe_create_winfos(
+                buffer,
+                _FSDPRefType.BUFFER,
+            )
+
+    def _instrument_optimizer(self) -> None:
+        # Register a hook on the optimizer step to track the optimizer states.
+        # The pre-hook is to set the flag ``_in_opt`` to True. The post-hook unsets the flag,
+        # and also tracks any optimizer states that are created during the optimizer step.
+        if self._optm is not None:
+            self._track_optimizer_states(_FSDPRefType.OPT, self._optm)
+
+            def _opt_step_pre_hook(
+                optimizer: optim.Optimizer, args: Any, kwargs: Any
+            ) -> None:
+                self._in_opt = True
+
+            def _opt_step_post_hook(
+                optimizer: optim.Optimizer, args: Any, kwargs: Any
+            ) -> None:
+                self._track_optimizer_states(_FSDPRefType.OPT, optimizer)
+                self._in_opt = False
+
+            self._optimizer_hook_handles = (
+                self._optm.register_step_pre_hook(_opt_step_pre_hook),
+                self._optm.register_step_post_hook(_opt_step_post_hook),
+            )
+
+    def _register_module_and_optimizer_hooks(self) -> None:
+        self._instrument_fsdp_module()
+        self._instrument_optimizer()
+
+    def _deregister_module_and_optimizer_hooks(self) -> None:
+        for (
+            fsdp_mod,
+            saved_methods,
+        ) in self._fsdp_mod_to_saved_methods.items():
+            fsdp_state = fsdp_mod._get_fsdp_state()
+            fsdp_state._pre_forward_hook_handle.remove()
+            fsdp_state._post_forward_hook_handle.remove()
+            fsdp_state._pre_forward_hook_handle = fsdp_mod.register_forward_pre_hook(
+                fsdp_state._pre_forward, prepend=True, with_kwargs=True
+            )
+            fsdp_state._post_forward_hook_handle = fsdp_mod.register_forward_hook(
+                fsdp_state._post_forward, prepend=False
+            )
+            if fsdp_param_group := fsdp_state._fsdp_param_group:
+                fsdp_param_group.pre_backward = saved_methods.pre_backward
+                fsdp_param_group.post_backward = saved_methods.post_backward
+        self._fsdp_mod_to_saved_methods.clear()
+
+        if self._optimizer_hook_handles is not None:
+            for handle in self._optimizer_hook_handles:
+                handle.remove()
+            self._optimizer_hook_handles = None
+
+    def _instrument_and_maybe_bypass_collectives(self) -> None:
+        # Monkey-patching collectives is required because they do not work with `FakeTensorMode`
+        # It's also easier to track `all_gather` and `reduce_scatter` buffers faithfully.
+        self._saved_collectives = _SavedCollectives(
+            dist.all_gather_into_tensor,
+            dist.reduce_scatter_tensor,
+            dist.all_reduce,
+            dist.barrier,
+        )
+
+        class FakeWork(Work):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def get_future(self) -> Future:
+                future: Future = Future()
+                future.set_result(None)
+                return future
+
+            def wait(self, timeout: Optional[timedelta] = None) -> bool:
+                return True
+
+        @wraps(dist.all_gather_into_tensor)
+        def all_gather_into_tensor(
+            output_tensor: torch.Tensor,
+            input_tensor: torch.Tensor,
+            group: Union[ProcessGroup, None] = None,
+            async_op: bool = False,
+        ) -> Union[Work, _IllegalWork, None]:
+            self._update_and_maybe_create_winfos(
+                output_tensor,
+                _FSDPRefType.ALL_GATHER,
+                update_existing=True,
+            )
+
+            if self._in_fake_mode:
+                if async_op:
+                    return FakeWork()
+                return None
+            else:
+                return self._saved_collectives.all_gather_into_tensor(
+                    output_tensor, input_tensor, group, async_op
+                )
+
+        @wraps(dist.reduce_scatter_tensor)
+        def reduce_scatter_tensor(
+            output: torch.Tensor,
+            input: torch.Tensor,
+            op: ReduceOp.RedOpType = dist.ReduceOp.SUM,
+            group: Union[ProcessGroup, None] = None,
+            async_op: bool = False,
+        ) -> Union[Work, _IllegalWork, None]:
+            self._update_and_maybe_create_winfos(
+                input,
+                _FSDPRefType.REDUCE_SCATTER,
+                update_existing=True,
+            )
+
+            if self._in_fake_mode:
+                if async_op:
+                    return FakeWork()
+                return None
+            else:
+                return self._saved_collectives.reduce_scatter_tensor(
+                    output, input, op, group, async_op
+                )
+
+        @wraps(dist.all_reduce)
+        def all_reduce(
+            tensor: torch.Tensor,
+            op: ReduceOp.RedOpType = dist.ReduceOp.SUM,
+            group: Union[ProcessGroup, None] = None,
+            async_op: bool = False,
+        ) -> Union[Work, _IllegalWork, None]:
+            if self._in_fake_mode:
+                if async_op:
+                    return FakeWork()
+                return None
+            else:
+                return self._saved_collectives.all_reduce(tensor, op, group, async_op)
+
+        @wraps(dist.barrier)
+        def barrier(
+            group: Union[ProcessGroup, None] = dist.GroupMember.WORLD,
+            async_op: bool = False,
+            device_ids: Union[List[int], None] = None,
+        ) -> Union[Work, None]:
+            if self._in_fake_mode:
+                return None
+            else:
+                return self._saved_collectives.barrier(group, async_op, device_ids)
+
+        dist.all_gather_into_tensor = all_gather_into_tensor
+        dist.reduce_scatter_tensor = reduce_scatter_tensor
+        dist.all_reduce = all_reduce
+        dist.barrier = barrier
+
+    def _restore_collectives(self) -> None:
+        dist.all_gather_into_tensor = self._saved_collectives.all_gather_into_tensor
+        dist.reduce_scatter_tensor = self._saved_collectives.reduce_scatter_tensor
+        dist.all_reduce = self._saved_collectives.all_reduce
+        dist.barrier = self._saved_collectives.barrier
+        del self._saved_collectives
+
+    def track_inputs(self, inputs: Tuple[Any, ...]) -> None:
+        """
+        This is used to track the input tensors to the model and annotate them as ``Inputs``.
+        Args:
+            inputs (Tuple[Any]): A tuple containing the input data. This can include tensors
+                        as well as other data types. Only tensors will be tracked.
+        """
+
+        def _track_inputs(t: torch.Tensor) -> None:
+            self._update_and_maybe_create_winfos(
+                t,
+                _FSDPRefType.INP,
+            )
+
+        tree_map_only(torch.Tensor, _track_inputs, inputs)
+
+    def track_external(
+        self, *external: Union[nn.Module, optim.Optimizer, torch.Tensor]
+    ) -> None:
+        """This is no-op for ``FSDPMemTracker``"""
+
+    def __enter__(self) -> "FSDPMemTracker":
+        self._in_fake_mode = True if active_fake_mode() else False
+        self._register_module_and_optimizer_hooks()
+        self._instrument_and_maybe_bypass_collectives()
+        self._track_resize()
+        self._peak_mem_snap = self.get_tracker_snapshot()
+        self._peak_mem = {
+            dev: dev_snap[_TOTAL_KEY] for dev, dev_snap in self._peak_mem_snap.items()
+        }
+        self._mod_tracker.__enter__()
+        TorchDispatchMode.__enter__(self)
+        return self
+
+    def __exit__(self, *args: Any) -> None:
+        self._deregister_module_and_optimizer_hooks()
+        self._restore_collectives()
+        self._restore_resize()
+        TorchDispatchMode.__exit__(self, *args)
+        self._mod_tracker.__exit__(*args)
+
+    def __torch_dispatch__(self, func, types, args=..., kwargs=None):  # type: ignore[no-untyped-def]
+        res = func(*args, **kwargs or {})
+        # If we are tracking an optimizer state, we use the optimizer reference type.
+        # If we are in backward region and not in AC region, we use the backward reference type.
+        # Else we use the forward reference type.
+        if self._in_opt:
+            reftype = _FSDPRefType.OPT
+        elif self._mod_tracker.is_bw and not self._in_ac:
+            reftype = _FSDPRefType.TEMP
+        else:
+            reftype = _FSDPRefType.ACT
+        tree_map_only(torch.Tensor, partial(self._track, reftype), res)
+        peak_state = (
+            _FSDPModState.PEAK_BW if self._mod_tracker.is_bw else _FSDPModState.PEAK_FW
+        )
+        self._update_peak_stats(peak_state)
+        return res
--- a/rl/Lib/site-packages/torch/distributed/_tools/mem_tracker.py
+++ b/rl/Lib/site-packages/torch/distributed/_tools/mem_tracker.py
@ -0,0 +1,943 @@
+import math
+import os
+import re
+import warnings
+from copy import deepcopy
+from enum import auto, Enum
+from functools import partial, wraps
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    TYPE_CHECKING,
+    Union,
+)
+from typing_extensions import Self
+
+import torch
+from torch import nn, optim
+from torch.distributed._tools.mod_tracker import ModTracker
+from torch.optim.optimizer import (
+    register_optimizer_step_post_hook,
+    register_optimizer_step_pre_hook,
+)
+from torch.utils._python_dispatch import (
+    is_traceable_wrapper_subclass,
+    TorchDispatchMode,
+)
+from torch.utils._pytree import tree_flatten, tree_map_only
+from torch.utils.weak import WeakIdKeyDictionary, weakref
+
+
+if TYPE_CHECKING:
+    from torch.utils.hooks import RemovableHandle
+
+# This value is hard-coded here:
+# https://github.com/pytorch/pytorch/blob/5fba5d83f0703ff8077ab65448a998e9ad6598fd/c10/cuda/CUDACachingAllocator.cpp#L117
+_PYTORCH_MIN_ALLOCATE = (
+    2**9 if int(os.environ.get("PYTORCH_NO_CUDA_MEMORY_CACHING", 0)) == 0 else 1
+)
+_TOTAL_KEY = "Total"
+
+__all__ = ["MemTracker"]
+
+
+class _RefType(str, Enum):
+    """Base Class for defining memory reference types, categorizing tensors based on their usage within a model."""
+
+
+class _State(str, Enum):
+    """Base Class for defining module state to capture snapshots ."""
+
+
+class _MemRefType(_RefType):
+    """
+    An enum to define memory reference types, categorizing tensors based on their usage within a model.
+
+        - PARAM: Tensors registered as nn.Parameter within modules.
+        - BUFFER: Tensors registered as nn.Buffer within modules.
+        - GRAD: Gradients associated with parameters.
+        - ACT: Tensors produced during the forward pass and recomputation in activation checkpointing.
+        - TMP: Temporary memory used during the backward pass, including gradients of activations.
+        - OPT: Tensors holding optimizer states.
+        - OTH: Tensors registered via `track_external` that do not fit the above categories.
+    """
+
+    PARAM = "Parameter"
+    BUFFER = "Buffer"
+    GRAD = "Gradient"
+    ACT = "Activation"
+    TEMP = "Temp"
+    OPT = "Optstate"
+    OTH = "Other"
+
+
+class _ModState(_State):
+    """
+    An enum to define the state of a module.
+
+        - PRE_FW: The module is about to run the forward pass.
+        - POST_FW: The module has finished running the forward pass.
+        - PEAK_FW: The module has reached the peak memory usage during the forward pass.
+        - PRE_BW: The module is about to run the backward pass.
+        - PRE_FW_AC: The module is about to run the forward pass with activation checkpointing.
+        - POST_FW_AC: The module has finished running the forward pass with activation checkpointing.
+        - POST_BW: The module has finished running the backward pass.
+        - PEAK_BW: The module has reached the peak memory usage during the backward pass.
+    """
+
+    PRE_FW = "Pre-Forward"
+    POST_FW = "Post-Forward"
+    PEAK_FW = "Peak-Forward"
+    PRE_BW = "Pre-Backward"
+    PRE_FW_AC = "Pre-Forward-AC"
+    POST_FW_AC = "Post-Forward-AC"
+    POST_BW = "Post-Backward"
+    PEAK_BW = "Peak-Backward"
+
+
+class _ModMemStats:
+    """
+    A class to store the memory statistics of a module.
+
+    Args:
+        mod_fqn (str): The fully qualified name of the module.
+    Attributes:
+        mod_fqn (str): The fully qualified name of the module.
+        parameter_mem (int): The memory usage of the parameters of the module.
+        buffer_mem (int): The memory usage of the buffers of the module.
+        input_mem (int): The memory usage of the inputs to the module.
+        output_mem (int): The memory usage of the outputs from the module.
+        snapshots (Dict[_ModState, Dict[torch.device, Dict[str, int]]]): A dictionary of memory snapshots
+        of the module at different states defined by ``_ModState``.
+    Note:
+        The memory snapshot is stored as a dictionary - Dict[torch.device, Dict[str, int]], where each key is a device,
+         and each value is another dictionary with keys as memory reference types defined by `_MemRefType` and
+         values as the memory consumed in bytes.
+    """
+
+    def __init__(self, mod_fqn: str):
+        self.mod_fqn = mod_fqn
+        self.parameter_mem: int
+        self.buffer_mem: int
+        self.input_mem: int
+        self.output_mem: int
+        self.local_peak: Dict[torch.device, int] = {}
+        self.snapshots: Dict[_ModState, List[Dict[torch.device, Dict[str, int]]]] = {}
+
+
+class _WeakRefInfo:
+    """
+    Manages memory statistics and device attributes for tensor storages.
+    """
+
+    def __init__(
+        self, size: int, element_size: int, device: torch.device, reftype: _RefType
+    ) -> None:
+        """
+        Initializes the ``_WeakRefInfo`` object with tensor storage properties.
+
+        Args:
+            size (int): The number of elements in the tensor storage.
+            element_size (int): The size of each element in the tensor storage.
+            device (torch.device): The device on which the tensor is allocated.
+            reftype (_RefType): The reference type of the tensor.
+        """
+        self.size = size
+        self.element_size = element_size
+        self.reftype = reftype
+        self.device = device
+        self.mem_consumed = self._calculate_mem_consumed()
+
+    def _calculate_mem_consumed(self) -> int:
+        """
+        Calculates the memory consumed by the tensor storage, considering device-specific allocation rules.
+
+        Returns:
+            int: The memory consumed in bytes.
+        """
+        mem = self.size * self.element_size
+        if self.device.type == "cuda":
+            return math.ceil((mem) / _PYTORCH_MIN_ALLOCATE) * _PYTORCH_MIN_ALLOCATE
+        return mem
+
+    def update_mem_consumed(self, st: torch.UntypedStorage) -> int:
+        """
+        Updates and returns the memory consumed if the storage size has changed.
+
+        Args:
+            st (torch.UntypedStorage): The tensor storage to check for size updates.
+
+        Returns:
+            int: The updated memory consumed in bytes.
+        """
+        if st.size() != self.size:
+            self.size = st.size()
+            self.mem_consumed = self._calculate_mem_consumed()
+        return self.mem_consumed
+
+    @staticmethod
+    def get_untyped_storages(t: torch.Tensor) -> Set[torch.UntypedStorage]:
+        """
+        Recursively extracts untyped storages from a tensor or its subclasses.
+
+        Args:
+            t (torch.Tensor): The tensor to extract storages from.
+
+        Returns:
+            Set[torch.UntypedStorage]: A set of untyped storages.
+        """
+        unflattened_tensors = [t]
+        flattened_tensor_storages = set()
+        while len(unflattened_tensors) > 0:
+            obj = unflattened_tensors.pop()
+            if is_traceable_wrapper_subclass(obj):
+                attrs, _ = obj.__tensor_flatten__()  # type: ignore[attr-defined]
+                unflattened_tensors.extend([getattr(obj, attr) for attr in attrs])
+            else:
+                if not hasattr(obj, "untyped_storage"):
+                    warnings.warn(
+                        f"Expected a tensor or a traceable wrapper-subclass of tensor, but got {type(obj)}",
+                        category=UserWarning,
+                        stacklevel=2,
+                    )
+                else:
+                    flattened_tensor_storages.add(obj.untyped_storage())
+        return flattened_tensor_storages
+
+    @classmethod
+    def create_winfo(
+        cls,
+        st: torch.UntypedStorage,
+        device: torch.device,
+        reftype: _RefType,
+        callback: Optional[Callable[[Self, weakref.ref], Any]] = None,
+    ) -> Tuple[Self, weakref.ref]:
+        """
+        Creates a new ``_WeakRefInfo`` instance and a weak reference to a ``torch.UntypedStorage`` object,
+        optionally attaching a callback to the weak reference.
+
+        Args:
+            st (torch.UntypedStorage): The storage object for which to create the weak reference info.
+            device (torch.device): The device associated with the storage object.
+            reftype (_RefType): The type of reference, used to categorize the storage.
+            callback (Optional[Callable[[Self, weakref.ref]]]): A callback function that is called when
+                the storage object is about to be finalized (garbage collected). The callback function
+                should accept two arguments: the ``_WeakRefInfo`` instance and the weak reference to the storage.
+        Returns:
+            Tuple[Self, weakref.ref]: A tuple containing the newly created ``_WeakRefInfo`` instance and the
+            weak reference to the storage object. The weak reference may have an attached callback if provided.
+        """
+
+        winfo = cls(st.size(), st.element_size(), device, reftype)
+        w_st = weakref.ref(st, partial(callback, winfo) if callback else None)
+        return winfo, w_st
+
+
+def _get_mem_divisor(units: str) -> int:
+    unit_dict = {"B": 1, "KiB": 2**10, "MiB": 2**20, "GiB": 2**30}
+    if units in unit_dict:
+        return unit_dict[units]
+    else:
+        raise ValueError(
+            f"Unsupported unit: {units}. Supported units are: {', '.join(unit_dict.keys())}"
+        )
+
+
+def _rounding_fn(value: int, divisor: int, precision: int) -> Union[float, int]:
+    return value if divisor == 1 else round(value / divisor, precision)
+
+
+def _print_snapshot(snapshot: Dict[torch.device, Dict[str, int]], units: str) -> None:
+    if len(snapshot) == 0:
+        print("No memory tracked.")
+        return
+    divisor = _get_mem_divisor(units)
+    for dev, dev_snap in snapshot.items():
+        if _rounding_fn(dev_snap[_TOTAL_KEY], divisor, 2) <= 0:
+            continue
+        print(
+            f"Device: {dev}",
+            *(
+                f"\t{k}: {_rounding_fn(v, divisor, 2)} {units}"
+                for k, v in dev_snap.items()
+            ),
+            sep="\n",
+        )
+
+
+def _print_snapshot_tabular(
+    snapshot: Dict[torch.device, Dict[str, int]], units: str
+) -> None:
+    if len(snapshot) == 0:
+        print("No memory tracked.")
+        return
+    try:
+        from tabulate import tabulate
+    except ImportError as err:
+        raise ImportError(
+            "Please install tabulate to use the tabulate option."
+        ) from err
+    divisor = _get_mem_divisor(units)
+    table_data = []
+    key_list = list(next(iter(snapshot.values())).keys())
+    headers = ["Device"] + [f"{key}" for key in key_list]
+
+    for dev, dev_snap in snapshot.items():
+        if _rounding_fn(dev_snap[_TOTAL_KEY], divisor, 2) <= 0:
+            continue
+        row = [str(dev)]
+        row.extend(f"{_rounding_fn(v, divisor, 2)} {units}" for v in dev_snap.values())
+        table_data.append(row)
+    print(tabulate(table_data, headers=headers, tablefmt="rst"))
+
+
+def _print_state_snapshots(
+    snapshots: Dict[_State, List[Dict[torch.device, Dict[str, int]]]], units: str
+) -> None:
+    for state, snapshot_list in snapshots.items():
+        print(f"{state}")
+        for i, snapshot in enumerate(snapshot_list):
+            print(f"# {i + 1}:")
+            _print_snapshot(snapshot, units)
+    print()
+
+
+def _print_state_snapshots_tabular(
+    snapshots: Dict[_State, List[Dict[torch.device, Dict[str, int]]]], units: str
+) -> None:
+    try:
+        from tabulate import tabulate
+    except ImportError as err:
+        raise ImportError(
+            "Please install tabulate to use the tabulate option."
+        ) from err
+
+    table_data = []
+    last_state_call = None
+    divisor = _get_mem_divisor(units)
+    for state, snapshot_list in snapshots.items():
+        for i, snapshot in enumerate(snapshot_list):
+            state_call = f"{state} # {i + 1}"
+            for dev, dev_snap in snapshot.items():
+                if _rounding_fn(dev_snap[_TOTAL_KEY], divisor, 2) <= 0:
+                    continue
+                row = {
+                    "State & Call": (
+                        state_call if state_call != last_state_call else ""
+                    ),
+                    "Device": str(dev),
+                }
+                last_state_call = state_call
+                for k, v in dev_snap.items():
+                    row[f"{k}"] = f"{_rounding_fn(v, divisor, 2)} {units}"
+                table_data.append(row)
+    print(tabulate(table_data, headers="keys", tablefmt="rst"))
+
+
+class _UpdateType(Enum):
+    # These are used for tracking updates to the continuouly maintained memory snapshot.
+    # ADD - When a new tensor storage is tracked
+    # DEL - When a tensor storage is about to be finalized (garbage collected).
+    # REF - When a tensor reference is updated, for instance, the gradients are marked as
+    #       generic backward reference types until the grad_hook categorizes them as gradients.
+    # SIZE - When a tensor's storage is resized.
+    ADD = auto()
+    DEL = auto()
+    REF = auto()
+    SIZE = auto()
+
+
+class MemTracker(TorchDispatchMode):
+    """
+    A TorchDispatchMode to track, categorize and attribute the tensor memory created or accessed within its context.
+
+    It categorizes the tracked tensors as parameters, buffers, activations, gradients, temporary memory and optimizer states
+    as defined by ``_MemRefType`` within its context. It captures memory `snapshots` for the modules, called within its context,
+    at various states defined by ``_ModState``.
+
+    Attributes:
+        memory_tracking: A weakref key dictionary to store the memory statistics of each module. Each key
+        is a reference to a module, and each value is a ``_ModMemStats`` object that stores the memory
+        statistics of the module.
+
+    Note:
+        The MemTracker should be used as a context manager. The modules, optimizers, and any other tensors created within
+        the context of MemTracker will be tracked by default. Any tensors or stateful objects such as modules, optimizers etc.
+        that need to be tracked but are created outside the MemTracker should be registered using the `track_external` method.
+        The `track_external` method should be called before the MemTracker is used. Any tensors created outside the ``MemTracker``
+        and not supplied to the `track_external` method will not be tracked by the ``MemTracker``.
+
+    Example usage:
+
+        .. code-block:: python
+
+            module = ...
+            optimizer = ...
+            inp = ...
+            mem_tracker = MemTracker()
+            mem_tracker.track_external(module, optimizer, inp)
+            with mem_tracker as mt:
+                loss = module(inp)
+                print("After Forward:")
+                mt.display_snapshot("current")
+                loss.backward()
+                optimizer.step()
+                optimizer.zero_grad()
+            mt.display_snapshot("peak")
+            mt.display_modulewise_snapshots(depth = 3, units = "MiB")
+
+    Known Limitations:
+        - The ``MemTracker`` does not track memory for tensors that bypass the ``TorchDispatchMode`` ex. under ``no_dispatch``.
+        - Resizing tensor storages directly by using non-Tensor methods other than using ``torch.Untyped_Storage.resize_``
+          is not tracked. File a Github issue if you have use-cases for this.
+        - If the tensors are not traceable or wrappable subclasses of ``torch.Tensor``, then the tracker does not know how to
+            track their storages. File a Github issue if you have use-cases for this.
+        - During AC in the backward pass there might be misattribution between activation and temp memory, but the peak memory
+          will be tracked accurately. This will be fixed in the next update by hooking intricately with ``torch.uitls.checkpoint``.
+    """
+
+    def __init__(self) -> None:
+        self.memory_tracking = WeakIdKeyDictionary()
+        self._curr_mem_snap: Dict[torch.device, Dict[str, int]] = {}
+        self._peak_mem: Dict[torch.device, int] = {}
+        self._peak_mem_snap: Dict[torch.device, Dict[str, int]] = {}
+        self._param_to_grad_hook_handles = WeakIdKeyDictionary()
+        self._optimizer_hook_handles: Optional[
+            Tuple[RemovableHandle, RemovableHandle]
+        ] = None
+        # Dictionary to store the ``_WeakRefInfo`` instances corresponding to each tensor's storage.
+        self._WINFO = WeakIdKeyDictionary()
+        self._mod_tracker = ModTracker()
+        # This is a general memory tracker which can be used with any ``_RefType`` subclass
+        self._ref_class: Type[_RefType] = _MemRefType
+        # Flags to track if we are in the AC region or optimizer step region
+        self._in_opt: bool = False
+        self._in_ac: bool = False
+        # Weak references to the topmost AC module currently active
+        self._ac_mod: Optional[weakref.ref] = None
+        self._orig_resize = torch.UntypedStorage.resize_
+
+    def _update_snap(
+        self,
+        u_type: _UpdateType,
+        winfo: _WeakRefInfo,
+        old_mem_consumed: Optional[int] = None,
+        old_reftype: Optional[_RefType] = None,
+    ) -> None:
+        # Initialize a flag to track if the total memory might drop to zero after updates.
+        maybe_zero = False
+        # Ensure the device entry exists in the current memory snapshot, initializing if necessary.
+        dev_snap = self._curr_mem_snap.setdefault(
+            winfo.device, dict.fromkeys(self._ref_class, 0)
+        )
+        dev_snap.setdefault(_TOTAL_KEY, 0)
+        # Handle different types of updates based on the update type (`u_type`).
+        if u_type == _UpdateType.ADD:
+            # Increase the memory consumed for the specific reference type and update the total.
+            dev_snap[winfo.reftype] += winfo.mem_consumed
+            dev_snap[_TOTAL_KEY] += winfo.mem_consumed
+        elif u_type == _UpdateType.DEL:
+            # Decrease the memory consumed for the specific reference type and reduce the total.
+            dev_snap[winfo.reftype] -= winfo.mem_consumed
+            dev_snap[_TOTAL_KEY] -= winfo.mem_consumed
+            maybe_zero = True
+        elif u_type == _UpdateType.REF:
+            assert old_reftype is not None
+            # Adjust memory consumption between two reference types within the same device.
+            dev_snap[old_reftype] -= winfo.mem_consumed
+            dev_snap[winfo.reftype] += winfo.mem_consumed
+        elif u_type == _UpdateType.SIZE:
+            assert old_mem_consumed is not None
+            # Adjust the memory consumed for a reference type due to a change in size.
+            change = winfo.mem_consumed - old_mem_consumed
+            dev_snap[winfo.reftype] += change
+            dev_snap[_TOTAL_KEY] += change
+            maybe_zero = True
+        else:
+            raise ValueError(f"Invalid update type: {u_type}")
+        # Check if the total memory for the device has dropped to zero.
+        if maybe_zero:
+            if self._curr_mem_snap[winfo.device][_TOTAL_KEY] == 0:
+                # Remove the device entry from the memory snapshot if the total memory is zero.
+                del self._curr_mem_snap[winfo.device]
+
+    def _update_and_maybe_create_winfos(
+        self,
+        t: torch.Tensor,
+        reftype: _RefType,
+        update_existing: bool = False,
+    ) -> Set[_WeakRefInfo]:
+        sts = _WeakRefInfo.get_untyped_storages(t)
+        winfos = set()
+        for st in sts:
+            # Attempt to retrieve existing ``_WeakRefInfo`` and its weak reference from the tracking dictionary.
+            winfo, _ = self._WINFO.get(st, (None, None))
+            if winfo is not None:
+                # If ``_WeakRefInfo`` exists, check if the reference type needs to be updated.
+                old_reftype = winfo.reftype
+                if old_reftype != reftype:
+                    # Update the reference type and apply changes via ``_update_snap``.
+                    winfo.reftype = reftype
+                    self._update_snap(_UpdateType.REF, winfo, old_reftype=old_reftype)
+                winfos.add(winfo)
+            elif update_existing:
+                # If no existing ``_WeakRefInfo`` is found and update_existing is True, raise an error.
+                raise KeyError("No existing winfo found")
+            else:
+                # If no existing _WeakRefInfo is found and update_existing is False, create a new ``_WeakRefInfo``.
+                winfo, w_st = _WeakRefInfo.create_winfo(
+                    st, t.device, reftype, self._delete_callback
+                )
+                # Store the new ``_WeakRefInfo`` and its weak reference in the tracking dictionary.
+                self._WINFO[st] = (winfo, w_st)
+                # Update the snapshot for the newly added ``_WeakRefInfo``.
+                if winfo.mem_consumed > 0:
+                    self._update_snap(_UpdateType.ADD, winfo)
+                winfos.add(winfo)
+        return winfos
+
+    def _delete_callback(self, winfo: _WeakRefInfo, w_st: weakref.ref) -> None:
+        # Callback to be called when the storage object corresponding to the  ``_WeakRefInfo``
+        # instance is about to be finalized.
+        if winfo.mem_consumed > 0:
+            self._update_snap(_UpdateType.DEL, winfo)
+
+    def _track_resize(self) -> None:
+        # Need to monkey-patch this because ``torch.UntypedStorage.resize_`` is not captured
+        # by ``TorchDispatchMode``.
+        @wraps(self._orig_resize)
+        def resize_(st: torch.UntypedStorage, size: int) -> None:
+            self._orig_resize(st, size)
+            winfo, _ = self._WINFO.get(st, (None, None))
+            if winfo is not None and winfo.size != st.size():
+                old_mem_consumed = winfo.mem_consumed
+                winfo.update_mem_consumed(st)
+                self._update_snap(
+                    _UpdateType.SIZE, winfo, old_mem_consumed=old_mem_consumed
+                )
+
+        torch.UntypedStorage.resize_ = resize_  # type: ignore[method-assign, assignment]
+
+    def _restore_resize(self) -> None:
+        torch.UntypedStorage.resize_ = self._orig_resize  # type: ignore[method-assign]
+
+    def _update_peak_stats(self, peak_state: _State) -> None:
+        # We first capture the current memory snapshot of the current tracker state then,
+        # We step through each of the modules we have tracked so far in ``memory_tracking``
+        #  and check if it is currently active by querying ``_mod_tracker.parents``
+        # If it is active, we update the per device peak memory usage for the module
+        #  corresponding to the ``_State`` which can be ``PEAK_FW`` or ``PEAK_BW``.
+        curr_snap = self._curr_mem_snap
+
+        for mod_stats in self.memory_tracking.values():
+            if mod_stats.mod_fqn in self._mod_tracker.parents:
+                if peak_state in mod_stats.snapshots:
+                    for dev, dev_snap in curr_snap.items():
+                        if mod_stats.local_peak.get(dev, 0) < dev_snap[_TOTAL_KEY]:
+                            mod_stats.local_peak[dev] = dev_snap[_TOTAL_KEY]
+                            mod_stats.snapshots[peak_state][-1][dev] = deepcopy(
+                                dev_snap
+                            )
+
+        for dev, dev_snap in curr_snap.items():
+            if self._peak_mem.get(dev, 0) < dev_snap[_TOTAL_KEY]:
+                self._peak_mem[dev] = dev_snap[_TOTAL_KEY]
+                self._peak_mem_snap[dev] = deepcopy(dev_snap)
+
+    def _track(self, reftype: _RefType, t: torch.Tensor) -> None:
+        # Get the storages of the tensor and check if we have already tracked them.
+        # If yes, then check if the storage size has changed and update the current snapshot.
+        # Else create a new ``_WeakRefInfo`` instance and add it to the dictionary.
+        sts = _WeakRefInfo.get_untyped_storages(t)
+        for st in sts:
+            winfo, _ = self._WINFO.get(st, (None, None))
+            if winfo is not None:
+                if winfo.size != st.size():
+                    old_mem_consumed = winfo.mem_consumed
+                    winfo.update_mem_consumed(st)
+                    self._update_snap(
+                        _UpdateType.SIZE, winfo, old_mem_consumed=old_mem_consumed
+                    )
+                return
+            else:
+                winfo, w_st = _WeakRefInfo.create_winfo(
+                    st, t.device, reftype, self._delete_callback
+                )
+                self._WINFO[st] = (winfo, w_st)
+                # Update the current snapshot for the newly added ``_WeakRefInfo``.
+                if winfo.mem_consumed > 0:
+                    self._update_snap(_UpdateType.ADD, winfo)
+
+    def get_tracker_snapshot(
+        self, type: str = "current"
+    ) -> Dict[torch.device, Dict[str, int]]:
+        """
+        Capture a snapshot of the memory usage breakdown per device, based on the specified type.
+
+        Args:
+            type (str): The type of snapshot to capture. Can be "current" for the current memory usage or "peak" for the
+                        peak memory usage. Defaults to "current".
+        Returns:
+            Dict[torch.device, Dict[str, int]]: A dictionary where each key is a torch.device, and each value is another
+                                                dictionary. This inner dictionary has keys representing memory reference
+                                                types as defined in ``_MemRefType`` and values representing the amount of
+                                                memory consumed in bytes.
+        Raises:
+            ValueError: If an invalid type is specified.
+        """
+        if type == "current":
+            return deepcopy(self._curr_mem_snap)
+        elif type == "peak":
+            return deepcopy(self._peak_mem_snap)
+        else:
+            raise ValueError(f"Invalid type {type}")
+
+    def _track_module_params_and_buffers(
+        self, module: nn.Module, install_grad_hooks: bool = True
+    ) -> Tuple[int, int]:
+        # Track the parameters and buffers of the module if not already tracked.
+        # If the parameters have gradients, track the gradients as well.
+        # If install_grad_hooks is True, install a gradient hook on the parameters
+        #  to track the gradients, if it has not already been installed.
+        # Return the total memory consumed by the parameters and buffers.
+        def _grad_hook(grad: torch.Tensor) -> None:
+            self._update_and_maybe_create_winfos(
+                grad,
+                _MemRefType.GRAD,
+            )
+
+        param_memory = 0
+        for param in module.parameters():
+            winfos = self._update_and_maybe_create_winfos(
+                param,
+                _MemRefType.PARAM,
+            )
+            param_memory += sum(winfo.mem_consumed for winfo in winfos)
+            if param.grad is not None:
+                self._update_and_maybe_create_winfos(
+                    param.grad,
+                    _MemRefType.GRAD,
+                )
+            if (
+                self._param_to_grad_hook_handles.get(param, None) is None
+                and install_grad_hooks
+            ):
+                grad_hook_handle = param.register_hook(_grad_hook)
+                post_acc_grad_hook_handle = param.register_post_accumulate_grad_hook(
+                    lambda p: (_grad_hook(p.grad))
+                )
+                self._param_to_grad_hook_handles[param] = (
+                    grad_hook_handle,
+                    post_acc_grad_hook_handle,
+                )
+        buffer_memory = 0
+        for buffer in module.buffers():
+            winfos = self._update_and_maybe_create_winfos(
+                buffer,
+                _MemRefType.BUFFER,
+            )
+            buffer_memory += sum(winfo.mem_consumed for winfo in winfos)
+        return (param_memory, buffer_memory)
+
+    def _track_inputs_or_outputs(self, args: Any) -> int:
+        # Calculate the memory consumed by the inputs or outputs of the module.
+        input_or_output_memory = 0
+
+        def add_inps_or_outs(t: torch.Tensor) -> None:
+            nonlocal input_or_output_memory
+            sts = _WeakRefInfo.get_untyped_storages(t)
+            for st in sts:
+                winfo, _ = self._WINFO.get(st, (None, None))
+                if winfo is not None:
+                    input_or_output_memory += winfo.mem_consumed
+
+        tree_map_only(torch.Tensor, add_inps_or_outs, args)
+        return input_or_output_memory
+
+    def _pre_fw_hook(self, module: nn.Module, inputs: Any) -> None:
+        # This is installed as a pre-fwd user hook with ``ModTracker.`` Based on the following cases we
+        # set the state and capture the memory snapshot for the module.
+        # Case 1: If the module is not in the ``memory_tracking`` dictionary, we track the parameters, buffers,
+        #         input and output memory of the module. Create a new ``_ModMemStats`` instance for the module
+        #         and add it to the ``memory_tracking`` dictionary.
+        # Case 2: If the module is already in the ``memory_tracking`` dictionary and we are in backward, this means
+        #         we are in the AC region. We check if this is the top most module in the AC region. If it is,
+        #         we store a weak reference and set the flag ``_in_ac`` to True.
+        # Case 3: If the module is already in the ``memory_tracking`` dictionary and we are in forward, this means
+        #         this module is called for the second time. If it is a root module, that means we are in the next
+        #         iteration and we error out. If it is not a root module, that means it's a submodule that is being
+        #         used multiple times in the same iteration, which we allow and track.
+        # For Case 1 and 3, we also initialiaze the ``local_peak`` and ``PEAK_FW`` snapshot for the module.
+        mod_name = self._mod_tracker.get_known_fqn(module)
+        assert mod_name is not None
+        if module not in self.memory_tracking:
+            mod_stats = _ModMemStats(mod_name)
+            param_mem, buffer_mem = self._track_module_params_and_buffers(
+                module, install_grad_hooks=True
+            )
+            input_mem = self._track_inputs_or_outputs(inputs)
+            mod_stats.parameter_mem = param_mem
+            mod_stats.buffer_mem = buffer_mem
+            mod_stats.input_mem = input_mem
+            self.memory_tracking[module] = mod_stats
+            state = _ModState.PRE_FW
+
+        elif self._mod_tracker.is_bw:
+            mod_stats = self.memory_tracking[module]
+            state = _ModState.PRE_FW_AC
+            if self._ac_mod is None:
+                self._ac_mod = weakref.ref(module)
+                self._in_ac = True
+        else:
+            parents = set(self._mod_tracker.parents) - {mod_name}
+            if len(parents) == 1 and "Global" in parents:
+                raise NotImplementedError(
+                    "MemTracker does not support memory tracking for multiple iterative calls."
+                    " Either use ``reset_mod_stats`` to clear module memory stats for the previous iteration"
+                    " or file a github issue if you need this feature."
+                )
+            mod_stats = self.memory_tracking[module]
+            state = _ModState.PRE_FW
+            input_mem = self._track_inputs_or_outputs(inputs)
+            mod_stats.input_mem = input_mem
+
+        mem_snapshot = self.get_tracker_snapshot()
+        if state == _ModState.PRE_FW:
+            mod_stats.local_peak = {
+                dev: dev_snap[_TOTAL_KEY] for dev, dev_snap in mem_snapshot.items()
+            }
+            mod_stats.snapshots.setdefault(_ModState.PEAK_FW, []).append(mem_snapshot)
+        mod_stats.snapshots.setdefault(state, []).append(deepcopy(mem_snapshot))
+
+    def _post_fw_hook(self, module: nn.Module, inputs: Any, outputs: Any) -> None:
+        # This is installed as a post-fwd user hook with ``ModTracker``. Based on the following cases we
+        # set the state and capture the memory snapshot for the module.
+        # Case 1: This is called in backward, which means we are in the AC region. If this is the top most module
+        #         in the AC region, we set the flag ``_in_ac`` to False.
+        # Case 2: This is called in forward so we calculate the output memory
+        #         of the module and update its mod_stats.
+        mod_stats = self.memory_tracking[module]
+        if self._mod_tracker.is_bw:
+            state = _ModState.POST_FW_AC
+            if self._ac_mod is not None and self._ac_mod() is module:
+                self._ac_mod = None
+                self._in_ac = False
+        else:
+            state = _ModState.POST_FW
+            output_mem = self._track_inputs_or_outputs(outputs)
+            mod_stats.output_mem = output_mem
+        mod_stats.snapshots.setdefault(state, []).append(self.get_tracker_snapshot())
+
+    def _pre_bw_hook(self, module: nn.Module, args: Any) -> None:
+        # This is installed as a pre-bwd user hook with ``ModTracker``. We set the state and capture the
+        # snapshot for the module. We also initialize the ``local_peak`` and ``PEAK_BW`` snapshot for it.
+        # If the module is None, we skip the hook.
+        # This can happen since this installed inside a multi-grad hook on the module's output tensors
+        # and the module itself may not be alive during backward.
+        if module is None:
+            warnings.warn("Module is None. Skipping PRE_BW hook.", stacklevel=2)
+            return
+        mod_stats = self.memory_tracking[module]
+        mem_snapshot = self.get_tracker_snapshot()
+        mod_stats.local_peak = {
+            dev: dev_snap[_TOTAL_KEY] for dev, dev_snap in mem_snapshot.items()
+        }
+        mod_stats.snapshots.setdefault(_ModState.PEAK_BW, []).append(mem_snapshot)
+        mod_stats.snapshots.setdefault(_ModState.PRE_BW, []).append(
+            deepcopy(mem_snapshot)
+        )
+
+    def _post_bw_hook(self, module: nn.Module, args: Any) -> None:
+        # This is installed as a post-bwd user hook with ``ModTracker``. We set the state and capture the
+        # snapshot for the module if it is not None.
+        # This can happen since this installed inside a multi-grad hook on the module's input tensors
+        # and the module itself may not be alive during backward.
+        if module is None:
+            warnings.warn("Module is None. Skipping POST_BW hook.", stacklevel=2)
+            return
+        mod_stats = self.memory_tracking[module]
+        mod_stats.snapshots.setdefault(_ModState.POST_BW, []).append(
+            self.get_tracker_snapshot()
+        )
+
+    def _track_optimizer_states(
+        self, reftype: _RefType, optimizer: optim.Optimizer
+    ) -> None:
+        for states in optimizer.state.values():
+            for val in states.values():
+                if isinstance(val, torch.Tensor):
+                    self._update_and_maybe_create_winfos(
+                        val,
+                        reftype,
+                    )
+
+    def _register_global_optimizer_hook(self) -> None:
+        # Register a hook on the optimizer step to track the optimizer states.
+        # The pre-hook is to set the flag ``_in_opt`` to True. The post-hook unsets the flag,
+        # and also tracks any optimizer states that are created during the optimizer step.
+        def _opt_step_pre_hook(
+            optimizer: optim.Optimizer, args: Any, kwargs: Any
+        ) -> None:
+            self._in_opt = True
+
+        def _opt_step_post_hook(
+            optimizer: optim.Optimizer, args: Any, kwargs: Any
+        ) -> None:
+            self._track_optimizer_states(_MemRefType.OPT, optimizer)
+            self._in_opt = False
+
+        self._optimizer_hook_handles = (
+            register_optimizer_step_pre_hook(_opt_step_pre_hook),
+            register_optimizer_step_post_hook(_opt_step_post_hook),
+        )
+
+    def _deregister_param_and_optimizer_hooks(self) -> None:
+        for (
+            grad_hook_handle,
+            post_acc_grad_hook_handle,
+        ) in self._param_to_grad_hook_handles.values():
+            grad_hook_handle.remove()
+            post_acc_grad_hook_handle.remove()
+        self._param_to_grad_hook_handles.clear()
+
+        if self._optimizer_hook_handles is not None:
+            for handle in self._optimizer_hook_handles:
+                handle.remove()
+            self._optimizer_hook_handles = None
+
+    def track_external(
+        self, *external: Union[nn.Module, optim.Optimizer, torch.Tensor]
+    ) -> None:
+        """
+        Track tensors and stateful objects like modules, optimizers etc. that are created outside the MemTracker.
+
+        This method should be called before the ``MemTracker`` is used. Any tensors that are not module parameters, buffers,
+        gradients activations, or optimizer states will be categorized as ``Other``. If you want them categorized with a
+        custom name, please file a GitHub issue. Any tensors created outside the MemTracker and not supplied to this
+        method will not be be tracked by ``MemTracker``.
+
+        Args:
+            *external (Union[nn.Module, optim.Optimizer, torch.Tensor]): The external modules, optimizers, and
+                                                                         tensors to be tracked.
+        """
+        flat_external, _ = tree_flatten(external)
+        for obj in flat_external:
+            if isinstance(obj, torch.Tensor):
+                self._update_and_maybe_create_winfos(
+                    obj,
+                    _MemRefType.OTH,
+                )
+            elif isinstance(obj, torch.nn.Module):
+                self._track_module_params_and_buffers(obj, install_grad_hooks=False)
+            elif isinstance(obj, optim.Optimizer):
+                self._track_optimizer_states(_MemRefType.OPT, obj)
+            else:
+                raise TypeError(
+                    f"Object of type {type(obj)} is not supported for tracking. "
+                    f"Only stateful objects like modules, optimizers, and tensors are supported."
+                )
+
+    def display_snapshot(
+        self, type: str = "current", units: str = "B", tabulate: bool = False
+    ) -> None:
+        """
+        Display the memory usage breakdown snapshot of the tracker based on the specified type and units.
+
+        Keyword args:
+            type (str): The type of snapshot to display. Can be "current" for the current memory usage or "peak" for the
+                        peak memory usage. Defaults to "current".
+            units (str): The units to use for displaying memory usage. Defaults to "B". Supports ["B", "KiB", "MiB", "GiB"].
+            tabulate (bool): Whether to display the snapshot in a tabular format. Defaults to False.
+        """
+        snapshot = self.get_tracker_snapshot(type)
+        if tabulate:
+            _print_snapshot_tabular(snapshot, units)
+        else:
+            _print_snapshot(snapshot, units)
+
+    def display_modulewise_snapshots(
+        self, depth: int = 2, units: str = "B", tabulate: bool = False
+    ) -> None:
+        """
+        Print per device memory breakdown snapshot for each module called within MemTracker.
+
+        Snapshots are displayed for the states defined by ``_ModState``.
+        The module hierarchy is displayed up to the specified depth.
+
+        Keyword Args:
+            depth (int, optional): The depth of the module hierarchy to display. Defaults to 2.
+            units (str, optional): The units to use for memory tracking. Defaults to "B". Supports ["B", "KiB", "MiB", "GiB"].
+            tabulate (bool, optional): Whether to display the snapshot in a tabular format. Defaults to False.
+        """
+
+        def natural_sort_key(s: str) -> List[Union[int, str]]:
+            return [
+                int(text) if text.isdigit() else text.lower()
+                for text in re.split("([0-9]+)", s)
+            ]
+
+        for mod_stats in sorted(
+            self.memory_tracking.values(),
+            key=lambda m_stats: natural_sort_key(m_stats.mod_fqn),
+        ):
+            mod_fqn = mod_stats.mod_fqn
+            mod_depth = mod_fqn.count(".") + 1
+            if mod_depth > depth:
+                continue
+            print(f"Module:  {mod_fqn}")
+            if tabulate:
+                _print_state_snapshots_tabular(mod_stats.snapshots, units)
+            else:
+                _print_state_snapshots(mod_stats.snapshots, units)
+
+    def reset_mod_stats(self) -> None:
+        """
+        Reset all the module memory stats. Clears ``memory_tracking`` dictionary.
+        """
+        self.memory_tracking.clear()
+
+    def __enter__(self) -> "MemTracker":
+        self._register_global_optimizer_hook()
+        self._mod_tracker.register_user_hooks(
+            self._pre_fw_hook,
+            self._post_fw_hook,
+            self._pre_bw_hook,
+            self._post_bw_hook,
+        )
+        self._track_resize()
+        self._peak_mem_snap = self.get_tracker_snapshot()
+        self._peak_mem = {
+            dev: dev_snap[_TOTAL_KEY] for dev, dev_snap in self._peak_mem_snap.items()
+        }
+        self._mod_tracker.__enter__()
+        super().__enter__()
+        return self
+
+    def __exit__(self, *args: Any) -> None:
+        self._deregister_param_and_optimizer_hooks()
+        self._mod_tracker.clear_user_hooks()
+        self._restore_resize()
+        super().__exit__(*args)
+        self._mod_tracker.__exit__(*args)
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):  # type: ignore[no-untyped-def]
+        res = func(*args, **kwargs or {})
+        # If we are tracking an optimizer state, we use the optimizer reference type.
+        # If we are in backward region and not in AC region, we use the backward reference type.
+        # Else we use the forward reference type.
+        if self._in_opt:
+            reftype = _MemRefType.OPT
+        elif self._mod_tracker.is_bw and not self._in_ac:
+            reftype = _MemRefType.TEMP
+        else:
+            reftype = _MemRefType.ACT
+        tree_map_only(torch.Tensor, partial(self._track, reftype), res)
+        peak_state = _ModState.PEAK_BW if self._mod_tracker.is_bw else _ModState.PEAK_FW
+        self._update_peak_stats(peak_state)
+        return res
--- a/rl/Lib/site-packages/torch/distributed/_tools/memory_tracker.py
+++ b/rl/Lib/site-packages/torch/distributed/_tools/memory_tracker.py
@ -0,0 +1,295 @@
+# mypy: allow-untyped-defs
+import operator
+import pickle
+from collections import defaultdict
+from itertools import chain
+from typing import Any, Callable, Dict, List, no_type_check, Sequence, TYPE_CHECKING
+
+import torch
+import torch.nn as nn
+from torch.utils._python_dispatch import TorchDispatchMode
+
+
+if TYPE_CHECKING:
+    from torch.utils.hooks import RemovableHandle
+
+
+BYTES_PER_MB = 1024 * 1024.0
+
+
+class MemoryProfileDispatchMode(TorchDispatchMode):
+    """Run in ``TorchDispatchMode`` to get memory stats at operator level."""
+
+    def __init__(self, memory_tracker) -> None:
+        self.memory_tracker = memory_tracker
+
+    def __torch_dispatch__(self, func, types, args=..., kwargs=None):
+        rs = func(*args, **kwargs)
+        if func == torch.ops.aten.detach.default:
+            return rs
+        func_name: str = (
+            self.memory_tracker._cur_module_name
+            + "."
+            + func.__name__
+            + "_"
+            + str(self.memory_tracker._operator_names[func.__name__])
+        )
+        self.memory_tracker._operator_names[func.__name__] = (
+            self.memory_tracker._operator_names[func.__name__] + 1
+        )
+        self.memory_tracker._record_memory_stats(func_name)
+
+        return rs
+
+
+class MemoryTracker:
+    """
+    Collect and plot the memory stats at operator level.
+
+    Includes ``memories_allocated``, ``memories_active`` and ``memories_reserved``.
+    It also prints a summary for the top 20 operators that generate the most memories.
+
+    Example usage:
+
+        >>> # xdoctest: +SKIP(failing)
+        >>> net.cuda()
+        >>> input = input.cuda()
+
+        >>> mem_tracker = MemoryTracker()
+        >>> mem_tracker.start_monitor(net)
+
+        >>> net.zero_grad(True)
+        >>> loss = net(input)
+        >>> if isinstance(loss, dict):
+        >>>    loss = loss['out']
+        >>> loss.sum().backward()
+        >>> net.zero_grad(set_to_none=True)
+
+        >>> mem_tracker.stop()
+        >>> mem_tracker.summary()
+        >>> mem_tracker.show_traces()
+    """
+
+    def __init__(self) -> None:
+        torch._C._log_api_usage_once("torch.distributed.memory_tracker")
+        self._hooks: List[RemovableHandle] = []
+        self._operator_names: Dict[str, int] = defaultdict(int)
+        self.memories_allocated: Dict[int, Dict[str, float]] = defaultdict()
+        self.memories_active: Dict[int, Dict[str, float]] = defaultdict()
+        self.memories_reserved: Dict[int, Dict[str, float]] = defaultdict()
+        self._markers: Dict[str, int] = defaultdict(int)
+        self._cur_module_name: str = ""
+        self._op_index: int = 0
+        self._num_cuda_retries: int = 0
+
+    @no_type_check
+    def start_monitor(self, root_module: nn.Module) -> None:
+        """
+        Register module hooks and entering ``MemoryProfileDispatchMode``.
+
+        This enables operator level memory stats can be tracked during module runtime.
+        """
+        self._clear_state()
+        root_module.__setattr__("_memory_tracker_is_root", True)
+        for name, m in root_module.named_modules():
+            if m is not root_module:
+                m.__setattr__("_memory_tracker_is_root", False)
+            # fused_proxy_group does not support hooks
+            if ".fused_proxy_grouped_embedding_bag" in name:
+                continue
+            # hook ordering with other hooks added by users is not managed, so
+            # the memory stats tracked here may not completely accurate.
+            h1 = m.register_forward_pre_hook(self._create_pre_forward_hook(name))
+            h2 = m.register_forward_hook(self._create_post_forward_hook(name))
+            # it does not work well with jagged tensor somehow, the root cause is not
+            # clear and remove it for now as it does not really capture important info.
+            # h3 = m.register_backward_hook(self._create_backward_hook(name))
+            self._hooks.extend([h1, h2])
+        torch.cuda.empty_cache()
+        assert getattr(self, "profile_mode", None) is None
+        self.profile_mode = MemoryProfileDispatchMode(self)
+        self.profile_mode.__enter__()
+
+    @no_type_check
+    def stop(self) -> None:
+        """
+        Remove module hooks and exit ``MemoryProfileDispatchMode`` to stop tracking memory stats at operator level.
+
+        Get some aggregated stats when the memory_tracker() is enabled, like cuda ``num_alloc_retries``.
+        """
+        self._num_cuda_retries = torch.cuda.memory_stats().get("num_alloc_retries", 0)
+
+        for h in self._hooks:
+            h.remove()
+        self._hooks.clear()
+        assert getattr(self, "profile_mode", None) is not None
+        self.profile_mode.__exit__(None, None, None)
+        self.profile_mode = None
+
+    @no_type_check
+    def summary(self, top: int = 20) -> None:
+        """
+        Print out the top operators that generate the most memories.
+
+        The number of the top operators can be configured.
+        """
+        op_diff: Dict[str, float] = defaultdict(float)
+        op_name, previous_allocated_memory = self.memories_allocated[0]
+        for i in range(1, self._op_index):
+            op_name, current_allocated_memory = self.memories_allocated[i]
+            op_diff[op_name] = current_allocated_memory - previous_allocated_memory
+            previous_allocated_memory = current_allocated_memory
+
+        print("------------------------------------------------")
+        print(f"The number of cuda retries are: {self._num_cuda_retries}")
+        print(f"Top {top} ops that generates memory are:")
+        for k, v in sorted(op_diff.items(), key=operator.itemgetter(1), reverse=True)[
+            :top
+        ]:
+            print(f"{k}: {v}MB")
+        print("------------------------------------------------")
+
+    @no_type_check
+    def show_traces(self, path: str = "") -> None:
+        import matplotlib.pyplot as plt
+
+        def _plot_figure(x, y_values, labels):
+            min_val = min(list(chain(*y_values))) * 0.999
+            max_val = max(list(chain(*y_values))) * 1.001
+            plt.figure()
+            for y, label in zip(y_values, labels):
+                plt.plot(x, y, label=label)
+            plt.xlabel("# Operator Calls")
+            plt.ylabel("Memory (MB)")
+            plt.legend()
+            for marker_name, marker in self._markers.items():
+                if marker_name == "fw_bw_boundary":
+                    plt.plot(
+                        [marker, marker],
+                        [min_val, max_val],
+                        "r",
+                        lw=2,
+                        label=marker_name,
+                    )
+                else:
+                    plt.plot(
+                        [marker, marker],
+                        [min_val, max_val],
+                        "k-",
+                        lw=2,
+                        label=marker_name,
+                    )
+
+        if path != "":
+            self.load(path)
+
+        y_1 = [gb for (name, gb) in self.memories_allocated.values()]
+        y_2 = [gb for (name, gb) in self.memories_active.values()]
+        y_3 = [gb for (name, gb) in self.memories_reserved.values()]
+        x = list(range(len(y_1)))
+        # Split figures when there is big difference between
+        # "reserved_memory" and "allocated_memory" or "active_memory".
+        _plot_figure(
+            x,
+            [list(y_1), list(y_2), list(y_3)],
+            ["allocated_memory", "active_memory", "reserved_memory"],
+        )
+        _plot_figure(x, [list(y_1)], ["allocated_memory"])
+        _plot_figure(x, [list(y_2)], ["active_memory"])
+        _plot_figure(x, [list(y_3)], ["reserved_memory"])
+
+    def save_stats(self, path: str) -> None:
+        """Save the stats using pickle during runtime if users want to plot the traces in other places like notebook."""
+        stats = {
+            "memories_allocated": self.memories_allocated,
+            "memories_active": self.memories_active,
+            "memories_reserved": self.memories_reserved,
+            "markers": self._markers,
+            "num_alloc_retries": self._num_cuda_retries,
+        }
+
+        with open(path, "wb") as f:
+            pickle.dump(stats, f, pickle.HIGHEST_PROTOCOL)
+
+    def load(self, path: str) -> None:
+        """Load the pickled memory stats to plot the traces or print the summary."""
+        with open(path, "rb") as f:
+            stats = pickle.load(f)
+
+        self.memories_allocated = stats["memories_allocated"]
+        self.memories_active = stats["memories_active"]
+        self.memories_reserved = stats["memories_reserved"]
+        self._markers = stats["markers"]
+        self._num_cuda_retries = stats["num_alloc_retries"]
+
+    def _create_pre_forward_hook(self, name: str) -> Callable:
+        """Prefix operator name with current module and 'forward', and insert 'fw_start' marker at forward pass start."""
+
+        def _pre_forward_hook(module: nn.Module, inputs: Any) -> None:
+            self._cur_module_name = f"{name}.forward"
+            if (
+                hasattr(module, "_memory_tracker_is_root")
+                and module._memory_tracker_is_root
+            ):
+                self._add_marker("fw_start")
+
+        return _pre_forward_hook
+
+    def _create_post_forward_hook(self, name: str) -> Callable:
+        """Insert the marker 'fw_bw_boundary' at the boundary of forward and backward pass."""
+
+        def _post_forward_hook(
+            module: nn.Module,
+            inputs: Sequence[torch.Tensor],
+            outputs: Sequence[torch.Tensor],
+        ) -> None:
+            if (
+                hasattr(module, "_memory_tracker_is_root")
+                and module._memory_tracker_is_root
+            ):
+                self._add_marker("fw_bw_boundary")
+
+        return _post_forward_hook
+
+    def _create_backward_hook(self, name: str) -> Callable:
+        """Insert the current module name with backward prefix for the operator name."""
+
+        def _backward_hook(
+            module: nn.Module, grad_input: torch.Tensor, grad_output: torch.Tensor
+        ) -> None:
+            self._cur_module_name = f"{name}.backward"
+
+        return _backward_hook
+
+    @no_type_check
+    def _record_memory_stats(self, fn_name: str) -> None:
+        """
+        Record current memory allocated, current memory active and current memory reserved.
+
+        The memory stats dict is indexed with ``self._op_index``.
+        """
+        memory_allocated: float = torch.cuda.memory_allocated() / BYTES_PER_MB
+        memory_reserved: float = torch.cuda.memory_reserved() / BYTES_PER_MB
+        memory_active: float = (
+            torch.cuda.memory_stats().get("active_bytes.all.current", 0) / BYTES_PER_MB
+        )
+        self.memories_allocated[self._op_index] = (fn_name, memory_allocated)
+        self.memories_reserved[self._op_index] = (fn_name, memory_reserved)
+        self.memories_active[self._op_index] = (fn_name, memory_active)
+        self._op_index += 1
+
+    def _add_marker(self, marker_name: str) -> None:
+        """Set the marker's x-axis value."""
+        marker_val = len(self.memories_allocated.values())
+        self._markers[marker_name] = marker_val
+
+    def _clear_state(self) -> None:
+        """Clear states when start_monitor() is called."""
+        self._operator_names.clear()
+        self.memories_allocated.clear()
+        self.memories_active.clear()
+        self.memories_reserved.clear()
+        self._markers.clear()
+        self._cur_module_name = ""
+        self._op_index = 0
+        self._num_cuda_retries = 0
--- a/rl/Lib/site-packages/torch/distributed/_tools/mod_tracker.py
+++ b/rl/Lib/site-packages/torch/distributed/_tools/mod_tracker.py
@ -0,0 +1,238 @@
+# mypy: allow-untyped-defs
+import warnings
+import weakref
+from typing import Callable, Optional, Set
+
+import torch
+from torch.autograd.graph import register_multi_grad_hook
+from torch.nn.modules.module import (
+    register_module_forward_hook,
+    register_module_forward_pre_hook,
+)
+from torch.utils._pytree import tree_flatten
+
+
+__all__ = ["ModTracker"]
+
+
+class ModTracker:
+    """
+    ``ModTracker`` is a context manager that tracks the nn.Module hierarchy during execution
+    so that other system can query which Module is currently being executed (or its backward is being
+    executed).
+
+    You can access the ``parents`` attribute on this context manager to get the set of all the
+    Modules currently being executed via their fqn (fully qualified name, also used as the key within
+    the state_dict).
+    You can access the ``is_bw`` attribute to know if you are currently running in backward or not.
+
+    Note that ``parents`` is never empty and always contains the "Global" key. The ``is_bw`` flag
+    will remain ``True`` after the forward until another Module is executed. If you need it to be
+    more accurate, please submit an issue requesting this. Adding a map from fqn to the module instance
+    is possible but not done yet, please submit an issue requesting this if you need it.
+
+    Example usage
+
+    .. code-block:: python
+
+        mod = torch.nn.Linear(2, 2)
+
+        with ModTracker() as tracker:
+            # Access anything during the forward pass
+            def my_linear(m1, m2, bias):
+                print(f"Current modules: {tracker.parents}")
+                return torch.mm(m1, m2.t()) + bias
+            torch.nn.functional.linear = my_linear
+
+            mod(torch.rand(2, 2))
+
+    """
+
+    parents: Set[str]
+    """
+    A Set containing the fqn for each module currently running their forward
+    """
+
+    def __init__(self):
+        self.parents = {"Global"}
+        self._active_module_cnt = {}
+        self._known_modules: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
+        self._seen_modules: weakref.WeakSet = weakref.WeakSet()
+        self._has_callback = False
+        self._user_pre_fw_hook = None
+        self._user_post_fw_hook = None
+        self._user_pre_bw_hook = None
+        self._user_post_bw_hook = None
+
+    def _maybe_set_engine_callback(self):
+        # This assumes no concurrent calls to backward
+        if self._has_callback:
+            return
+
+        def callback():
+            self.parents = {"Global"}
+            self._has_callback = False
+
+        torch.autograd.Variable._execution_engine.queue_callback(callback)
+        self._has_callback = True
+
+    @property
+    def is_bw(self):
+        """
+        A boolean marking if this is currently running during the backward pass or not
+        """
+        return torch._C._current_graph_task_id() != -1
+
+    def get_known_fqn(self, mod):
+        """
+        Return the fqn for the given module if it is known to the ``ModTracker``, otherwise ``None``.
+        """
+        return self._known_modules.get(mod, None)
+
+    def register_user_hooks(
+        self,
+        pre_fw_hook: Optional[Callable] = None,
+        post_fw_hook: Optional[Callable] = None,
+        pre_bw_hook: Optional[Callable] = None,
+        post_bw_hook: Optional[Callable] = None,
+    ):
+        """
+        Registers user-specified hooks to be called before/after the forward/backward pass for each
+        module tracked by the ``ModTracker``. One or more can be ``None``.
+        Args:
+            pre_fw_hook (Callable, optional): A hook to be called before the forward pass for the
+                module. It should have the following signature:
+                pre_fw_hook (module, input) -> None
+            post_fw_hook (Callable, optional): A hook to be called after the forward pass for the
+                module. It should have the following signature:
+                post_fw_hook (module, input, output) -> None
+            pre_bw_hook (Callable, optional): A multi-grad hook to be called on all the outputs of
+                the module that require gradients. It should have the following signature:
+                pre_bw_hook (module, grad_output) -> None
+            post_bw_hook (Callable, optional): A multi-grad hook to be called on all the inputs of
+                the module that require gradients. It should have the following signature:
+                post_bw_hook (module, grad_input) -> None
+        Raises:
+            AssertionError: If a new hook is provided when one is already registered.
+        Note:
+            If the module is not alive during the backward pass, the pre_bw_hook and post_bw_hook will
+            will receive None as the module argument.
+            The module fqn will be present in the ``parents`` attribute when each of the hooks is called.
+            Hooks are intended to be used as markers only not to modify the inputs/outputs.
+        """
+
+        def set_hook(hook, user_hook, hook_name):
+            if hook is not None and user_hook is not None:
+                raise AssertionError(
+                    f"Only one {hook_name} can be registered at a time"
+                    f" Clear the existing hook by calling ``clear_user_hooks`` before registering a new one"
+                )
+            return hook
+
+        self._user_pre_fw_hook = set_hook(
+            pre_fw_hook, self._user_pre_fw_hook, "pre_fw_hook"
+        )
+        self._user_post_fw_hook = set_hook(
+            post_fw_hook, self._user_post_fw_hook, "post_fw_hook"
+        )
+        self._user_pre_bw_hook = set_hook(
+            pre_bw_hook, self._user_pre_bw_hook, "pre_bw_hook"
+        )
+        self._user_post_bw_hook = set_hook(
+            post_bw_hook, self._user_post_bw_hook, "post_bw_hook"
+        )
+
+    def clear_user_hooks(self):
+        """
+        Clears the user specified hooks registered with ``register_user_hooks``
+        """
+        self._user_pre_fw_hook = None
+        self._user_post_fw_hook = None
+        self._user_pre_bw_hook = None
+        self._user_post_bw_hook = None
+
+    def _get_mod_name(self, mod):
+        if mod not in self._known_modules:
+            self._known_modules[mod] = type(mod).__name__
+        mod_name = self._known_modules[mod]
+        if mod not in self._seen_modules:
+            for name, submod in mod.named_children():
+                self._known_modules[submod] = f"{mod_name}.{name}"
+                self._get_mod_name(submod)
+            self._seen_modules.add(mod)
+        return mod_name
+
+    def _get_append_fn(self, w_mod, name, is_bw):
+        def fn(*args):
+            if is_bw:
+                self._maybe_set_engine_callback()
+            if name in self.parents and not self.is_bw:
+
+                def custom_formatwarning(msg, category, filename, lineno, line=None):
+                    return f"{filename}:{lineno}: {category.__name__}: {msg} \n"
+
+                warnings.formatwarning = custom_formatwarning
+                warnings.warn(
+                    "The module hierarchy tracking maybe be messed up."
+                    " Please file a bug to PyTorch, if it is the case."
+                )
+            if name not in self.parents:
+                self._active_module_cnt[name] = 1
+                self.parents.add(name)
+            else:
+                self._active_module_cnt[name] += 1
+
+            if self._user_pre_bw_hook is not None and is_bw:
+                self._user_pre_bw_hook(w_mod(), args)
+
+        return fn
+
+    def _get_pop_fn(self, w_mod, name, is_bw):
+        def fn(*args):
+            if self._user_post_bw_hook is not None and is_bw:
+                self._user_post_bw_hook(w_mod(), args)
+            if name in self.parents:
+                self._active_module_cnt[name] -= 1
+                if self._active_module_cnt[name] == 0:
+                    self.parents.remove(name)
+            elif not self.is_bw:
+                # Due to some input/output not requiring gradients, we cannot enforce
+                # proper nesting in backward
+                raise RuntimeError(
+                    "The Module hierarchy tracking is wrong. Report a bug to PyTorch"
+                )
+
+        return fn
+
+    def _fw_pre_hook(self, mod, input):
+        name = self._get_mod_name(mod)
+        w_mod = weakref.ref(mod)
+        self._get_append_fn(w_mod, name, False)()
+        if self._user_pre_fw_hook is not None:
+            self._user_pre_fw_hook(mod, input)
+        args, _ = tree_flatten(input)
+        tensors = [a for a in args if isinstance(a, torch.Tensor) and a.requires_grad]
+        if not self.is_bw and tensors:
+            register_multi_grad_hook(tensors, self._get_pop_fn(w_mod, name, True))
+
+    def _fw_post_hook(self, mod, input, output):
+        name = self._get_mod_name(mod)
+        w_mod = weakref.ref(mod)
+        if self._user_post_fw_hook is not None:
+            self._user_post_fw_hook(mod, input, output)
+        self._get_pop_fn(w_mod, name, False)()
+        args, _ = tree_flatten(output)
+        tensors = [a for a in args if isinstance(a, torch.Tensor) and a.requires_grad]
+        if not self.is_bw and tensors:
+            register_multi_grad_hook(tensors, self._get_append_fn(w_mod, name, True))
+
+    def __enter__(self):
+        self._fw_pre_handle = register_module_forward_pre_hook(self._fw_pre_hook)
+        self._fw_post_handle = register_module_forward_hook(
+            self._fw_post_hook, always_call=True
+        )
+        return self
+
+    def __exit__(self, *args):
+        self._fw_pre_handle.remove()
+        self._fw_post_handle.remove()
--- a/rl/Lib/site-packages/torch/distributed/_tools/runtime_estimator.py
+++ b/rl/Lib/site-packages/torch/distributed/_tools/runtime_estimator.py
@ -0,0 +1,527 @@
+# Owner(s): ["module: unknown"]
+import math
+import os
+from collections import defaultdict
+from typing import Any, Callable, Dict, List, Set, Tuple
+from typing_extensions import Self
+
+import torch
+import torch.utils._pytree as pytree
+from torch._guards import active_fake_mode
+from torch._inductor.utils import get_device_tflops, get_gpu_dram_gbps
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.distributed._tools.mod_tracker import ModTracker
+from torch.utils._mode_utils import no_dispatch
+from torch.utils._python_dispatch import TorchDispatchMode
+from torch.utils.flop_counter import flop_registry
+
+
+aten = torch.ops.aten
+
+# This value is hard-coded here:
+# https://github.com/pytorch/pytorch/blob/5fba5d83f0703ff8077ab65448a998e9ad6598fd/c10/cuda/CUDACachingAllocator.cpp#L117
+_PYTORCH_MIN_ALLOCATE = (
+    2**9 if int(os.environ.get("PYTORCH_NO_CUDA_MEMORY_CACHING", 0)) == 0 else 1
+)
+
+# No fall-back kernel needed/exists for view ops
+_VIEW_OPS = {
+    aten.lift_fresh,
+    aten.t,
+    aten.transpose,
+    aten.view,
+    aten.detach,
+    aten._unsafe_view,
+    aten.split,
+    aten.adjoint,
+    aten.as_strided,
+    aten.diagonal,
+    aten.expand,
+    aten.expand_as,
+    aten.movedim,
+    aten.permute,
+    aten.select,
+    aten.squeeze,
+    aten.mT,
+    aten.mH,
+    aten.real,
+    aten.imag,
+    aten.view_as,
+    aten.unflatten,
+    aten.unfold,
+    aten.unbind,
+    aten.unsqueeze,
+    aten.vsplit,
+    aten.hsplit,
+    aten.split_with_sizes,
+    aten.swapaxes,
+    aten.swapdims,
+    aten.chunk,
+}
+# We can ignore benchmarking tensor create ops
+_CREATE_OPS = {
+    aten.randint,
+    aten.randn,
+    aten.rand,
+    aten.randn_like,
+    aten.rand_like,
+    aten.randint_like,
+    aten.arange,
+    aten.ones_like,
+    aten.zeros_like,
+}
+
+_IGNORE_OPS = _VIEW_OPS | _CREATE_OPS
+
+__all__ = ["RuntimeEstimator"]
+
+
+class RuntimeEstimator(TorchDispatchMode):
+    """
+    Estimates the GPU runtime in milliseconds using various estimation methods under the ``FakeTensorMode``.
+
+    This class provides a ``TorchDispatchMode`` based context manager that can be used to estimate the eager
+    runtime of PyTorch functions. It supports two estimation modes, benchmarking (`operator-level-benchmark`) and
+    roofline cost modeling (`operator-level-cost-model`).
+    For modules executed under this context manager, it agggregates the forward and backward operation runtimes
+    and also records their execution orders.
+
+    Attributes:
+        mod_runtimes (Dict[str, Dict[str, float]]): A dictionary of module runtimes. The key to the outer dictionary
+            is the fully qualified name (FQN) of the module. For each module the forward and backward runtimes of the
+            operations are aggregated in the inner dictionary keyed by 'fw' and 'bw'.
+        mod_fw_pre_order (List[str]): List of module FQNs in pre-forward execution order.
+        mod_bw_pre_order (List[str]): List of module FQNs in pre-backward execution order.
+        mod_fw_post_order (List[str]): List of module FQNs in post-forward execution order.
+        mod_bw_post_order (List[str]): List of module FQNs in post-backward execution order.
+        total_runtime (float): The total estimated runtime in milliseconds.
+
+    Note:
+        1) The benchmarking estimate mode will execute kernels on GPU and assumes that every operation can run in
+            isolation without causing an OOM error. It is also designed to be used only under ``FakeTensorMode``.
+        2) Currently wrapper tensor sub-classes such as ``DTensor`` won't produce correct estimates. We plan to support
+            them in future PRs.
+        3) We only estimate the compute time, if your code has communication, it will not be considered. Again, we will
+            support this in future PRs.
+
+    Example usage:
+
+        .. code-block:: python
+
+            runtime_estimator = RuntimeEstimator()
+            with FakeTensorMode():
+                module = ...
+                optimizer = ...
+                inp = ...
+                with runtime_estimator(estimate_mode_type="operator-level-cost-model"):
+                    loss = module(inp)
+                    loss.backward()
+                    optimizer.step()
+                    optimizer.zero_grad()
+                runtime_estimator.display_modulewise_stats()
+    """
+
+    _float_types: Set[torch.dtype] = {
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+        torch.float64,
+    }
+    _no_fallback_kernel: Set[torch._ops._OpNamespace] = set()
+    fake_mode: FakeTensorMode
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._estimate: Callable
+        self._estimate_mode_type: str
+        self._mod_tracker = ModTracker()
+        self.mod_runtimes: Dict[str, Dict[str, float]] = defaultdict(
+            lambda: defaultdict(lambda: 0.0)
+        )
+        self.mod_fw_pre_order: List[str] = []
+        self.mod_bw_pre_order: List[str] = []
+        self.mod_fw_post_order: List[str] = []
+        self.mod_bw_post_order: List[str] = []
+        self.total_runtime: float = 0.0
+
+    # Adapted from: https://github.com/pytorch/pytorch/blob/9b902b3ee3bd608a19543362b66bf06c373dd374/torch/_subclasses/fake_tensor.py#L1969  # noqa: PGH004,B950
+    # NB: returns fake tensors
+    @classmethod
+    def _maybe_run_and_benchmark_fallback_kernel(  # type: ignore[no-untyped-def]
+        cls,
+        func,
+        args,
+        kwargs,
+        orig_not_implemented_exception,
+    ):
+        """
+        Runs and benchmarks a fallback kernel for a given function.
+
+        Args:
+            func (Callable): The function to benchmark.
+            args (Tuple): The arguments to pass to the function.
+            kwargs (Dict[str, Any]): The keyword arguments to pass to the function.
+            orig_not_implemented_exception (Exception): The original exception to raise if the fallback kernel
+                is not implemented.
+
+        Returns:
+            Tuple[Any, float]: A tuple containing the result of the function and
+                the mean operation time in milliseconds.
+        """
+        # these should all be supported, just to be safe
+        # avoid fallback for operators which inplace modify metadata
+        # because the input fake tensors would be umodified
+        if torch.Tag.inplace_view in func.tags:  # type: ignore[attr-defined]
+            raise orig_not_implemented_exception
+
+        inp_impls = {}
+        flat_args, args_spec = pytree.tree_flatten((args, kwargs))
+        # Don't use in_kernel_invocation_manager(fake_mode) as we want to do
+        # REAL compute (not with meta device)
+        with no_dispatch():
+
+            def to_real_tensor(e):  # type: ignore[no-untyped-def]
+                if cls.fake_mode.is_our_fake(e):
+                    if e.dtype in cls._float_types:
+                        out = torch.rand_like(e, device=e.fake_device)
+                    else:
+                        out = torch.ones_like(e, device=e.fake_device)
+                    if e.is_sparse:
+                        out._coalesced_(e.is_coalesced())
+                    inp_impls[id(out)] = e
+                    return out
+                return e
+
+            flat_args = [to_real_tensor(a) for a in flat_args]
+            args, kwargs = pytree.tree_unflatten(flat_args, args_spec)
+            r = func(*args, **kwargs)
+            warmup_iters, actual_iters = 2, 3
+            for _ in range(warmup_iters):
+                func(*args, **kwargs)
+            start_event = torch.cuda.Event(enable_timing=True)
+            end_event = torch.cuda.Event(enable_timing=True)
+            start_event.record(torch.cuda.current_stream())
+            for _ in range(actual_iters):
+                func(*args, **kwargs)
+            end_event.record(torch.cuda.current_stream())
+            torch.cuda.synchronize()
+            cuda_time = start_event.elapsed_time(end_event)
+            mean_op_time = cuda_time / actual_iters
+
+        storages = set()
+
+        for e in flat_args:
+            if isinstance(e, torch.Tensor):
+                if not e.is_sparse:
+                    storages.add(e._typed_storage()._cdata)
+
+        # TODO: also check metadata change on inputs
+        # proper aliasing/metadata relationship between outputs and inputs will
+        # not be set up, bc of conversion to device, unless we can reuse an
+        # input impl
+
+        def map_out(e):  # type: ignore[no-untyped-def]
+            if id(e) not in inp_impls and (
+                isinstance(e, torch.Tensor)
+                and not e.is_sparse
+                and e._typed_storage()._cdata in storages
+            ):
+                raise orig_not_implemented_exception
+
+            if isinstance(e, torch.Tensor):
+                if id(e) in inp_impls:
+                    return inp_impls[id(e)]
+                else:
+                    return cls.fake_mode.fake_tensor_converter.from_real_tensor(
+                        cls.fake_mode, e
+                    )
+            else:
+                return e
+
+        return (pytree.tree_map(map_out, r), mean_op_time)
+
+    @classmethod
+    def _benchmark_estimate(cls, func, args, kwargs) -> Tuple[Any, float]:  # type: ignore[no-untyped-def]
+        """
+        Estimates the runtime of a function using benchmarking.
+
+        Args:
+            func: The function to estimate.
+            args: The arguments to pass to the function.
+            kwargs: The keyword arguments to pass to the function.
+            res: The result of the function.
+
+        Returns:
+            Tuple[Any, float]: A tuple containing the result of the function and
+                the mean operation time in milliseconds.
+        """
+        assert isinstance(
+            cls.fake_mode, FakeTensorMode
+        ), "Initialize/Assign FakeTensorMode before using this function"
+        mean_op_time = 0.0
+        if func._overloadpacket not in _VIEW_OPS:
+            try:
+                res, mean_op_time = cls._maybe_run_and_benchmark_fallback_kernel(
+                    func,
+                    args,
+                    kwargs,
+                    NotImplementedError,
+                )
+                return (res, mean_op_time)
+            except NotImplementedError:
+                cls._no_fallback_kernel.add(func._overloadpacket)
+        res = func(*args, **kwargs or {})
+        return (res, mean_op_time)
+
+    # Adapted from: https://github.com/pytorch/pytorch/blob/9b902b3ee3bd608a19543362b66bf06c373dd374/torch/_inductor/scheduler.py#L589  # noqa: PGH004,B950
+    @classmethod
+    def _roofline_estimate(cls, func, args, kwargs) -> Tuple[Any, float]:  # type: ignore[no-untyped-def]
+        """
+        Estimates the runtime of a function using a roofline cost model.
+
+        Args:
+            func: The function to estimate.
+            args: The arguments to pass to the function.
+            kwargs: The keyword arguments to pass to the function.
+            out: The output of the function.
+
+        Returns:
+            Tuple[Any, float]: A tuple containing the result of the function and
+                the mean operation time in milliseconds.
+        """
+        assert (
+            torch.cuda.is_available()
+        ), "Roofline estimation needs to access CUDA capabilities to make estimations"
+
+        def get_num_bytes(t: torch.Tensor) -> int:
+            """
+            Calculates the memory consumption of a tensor.
+
+            Args:
+                t (torch.Tensor): The input tensor.
+
+            Returns:
+                int: The memory consumption of the tensor in bytes.
+            """
+            num_bytes = t.untyped_storage().nbytes()
+            mem_consumed = (
+                math.ceil(num_bytes / _PYTORCH_MIN_ALLOCATE) * _PYTORCH_MIN_ALLOCATE
+            )
+            return mem_consumed
+
+        def get_compute_time(func_packet, args, kwargs, out, out_dtypes) -> float:  # type: ignore[no-untyped-def]
+            """
+            Estimates the compute time of an aten operator.
+
+            Args:
+                func_packet: The operator overload packet.
+                args: The arguments to the operator.
+                kwargs: The keyword arguments to the operator.
+                out: The output of the operator.
+                out_dtypes: The output data types.
+
+            Returns:
+                float: The estimated compute time in nanoseconds.
+            """
+            if func_packet in flop_registry:
+                assert (
+                    len(out_dtypes) == 1
+                ), f"Only support single out dtype got {out_dtypes} for {func_packet}"
+                dtype = out_dtypes.pop()
+                # This actually gives peta-FLOPs/s hence multiply by 1e15 to get the FLOPs/s
+                peak_gpu_flops = get_device_tflops(dtype) * 1e15
+                # We can expect to achieve 75% of theoretical peak flops
+                factor = 0.75
+                peak_empirical_flops = factor * peak_gpu_flops
+                flop_count_func = flop_registry[func_packet]
+                # We divide by a factor of 2 to get the MACs (multiply and accumulate)
+                flop_count = flop_count_func(*args, **kwargs, out_val=out) / 2
+                # We multiply by 1e9 to get the time in nano seconds
+                compute_time = (flop_count / peak_empirical_flops) * 1e9
+                return compute_time
+            return 0.0
+
+        def get_transfer_time(flat_args_kwargs, flat_outs) -> float:  # type: ignore[no-untyped-def]
+            """
+            Estimates the memory transfer time of input and output tensors.
+
+            Args:
+                flat_args_kwargs (List[torch.Tensor]): The flat list of arguments and keyword arguments.
+                flat_outs (List[torch.Tensor]): The flat list of outputs.
+
+            Returns:
+                float: The estimated memory transfer time in nanoseconds.
+            """
+            gpu_memory_bandwidth = get_gpu_dram_gbps()
+            read_bytes = sum(
+                get_num_bytes(t)
+                for t in flat_args_kwargs
+                if isinstance(t, torch.Tensor)
+            )
+            write_bytes = sum(
+                get_num_bytes(t) for t in flat_outs if isinstance(t, torch.Tensor)
+            )
+            counted_bytes = read_bytes + write_bytes
+            # The GPU memory bandwidth is in GB/s so the transfer time is in nanoseconds
+            transfer_time = counted_bytes / gpu_memory_bandwidth
+            return transfer_time
+
+        # Roofline Cost Model Explanation
+
+        # The roofline cost model estimates the execution time of an operator based on
+        # the device's empirical maximum FLOPs/sec (pi) and device DRAM bandwidth (beta).
+
+        # Variables:
+        # - pi: Maximum empirical FLOPs/sec of the device
+        # - beta: Maximum empirical device DRAM bandwidth (bytes/sec) of the device
+        # - I: Arithmetic intensity of the operator (FLOPs/bytes)
+        # - op_flops: FLOPs required by the operator
+        # - op_bytes: Bytes transferred to and from DRAM for the operator
+
+        # Calculation Steps:
+        # 1. Calculate arithmetic intensity: I = op_flops / op_bytes
+        # 2. Calculate estimated FLOPs/sec: est_flops_sec = min(pi, beta * I)
+        # 3. Calculate estimated operator time: estimated_op_time = op_flops / est_flops_sec
+        #    This simplifies to: estimated_op_time = max(op_flops / pi, op_flops / (beta * I))
+        #    Further simplifying: estimated_op_time = max(op_flops / pi, op_bytes / beta)
+
+        # Simplified Formulas:
+        # - compute_time = op_flops / pi
+        # - transfer_time = op_bytes / beta
+        # - estimated_op_time = max(compute_time, transfer_time)
+
+        kwargs = kwargs if kwargs else {}
+        out = func(*args, **kwargs)
+        op_time = 0.0
+        func_packet = func._overloadpacket
+        if func_packet not in _IGNORE_OPS:
+            flat_args_kwargs, args_spec = pytree.tree_flatten((args, kwargs))
+            flat_outs, out_spec = pytree.tree_flatten(out)
+            transfer_time = get_transfer_time(flat_args_kwargs, flat_outs)
+
+            out_dtypes = {
+                t.dtype
+                for t in flat_outs
+                if isinstance(t, torch.Tensor) and t.dtype in cls._float_types
+            }
+
+            args, kwargs = pytree.tree_unflatten(flat_args_kwargs, args_spec)
+            out = pytree.tree_unflatten(flat_outs, out_spec)
+
+            compute_time = get_compute_time(func_packet, args, kwargs, out, out_dtypes)
+            # We get the estimated time as the max of the transfer time and
+            # compute time. We divide by 1e6 to get the time in ms
+            op_time = max(transfer_time, compute_time) / 1e6
+
+        return (out, op_time)
+
+    def display_modulewise_stats(self, depth: int = 2) -> None:
+        """
+        Displays module-wise statistics collected by ``RuntimeEstimator``.
+
+        Prints the pre-forward and pre-backward execution orders.
+        Displays the module-wise forward and backward runtimes in milliseconds.
+
+        Args:
+            depth (int): The maximum depth of module hierarchy to display (default to 2).
+        """
+        print("Pre-Forward Execution Order: ")
+        for mod_fqn in self.mod_fw_pre_order:
+            mod_depth = mod_fqn.count(".") + 1
+            if mod_depth > depth:
+                continue
+            print(mod_fqn)
+        print("Pre-Backward Execution Order: ")
+        for mod_fqn in self.mod_bw_pre_order:
+            mod_depth = mod_fqn.count(".") + 1
+            if mod_depth > depth:
+                continue
+            print(mod_fqn)
+        for mod_fqn, runtimes in self.mod_runtimes.items():
+            mod_depth = mod_fqn.count(".") + 1
+            if mod_depth > depth:
+                continue
+            print(
+                f"{mod_fqn} fw: {runtimes.get('fw', 0.0):.3f}ms bw: {runtimes.get('bw', 0.0):.3f}ms"
+            )
+
+    def __torch_dispatch__(self, func, types, args=..., kwargs=None):  # type: ignore[no-untyped-def]
+        # TODO: @sanketpurandare: Flatten tensors by desugaring the tensor subclasses
+        # TODO: @sanketpurandare: Add logic for incorporating communication time
+        res, op_time = self._estimate(func, args, kwargs)
+        for par in self._mod_tracker.parents:
+            if self._mod_tracker.is_bw:
+                self.mod_runtimes[par]["bw"] += op_time
+            else:
+                self.mod_runtimes[par]["fw"] += op_time
+        self.total_runtime += op_time
+        return res
+
+    def __call__(self, estimate_mode_type: str) -> Self:
+        """
+        Sets the estimate mode type.
+
+        Currently supported modes:
+            - "operator-level-benchmark": Estimates runtime using operator benchmarking.
+            - "operator-level-cost-model": Estimates runtime using roofline cost model.
+
+        Args:
+            estimate_mode_type (str): The type of estimate mode to use.
+
+        Returns:
+            RuntimeEstimator: The runtime estimator instance.
+
+        Raises:
+            NotImplementedError: If the estimate mode type is not supported.
+        """
+        if estimate_mode_type == "operator-level-benchmark":
+            self._estimate = RuntimeEstimator._benchmark_estimate
+        elif estimate_mode_type == "operator-level-cost-model":
+            self._estimate = RuntimeEstimator._roofline_estimate
+        else:
+            raise NotImplementedError(
+                f"estimate_mode_type {estimate_mode_type} not supported"
+            )
+        self._estimate_mode_type = estimate_mode_type
+        return self
+
+    def __enter__(self) -> Self:
+        fake_mode = active_fake_mode()
+        assert isinstance(
+            fake_mode, FakeTensorMode
+        ), "No FakeTensorMode found, designed to used under FakeTensorMode"
+        RuntimeEstimator.fake_mode = fake_mode
+        self.total_runtime = 0.0
+        self.mod_runtimes = defaultdict(lambda: defaultdict(lambda: 0.0))
+        self.mod_fw_pre_order.clear()
+        self.mod_bw_pre_order.clear()
+        self.mod_fw_post_order.clear()
+        self.mod_bw_post_order.clear()
+        self._mod_tracker.register_user_hooks(
+            pre_fw_hook=lambda mod, inp: self.mod_fw_pre_order.append(
+                self._mod_tracker.get_known_fqn(mod)
+            ),
+            pre_bw_hook=lambda mod, g_out: self.mod_bw_pre_order.append(
+                self._mod_tracker.get_known_fqn(mod)
+            ),
+            post_fw_hook=lambda mod, inp, out: self.mod_fw_post_order.append(
+                self._mod_tracker.get_known_fqn(mod)
+            ),
+            post_bw_hook=lambda mod, g_inp: self.mod_bw_post_order.append(
+                self._mod_tracker.get_known_fqn(mod)
+            ),
+        )
+        self._mod_tracker.__enter__()
+        super().__enter__()
+        return self
+
+    def __exit__(self, *args: Any) -> None:
+        print(
+            f"Estimated ({self._estimate_mode_type})"
+            f"total_time: {self.total_runtime:.3f} ms"
+        )
+        if len(self._no_fallback_kernel) > 0:
+            print("no_fallback_kernel: ", list(self._no_fallback_kernel))
+        super().__exit__(*args)
+        self._mod_tracker.clear_user_hooks()
+        self._mod_tracker.__exit__()