I am done

2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions
--- a/rl/Lib/site-packages/torch/profiler/init.py
+++ b/rl/Lib/site-packages/torch/profiler/init.py
@ -0,0 +1,50 @@
+# mypy: allow-untyped-defs
+r"""
+PyTorch Profiler is a tool that allows the collection of performance metrics during training and inference.
+Profiler's context manager API can be used to better understand what model operators are the most expensive,
+examine their input shapes and stack traces, study device kernel activity and visualize the execution trace.
+
+.. note::
+    An earlier version of the API in :mod:`torch.autograd` module is considered legacy and will be deprecated.
+
+"""
+import os
+
+from torch._C._autograd import _supported_activities, DeviceType, kineto_available
+from torch._C._profiler import _ExperimentalConfig, ProfilerActivity, RecordScope
+from torch.autograd.profiler import KinetoStepTracker, record_function
+from torch.optim.optimizer import register_optimizer_step_post_hook
+
+from .profiler import (
+    _KinetoProfile,
+    ExecutionTraceObserver,
+    profile,
+    ProfilerAction,
+    schedule,
+    supported_activities,
+    tensorboard_trace_handler,
+)
+
+
+__all__ = [
+    "profile",
+    "schedule",
+    "supported_activities",
+    "tensorboard_trace_handler",
+    "ProfilerAction",
+    "ProfilerActivity",
+    "kineto_available",
+    "DeviceType",
+    "record_function",
+    "ExecutionTraceObserver",
+]
+
+from . import itt
+
+
+def _optimizer_post_hook(optimizer, args, kwargs):
+    KinetoStepTracker.increment_step("Optimizer")
+
+
+if os.environ.get("KINETO_USE_DAEMON", None):
+    _ = register_optimizer_step_post_hook(_optimizer_post_hook)
--- a/rl/Lib/site-packages/torch/profiler/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/profiler/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/profiler/pycache/_memory_profiler.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/profiler/pycache/_memory_profiler.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/profiler/pycache/_pattern_matcher.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/profiler/pycache/_pattern_matcher.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/profiler/pycache/_utils.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/profiler/pycache/_utils.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/profiler/pycache/itt.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/profiler/pycache/itt.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/profiler/pycache/profiler.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/profiler/pycache/profiler.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/profiler/pycache/python_tracer.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/profiler/pycache/python_tracer.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/profiler/_memory_profiler.py
+++ b/rl/Lib/site-packages/torch/profiler/_memory_profiler.py
--- a/rl/Lib/site-packages/torch/profiler/_pattern_matcher.py
+++ b/rl/Lib/site-packages/torch/profiler/_pattern_matcher.py
@ -0,0 +1,663 @@
+# mypy: allow-untyped-defs
+import json
+import math
+import os
+import re
+from typing import Dict, List, Optional, Set
+
+import torch
+import torch.utils.benchmark as benchmark
+from torch._C._profiler import (
+    _EventType,
+    _ExtraFields_PyCall,
+    _ExtraFields_PyCCall,
+    _ExtraFields_TorchOp,
+    _ProfilerEvent,
+)
+from torch.profiler import profile
+from torch.profiler._utils import index_of_first_match, traverse_bfs, traverse_dfs
+
+
+class Pattern:
+    """
+    Base class for all patterns, subclass this class and implement match()
+    to define custom patterns.
+
+    In subclass, define description and skip property.
+    """
+
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        self.prof = prof
+        self.should_benchmark = should_benchmark
+        self.name = "Please specify a name for pattern"
+        self.description = "Please specify a description for pattern"
+        self.url = ""
+        assert prof.profiler is not None and prof.profiler.kineto_results is not None
+        self.event_tree = prof.profiler.kineto_results.experimental_event_tree()
+        self.tid_root: Dict[int, List[_ProfilerEvent]] = {}
+        for event in self.event_tree:
+            self.tid_root.setdefault(event.start_tid, []).append(event)
+
+    @property
+    def skip(self):
+        return False
+
+    def report(self, event: _ProfilerEvent):
+        msg = (
+            f"{self.description}\n[Source Code Location] {source_code_location(event)}"
+        )
+        return msg
+
+    def eventTreeTraversal(self):
+        """
+        Traverse the event tree and yield all events.
+        Override this method in subclass to customize the traversal.
+        """
+        yield from traverse_dfs(self.event_tree)
+
+    def summary(self, events: List[_ProfilerEvent]):
+        default_summary = f"{self.name}: {len(events)} events matched."
+        if self.should_benchmark:
+            # If benchmark summary is not empty, use it.
+            return (
+                self.benchmark_summary(events)
+                if hasattr(self, "benchmark")  # type: ignore[attr-defined]
+                else default_summary
+            )
+        return default_summary
+
+    def benchmark_summary(self, events: List[_ProfilerEvent]):
+        def format_time(time_ns: int):
+            unit_lst = ["ns", "us", "ms"]
+            for unit in unit_lst:
+                if time_ns < 1000:
+                    return f"{time_ns:.2f} {unit}"
+                time_ns //= 1000
+            return f"{time_ns:.2f} s"
+
+        assert hasattr(self, "benchmark"), "Please implement benchmark()"
+        shapes_factor_map = self.benchmark(events)  # type: ignore[attr-defined]
+        original_time = sum(event.duration_time_ns for event in events)
+        new_time = sum(
+            shapes_factor_map[input_shapes(event)] * event.duration_time_ns
+            for event in events
+        )
+        return (
+            f"{self.name}: {len(events)} events matched. "
+            f"Total Estimated Speedup: {format_time(original_time - new_time)} ({round(original_time/new_time, 2)}X)"
+        )
+
+    def match(self, event: _ProfilerEvent):
+        """
+        Return True if the event matches the pattern.
+        This method should be overriden in subclass.
+        """
+        raise NotImplementedError
+
+    def matched_events(self):
+        if self.skip:
+            return []
+        matched_events = []
+        for event in self.eventTreeTraversal():
+            if self.match(event):
+                matched_events.append(event)
+        return matched_events
+
+    def root_of(self, event: _ProfilerEvent):
+        while event.parent:
+            event = event.parent
+        return event
+
+    def siblings_of(self, event: _ProfilerEvent):
+        if event.parent:
+            children = event.parent.children
+        else:
+            children = self.tid_root[event.start_tid]
+        index = children.index(event)
+        return children[:index], children[index + 1 :]
+
+    def next_of(self, event: _ProfilerEvent):
+        _, next_events = self.siblings_of(event)
+        return next_events[0] if next_events else None
+
+    def prev_of(self, event: _ProfilerEvent):
+        prev_events, _ = self.siblings_of(event)
+        return prev_events[-1] if prev_events else None
+
+    def go_up_until(self, event: _ProfilerEvent, predicate):
+        if not event:
+            return None
+        while event.parent and not predicate(event):
+            event = event.parent
+        return event
+
+
+# Patterns
+
+
+class NamePattern(Pattern):
+    def __init__(self, prof: profile, name: str, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.description = f"Matched Name Event: {name}"
+        self.name = name
+
+    def match(self, event: _ProfilerEvent):
+        return re.search(self.name, event.name) is not None
+
+
+class ExtraCUDACopyPattern(Pattern):
+    """
+    This pattern identifies if we creates a constant tensor on CPU and immediately moves it to GPU.
+    example: torch.zeros((100, 100)).to("cuda")
+
+    Pattern:
+    build-in method                 |build-in method
+        ...                         |    aten::to
+            aten::fill_/aten::zero_ |        aten::_to_copy
+
+    Algorithm:
+    We start at node aten::to, go parent events' previous events,
+    and check if we have a aten::fill_/aten::zero_ as we keep going down the tree.
+    We always select the last child in the children list when we go down the tree.
+    If at any step we failed, it is not a match.
+    """
+
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.name = "Extra CUDA Copy Pattern"
+        self.description = "Filled a CPU tensor and immediately moved it to GPU. Please initialize it on GPU."
+        self.url = "https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#create-tensors-directly-on-the-target-device"
+        self.init_ops = {
+            "aten::fill_",
+            "aten::zero_",
+            "aten::normal_",
+            "aten::uniform_",
+        }
+
+    @property
+    def skip(self):
+        return not self.prof.with_stack or not self.prof.record_shapes
+
+    def match(self, event):
+        # TODO: We should also check tensor identities
+        if event.name != "aten::to":
+            return False
+        to_event = event
+        if not event.children:
+            return False
+        event = event.children[-1]
+        if event.name != "aten::_to_copy":
+            return False
+        if not event.children:
+            return False
+        event = event.children[-1]
+        if event.name != "aten::copy_":
+            return False
+        # aten::copy_ should have the first 2 args dtype the same
+        dtypes = input_dtypes(event)
+        if len(dtypes) < 2:
+            return False
+        if dtypes[0] is None or dtypes[0] != dtypes[1]:
+            return False
+        event = to_event
+        # Up one level
+        event = event.parent
+        if event is None:
+            return False
+        # Check if we have a aten::fill_ in previous leaf
+        event = self.prev_of(event)
+        if event is None:
+            return False
+        while event.children:
+            event = event.children[-1]
+            # aten::zero_ is a special optimzation case where fill_ is not called
+            if event.name in self.init_ops:
+                return True
+        return event.name in self.init_ops
+        # TODO: Check if tensor is reused
+
+    def benchmark(self, events: List[_ProfilerEvent]):
+        shapes_factor_map = {input_shapes(event): 0.0 for event in events}
+        for shape in shapes_factor_map:
+            size = shape[0]
+            to_timer = benchmark.Timer(
+                stmt='torch.ones(size).to("cuda")', globals={"size": size}
+            )
+            de_timer = benchmark.Timer(
+                stmt='torch.ones(size, device="cuda")', globals={"size": size}
+            )
+            to_time = to_timer.timeit(10).mean
+            de_time = de_timer.timeit(10).mean
+            shapes_factor_map[shape] = de_time / to_time
+        return shapes_factor_map
+
+
+class ForLoopIndexingPattern(Pattern):
+    """
+    This pattern identifies if we use a for loop to index a tensor that
+    can be vectorized.
+    example:
+    tensor = torch.empty((100, 100))
+    for i in range(100):
+        tensor[i] = i
+
+    Pattern:
+    aten::select | ... | aten::select | ... (Repeat)
+
+    Algorithm:
+    We start at node aten::select, and we check if we can find this alternating patterns.
+    We also keep a dictionary to avoid duplicate match in the for loop.
+    """
+
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.name = "For Loop Indexing Pattern"
+        self.description = "For loop indexing detected. Vectorization recommended."
+        self.visited: Set[int] = set()
+
+    def eventTreeTraversal(self):
+        """
+        We need to use BFS traversal order to avoid duplicate match.
+        """
+        yield from traverse_bfs(self.event_tree)
+
+    def match(self, event: _ProfilerEvent):
+        if event.name != "aten::select":
+            return False
+        if event.id in self.visited:
+            return False
+        repeat_count = 1
+        _, next = self.siblings_of(event)
+        if len(next) <= 1:
+            return False
+
+        # Custom event list matching
+        def same_ops(list1, list2):
+            if len(list1) != len(list2):
+                return False
+            for op1, op2 in zip(list1, list2):
+                if op1.name != op2.name:
+                    return False
+            return True
+
+        # Record the ops between two aten::select
+        next_select_idx = index_of_first_match(next, lambda e: e.name == "aten::select")
+        if next_select_idx is None:
+            return False
+        indexing_ops = [event] + next[:next_select_idx]
+        next = next[len(indexing_ops) - 1 :]
+        for i in range(0, len(next), len(indexing_ops)):
+            if same_ops(indexing_ops, next[i : i + len(indexing_ops)]):
+                repeat_count += 1
+                self.visited.add(next[i].id)
+            else:
+                break
+        return repeat_count >= 10
+
+
+class FP32MatMulPattern(Pattern):
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.name = "FP32 MatMul Pattern"
+        self.description = (
+            "You are currently using GPU that supports TF32. "
+            "Please enable TF32 by setting 'torch.backends.cuda.matmul.allow_tf32 = True'"
+        )
+        self.url = "https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+
+    @property
+    def skip(self):
+        if torch.version.hip is not None:
+            has_tf32 = False
+        else:
+            # Anything less than sm_80 is not Ampere which doesn't support TF32
+            has_tf32 = all(int(arch[3:]) >= 80 for arch in torch.cuda.get_arch_list())
+        return has_tf32 is False or super().skip or not self.prof.record_shapes
+
+    def match(self, event: _ProfilerEvent):
+        # If we saw this pattern once, we don't need to match it again
+        if event.tag != _EventType.TorchOp:
+            return False
+        assert isinstance(event.extra_fields, _ExtraFields_TorchOp)
+        if event.name == "aten::mm":
+            if event.extra_fields.allow_tf32_cublas is False:
+                return True
+        return False
+
+    def report(self, event: _ProfilerEvent):
+        return self.description
+
+    def benchmark(self, events: List[_ProfilerEvent]):
+        shapes_factor_map = {input_shapes(event): 0.0 for event in events}
+        for shape in shapes_factor_map:
+            matrixA = torch.randn(shape[0], device="cuda", dtype=torch.float32)
+            matrixB = torch.randn(shape[1], device="cuda", dtype=torch.float32)
+            fp32_timer = benchmark.Timer(
+                stmt="torch.mm(matrixA, matrixB)",
+                globals={"matrixA": matrixA, "matrixB": matrixB},
+            )
+            tf32_timer = benchmark.Timer(
+                stmt="torch.mm(matrixA, matrixB)",
+                setup="torch.backends.cuda.matmul.allow_tf32 = True",
+                globals={"matrixA": matrixA, "matrixB": matrixB},
+            )
+            torch.backends.cuda.matmul.allow_tf32 = False
+            fp32_time = fp32_timer.timeit(10).mean
+            tf32_time = tf32_timer.timeit(10).mean
+            shapes_factor_map[shape] = tf32_time / fp32_time
+        return shapes_factor_map
+
+
+class OptimizerSingleTensorPattern(Pattern):
+    """
+    This pattern identifies if we are using the single-tensor version of an optimizer.
+    example:
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+    By adding foreach=True to enable multi-tensor optimizer, we can gain speedup when
+    the kernels are relatively small.
+
+    Pattern:
+    XXXXX: _single_tenser_<OPTIMIZER_NAME>
+
+    Algorithm:
+    String match
+    """
+
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.name = "Optimizer Single Tensor Pattern"
+        self.optimizers_with_foreach = ["adam", "sgd", "adamw"]
+        self.description = (
+            "Deteced optimizer running with single tensor implementation. "
+            "Please enable multi tensor implementation by passing 'foreach=True' into optimizer."
+        )
+        self.url = ""
+
+    def match(self, event: _ProfilerEvent):
+        for optimizer in self.optimizers_with_foreach:
+            if event.name.endswith(f"_single_tensor_{optimizer}"):
+                return True
+        return False
+
+
+class SynchronizedDataLoaderPattern(Pattern):
+    """
+    This pattern identifies if we are using num_workers=0 in DataLoader.
+    example:
+    torch.utils.data.DataLoader(dataset, batch_size=batch_size)
+    Add num_workers=N to the arguments. N depends on system configuration.
+
+    Pattern:
+    dataloader.py(...): __iter__
+        dataloader.py(...): _get_iterator
+            NOT dataloader.py(...): check_worker_number_rationality
+
+    Algorithm:
+    If we don't see check_worker_number_rationality call in the dataloader __iter__,
+    It is not an asynchronous dataloader.
+
+    """
+
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.name = "Synchronized DataLoader Pattern"
+        self.description = (
+            "Detected DataLoader running with synchronized implementation. "
+            "Please enable asynchronous dataloading by setting num_workers > 0 when initializing DataLoader."
+        )
+        self.url = (
+            "https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html"
+            "#enable-async-data-loading-and-augmentation"
+        )
+
+    def match(self, event: _ProfilerEvent):
+        def is_dataloader_function(name: str, function_name: str):
+            return name.startswith(
+                os.path.join("torch", "utils", "data", "dataloader.py")
+            ) and name.endswith(function_name)
+
+        # TODO: fixme! Due to lifetime issues of the function name, this field might
+        # actually point to an already freed string when the even is a PyCall.
+        # Just silently skip this to unblock testing.
+        try:
+            event.name
+        except UnicodeDecodeError:
+            return False
+
+        if not is_dataloader_function(event.name, "__iter__"):
+            return False
+        if not event.children:
+            return False
+        event = event.children[0]
+        if not is_dataloader_function(event.name, "_get_iterator"):
+            return False
+        if not event.children:
+            return False
+        event = event.children[0]
+        return not is_dataloader_function(event.name, "check_worker_number_rationality")
+        # TODO: We should also check if the loader is bottleneck.
+
+
+class GradNotSetToNonePattern(Pattern):
+    """
+    This pattern identifies if we are not setting grad to None in zero_grad.
+    example:
+    optimizer.zero_grad()
+    By setting set_to_none=True, we can gain speedup
+
+    Pattern:
+    XXXXX: _zero_grad
+        NOT aten::zeros
+            aten::zero_
+
+    aten::zero_ is called on each parameter in the model.
+    We also want to make sure it is not called by aten::zeros.
+
+    Algorithm:
+    String match
+    """
+
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.name = "Gradient Set To Zero Instead of None Pattern"
+        self.description = (
+            "Detected gradient set to zero instead of None. "
+            "Please add 'set_to_none=True' when calling zero_grad()."
+        )
+        self.url = (
+            "https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html"
+            "#disable-gradient-calculation-for-validation-or-inference"
+        )
+
+    def match(self, event: _ProfilerEvent):
+        if not event.name.endswith(": zero_grad"):
+            return False
+        if not event.children:
+            return False
+
+        for sub_event in traverse_dfs(event.children):
+            if (
+                sub_event.name == "aten::zero_"
+                and sub_event.parent.name != "aten::zeros"
+            ):
+                return True
+        # TODO: We should also check if the optimizer's numerical behavior will change.
+        return False
+
+
+class Conv2dBiasFollowedByBatchNorm2dPattern(Pattern):
+    """
+    This pattern identifies if we are enabling bias in Conv2d which is followed by BatchNorm2d.
+    Bias doesn't do anything when followed by batchnorm.
+    Pattern:
+    nn.Module: Conv2d            | nn.Module: BatchNorm2d
+        ...
+            aten::conv2d AND dtype of third argument is not null
+    The third argument is the bias
+    Algorithm:
+    String match
+    """
+
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.name = "Enabling Bias in Conv2d Followed By BatchNorm Pattern"
+        self.description = "Detected bias enabled in Conv2d that is followed by BatchNorm2d. Please set 'bias=False' in Conv2d."
+        self.url = (
+            "https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html"
+            "#disable-bias-for-convolutions-directly-followed-by-a-batch-norm"
+        )
+
+    @property
+    def skip(self):
+        return self.prof.record_shapes is False or super().skip
+
+    def match(self, event: _ProfilerEvent):
+        if event.name != "aten::conv2d":
+            return False
+        if len(input_dtypes(event)) < 3 or input_dtypes(event)[2] is None:
+            return False
+        # This means bias=True
+        event = self.go_up_until(
+            event, lambda e: e.name.startswith("nn.Module: Conv2d")
+        )
+        if not event:
+            return False
+        event = self.next_of(event)
+        if not event:
+            return False
+        return event.name.startswith("nn.Module: BatchNorm2d")
+
+
+class MatMulDimInFP16Pattern(Pattern):
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.name = "Matrix Multiplication Dimension Not Aligned Pattern"
+        self.description = "Detected matmul with dimension not aligned. Please use matmul with aligned dimension."
+        self.url = "https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#use-mixed-precision-and-amp"
+
+    @property
+    def skip(self):
+        return not self.prof.with_stack or not self.prof.record_shapes
+
+    def match(self, event: _ProfilerEvent):
+        def mutiple_of(shapes, multiple):
+            return all(dim % multiple == 0 for shape in shapes for dim in shape[-2:])
+
+        if event.name not in ("aten::mm", "aten::bmm", "aten::addmm"):
+            return False
+        if not input_dtypes(event):
+            return False
+        arg_dtype = input_dtypes(event)[0]
+        if arg_dtype in (torch.bfloat16, torch.half) and not mutiple_of(
+            input_shapes(event), 8
+        ):
+            return True
+        return False
+
+    def benchmark(self, events: List[_ProfilerEvent]):
+        def closest_multiple(shapes, multiple):
+            return [multiple * math.ceil(shape / multiple) for shape in shapes]
+
+        shapes_factor_map = {input_shapes(event): 0.0 for event in events}
+        for shape in shapes_factor_map:
+            matrixA = torch.randn(shape[0], device="cuda", dtype=torch.float16)
+            matrixB = torch.randn(shape[1], device="cuda", dtype=torch.float16)
+            not_aligned_dim_timer = benchmark.Timer(
+                stmt="torch.mm(matrixA, matrixB)",
+                globals={"matrixA": matrixA, "matrixB": matrixB},
+            )
+            matrixA = torch.randn(
+                closest_multiple(shape[0], 8), device="cuda", dtype=torch.float16
+            )
+            matrixB = torch.randn(
+                closest_multiple(shape[1], 8), device="cuda", dtype=torch.float16
+            )
+            aligned_dim_timer = benchmark.Timer(
+                stmt="torch.mm(matrixA, matrixB)",
+                globals={"matrixA": matrixA, "matrixB": matrixB},
+            )
+            not_aligned_dim_time = not_aligned_dim_timer.timeit(10).mean
+            aligned_dim_time = aligned_dim_timer.timeit(10).mean
+            shapes_factor_map[shape] = aligned_dim_time / not_aligned_dim_time
+        return shapes_factor_map
+
+
+def source_code_location(event: Optional[_ProfilerEvent]):
+    while event:
+        if event.tag == _EventType.PyCall or event.tag == _EventType.PyCCall:
+            assert isinstance(
+                event.extra_fields, (_ExtraFields_PyCall, _ExtraFields_PyCCall)
+            )
+            if not event.extra_fields.caller.file_name.startswith("torch" + os.sep):
+                return f"{event.extra_fields.caller.file_name}:{event.extra_fields.caller.line_number}"
+        event = event.parent
+    return "No source code location found"
+
+
+def input_shapes(event: _ProfilerEvent):
+    assert isinstance(event.extra_fields, _ExtraFields_TorchOp)
+    return tuple(tuple(getattr(i, "sizes", ())) for i in event.extra_fields.inputs)
+
+
+def input_dtypes(event: _ProfilerEvent):
+    assert isinstance(event.extra_fields, _ExtraFields_TorchOp)
+    return tuple(getattr(i, "dtype", None) for i in event.extra_fields.inputs)
+
+
+def report_all_anti_patterns(
+    prof,
+    should_benchmark: bool = False,
+    print_enable: bool = True,
+    json_report_dir: Optional[str] = None,
+):
+    report_dict: Dict = {}
+    anti_patterns = [
+        ExtraCUDACopyPattern(prof, should_benchmark),
+        # ForLoopIndexingPattern(prof, should_benchmark),
+        FP32MatMulPattern(prof, should_benchmark),
+        OptimizerSingleTensorPattern(prof, should_benchmark),
+        SynchronizedDataLoaderPattern(prof, should_benchmark),
+        GradNotSetToNonePattern(prof, should_benchmark),
+        Conv2dBiasFollowedByBatchNorm2dPattern(prof, should_benchmark),
+        MatMulDimInFP16Pattern(prof, should_benchmark),
+    ]
+    reported = set()
+    summaries = []
+    message_list = [f"{'-'*40}TorchTidy Report{'-'*40}"]
+    message_list.append("Matched Events:")
+
+    for anti_pattern in anti_patterns:
+        matched_events = anti_pattern.matched_events()
+        if not matched_events:
+            continue
+        summaries.append(anti_pattern.summary(matched_events))
+        for event in matched_events:
+            report_msg = anti_pattern.report(event)
+            if report_msg not in reported:
+                message_list.append(report_msg)
+                reported.add(report_msg)
+                src_location, line_no = source_code_location(event).split(":")
+                report_dict.setdefault(src_location, []).append(
+                    {
+                        "line_number": int(line_no),
+                        "name": anti_pattern.name,
+                        "url": anti_pattern.url,
+                        "message": anti_pattern.description,
+                    }
+                )
+
+    if json_report_dir is not None:
+        json_report_path = os.path.join(json_report_dir, "torchtidy_report.json")
+        if os.path.exists(json_report_path):
+            with open(json_report_path) as f:
+                exisiting_report = json.load(f)
+                exisiting_report.update(report_dict)
+                report_dict = exisiting_report
+        with open(json_report_path, "w") as f:
+            json.dump(report_dict, f, indent=4)
+
+    message_list.append("Summary:")
+    message_list += summaries
+    message_list.append(f"{'-'*40}TorchTidy Report{'-'*40}")
+    if print_enable:
+        print("\n".join(message_list))
--- a/rl/Lib/site-packages/torch/profiler/_utils.py
+++ b/rl/Lib/site-packages/torch/profiler/_utils.py
@ -0,0 +1,385 @@
+# mypy: allow-untyped-defs
+import functools
+import operator
+import re
+from collections import deque
+from dataclasses import dataclass
+from typing import Dict, List, TYPE_CHECKING
+
+from torch.autograd.profiler import profile
+from torch.profiler import DeviceType
+
+
+if TYPE_CHECKING:
+    from torch.autograd import _KinetoEvent
+
+
+def _traverse(tree, next_fn, children_fn=lambda x: x.children, reverse: bool = False):
+    order = reversed if reverse else lambda x: x
+    remaining = deque(order(tree))
+    while remaining:
+        curr_event = next_fn(remaining)
+        yield curr_event
+        for child_event in order(children_fn(curr_event)):
+            remaining.append(child_event)
+
+
+traverse_dfs = functools.partial(_traverse, next_fn=lambda x: x.pop(), reverse=True)
+traverse_bfs = functools.partial(
+    _traverse, next_fn=lambda x: x.popleft(), reverse=False
+)
+
+
+@dataclass
+class EventMetrics:
+    duration_time_ns: int = 0
+    self_time_ns: int = 0
+    idle_time_ns: int = 0
+    queue_depth: int = 0
+
+    @property
+    def fraction_idle_time(self):
+        if self.duration_time_ns == 0:
+            return 0.0
+        return self.idle_time_ns / self.duration_time_ns
+
+
+@dataclass
+class Interval:
+    start: int
+    end: int
+    queue_depth: int = 0
+
+
+class EventKey:
+    def __init__(self, event):
+        self.event = event
+
+    def __hash__(self):
+        return hash(self.event.id)
+
+    def __eq__(self, other):
+        return self.event.id == other.event.id
+
+    def __repr__(self):
+        return f"{self.event.name}"
+
+    def intervals_overlap(self, intervals: List[Interval]):
+        overlap_time = 0
+        intervals = sorted(intervals, key=lambda x: x.start)
+
+        if intervals:
+            overlap_start = max(self.event.start_time_ns, intervals[0].start)
+            overlap_end = min(self.event.end_time_ns, intervals[0].end)
+
+            if overlap_start < overlap_end:
+                overlap_time += overlap_end - overlap_start
+
+        i, j = 0, 1
+        while j < len(intervals):
+            prev_interval = intervals[i]
+            curr_interval = intervals[j]
+            j += 1
+            if prev_interval.end > curr_interval.start:
+                # Completely subsumed by previous interval
+                if prev_interval.end > curr_interval.end:
+                    j += 1
+                    continue
+                else:
+                    curr_interval.start = prev_interval.end
+                    i = j
+
+            overlap_start = max(self.event.start_time_ns, curr_interval.start)
+            overlap_end = min(self.event.end_time_ns, curr_interval.end)
+            if overlap_start < overlap_end:
+                overlap_time += overlap_end - overlap_start
+
+        return overlap_time
+
+
+class BasicEvaluation:
+    def __init__(self, prof: profile):
+        self.profile = prof
+        self.metrics: Dict[EventKey, EventMetrics] = {}
+        self.compute_self_time()
+        self.event_keys = sorted(
+            (e for e in self.metrics.keys()), key=lambda x: x.event.start_time_ns
+        )
+        self.events = [e.event for e in self.event_keys]
+        self.cuda_events: List[_KinetoEvent] = []
+        self.queue_depth_list = self.compute_queue_depth()
+        self.compute_idle_time()
+
+    def compute_self_time(self):
+        """
+        Computes event's self time(total time - time in child ops).
+        """
+        assert self.profile.kineto_results is not None
+        stack = deque(self.profile.kineto_results.experimental_event_tree())
+
+        # standard iterating dfs
+        while stack:
+            curr_event = stack.pop()
+            self_time = curr_event.duration_time_ns
+            for child_event in curr_event.children:
+                self_time -= child_event.duration_time_ns
+                stack.append(child_event)
+            assert (
+                EventKey(curr_event) not in self.metrics
+            ), f"Duplicate id: {curr_event.id}, {curr_event.name}"
+            self.metrics[EventKey(curr_event)] = EventMetrics(self_time_ns=self_time)
+            self.metrics[
+                EventKey(curr_event)
+            ].duration_time_ns = curr_event.duration_time_ns
+
+    def compute_queue_depth(self):
+        """
+        Computes queue_depth at each event. This will calculate the queue depth data for
+        All the events in the tree.
+        This will return a list of Interval of queue depth data of cuda launch and kernels.
+        """
+        assert self.profile.kineto_results is not None
+        cuda_event_list = self.profile.kineto_results.events()
+
+        def is_cuda_launch_kernel(e):
+            # TODO: find a better way to identify cudaLaunchKernel
+            return e.name == "cudaLaunchKernel"
+
+        def is_cuda_kernel(e):
+            # TODO: find a better way to identify CUDA Kernel
+            return e.device_type() == DeviceType.CUDA and "mem" not in e.name.lower()
+
+        cuda_launch_events = sorted(
+            (e for e in cuda_event_list if is_cuda_launch_kernel(e)),
+            key=lambda x: x.start_ns(),
+        )
+        cuda_kernel_events = sorted(
+            (e for e in cuda_event_list if is_cuda_kernel(e)),
+            key=lambda x: x.start_ns(),
+        )
+
+        self.cuda_events = sorted(
+            cuda_launch_events + cuda_kernel_events, key=lambda x: x.start_ns()
+        )
+
+        kernel_mapping: Dict[_KinetoEvent, int] = {}
+        last_mapped_kernel = 0
+        for cuda_launch_event in cuda_launch_events:
+            index = index_of_first_match(
+                cuda_kernel_events,
+                lambda x: x.linked_correlation_id()
+                == cuda_launch_event.linked_correlation_id(),
+                start=last_mapped_kernel,
+            )
+            kernel_mapping[cuda_launch_event] = index
+            last_mapped_kernel = index if index is not None else last_mapped_kernel
+
+        current_kernel_index = 0
+        spawned_kernel_index = -1
+
+        all_events = cuda_launch_events + cuda_kernel_events + self.events
+
+        def new_old_event_comparator(event):
+            if hasattr(event, "start_us"):
+                return event.start_us() * 1000
+            if hasattr(event, "start_ns"):
+                return event.start_ns()
+            if hasattr(event, "start_time_ns"):
+                return event.start_time_ns
+            raise Exception("Unknown Event Type")  # noqa: TRY002
+
+        queue_depth_list: List[Interval] = []
+        all_events.sort(key=new_old_event_comparator)
+        for event in all_events:
+            # Find latest cuda kernel event
+            if hasattr(event, "start_us"):
+                start_time = event.start_us() * 1000
+                end_time = (event.start_us() + event.duration_us()) * 1000
+                # Find current spawned cuda kernel event
+                if event in kernel_mapping and kernel_mapping[event] is not None:
+                    spawned_kernel_index = kernel_mapping[event]
+            if hasattr(event, "start_ns"):
+                start_time = event.start_ns()
+                end_time = event.start_ns() + event.duration_ns()
+                # Find current spawned cuda kernel event
+                if event in kernel_mapping and kernel_mapping[event] is not None:
+                    spawned_kernel_index = kernel_mapping[event]
+            elif hasattr(event, "start_time_ns"):
+                start_time = event.start_time_ns  # type: ignore[attr-defined]
+                end_time = event.end_time_ns  # type: ignore[attr-defined]
+
+            while (
+                current_kernel_index < len(cuda_kernel_events)
+                and (cuda_kernel_events[current_kernel_index].start_ns())
+                <= start_time  # type: ignore[possibly-undefined]
+            ):
+                current_kernel_index += 1
+            current_queue_depth = spawned_kernel_index - current_kernel_index + 1
+            current_queue_depth = max(current_queue_depth, 0)
+
+            if hasattr(event, "start_us") or hasattr(event, "start_ns"):
+                queue_depth_list.append(
+                    Interval(start_time, end_time, current_queue_depth)  # type: ignore[possibly-undefined]
+                )
+            elif hasattr(event, "start_time_ns"):
+                self.metrics[EventKey(event)].queue_depth = current_queue_depth
+
+        return queue_depth_list
+
+    def compute_idle_time(self):
+        """
+        Computes idle time of the profile.
+        """
+        # Based on queue_depth_list, we can calculate idle time for all the events
+        idle = False
+        idle_start = 0
+        idle_intervals: List[Interval] = []
+        if self.queue_depth_list and self.events:
+            idle_intervals += [
+                Interval(self.events[0].start_time_ns, self.queue_depth_list[0].start),
+                Interval(self.queue_depth_list[-1].end, self.events[-1].end_time_ns),
+            ]
+
+        for data_point in self.queue_depth_list:
+            if data_point.queue_depth == 0 and not idle:
+                idle_start = data_point.end
+                idle = True
+            if data_point.queue_depth > 0 and idle:
+                idle_intervals.append(Interval(idle_start, data_point.start))
+                idle = False
+
+        event_list = [e.event for e in self.metrics.keys()]
+        for event in event_list:
+            self.metrics[EventKey(event)].idle_time_ns = EventKey(
+                event
+            ).intervals_overlap(idle_intervals)
+
+    def rank_events(self, length):
+        """
+        Filter and Rank the events based on some heuristics:
+        1) Events that are in the falling phase of the queue depth.
+        2) Events that have a high idle_time, self_time difference.
+
+        Parameters:
+            length: The number of events to return.
+        """
+
+        # Find the interval when qd is falling to 0
+        import torch
+
+        queue_depth_list = list(reversed(self.queue_depth_list))
+        qd_values = [e.queue_depth for e in queue_depth_list]
+
+        bottom_threashold = 0
+        top_threashold = 4
+        decrease_interval = []
+        i = 0
+        while i < len(qd_values):
+            if qd_values[i] > bottom_threashold:
+                i += 1
+                continue
+            for j in range(i + 1, len(qd_values)):
+                # Find next zero and if the max value between them exceeds
+                # the threshold, then we have a falling interval
+                next_minimum_idx = index_of_first_match(
+                    qd_values, lambda x: x <= bottom_threashold, start=j
+                )
+                peak_idx = argmax(qd_values, start=j, end=next_minimum_idx)
+
+                # if is a valid peak, we add to list and continue
+                if peak_idx is not None and qd_values[peak_idx] >= top_threashold:
+                    decrease_interval.append(
+                        Interval(
+                            queue_depth_list[peak_idx].start, queue_depth_list[i].start
+                        )
+                    )
+                    i = next_minimum_idx if next_minimum_idx is not None else i
+                    break
+            i += 1
+        # Filter out events that are not in the decrease interval
+        event_list = [
+            event
+            for event in self.metrics.keys()
+            if event.intervals_overlap(decrease_interval)
+        ]
+        if event_list:
+            self_time = torch.tensor(
+                [self.metrics[event].self_time_ns for event in event_list],
+                dtype=torch.float32,
+            )
+            idle_time = torch.tensor(
+                [self.metrics[event].fraction_idle_time for event in event_list],
+                dtype=torch.float32,
+            )
+            normalized_gain = (idle_time - torch.mean(idle_time)) / torch.std(idle_time)
+            normalized_self = (self_time - torch.mean(self_time)) / torch.std(self_time)
+            heuristic_score_list = normalized_gain + 0.6 * normalized_self
+
+            # Sort events by heuristic
+            event_list = [
+                event
+                for _, event in sorted(
+                    zip(heuristic_score_list, event_list),
+                    key=operator.itemgetter(0),
+                    reverse=True,
+                )
+            ]
+            event_list = event_list[:length]
+        return event_list
+
+    def get_optimizable_events(self, length: int = 1, print_enable: bool = True):
+        event_list = self.rank_events(length)
+        if not print_enable:
+            return event_list
+        output = "Optimizable events:\n" if event_list else "No events to optimize\n"
+
+        output += "\n".join(
+            [
+                f"""{'-'*80}
+Event:                {event}
+Source code location: {source_code_location(event.event)}
+Percentage idle time: {self.metrics[event].fraction_idle_time * 100:.2f}%
+{'-'*80}"""
+                for event in event_list
+            ]
+        )
+        if print_enable:
+            print(output)
+        return event_list
+
+
+def index_of_first_match(seq, predicate, start=0, end=None):
+    if end is None or end >= len(seq):
+        end = len(seq)
+    for i in range(start, end):
+        if predicate(seq[i]):
+            return i
+    return None
+
+
+def argmax(seq, key=lambda x: x, start=0, end=None):
+    seq = seq[start:end]
+    if len(seq) == 0:
+        return None
+    return seq.index(max(seq, key=key)) + start
+
+
+def source_code_location(event):
+    while event is not None:
+        match = re.search(r"\.py\(.*\)", event.name)
+        if match is None:
+            event = event.parent
+            continue
+        return event.name
+    return "No source code location found"
+
+
+# Provide an OSS workaround for cudagraphs + CUPTI issue
+# https://github.com/pytorch/pytorch/issues/75504
+# TODO(dberard) - deprecate / remove workaround for CUDA >= 12, when
+# we stop supporting older CUDA versions.
+def _init_for_cuda_graphs():
+    from torch.autograd.profiler import profile
+
+    with profile():
+        pass
--- a/rl/Lib/site-packages/torch/profiler/itt.py
+++ b/rl/Lib/site-packages/torch/profiler/itt.py
@ -0,0 +1,80 @@
+# mypy: allow-untyped-defs
+from contextlib import contextmanager
+
+
+try:
+    from torch._C import _itt
+except ImportError:
+
+    class _ITTStub:
+        @staticmethod
+        def _fail(*args, **kwargs):
+            raise RuntimeError(
+                "ITT functions not installed. Are you sure you have a ITT build?"
+            )
+
+        @staticmethod
+        def is_available():
+            return False
+
+        rangePush = _fail
+        rangePop = _fail
+        mark = _fail
+
+    _itt = _ITTStub()  # type: ignore[assignment]
+
+
+__all__ = ["is_available", "range_push", "range_pop", "mark", "range"]
+
+
+def is_available():
+    """
+    Check if ITT feature is available or not
+    """
+    return _itt.is_available()
+
+
+def range_push(msg):
+    """
+    Pushes a range onto a stack of nested range span.  Returns zero-based
+    depth of the range that is started.
+
+    Arguments:
+        msg (str): ASCII message to associate with range
+    """
+    return _itt.rangePush(msg)
+
+
+def range_pop():
+    """
+    Pops a range off of a stack of nested range spans. Returns the
+    zero-based depth of the range that is ended.
+    """
+    return _itt.rangePop()
+
+
+def mark(msg):
+    """
+    Describe an instantaneous event that occurred at some point.
+
+    Arguments:
+        msg (str): ASCII message to associate with the event.
+    """
+    return _itt.mark(msg)
+
+
+@contextmanager
+def range(msg, *args, **kwargs):
+    """
+    Context manager / decorator that pushes an ITT range at the beginning
+    of its scope, and pops it at the end. If extra arguments are given,
+    they are passed as arguments to msg.format().
+
+    Args:
+        msg (str): message to associate with the range
+    """
+    range_push(msg.format(*args, **kwargs))
+    try:
+        yield
+    finally:
+        range_pop()
--- a/rl/Lib/site-packages/torch/profiler/profiler.py
+++ b/rl/Lib/site-packages/torch/profiler/profiler.py
@ -0,0 +1,935 @@
+# mypy: allow-untyped-defs
+import gzip
+import json
+import os
+import shutil
+import tempfile
+from abc import ABC, abstractmethod
+from enum import Enum
+from functools import partial
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+from typing_extensions import Self
+from warnings import warn
+
+import torch
+import torch.autograd.profiler as prof
+from torch._C import _get_privateuse1_backend_name
+from torch._C._profiler import (
+    _add_execution_trace_observer,
+    _disable_execution_trace_observer,
+    _enable_execution_trace_observer,
+    _ExperimentalConfig,
+    _remove_execution_trace_observer,
+)
+from torch.autograd import kineto_available, ProfilerActivity
+from torch.profiler._memory_profiler import MemoryProfile, MemoryProfileTimeline
+
+
+__all__ = [
+    "supported_activities",
+    "ProfilerAction",
+    "schedule",
+    "tensorboard_trace_handler",
+    "profile",
+    "ExecutionTraceObserver",
+]
+PROFILER_STEP_NAME = "ProfilerStep"
+
+
+def supported_activities():
+    """
+    Returns a set of supported profiler tracing activities.
+
+    Note: profiler uses CUPTI library to trace on-device CUDA kernels.
+    In case when CUDA is enabled but CUPTI is not available, passing
+    ``ProfilerActivity.CUDA`` to profiler results in using the legacy CUDA
+    profiling code (same as in the legacy ``torch.autograd.profiler``).
+    This, in turn, results in including CUDA time in the profiler table output,
+    but not in the JSON trace.
+    """
+    return torch.autograd._supported_activities()
+
+
+class _ITraceObserver(ABC):
+    """Abstract interface for a Trace observer.
+    This satisfies 3 methods: start, stop and cleanup"""
+
+    @abstractmethod
+    def start(self):
+        pass
+
+    @abstractmethod
+    def stop(self):
+        pass
+
+    @abstractmethod
+    def cleanup(self):
+        pass
+
+
+class _KinetoProfile:
+    """Low-level profiler wrap the autograd profile
+
+    Args:
+        activities (iterable): list of activity groups (CPU, CUDA) to use in profiling, supported values:
+            ``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA``,
+            ``torch.profiler.ProfilerActivity.XPU``.
+            Default value: ProfilerActivity.CPU and (when available) ProfilerActivity.CUDA
+            or (when available) ProfilerActivity.XPU.
+        record_shapes (bool): save information about operator's input shapes.
+        profile_memory (bool): track tensor memory allocation/deallocation (see ``export_memory_timeline``
+            for more details).
+        with_stack (bool): record source information (file and line number) for the ops.
+        with_flops (bool): use formula to estimate the FLOPS of specific operators
+            (matrix multiplication and 2D convolution).
+        with_modules (bool): record module hierarchy (including function names)
+            corresponding to the callstack of the op. e.g. If module A's forward call's
+            module B's forward which contains an aten::add op,
+            then aten::add's module hierarchy is A.B
+            Note that this support exist, at the moment, only for TorchScript models
+            and not eager mode models.
+        experimental_config (_ExperimentalConfig) : A set of experimental options
+            used by profiler libraries like Kineto. Note, backward compatibility is not guaranteed.
+        execution_trace_observer (ExecutionTraceObserver) : A PyTorch Execution Trace Observer object.
+            `PyTorch Execution Traces <https://arxiv.org/pdf/2305.14516.pdf>`__ offer a graph based
+            representation of AI/ML workloads and enable replay benchmarks, simulators, and emulators.
+            When this argument is included the observer start() and stop() will be called for the
+            same time window as PyTorch profiler.
+        acc_events (bool): Enable the accumulation of FunctionEvents across multiple profiling cycles
+
+
+    .. note::
+        This API is experimental and subject to change in the future.
+
+        Enabling shape and stack tracing results in additional overhead.
+        When record_shapes=True is specified, profiler will temporarily hold references to the tensors;
+        that may further prevent certain optimizations that depend on the reference count and introduce
+        extra tensor copies.
+    """
+
+    def __init__(
+        self,
+        *,
+        activities: Optional[Iterable[ProfilerActivity]] = None,
+        record_shapes: bool = False,
+        profile_memory: bool = False,
+        with_stack: bool = False,
+        with_flops: bool = False,
+        with_modules: bool = False,
+        experimental_config: Optional[_ExperimentalConfig] = None,
+        execution_trace_observer: Optional[_ITraceObserver] = None,
+        acc_events: bool = False,
+    ):
+        self.activities = set(activities) if activities else supported_activities()
+        self.record_shapes = record_shapes
+        self.with_flops = with_flops
+        self.profile_memory = profile_memory
+        self.with_stack = with_stack
+        self.with_modules = with_modules
+        self.experimental_config = experimental_config
+        self.execution_trace_observer = execution_trace_observer
+        self.acc_events = acc_events
+        self.profiler: Optional[prof.profile] = None
+        self.mem_tl: Optional[MemoryProfileTimeline] = None
+        self.use_device = None
+        if ProfilerActivity.CUDA in self.activities:
+            self.use_device = "cuda"
+        elif ProfilerActivity.XPU in self.activities:
+            self.use_device = "xpu"
+        elif ProfilerActivity.MTIA in self.activities:
+            self.use_device = "mtia"
+        elif ProfilerActivity.PrivateUse1 in self.activities:
+            self.use_device = _get_privateuse1_backend_name()
+
+        # user-defined metadata to be amended to the trace
+        self.preset_metadata: Dict[str, str] = {}
+
+    def start(self):
+        self.prepare_trace()
+        self.start_trace()
+
+    def stop(self):
+        self.stop_trace()
+
+    def prepare_trace(self):
+        if (self.profiler is None) or (not self.acc_events):
+            self.profiler = prof.profile(
+                use_cpu=(ProfilerActivity.CPU in self.activities),
+                use_device=self.use_device,
+                record_shapes=self.record_shapes,
+                with_flops=self.with_flops,
+                profile_memory=self.profile_memory,
+                with_stack=self.with_stack,
+                with_modules=self.with_modules,
+                use_kineto=True,
+                experimental_config=self.experimental_config,
+                acc_events=self.acc_events,
+            )
+        self.profiler._prepare_trace()
+
+    def start_trace(self):
+        if self.execution_trace_observer:
+            self.execution_trace_observer.start()
+        assert self.profiler is not None
+        self.profiler._start_trace()
+
+        if self.profile_memory:
+            self.add_metadata_json("profile_memory", "1")
+        if self.with_stack:
+            self.add_metadata_json("with_stack", "1")
+        if self.record_shapes:
+            self.add_metadata_json("record_shapes", "1")
+        if self.with_modules:
+            self.add_metadata_json("with_modules", "1")
+        if self.with_flops:
+            self.add_metadata_json("with_flops", "1")
+
+        if kineto_available():
+            dist_info = self._get_distributed_info()
+            if dist_info:
+                self.add_metadata_json("distributedInfo", json.dumps(dist_info))
+
+            if hasattr(torch, "_inductor"):
+                import torch._inductor.config as inductor_config
+
+                if inductor_config.triton.cudagraphs:
+                    os.environ["DISABLE_CUPTI_LAZY_REINIT"] = "1"
+                    self.add_metadata_json("DISABLE_CUPTI_LAZY_REINIT", "1")
+                    # FIXME: CUDA Graph does not work well with CUPTI teardown.
+                    #   1) crashes on 1st lazy CUPTI re-init after teardown (CUDA 11)
+                    #   2) crashes on 2nd non-lazy CUPTI re-init after teardown (CUDA 12)
+                    # Workaround: turn off CUPTI teardown when using CUDA Graphs.
+                    os.environ["TEARDOWN_CUPTI"] = "0"
+
+            # Insert the preset user metadata to the trace
+            for k, v in self.preset_metadata.items():
+                self.add_metadata_json(k, v)
+
+    def stop_trace(self):
+        if self.execution_trace_observer:
+            self.execution_trace_observer.stop()
+        assert self.profiler is not None
+        self.profiler.__exit__(None, None, None)
+
+    def export_chrome_trace(self, path: str):
+        """
+        Exports the collected trace in Chrome JSON format. If kineto is enabled, only
+        last cycle in schedule is exported.
+        """
+        assert self.profiler
+        if path.endswith(".gz"):
+            fp = tempfile.NamedTemporaryFile("w+t", suffix=".json", delete=False)
+            fp.close()
+            retvalue = self.profiler.export_chrome_trace(fp.name)
+            with open(fp.name) as fin:
+                with gzip.open(path, "wt") as fout:
+                    fout.writelines(fin)
+            os.remove(fp.name)
+            return retvalue
+        else:
+            return self.profiler.export_chrome_trace(path)
+
+    def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
+        """Save stack traces to a file
+
+        Args:
+            path (str): save stacks file to this location;
+            metric (str): metric to use: "self_cpu_time_total" or "self_cuda_time_total"
+        """
+        assert self.profiler
+        return self.profiler.export_stacks(path, metric)
+
+    def toggle_collection_dynamic(
+        self, enable: bool, activities: Iterable[ProfilerActivity]
+    ):
+        """Toggle collection of activities on/off at any point of collection. Currently supports toggling Torch Ops
+        (CPU) and CUDA activity supported in Kineto
+
+        Args:
+            activities (iterable): list of activity groups to use in profiling, supported values:
+                ``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA``
+        Examples:
+
+        .. code-block:: python
+
+            with torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.CUDA,
+                ]
+            ) as p:
+                code_to_profile_0()
+                // turn off collection of all CUDA activity
+                p.toggle_collection_dynamic(False, [torch.profiler.ProfilerActivity.CUDA])
+                code_to_profile_1()
+                // turn on collection of all CUDA activity
+                p.toggle_collection_dynamic(True, [torch.profiler.ProfilerActivity.CUDA])
+                code_to_profile_2()
+            print(p.key_averages().table(
+                sort_by="self_cuda_time_total", row_limit=-1))
+        """
+        if not self.profiler:
+            return
+        self.profiler.toggle_collection_dynamic(enable, activities)
+
+    def key_averages(
+        self, group_by_input_shape: bool = False, group_by_stack_n: int = 0
+    ):
+        """Averages events, grouping them by operator name and (optionally) input shapes and
+        stack.
+
+        .. note::
+            To use shape/stack functionality make sure to set record_shapes/with_stack
+            when creating profiler context manager.
+        """
+        assert self.profiler
+        return self.profiler.key_averages(group_by_input_shape, group_by_stack_n)
+
+    def events(self):
+        """
+        Returns the list of unaggregated profiler events,
+        to be used in the trace callback or after the profiling is finished
+        """
+        assert self.profiler
+        return self.profiler.function_events
+
+    def add_metadata(self, key: str, value: str):
+        """
+        Adds a user defined metadata with a string key and a string value
+        into the trace file
+        """
+        wrapped_value = '"' + value.replace('"', '\\"') + '"'
+        torch.autograd._add_metadata_json(key, wrapped_value)
+
+    def add_metadata_json(self, key: str, value: str):
+        """
+        Adds a user defined metadata with a string key and a valid json value
+        into the trace file
+        """
+        torch.autograd._add_metadata_json(key, value)
+
+    def preset_metadata_json(self, key: str, value: str):
+        """
+        Preset a user defined metadata when the profiler is not started
+        and added into the trace file later.
+        Metadata is in the format of a string key and a valid json value
+        """
+        self.preset_metadata[key] = value
+
+    def _get_distributed_info(self):
+        import torch.distributed as dist
+
+        if not dist.is_available() or not dist.is_initialized():
+            return None
+
+        backend = dist.get_backend()
+        dist_info = {
+            "backend": backend,
+            "rank": dist.get_rank(),
+            "world_size": dist.get_world_size(),
+            "pg_count": dist.get_pg_count(),
+            "pg_config": dist.distributed_c10d._get_all_pg_configs(),
+        }
+        if backend == "nccl":
+            nccl_version = torch.cuda.nccl.version()
+            dist_info["nccl_version"] = ".".join(str(v) for v in nccl_version)
+        return dist_info
+
+    def _memory_profile(self) -> MemoryProfile:
+        required = ("record_shapes", "profile_memory", "with_stack")
+        missing = [f"{i}=True" for i in required if not getattr(self, i)]
+        if missing:
+            raise ValueError(f"{', '.join(missing)} required for memory profiling.")
+
+        assert self.profiler is not None and self.profiler.kineto_results is not None
+        return MemoryProfile(self.profiler.kineto_results)
+
+    def export_memory_timeline(self, path: str, device: Optional[str] = None) -> None:
+        """Export memory event information from the profiler collected
+        tree for a given device, and export a timeline plot. There are 3
+        exportable files using ``export_memory_timeline``, each controlled by the
+        ``path``'s suffix.
+
+        - For an HTML compatible plot, use the suffix ``.html``, and a memory timeline
+          plot will be embedded as a PNG file in the HTML file.
+
+        - For plot points consisting of ``[times, [sizes by category]]``, where
+          ``times`` are timestamps and ``sizes`` are memory usage for each category.
+          The memory timeline plot will be saved a JSON (``.json``) or gzipped JSON
+          (``.json.gz``) depending on the suffix.
+
+        - For raw memory points, use the suffix ``.raw.json.gz``. Each raw memory
+          event will consist of ``(timestamp, action, numbytes, category)``, where
+          ``action`` is one of ``[PREEXISTING, CREATE, INCREMENT_VERSION, DESTROY]``,
+          and ``category`` is one of the enums from
+          ``torch.profiler._memory_profiler.Category``.
+
+        Output: Memory timeline written as gzipped JSON, JSON, or HTML.
+        """
+        # Default to device 0, if unset. Fallback on cpu.
+        if device is None and self.use_device and self.use_device != "cuda":
+            device = self.use_device + ":0"
+
+        if device is None:
+            device = "cuda:0" if torch.cuda.is_available() else "cpu"
+
+        # Construct the memory timeline plot data
+        self.mem_tl = MemoryProfileTimeline(self._memory_profile())
+
+        # Depending on the file suffix, save the data as json.gz or json.
+        # For html, we can embed the image into an HTML file.
+        if path.endswith(".html"):
+            self.mem_tl.export_memory_timeline_html(path, device)
+        elif path.endswith(".gz"):
+            fp = tempfile.NamedTemporaryFile("w+t", suffix=".json", delete=False)
+            fp.close()
+            if path.endswith("raw.json.gz"):
+                self.mem_tl.export_memory_timeline_raw(fp.name, device)
+            else:
+                self.mem_tl.export_memory_timeline(fp.name, device)
+            with open(fp.name) as fin:
+                with gzip.open(path, "wt") as fout:
+                    fout.writelines(fin)
+            os.remove(fp.name)
+        else:
+            self.mem_tl.export_memory_timeline(path, device)
+
+
+class ProfilerAction(Enum):
+    """
+    Profiler actions that can be taken at the specified intervals
+    """
+
+    NONE = 0
+    WARMUP = 1
+    RECORD = 2
+    RECORD_AND_SAVE = 3
+
+
+def schedule(
+    *, wait: int, warmup: int, active: int, repeat: int = 0, skip_first: int = 0
+) -> Callable:
+    """
+    Returns a callable that can be used as profiler ``schedule`` argument. The profiler will skip
+    the first ``skip_first`` steps, then wait for ``wait`` steps, then do the warmup for the next ``warmup`` steps,
+    then do the active recording for the next ``active`` steps and then repeat the cycle starting with ``wait`` steps.
+    The optional number of cycles is specified with the ``repeat`` parameter, the zero value means that
+    the cycles will continue until the profiling is finished.
+    """
+
+    def schedule_fn(step: int) -> ProfilerAction:
+        assert step >= 0
+        if step < skip_first:
+            return ProfilerAction.NONE
+        else:
+            step -= skip_first
+        num_steps = wait + warmup + active
+        if repeat > 0 and step / num_steps >= repeat:
+            return ProfilerAction.NONE
+        mod_step = step % num_steps
+        if mod_step < wait:
+            return ProfilerAction.NONE
+        elif mod_step < wait + warmup:
+            return ProfilerAction.WARMUP
+        else:
+            return (
+                ProfilerAction.RECORD
+                if mod_step < num_steps - 1
+                else ProfilerAction.RECORD_AND_SAVE
+            )
+
+    assert (
+        wait >= 0 and warmup >= 0 and active > 0 and repeat >= 0 and skip_first >= 0
+    ), "Invalid profiler schedule arguments"
+    if warmup == 0:
+        warn("Profiler won't be using warmup, this can skew profiler results")
+    return schedule_fn
+
+
+def _default_schedule_fn(_: int) -> ProfilerAction:
+    """
+    Default profiler behavior - immediately starts recording the events,
+    keeps doing it on every profiler step.
+    """
+    return ProfilerAction.RECORD
+
+
+def tensorboard_trace_handler(
+    dir_name: str, worker_name: Optional[str] = None, use_gzip: bool = False
+):
+    """
+    Outputs tracing files to directory of ``dir_name``, then that directory can be
+    directly delivered to tensorboard as logdir.
+    ``worker_name`` should be unique for each worker in distributed scenario,
+    it will be set to '[hostname]_[pid]' by default.
+    """
+    import os
+    import socket
+    import time
+
+    def handler_fn(prof) -> None:
+        nonlocal worker_name
+        if not os.path.isdir(dir_name):
+            try:
+                os.makedirs(dir_name, exist_ok=True)
+            except Exception as e:
+                raise RuntimeError("Can't create directory: " + dir_name) from e
+        if not worker_name:
+            worker_name = f"{socket.gethostname()}_{os.getpid()}"
+        # Use nanosecond here to avoid naming clash when exporting the trace
+        file_name = f"{worker_name}.{time.time_ns()}.pt.trace.json"
+        if use_gzip:
+            file_name = file_name + ".gz"
+        prof.export_chrome_trace(os.path.join(dir_name, file_name))
+
+    return handler_fn
+
+
+class profile(_KinetoProfile):
+    """Profiler context manager.
+
+    Args:
+        activities (iterable): list of activity groups (CPU, CUDA) to use in profiling, supported values:
+            ``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA``,
+            ``torch.profiler.ProfilerActivity.XPU``.
+            Default value: ProfilerActivity.CPU and (when available) ProfilerActivity.CUDA
+            or (when available) ProfilerActivity.XPU.
+        schedule (Callable): callable that takes step (int) as a single parameter and returns
+            ``ProfilerAction`` value that specifies the profiler action to perform at each step.
+        on_trace_ready (Callable): callable that is called at each step when ``schedule``
+            returns ``ProfilerAction.RECORD_AND_SAVE`` during the profiling.
+        record_shapes (bool): save information about operator's input shapes.
+        profile_memory (bool): track tensor memory allocation/deallocation.
+        with_stack (bool): record source information (file and line number) for the ops.
+        with_flops (bool): use formula to estimate the FLOPs (floating point operations) of specific operators
+            (matrix multiplication and 2D convolution).
+        with_modules (bool): record module hierarchy (including function names)
+            corresponding to the callstack of the op. e.g. If module A's forward call's
+            module B's forward which contains an aten::add op,
+            then aten::add's module hierarchy is A.B
+            Note that this support exist, at the moment, only for TorchScript models
+            and not eager mode models.
+        experimental_config (_ExperimentalConfig) : A set of experimental options
+            used for Kineto library features. Note, backward compatibility is not guaranteed.
+        execution_trace_observer (ExecutionTraceObserver) : A PyTorch Execution Trace Observer object.
+            `PyTorch Execution Traces <https://arxiv.org/pdf/2305.14516.pdf>`__ offer a graph based
+            representation of AI/ML workloads and enable replay benchmarks, simulators, and emulators.
+            When this argument is included the observer start() and stop() will be called for the
+            same time window as PyTorch profiler. See the examples section below for a code sample.
+        acc_events (bool): Enable the accumulation of FunctionEvents across multiple profiling cycles
+        use_cuda (bool):
+            .. deprecated:: 1.8.1
+                use ``activities`` instead.
+
+    .. note::
+        Use :func:`~torch.profiler.schedule` to generate the callable schedule.
+        Non-default schedules are useful when profiling long training jobs
+        and allow the user to obtain multiple traces at the different iterations
+        of the training process.
+        The default schedule simply records all the events continuously for the
+        duration of the context manager.
+
+    .. note::
+        Use :func:`~torch.profiler.tensorboard_trace_handler` to generate result files for TensorBoard:
+
+        ``on_trace_ready=torch.profiler.tensorboard_trace_handler(dir_name)``
+
+        After profiling, result files can be found in the specified directory. Use the command:
+
+        ``tensorboard --logdir dir_name``
+
+        to see the results in TensorBoard.
+        For more information, see
+        `PyTorch Profiler TensorBoard Plugin <https://github.com/pytorch/kineto/tree/master/tb_plugin>`__
+
+    .. note::
+        Enabling shape and stack tracing results in additional overhead.
+        When record_shapes=True is specified, profiler will temporarily hold references to the tensors;
+        that may further prevent certain optimizations that depend on the reference count and introduce
+        extra tensor copies.
+
+
+    Examples:
+
+    .. code-block:: python
+
+        with torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ]
+        ) as p:
+            code_to_profile()
+        print(p.key_averages().table(
+            sort_by="self_cuda_time_total", row_limit=-1))
+
+    Using the profiler's ``schedule``, ``on_trace_ready`` and ``step`` functions:
+
+    .. code-block:: python
+
+        # Non-default profiler schedule allows user to turn profiler on and off
+        # on different iterations of the training loop;
+        # trace_handler is called every time a new trace becomes available
+        def trace_handler(prof):
+            print(prof.key_averages().table(
+                sort_by="self_cuda_time_total", row_limit=-1))
+            # prof.export_chrome_trace("/tmp/test_trace_" + str(prof.step_num) + ".json")
+
+        with torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+
+            # In this example with wait=1, warmup=1, active=2, repeat=1,
+            # profiler will skip the first step/iteration,
+            # start warming up on the second, record
+            # the third and the forth iterations,
+            # after which the trace will become available
+            # and on_trace_ready (when set) is called;
+            # the cycle repeats starting with the next step
+
+            schedule=torch.profiler.schedule(
+                wait=1,
+                warmup=1,
+                active=2,
+                repeat=1),
+            on_trace_ready=trace_handler
+            # on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')
+            # used when outputting for tensorboard
+            ) as p:
+                for iter in range(N):
+                    code_iteration_to_profile(iter)
+                    # send a signal to the profiler that the next iteration has started
+                    p.step()
+
+    The following sample shows how to setup up an Execution Trace Observer (`execution_trace_observer`)
+
+    .. code-block:: python
+
+        with torch.profiler.profile(
+            ...
+            execution_trace_observer=(
+                ExecutionTraceObserver().register_callback("./execution_trace.json")
+            ),
+        ) as p:
+            for iter in range(N):
+                code_iteration_to_profile(iter)
+                p.step()
+
+    You can also refer to test_execution_trace_with_kineto() in tests/profiler/test_profiler.py.
+    Note: One can also pass any object satisfying the _ITraceObserver interface.
+    """
+
+    def __init__(
+        self,
+        *,
+        activities: Optional[Iterable[ProfilerActivity]] = None,
+        schedule: Optional[Callable[[int], ProfilerAction]] = None,
+        on_trace_ready: Optional[Callable[..., Any]] = None,
+        record_shapes: bool = False,
+        profile_memory: bool = False,
+        with_stack: bool = False,
+        with_flops: bool = False,
+        with_modules: bool = False,
+        experimental_config: Optional[_ExperimentalConfig] = None,
+        execution_trace_observer: Optional[_ITraceObserver] = None,
+        acc_events: bool = False,
+        # deprecated:
+        use_cuda: Optional[bool] = None,
+    ):
+        activities_set = set(activities) if activities else supported_activities()
+        if use_cuda is not None:
+            warn(
+                "`use_cuda` is deprecated, use `activities` argument instead",
+                FutureWarning,
+                stacklevel=2,
+            )
+            if use_cuda:
+                activities_set.add(ProfilerActivity.CUDA)
+            elif ProfilerActivity.CUDA in activities_set:
+                activities_set.remove(ProfilerActivity.CUDA)
+        assert len(activities_set) > 0, "No valid profiler activities found"
+
+        super().__init__(
+            activities=activities,
+            record_shapes=record_shapes,
+            profile_memory=profile_memory,
+            with_stack=with_stack,
+            with_flops=with_flops,
+            with_modules=with_modules,
+            experimental_config=experimental_config,
+            execution_trace_observer=execution_trace_observer,
+            acc_events=acc_events,
+        )
+
+        if schedule:
+            self.schedule = schedule
+            # add step markers into the trace and table view
+            self.record_steps = True
+        else:
+            self.schedule = _default_schedule_fn
+            self.record_steps = False
+        self.on_trace_ready = on_trace_ready
+        self.step_num = 0
+        self.current_action = self.schedule(self.step_num)
+        self.step_rec_fn: Optional[prof.record_function] = None
+
+        self.action_map: Dict[
+            Tuple[ProfilerAction, Optional[ProfilerAction]], List[Any]
+        ] = {
+            # key is (prev_action, current_action), value is action list corresponding to the state pair.
+            (ProfilerAction.NONE, ProfilerAction.NONE): [],
+            (ProfilerAction.NONE, ProfilerAction.WARMUP): [self.prepare_trace],
+            (ProfilerAction.NONE, ProfilerAction.RECORD): [
+                self.prepare_trace,
+                self.start_trace,
+            ],
+            (ProfilerAction.NONE, ProfilerAction.RECORD_AND_SAVE): [
+                self.prepare_trace,
+                self.start_trace,
+            ],
+            (ProfilerAction.WARMUP, ProfilerAction.NONE): [
+                partial(warn, "Incorrect schedule: WARMUP followed by NONE"),
+                self.start_trace,
+                self.stop_trace,
+            ],
+            (ProfilerAction.WARMUP, ProfilerAction.WARMUP): [],
+            (ProfilerAction.WARMUP, ProfilerAction.RECORD): [self.start_trace],
+            (ProfilerAction.WARMUP, ProfilerAction.RECORD_AND_SAVE): [self.start_trace],
+            (ProfilerAction.RECORD, ProfilerAction.NONE): [
+                partial(warn, "Incorrect schedule: RECORD followed by NONE"),
+                self.stop_trace,
+            ],
+            (ProfilerAction.RECORD, ProfilerAction.WARMUP): [
+                partial(warn, "Incorrect schedule: RECORD followed by WARMUP"),
+                self.stop_trace,
+            ],
+            (ProfilerAction.RECORD, ProfilerAction.RECORD): [],
+            (ProfilerAction.RECORD, ProfilerAction.RECORD_AND_SAVE): [],
+            (ProfilerAction.RECORD_AND_SAVE, ProfilerAction.NONE): [
+                self.stop_trace,
+                self._trace_ready,
+            ],
+            (ProfilerAction.RECORD_AND_SAVE, ProfilerAction.WARMUP): [
+                self.stop_trace,
+                self._trace_ready,
+                self.prepare_trace,
+            ],
+            (ProfilerAction.RECORD_AND_SAVE, ProfilerAction.RECORD): [
+                self.stop_trace,
+                self._trace_ready,
+                self.prepare_trace,
+                self.start_trace,
+            ],
+            (ProfilerAction.RECORD_AND_SAVE, ProfilerAction.RECORD_AND_SAVE): [
+                self.stop_trace,
+                self._trace_ready,
+                self.prepare_trace,
+                self.start_trace,
+            ],
+            # used for exit action
+            (ProfilerAction.WARMUP, None): [self.start_trace, self.stop_trace],
+            (ProfilerAction.RECORD, None): [self.stop_trace, self._trace_ready],
+            (ProfilerAction.RECORD_AND_SAVE, None): [
+                self.stop_trace,
+                self._trace_ready,
+            ],
+        }
+        # Start tracking increments to profiler step, this will be used
+        # by Kineto
+        prof.KinetoStepTracker.init_step_count(PROFILER_STEP_NAME)
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop()
+        prof.KinetoStepTracker.erase_step_count(PROFILER_STEP_NAME)
+        if self.execution_trace_observer:
+            self.execution_trace_observer.cleanup()
+
+    def start(self):
+        self._transit_action(ProfilerAction.NONE, self.current_action)
+        if self.record_steps:
+            self.step_rec_fn = prof.record_function(
+                "ProfilerStep#" + str(self.step_num)
+            )
+            self.step_rec_fn.__enter__()
+
+    def stop(self):
+        if self.record_steps and self.step_rec_fn:
+            self.step_rec_fn.__exit__(None, None, None)
+        self._transit_action(self.current_action, None)
+
+    def step(self):
+        """
+        Signals the profiler that the next profiling step has started.
+        """
+        if self.record_steps and self.step_rec_fn:
+            self.step_rec_fn.__exit__(None, None, None)
+        prev_action = self.current_action
+        self.step_num += 1
+        self.current_action = self.schedule(self.step_num)
+
+        self._transit_action(prev_action, self.current_action)
+        prof.KinetoStepTracker.increment_step(PROFILER_STEP_NAME)
+
+        if self.record_steps:
+            self.step_rec_fn = prof.record_function(
+                "ProfilerStep#" + str(self.step_num)
+            )
+            self.step_rec_fn.__enter__()
+
+    def _trace_ready(self):
+        if self.on_trace_ready:
+            self.on_trace_ready(self)
+
+    def _transit_action(self, prev_action, current_action):
+        action_list = self.action_map.get((prev_action, current_action))
+        if action_list:
+            for action in action_list:
+                action()
+
+    def _stats(self) -> Optional[prof._ProfilerStats]:
+        if self.profiler is None:
+            return None
+        return self.profiler._stats
+
+
+class ExecutionTraceObserver(_ITraceObserver):
+    """Execution Trace Observer
+
+    Each process can have a single ExecutionTraceObserver instance. The observer
+    can be added to record function callbacks via calling register_callback()
+    explicitly. Without calling unregister_callback(), repeated calls to
+    register_callback() will not add additional observers to record function
+    callbacks. Once an ExecutionTraceObserver is created, the start() and stop()
+    methods control when the event data is recorded.
+
+    Deleting or calling unregister_callback() will remove the observer from the
+    record function callbacks, finalize the output file, and will stop
+    incurring any overheads.
+    """
+
+    def __init__(self) -> None:
+        """
+        Initializes the default states.
+        """
+        self._registered = False
+        self._execution_trace_running = False
+
+    def __del__(self):
+        """
+        Calls unregister_callback() to make sure to finalize outputs.
+        """
+        self.unregister_callback()
+
+    def register_callback(self, output_file_path: str) -> Self:
+        """
+        Adds ET observer to record function callbacks. The data will be
+        written to output_file_path.
+        """
+        if not self._registered:
+            self._output_file_path = output_file_path
+            self._registered = _add_execution_trace_observer(output_file_path)
+        return self
+
+    def unregister_callback(self):
+        """
+        Removes ET observer from record function callbacks.
+        """
+
+        def _save_triton_kernels():
+            # Save the kernel paths for the generated kernels
+            from torch._inductor.codecache import PyCodeCache as PyCodeCache
+
+            kernel_files = [
+                v.__file__
+                for v in PyCodeCache.cache.values()
+                if getattr(v, "__file__", None) is not None
+            ]
+            work_dir, file_name = os.path.split(self._output_file_path)
+            resource_dir = os.path.join(
+                work_dir, os.path.splitext(file_name)[0] + "_resources"
+            )
+            if not os.path.exists(resource_dir):
+                os.mkdir(resource_dir)
+
+            for kernel_file in kernel_files:
+                if kernel_file is None:
+                    continue
+                path, name = os.path.split(kernel_file)
+                dst = os.path.join(resource_dir, name)
+                shutil.copyfile(kernel_file, dst)
+
+        if self._registered:
+            self.stop()
+            try:
+                _save_triton_kernels()
+            except Exception as e:
+                warn(f"Execution trace failed to save kernels: {e}")
+            _remove_execution_trace_observer()
+            self._registered = False
+
+    @property
+    def is_registered(self):
+        """
+        Returns True if the execution trace observer is registered, otherwise False.
+        """
+        return self._registered
+
+    def is_running(self):
+        """
+        Returns True if the observer is running, otherwise False.
+        """
+        return self._execution_trace_running
+
+    def start(self):
+        """
+        Starts to capture.
+        """
+        if self._registered and not self._execution_trace_running:
+            _enable_execution_trace_observer()
+            self._execution_trace_running = True
+            self._record_pg_config()
+
+    def stop(self):
+        """
+        Stops to capture.
+        """
+        if self._execution_trace_running:
+            _disable_execution_trace_observer()
+            self._execution_trace_running = False
+
+    def cleanup(self):
+        """
+        Calls unregister_callback() to make sure to finalize outputs.
+        """
+        self.unregister_callback()
+
+    def get_output_file_path(self) -> str:
+        """
+        Returns the output file name.
+        """
+        if self.is_registered:
+            return self._output_file_path
+        else:
+            raise RuntimeError(
+                "A callback to the ET profiler needs to be registered "
+                "first before getting the output file path"
+            )
+
+    def _record_pg_config(self) -> None:
+        # Records the PG config info to the trace as node:
+        #  ## process_group:init ##
+        if (
+            self.is_registered
+            and torch.distributed.is_available()
+            and torch.distributed.is_initialized()
+        ):
+            pg_config_info = torch.distributed.distributed_c10d._world.pg_config_info
+            torch.autograd._record_function_with_args_enter(
+                "## process_group:init ##", json.dumps(pg_config_info)
+            )
--- a/rl/Lib/site-packages/torch/profiler/python_tracer.py
+++ b/rl/Lib/site-packages/torch/profiler/python_tracer.py
@ -0,0 +1,20 @@
+import os
+import site
+import sys
+import typing
+
+import torch
+
+
+def _prefix_regex() -> typing.List[str]:
+    raw_paths = (
+        site.getsitepackages()
+        + sys.path
+        + [site.getuserbase()]
+        + [site.getusersitepackages()]
+        + [os.path.dirname(os.path.dirname(torch.__file__))]
+    )
+
+    path_prefixes = sorted({os.path.abspath(i) for i in raw_paths}, reverse=True)
+    assert all(isinstance(i, str) for i in path_prefixes)
+    return [i + os.sep for i in path_prefixes]