I am done

2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions
--- a/rl/Lib/site-packages/torch/utils/benchmark/init.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/init.py
@ -0,0 +1,6 @@
+from torch.utils.benchmark.utils.common import *  # noqa: F403
+from torch.utils.benchmark.utils.timer import *  # noqa: F403
+from torch.utils.benchmark.utils.compare import *  # noqa: F403
+from torch.utils.benchmark.utils.fuzzer import *  # noqa: F403
+from torch.utils.benchmark.utils.valgrind_wrapper.timer_interface import *  # noqa: F403
+from torch.utils.benchmark.utils.sparse_fuzzer import *  # noqa: F403
--- a/rl/Lib/site-packages/torch/utils/benchmark/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/init.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/init.py
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/blas_compare_setup.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/blas_compare_setup.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/compare.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/compare.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/fuzzer.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/fuzzer.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/op_benchmark.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/op_benchmark.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/simple_timeit.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/simple_timeit.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/spectral_ops_fuzz_test.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/spectral_ops_fuzz_test.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/blas_compare_setup.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/blas_compare_setup.py
@ -0,0 +1,222 @@
+# mypy: allow-untyped-defs
+import collections
+import os
+import shutil
+import subprocess
+
+try:
+    # no type stub for conda command line interface
+    import conda.cli.python_api  # type: ignore[import]
+    from conda.cli.python_api import Commands as conda_commands
+except ImportError:
+    # blas_compare.py will fail to import these when it's inside a conda env,
+    # but that's fine as it only wants the constants.
+    pass
+
+
+WORKING_ROOT = "/tmp/pytorch_blas_compare_environments"
+MKL_2020_3 = "mkl_2020_3"
+MKL_2020_0 = "mkl_2020_0"
+OPEN_BLAS = "open_blas"
+EIGEN = "eigen"
+
+
+GENERIC_ENV_VARS = ("USE_CUDA=0", "USE_ROCM=0")
+BASE_PKG_DEPS = (
+    "cmake",
+    "hypothesis",
+    "ninja",
+    "numpy",
+    "pyyaml",
+    "setuptools",
+    "typing_extensions",
+)
+
+
+SubEnvSpec = collections.namedtuple(
+    "SubEnvSpec", (
+        "generic_installs",
+        "special_installs",
+        "environment_variables",
+
+        # Validate install.
+        "expected_blas_symbols",
+        "expected_mkl_version",
+    ))
+
+
+SUB_ENVS = {
+    MKL_2020_3: SubEnvSpec(
+        generic_installs=(),
+        special_installs=("intel", ("mkl=2020.3", "mkl-include=2020.3")),
+        environment_variables=("BLAS=MKL",) + GENERIC_ENV_VARS,
+        expected_blas_symbols=("mkl_blas_sgemm",),
+        expected_mkl_version="2020.0.3",
+    ),
+
+    MKL_2020_0: SubEnvSpec(
+        generic_installs=(),
+        special_installs=("intel", ("mkl=2020.0", "mkl-include=2020.0")),
+        environment_variables=("BLAS=MKL",) + GENERIC_ENV_VARS,
+        expected_blas_symbols=("mkl_blas_sgemm",),
+        expected_mkl_version="2020.0.0",
+    ),
+
+    OPEN_BLAS: SubEnvSpec(
+        generic_installs=("openblas",),
+        special_installs=(),
+        environment_variables=("BLAS=OpenBLAS",) + GENERIC_ENV_VARS,
+        expected_blas_symbols=("exec_blas",),
+        expected_mkl_version=None,
+    ),
+
+    # EIGEN: SubEnvSpec(
+    #     generic_installs=(),
+    #     special_installs=(),
+    #     environment_variables=("BLAS=Eigen",) + GENERIC_ENV_VARS,
+    #     expected_blas_symbols=(),
+    # ),
+}
+
+
+def conda_run(*args):
+    """Convenience method."""
+    stdout, stderr, retcode = conda.cli.python_api.run_command(*args)
+    if retcode:
+        raise OSError(f"conda error: {str(args)}  retcode: {retcode}\n{stderr}")
+
+    return stdout
+
+
+def main():
+    if os.path.exists(WORKING_ROOT):
+        print("Cleaning: removing old working root.")
+        shutil.rmtree(WORKING_ROOT)
+    os.makedirs(WORKING_ROOT)
+
+    git_root = subprocess.check_output(
+        "git rev-parse --show-toplevel",
+        shell=True,
+        cwd=os.path.dirname(os.path.realpath(__file__))
+    ).decode("utf-8").strip()
+
+    for env_name, env_spec in SUB_ENVS.items():
+        env_path = os.path.join(WORKING_ROOT, env_name)
+        print(f"Creating env: {env_name}: ({env_path})")
+        conda_run(
+            conda_commands.CREATE,
+            "--no-default-packages",
+            "--prefix", env_path,
+            "python=3",
+        )
+
+        print("Testing that env can be activated:")
+        base_source = subprocess.run(
+            f"source activate {env_path}",
+            shell=True,
+            capture_output=True,
+            check=False,
+        )
+        if base_source.returncode:
+            raise OSError(
+                "Failed to source base environment:\n"
+                f"  stdout: {base_source.stdout.decode('utf-8')}\n"
+                f"  stderr: {base_source.stderr.decode('utf-8')}"
+            )
+
+        print("Installing packages:")
+        conda_run(
+            conda_commands.INSTALL,
+            "--prefix", env_path,
+            *(BASE_PKG_DEPS + env_spec.generic_installs)
+        )
+
+        if env_spec.special_installs:
+            channel, channel_deps = env_spec.special_installs
+            print(f"Installing packages from channel: {channel}")
+            conda_run(
+                conda_commands.INSTALL,
+                "--prefix", env_path,
+                "-c", channel, *channel_deps
+            )
+
+        if env_spec.environment_variables:
+            print("Setting environment variables.")
+
+            # This does not appear to be possible using the python API.
+            env_set = subprocess.run(
+                f"source activate {env_path} && "
+                f"conda env config vars set {' '.join(env_spec.environment_variables)}",
+                shell=True,
+                capture_output=True,
+                check=False,
+            )
+            if env_set.returncode:
+                raise OSError(
+                    "Failed to set environment variables:\n"
+                    f"  stdout: {env_set.stdout.decode('utf-8')}\n"
+                    f"  stderr: {env_set.stderr.decode('utf-8')}"
+                )
+
+            # Check that they were actually set correctly.
+            actual_env_vars = subprocess.run(
+                f"source activate {env_path} && env",
+                shell=True,
+                capture_output=True,
+                check=True,
+            ).stdout.decode("utf-8").strip().splitlines()
+            for e in env_spec.environment_variables:
+                assert e in actual_env_vars, f"{e} not in envs"
+
+        print(f"Building PyTorch for env: `{env_name}`")
+        # We have to re-run during each build to pick up the new
+        # build config settings.
+        build_run = subprocess.run(
+            f"source activate {env_path} && "
+            f"cd {git_root} && "
+            "python setup.py install --cmake",
+            shell=True,
+            capture_output=True,
+            check=True,
+        )
+
+        print("Checking configuration:")
+        check_run = subprocess.run(
+            # Shameless abuse of `python -c ...`
+            f"source activate {env_path} && "
+            'python -c "'
+            "import torch;"
+            "from torch.utils.benchmark import Timer;"
+            "print(torch.__config__.show());"
+            "setup = 'x=torch.ones((128, 128));y=torch.ones((128, 128))';"
+            "counts = Timer('torch.mm(x, y)', setup).collect_callgrind(collect_baseline=False);"
+            "stats = counts.as_standardized().stats(inclusive=True);"
+            "print(stats.filter(lambda l: 'blas' in l.lower()))\"",
+            shell=True,
+            capture_output=True,
+            check=False,
+        )
+        if check_run.returncode:
+            raise OSError(
+                "Failed to set environment variables:\n"
+                f"  stdout: {check_run.stdout.decode('utf-8')}\n"
+                f"  stderr: {check_run.stderr.decode('utf-8')}"
+            )
+        check_run_stdout = check_run.stdout.decode('utf-8')
+        print(check_run_stdout)
+
+        for e in env_spec.environment_variables:
+            if "BLAS" in e:
+                assert e in check_run_stdout, f"PyTorch build did not respect `BLAS=...`: {e}"
+
+        for s in env_spec.expected_blas_symbols:
+            assert s in check_run_stdout
+
+        if env_spec.expected_mkl_version is not None:
+            assert f"- Intel(R) Math Kernel Library Version {env_spec.expected_mkl_version}" in check_run_stdout
+
+        print(f"Build complete: {env_name}")
+
+
+if __name__ == "__main__":
+    main()
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/compare.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/compare.py
@ -0,0 +1,99 @@
+# mypy: allow-untyped-defs
+"""Example of Timer and Compare APIs:
+
+$ python -m examples.compare
+"""
+
+import pickle
+import sys
+import time
+
+import torch
+
+import torch.utils.benchmark as benchmark_utils
+
+
+class FauxTorch:
+    """Emulate different versions of pytorch.
+
+    In normal circumstances this would be done with multiple processes
+    writing serialized measurements, but this simplifies that model to
+    make the example clearer.
+    """
+    def __init__(self, real_torch, extra_ns_per_element):
+        self._real_torch = real_torch
+        self._extra_ns_per_element = extra_ns_per_element
+
+    def extra_overhead(self, result):
+        # time.sleep has a ~65 us overhead, so only fake a
+        # per-element overhead if numel is large enough.
+        numel = int(result.numel())
+        if numel > 5000:
+            time.sleep(numel * self._extra_ns_per_element * 1e-9)
+        return result
+
+    def add(self, *args, **kwargs):
+        return self.extra_overhead(self._real_torch.add(*args, **kwargs))
+
+    def mul(self, *args, **kwargs):
+        return self.extra_overhead(self._real_torch.mul(*args, **kwargs))
+
+    def cat(self, *args, **kwargs):
+        return self.extra_overhead(self._real_torch.cat(*args, **kwargs))
+
+    def matmul(self, *args, **kwargs):
+        return self.extra_overhead(self._real_torch.matmul(*args, **kwargs))
+
+
+def main():
+    tasks = [
+        ("add", "add", "torch.add(x, y)"),
+        ("add", "add (extra +0)", "torch.add(x, y + zero)"),
+    ]
+
+    serialized_results = []
+    repeats = 2
+    timers = [
+        benchmark_utils.Timer(
+            stmt=stmt,
+            globals={
+                "torch": torch if branch == "master" else FauxTorch(torch, overhead_ns),
+                "x": torch.ones((size, 4)),
+                "y": torch.ones((1, 4)),
+                "zero": torch.zeros(()),
+            },
+            label=label,
+            sub_label=sub_label,
+            description=f"size: {size}",
+            env=branch,
+            num_threads=num_threads,
+        )
+        for branch, overhead_ns in [("master", None), ("my_branch", 1), ("severe_regression", 5)]
+        for label, sub_label, stmt in tasks
+        for size in [1, 10, 100, 1000, 10000, 50000]
+        for num_threads in [1, 4]
+    ]
+
+    for i, timer in enumerate(timers * repeats):
+        serialized_results.append(pickle.dumps(
+            timer.blocked_autorange(min_run_time=0.05)
+        ))
+        print(f"\r{i + 1} / {len(timers) * repeats}", end="")
+        sys.stdout.flush()
+    print()
+
+    comparison = benchmark_utils.Compare([
+        pickle.loads(i) for i in serialized_results
+    ])
+
+    print("== Unformatted " + "=" * 80 + "\n" + "/" * 95 + "\n")
+    comparison.print()
+
+    print("== Formatted " + "=" * 80 + "\n" + "/" * 93 + "\n")
+    comparison.trim_significant_figures()
+    comparison.colorize()
+    comparison.print()
+
+
+if __name__ == "__main__":
+    main()
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/fuzzer.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/fuzzer.py
@ -0,0 +1,86 @@
+# mypy: allow-untyped-defs
+"""Example of the Timer and Fuzzer APIs:
+
+$ python -m examples.fuzzer
+"""
+
+import sys
+
+import torch.utils.benchmark as benchmark_utils
+
+
+def main():
+    add_fuzzer = benchmark_utils.Fuzzer(
+        parameters=[
+            [
+                benchmark_utils.FuzzedParameter(
+                    name=f"k{i}",
+                    minval=16,
+                    maxval=16 * 1024,
+                    distribution="loguniform",
+                ) for i in range(3)
+            ],
+            benchmark_utils.FuzzedParameter(
+                name="d",
+                distribution={2: 0.6, 3: 0.4},
+            ),
+        ],
+        tensors=[
+            [
+                benchmark_utils.FuzzedTensor(
+                    name=name,
+                    size=("k0", "k1", "k2"),
+                    dim_parameter="d",
+                    probability_contiguous=0.75,
+                    min_elements=64 * 1024,
+                    max_elements=128 * 1024,
+                ) for name in ("x", "y")
+            ],
+        ],
+        seed=0,
+    )
+
+    n = 250
+    measurements = []
+    for i, (tensors, tensor_properties, _) in enumerate(add_fuzzer.take(n=n)):
+        x, x_order = tensors["x"], str(tensor_properties["x"]["order"])
+        y, y_order = tensors["y"], str(tensor_properties["y"]["order"])
+        shape = ", ".join(tuple(f'{i:>4}' for i in x.shape))
+
+        description = "".join([
+            f"{x.numel():>7} | {shape:<16} | ",
+            f"{'contiguous' if x.is_contiguous() else x_order:<12} | ",
+            f"{'contiguous' if y.is_contiguous() else y_order:<12} | ",
+        ])
+
+        timer = benchmark_utils.Timer(
+            stmt="x + y",
+            globals=tensors,
+            description=description,
+        )
+
+        measurements.append(timer.blocked_autorange(min_run_time=0.1))
+        measurements[-1].metadata = {"numel": x.numel()}
+        print(f"\r{i + 1} / {n}", end="")
+        sys.stdout.flush()
+    print()
+
+    # More string munging to make pretty output.
+    print(f"Average attempts per valid config: {1. / (1. - add_fuzzer.rejection_rate):.1f}")
+
+    def time_fn(m):
+        return m.median / m.metadata["numel"]
+    measurements.sort(key=time_fn)
+
+    template = f"{{:>6}}{' ' * 19}Size    Shape{' ' * 13}X order        Y order\n{'-' * 80}"
+    print(template.format("Best:"))
+    for m in measurements[:15]:
+        print(f"{time_fn(m) * 1e9:>4.1f} ns / element     {m.description}")
+
+    print("\n" + template.format("Worst:"))
+    for m in measurements[-15:]:
+        print(f"{time_fn(m) * 1e9:>4.1f} ns / element     {m.description}")
+
+
+if __name__ == "__main__":
+    main()
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/op_benchmark.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/op_benchmark.py
@ -0,0 +1,105 @@
+# mypy: allow-untyped-defs
+"""Example use of Timer and op fuzzers to measure kernel performance.
+
+$ python -m examples.op_benchmark
+"""
+
+import numpy as np
+import torch
+
+from torch.utils.benchmark import Timer
+from torch.utils.benchmark.op_fuzzers.binary import BinaryOpFuzzer
+from torch.utils.benchmark.op_fuzzers.unary import UnaryOpFuzzer
+import operator
+
+
+_MEASURE_TIME = 1.0
+
+
+def assert_dicts_equal(dict_0, dict_1):
+    """Builtin dict comparison will not compare numpy arrays.
+    e.g.
+        x = {"a": np.ones((2, 1))}
+        x == x  # Raises ValueError
+    """
+    assert set(dict_0.keys()) == set(dict_0.keys())
+    assert all(np.all(v == dict_1[k]) for k, v in dict_0.items() if k != "dtype")
+
+
+def run(n, stmt, fuzzer_cls):
+    float_iter = fuzzer_cls(seed=0, dtype=torch.float32).take(n)
+    int_iter = fuzzer_cls(seed=0, dtype=torch.int32).take(n)
+    raw_results = []
+    for i, (float_values, int_values) in enumerate(zip(float_iter, int_iter)):
+        float_tensors, float_tensor_params, float_params = float_values
+        int_tensors, int_tensor_params, int_params = int_values
+
+        # This benchmark assumes that the two fuzzers generate identically
+        # sized and strided Tensors, since the same seed is used.
+        assert_dicts_equal(float_params, int_params)
+        assert_dicts_equal(float_tensor_params["x"], int_tensor_params["x"])
+
+        float_measurement, int_measurement = (
+            Timer(
+                stmt,
+                globals=tensors,
+            ).blocked_autorange(min_run_time=_MEASURE_TIME)
+            for tensors in (float_tensors, int_tensors)
+        )
+
+        descriptions = []
+        for name in float_tensors:
+            shape_str = "(" + ", ".join([
+                f"2 ** {int(np.log2(i))}"
+                if 2 ** int(np.log2(i)) == i and i > 1
+                else str(i)
+                for i in float_tensors[name].shape
+            ]) + ")"
+            order = float_tensor_params[name]["order"]
+            order_str = ("" if all(order == np.arange(len(order))) else str(tuple(order)))
+            steps = float_tensor_params[name]["steps"]
+            steps_str = str(steps) if sum(steps) > len(steps) else ""
+            descriptions.append((name, shape_str, order_str, steps_str))
+        raw_results.append((float_measurement, int_measurement, descriptions))
+
+        print(f"\r{i + 1} / {n}", end="")
+    print()
+
+    parsed_results, name_len, shape_len, order_len, steps_len = [], 0, 0, 0, 0
+    for float_measurement, int_measurement, descriptions in raw_results:
+        t_float = float_measurement.median * 1e6
+        t_int = int_measurement.median * 1e6
+        rel_diff = abs(t_float - t_int) / (t_float + t_int) * 2
+        parsed_results.append((t_float, t_int, rel_diff, descriptions))
+        for name, shape, order, steps in descriptions:
+            name_len = max(name_len, len(name))
+            shape_len = max(shape_len, len(shape))
+            order_len = max(order_len, len(order))
+            steps_len = max(steps_len, len(steps))
+
+    parsed_results.sort(key=operator.itemgetter(2))
+
+    print(f"stmt: {stmt}")
+    print(f" diff    faster{'':>17}{' ' * name_len} ", end="")
+    print(f"{'shape'.ljust(shape_len)}{'':>16}{'order'.ljust(order_len)}", end="")
+    print(f"          steps\n{'-' * 100}")
+    for results, spacer in [(parsed_results[:10], "..."), (parsed_results[-10:], "")]:
+        for t_float, t_int, rel_diff, descriptions in results:
+            time_str = [f"{rel_diff * 100:>4.1f}%    {'int' if t_int < t_float else 'float':<20}"]
+            time_str.extend(["".ljust(len(time_str[0])) for _ in descriptions[:-1]])
+            for t_str, (name, shape, order, steps) in zip(time_str, descriptions):
+                name = f"{name}:".ljust(name_len + 1)
+                shape = shape.ljust(shape_len + 10)
+                order = order.ljust(order_len)
+                print(f"{t_str} {name}  {shape}|     {order}      |   {steps}")
+        print(spacer)
+
+
+def main():
+    run(n=100, stmt="torch.median(x, dim=0)", fuzzer_cls=UnaryOpFuzzer)
+    run(n=100, stmt="torch.square(x)", fuzzer_cls=UnaryOpFuzzer)
+    run(n=100, stmt="x + y", fuzzer_cls=BinaryOpFuzzer)
+
+
+if __name__ == "__main__":
+    main()
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/simple_timeit.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/simple_timeit.py
@ -0,0 +1,26 @@
+# mypy: allow-untyped-defs
+"""Trivial use of Timer API:
+
+$ python -m examples.simple_timeit
+"""
+
+import torch
+
+import torch.utils.benchmark as benchmark_utils
+
+
+def main():
+    timer = benchmark_utils.Timer(
+        stmt="x + y",
+        globals={"x": torch.ones((4, 8)), "y": torch.ones((1, 8))},
+        label="Broadcasting add (4x8)",
+    )
+
+    for i in range(3):
+        print(f"Run: {i}\n{'-' * 40}")
+        print(f"timeit:\n{timer.timeit(10000)}\n")
+        print(f"autorange:\n{timer.blocked_autorange()}\n\n")
+
+
+if __name__ == "__main__":
+    main()
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py
@ -0,0 +1,114 @@
+# mypy: allow-untyped-defs
+"""Microbenchmarks for the torch.fft module"""
+from argparse import ArgumentParser
+from collections import namedtuple
+from collections.abc import Iterable
+
+import torch
+import torch.fft
+from torch.utils import benchmark
+from torch.utils.benchmark.op_fuzzers.spectral import SpectralOpFuzzer
+
+
+def _dim_options(ndim):
+    if ndim == 1:
+        return [None]
+    elif ndim == 2:
+        return [0, 1, None]
+    elif ndim == 3:
+        return [0, 1, 2, (0, 1), (0, 2), None]
+    raise ValueError(f"Expected ndim in range 1-3, got {ndim}")
+
+
+def run_benchmark(name: str, function: object, dtype: torch.dtype, seed: int, device: str, samples: int,
+                  probability_regular: float):
+    cuda = device == 'cuda'
+    spectral_fuzzer = SpectralOpFuzzer(seed=seed, dtype=dtype, cuda=cuda,
+                                       probability_regular=probability_regular)
+    results = []
+    for tensors, tensor_params, params in spectral_fuzzer.take(samples):
+        shape = [params['k0'], params['k1'], params['k2']][:params['ndim']]
+        str_shape = ' x '.join([f"{s:<4}" for s in shape])
+        sub_label = f"{str_shape} {'' if tensor_params['x']['is_contiguous'] else '(discontiguous)'}"
+        for dim in _dim_options(params['ndim']):
+            for nthreads in (1, 4, 16) if not cuda else (1,):
+                measurement = benchmark.Timer(
+                    stmt='func(x, dim=dim)',
+                    globals={'func': function, 'x': tensors['x'], 'dim': dim},
+                    label=f"{name}_{device}",
+                    sub_label=sub_label,
+                    description=f"dim={dim}",
+                    num_threads=nthreads,
+                ).blocked_autorange(min_run_time=1)
+                measurement.metadata = {
+                    'name': name,
+                    'device': device,
+                    'dim': dim,
+                    'shape': shape,
+                }
+                measurement.metadata.update(tensor_params['x'])
+                results.append(measurement)
+    return results
+
+
+Benchmark = namedtuple('Benchmark', ['name', 'function', 'dtype'])
+BENCHMARKS = [
+    Benchmark('fft_real', torch.fft.fftn, torch.float32),
+    Benchmark('fft_complex', torch.fft.fftn, torch.complex64),
+    Benchmark('ifft', torch.fft.ifftn, torch.complex64),
+    Benchmark('rfft', torch.fft.rfftn, torch.float32),
+    Benchmark('irfft', torch.fft.irfftn, torch.complex64),
+]
+BENCHMARK_MAP = {b.name: b for b in BENCHMARKS}
+BENCHMARK_NAMES = [b.name for b in BENCHMARKS]
+DEVICE_NAMES = ['cpu', 'cuda']
+
+def _output_csv(file, results):
+    file.write('benchmark,device,num_threads,numel,shape,contiguous,dim,mean (us),median (us),iqr (us)\n')
+    for measurement in results:
+        metadata = measurement.metadata
+        device, dim, shape, name, numel, contiguous = (
+            metadata['device'], metadata['dim'], metadata['shape'],
+            metadata['name'], metadata['numel'], metadata['is_contiguous'])
+
+        if isinstance(dim, Iterable):
+            dim_str = '-'.join(str(d) for d in dim)
+        else:
+            dim_str = str(dim)
+            shape_str = 'x'.join(str(s) for s in shape)
+
+        print(name, device, measurement.task_spec.num_threads, numel, shape_str, contiguous, dim_str,  # type: ignore[possibly-undefined]
+              measurement.mean * 1e6, measurement.median * 1e6, measurement.iqr * 1e6,
+              sep=',', file=file)
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(description=__doc__)
+    parser.add_argument('--device', type=str, choices=DEVICE_NAMES, nargs='+', default=DEVICE_NAMES)
+    parser.add_argument('--bench', type=str, choices=BENCHMARK_NAMES, nargs='+', default=BENCHMARK_NAMES)
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--samples', type=int, default=10)
+    parser.add_argument('--probability-regular', '--probability_regular', type=float, default=1.0)
+    parser.add_argument('-o', '--output', type=str)
+    args = parser.parse_args()
+
+    num_benchmarks = len(args.device) * len(args.bench)
+    i = 0
+    results = []
+    for device in args.device:
+        for bench in (BENCHMARK_MAP[b] for b in args.bench):
+            results += run_benchmark(
+                name=bench.name, function=bench.function, dtype=bench.dtype,
+                seed=args.seed, device=device, samples=args.samples,
+                probability_regular=args.probability_regular)
+            i += 1
+            print(f'Completed {bench.name} benchmark on {device} ({i} of {num_benchmarks})')
+
+    if args.output is not None:
+        with open(args.output, 'w') as f:
+            _output_csv(f, results)
+
+    compare = benchmark.Compare(results)
+    compare.trim_significant_figures()
+    compare.colorize()
+    compare.print()
--- a/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/init.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/init.py
--- a/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/pycache/binary.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/pycache/binary.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/pycache/sparse_binary.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/pycache/sparse_binary.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/pycache/sparse_unary.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/pycache/sparse_unary.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/pycache/spectral.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/pycache/spectral.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/pycache/unary.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/pycache/unary.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/binary.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/binary.py
@ -0,0 +1,107 @@
+# mypy: allow-untyped-defs
+import numpy as np
+import torch
+
+from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedTensor
+
+
+_MIN_DIM_SIZE = 16
+_MAX_DIM_SIZE = 16 * 1024 ** 2
+_POW_TWO_SIZES = tuple(2 ** i for i in range(
+    int(np.log2(_MIN_DIM_SIZE)),
+    int(np.log2(_MAX_DIM_SIZE)) + 1,
+))
+
+
+class BinaryOpFuzzer(Fuzzer):
+    def __init__(self, seed, dtype=torch.float32, cuda=False):
+        super().__init__(
+            parameters=[
+                # Dimensionality of x and y. (e.g. 1D, 2D, or 3D.)
+                FuzzedParameter("dim", distribution={1: 0.3, 2: 0.4, 3: 0.3}, strict=True),
+
+                # Shapes for `x` and `y`.
+                #       It is important to test all shapes, however
+                #   powers of two are especially important and therefore
+                #   warrant special attention. This is done by generating
+                #   both a value drawn from all integers between the min and
+                #   max allowed values, and another from only the powers of two
+                #   (both distributions are loguniform) and then randomly
+                #   selecting between the two.
+                #       Moreover, `y` will occasionally have singleton
+                #   dimensions in order to test broadcasting.
+                [
+                    FuzzedParameter(
+                        name=f"k_any_{i}",
+                        minval=_MIN_DIM_SIZE,
+                        maxval=_MAX_DIM_SIZE,
+                        distribution="loguniform",
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k_pow2_{i}",
+                        distribution={size: 1. / len(_POW_TWO_SIZES) for size in _POW_TWO_SIZES}
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k{i}",
+                        distribution={
+                            ParameterAlias(f"k_any_{i}"): 0.8,
+                            ParameterAlias(f"k_pow2_{i}"): 0.2,
+                        },
+                        strict=True,
+                    ) for i in range(3)
+                ],
+
+                [
+                    FuzzedParameter(
+                        name=f"y_k{i}",
+                        distribution={
+                            ParameterAlias(f"k{i}"): 0.8,
+                            1: 0.2,
+                        },
+                        strict=True,
+                    ) for i in range(3)
+                ],
+
+                # Steps for `x` and `y`. (Benchmarks strided memory access.)
+                [
+                    FuzzedParameter(
+                        name=f"{name}_step_{i}",
+                        distribution={1: 0.8, 2: 0.06, 4: 0.06, 8: 0.04, 16: 0.04},
+                    )
+                    for i in range(3)
+                    for name in ("x", "y")
+                ],
+
+                # Repeatable entropy for downstream applications.
+                FuzzedParameter(name="random_value", minval=0, maxval=2 ** 32 - 1, distribution="uniform"),
+            ],
+            tensors=[
+                FuzzedTensor(
+                    name="x",
+                    size=("k0", "k1", "k2"),
+                    steps=("x_step_0", "x_step_1", "x_step_2"),
+                    probability_contiguous=0.75,
+                    min_elements=4 * 1024,
+                    max_elements=32 * 1024 ** 2,
+                    max_allocation_bytes=2 * 1024**3,  # 2 GB
+                    dim_parameter="dim",
+                    dtype=dtype,
+                    cuda=cuda,
+                ),
+                FuzzedTensor(
+                    name="y",
+                    size=("y_k0", "y_k1", "y_k2"),
+                    steps=("x_step_0", "x_step_1", "x_step_2"),
+                    probability_contiguous=0.75,
+                    max_allocation_bytes=2 * 1024**3,  # 2 GB
+                    dim_parameter="dim",
+                    dtype=dtype,
+                    cuda=cuda,
+                ),
+            ],
+            seed=seed,
+        )
--- a/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/sparse_binary.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/sparse_binary.py
@ -0,0 +1,107 @@
+# mypy: allow-untyped-defs
+import numpy as np
+import torch
+
+from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedSparseTensor
+
+
+_MIN_DIM_SIZE = 16
+_MAX_DIM_SIZE = 16 * 1024 ** 2
+_POW_TWO_SIZES = tuple(2 ** i for i in range(
+    int(np.log2(_MIN_DIM_SIZE)),
+    int(np.log2(_MAX_DIM_SIZE)) + 1,
+))
+
+
+class BinaryOpSparseFuzzer(Fuzzer):
+    def __init__(self, seed, dtype=torch.float32, cuda=False):
+        super().__init__(
+            parameters=[
+                # Dimensionality of x and y. (e.g. 1D, 2D, or 3D.)
+                FuzzedParameter("dim_parameter", distribution={1: 0.3, 2: 0.4, 3: 0.3}, strict=True),
+                FuzzedParameter(
+                    name="sparse_dim",
+                    distribution={1: 0.4, 2: 0.4, 3: 0.2},
+                    strict=True
+                ),
+                # Shapes for `x` and `y`.
+                #       It is important to test all shapes, however
+                #   powers of two are especially important and therefore
+                #   warrant special attention. This is done by generating
+                #   both a value drawn from all integers between the min and
+                #   max allowed values, and another from only the powers of two
+                #   (both distributions are loguniform) and then randomly
+                #   selecting between the two.
+                #       Moreover, `y` will occasionally have singleton
+                #   dimensions in order to test broadcasting.
+                [
+                    FuzzedParameter(
+                        name=f"k_any_{i}",
+                        minval=_MIN_DIM_SIZE,
+                        maxval=_MAX_DIM_SIZE,
+                        distribution="loguniform",
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k_pow2_{i}",
+                        distribution={size: 1. / len(_POW_TWO_SIZES) for size in _POW_TWO_SIZES}
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k{i}",
+                        distribution={
+                            ParameterAlias(f"k_any_{i}"): 0.8,
+                            ParameterAlias(f"k_pow2_{i}"): 0.2,
+                        },
+                        strict=True,
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"y_k{i}",
+                        distribution={
+                            ParameterAlias(f"k{i}"): 1.0},
+                        strict=True,
+                    ) for i in range(3)
+                ],
+                FuzzedParameter(
+                    name="density",
+                    distribution={0.1: 0.4, 0.05: 0.3, 0.01: 0.3},
+                ),
+                FuzzedParameter(
+                    name="coalesced",
+                    distribution={True: 0.5, False: 0.5},
+                ),
+                # Repeatable entropy for downstream applications.
+                FuzzedParameter(name="random_value", minval=0, maxval=2 ** 32 - 1, distribution="uniform"),
+            ],
+            tensors=[
+                FuzzedSparseTensor(
+                    name="x",
+                    size=("k0", "k1", "k2"),
+                    dim_parameter="dim_parameter",
+                    sparse_dim="sparse_dim",
+                    density="density",
+                    coalesced="coalesced",
+                    min_elements=4 * 1024,
+                    max_elements=32 * 1024 ** 2,
+                    dtype=dtype,
+                    cuda=cuda,
+                ),
+                FuzzedSparseTensor(
+                    name="y",
+                    size=("y_k0", "y_k1", "y_k2"),
+                    dim_parameter="dim_parameter",
+                    sparse_dim="sparse_dim",
+                    density="density",
+                    coalesced="coalesced",
+                    min_elements=4 * 1024,
+                    max_elements=32 * 1024 ** 2,
+                    dtype=dtype,
+                    cuda=cuda,
+                ),
+            ],
+            seed=seed,
+        )
--- a/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/sparse_unary.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/sparse_unary.py
@ -0,0 +1,83 @@
+# mypy: allow-untyped-defs
+
+import numpy as np
+import torch
+from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedSparseTensor
+
+
+_MIN_DIM_SIZE = 16
+_MAX_DIM_SIZE = 16 * 1024 ** 2
+_POW_TWO_SIZES = tuple(2 ** i for i in range(
+    int(np.log2(_MIN_DIM_SIZE)),
+    int(np.log2(_MAX_DIM_SIZE)) + 1,
+))
+
+class UnaryOpSparseFuzzer(Fuzzer):
+    def __init__(self, seed, dtype=torch.float32, cuda=False):
+        super().__init__(
+            parameters=[
+                # Sparse dim parameter of x. (e.g. 1D, 2D, or 3D.)
+                FuzzedParameter("dim_parameter", distribution={1: 0.3, 2: 0.4, 3: 0.3}, strict=True),
+                FuzzedParameter(
+                    name="sparse_dim",
+                    distribution={1: 0.4, 2: 0.4, 3: 0.2},
+                    strict=True
+                ),
+                # Shapes for `x`.
+                #   It is important to test all shapes, however
+                #   powers of two are especially important and therefore
+                #   warrant special attention. This is done by generating
+                #   both a value drawn from all integers between the min and
+                #   max allowed values, and another from only the powers of two
+                #   (both distributions are loguniform) and then randomly
+                #   selecting between the two.
+                [
+                    FuzzedParameter(
+                        name=f"k_any_{i}",
+                        minval=_MIN_DIM_SIZE,
+                        maxval=_MAX_DIM_SIZE,
+                        distribution="loguniform",
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k_pow2_{i}",
+                        distribution={size: 1. / len(_POW_TWO_SIZES) for size in _POW_TWO_SIZES}
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k{i}",
+                        distribution={
+                            ParameterAlias(f"k_any_{i}"): 0.8,
+                            ParameterAlias(f"k_pow2_{i}"): 0.2,
+                        },
+                        strict=True,
+                    ) for i in range(3)
+                ],
+                FuzzedParameter(
+                    name="density",
+                    distribution={0.1: 0.4, 0.05: 0.3, 0.01: 0.3},
+                ),
+                FuzzedParameter(
+                    name="coalesced",
+                    distribution={True: 0.5, False: 0.5},
+                ),
+                FuzzedParameter(name="random_value", minval=0, maxval=2 ** 32 - 1, distribution="uniform"),
+            ],
+            tensors=[
+                FuzzedSparseTensor(
+                    name="x",
+                    size=("k0", "k1", "k2"),
+                    dim_parameter="dim_parameter",
+                    sparse_dim="sparse_dim",
+                    min_elements=4 * 1024,
+                    max_elements=32 * 1024 ** 2,
+                    density="density",
+                    coalesced="coalesced",
+                    dtype=dtype,
+                    cuda=cuda,
+                ),
+            ],
+            seed=seed,
+        )
--- a/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/spectral.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/spectral.py
@ -0,0 +1,94 @@
+# mypy: allow-untyped-defs
+import math
+
+import torch
+from torch.utils import benchmark
+from torch.utils.benchmark import FuzzedParameter, FuzzedTensor, ParameterAlias
+
+
+__all__ = ['SpectralOpFuzzer']
+
+MIN_DIM_SIZE = 16
+MAX_DIM_SIZE = 16 * 1024
+
+def power_range(upper_bound, base):
+    return (base ** i for i in range(int(math.log(upper_bound, base)) + 1))
+
+# List of regular numbers from MIN_DIM_SIZE to MAX_DIM_SIZE
+# These numbers factorize into multiples of prime factors 2, 3, and 5 only
+# and are usually the fastest in FFT implementations.
+REGULAR_SIZES = []
+for i in power_range(MAX_DIM_SIZE, 2):
+    for j in power_range(MAX_DIM_SIZE // i, 3):
+        ij = i * j
+        for k in power_range(MAX_DIM_SIZE // ij, 5):
+            ijk = ij * k
+            if ijk > MIN_DIM_SIZE:
+                REGULAR_SIZES.append(ijk)
+REGULAR_SIZES.sort()
+
+class SpectralOpFuzzer(benchmark.Fuzzer):
+    def __init__(self, *, seed: int, dtype=torch.float64,
+                 cuda: bool = False, probability_regular: float = 1.0):
+        super().__init__(
+            parameters=[
+                # Dimensionality of x. (e.g. 1D, 2D, or 3D.)
+                FuzzedParameter("ndim", distribution={1: 0.3, 2: 0.4, 3: 0.3}, strict=True),
+
+                # Shapes for `x`.
+                #   It is important to test all shapes, however
+                #   regular sizes are especially important to the FFT and therefore
+                #   warrant special attention. This is done by generating
+                #   both a value drawn from all integers between the min and
+                #   max allowed values, and another from only the regular numbers
+                #   (both distributions are loguniform) and then randomly
+                #   selecting between the two.
+                [
+                    FuzzedParameter(
+                        name=f"k_any_{i}",
+                        minval=MIN_DIM_SIZE,
+                        maxval=MAX_DIM_SIZE,
+                        distribution="loguniform",
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k_regular_{i}",
+                        distribution={size: 1. / len(REGULAR_SIZES) for size in REGULAR_SIZES}
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k{i}",
+                        distribution={
+                            ParameterAlias(f"k_regular_{i}"): probability_regular,
+                            ParameterAlias(f"k_any_{i}"): 1 - probability_regular,
+                        },
+                        strict=True,
+                    ) for i in range(3)
+                ],
+
+                # Steps for `x`. (Benchmarks strided memory access.)
+                [
+                    FuzzedParameter(
+                        name=f"step_{i}",
+                        distribution={1: 0.8, 2: 0.06, 4: 0.06, 8: 0.04, 16: 0.04},
+                    ) for i in range(3)
+                ],
+            ],
+            tensors=[
+                FuzzedTensor(
+                    name="x",
+                    size=("k0", "k1", "k2"),
+                    steps=("step_0", "step_1", "step_2"),
+                    probability_contiguous=0.75,
+                    min_elements=4 * 1024,
+                    max_elements=32 * 1024 ** 2,
+                    max_allocation_bytes=2 * 1024**3,  # 2 GB
+                    dim_parameter="ndim",
+                    dtype=dtype,
+                    cuda=cuda,
+                ),
+            ],
+            seed=seed,
+        )
--- a/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/unary.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/op_fuzzers/unary.py
@ -0,0 +1,82 @@
+# mypy: allow-untyped-defs
+import numpy as np
+import torch
+
+from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedTensor
+
+
+_MIN_DIM_SIZE = 16
+_MAX_DIM_SIZE = 16 * 1024 ** 2
+_POW_TWO_SIZES = tuple(2 ** i for i in range(
+    int(np.log2(_MIN_DIM_SIZE)),
+    int(np.log2(_MAX_DIM_SIZE)) + 1,
+))
+
+
+class UnaryOpFuzzer(Fuzzer):
+    def __init__(self, seed, dtype=torch.float32, cuda=False):
+        super().__init__(
+            parameters=[
+                # Dimensionality of x. (e.g. 1D, 2D, or 3D.)
+                FuzzedParameter("dim", distribution={1: 0.3, 2: 0.4, 3: 0.3}, strict=True),
+
+                # Shapes for `x`.
+                #   It is important to test all shapes, however
+                #   powers of two are especially important and therefore
+                #   warrant special attention. This is done by generating
+                #   both a value drawn from all integers between the min and
+                #   max allowed values, and another from only the powers of two
+                #   (both distributions are loguniform) and then randomly
+                #   selecting between the two.
+                [
+                    FuzzedParameter(
+                        name=f"k_any_{i}",
+                        minval=_MIN_DIM_SIZE,
+                        maxval=_MAX_DIM_SIZE,
+                        distribution="loguniform",
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k_pow2_{i}",
+                        distribution={size: 1. / len(_POW_TWO_SIZES) for size in _POW_TWO_SIZES}
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k{i}",
+                        distribution={
+                            ParameterAlias(f"k_any_{i}"): 0.8,
+                            ParameterAlias(f"k_pow2_{i}"): 0.2,
+                        },
+                        strict=True,
+                    ) for i in range(3)
+                ],
+
+                # Steps for `x`. (Benchmarks strided memory access.)
+                [
+                    FuzzedParameter(
+                        name=f"x_step_{i}",
+                        distribution={1: 0.8, 2: 0.06, 4: 0.06, 8: 0.04, 16: 0.04},
+                    ) for i in range(3)
+                ],
+
+                # Repeatable entropy for downstream applications.
+                FuzzedParameter(name="random_value", minval=0, maxval=2 ** 32 - 1, distribution="uniform"),
+            ],
+            tensors=[
+                FuzzedTensor(
+                    name="x",
+                    size=("k0", "k1", "k2"),
+                    steps=("x_step_0", "x_step_1", "x_step_2"),
+                    probability_contiguous=0.75,
+                    min_elements=4 * 1024,
+                    max_elements=32 * 1024 ** 2,
+                    max_allocation_bytes=2 * 1024**3,  # 2 GB
+                    dim_parameter="dim",
+                    dtype=dtype,
+                    cuda=cuda,
+                ),
+            ],
+            seed=seed,
+        )
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/init.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/init.py
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/_stubs.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/_stubs.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/common.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/common.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/compare.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/compare.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/compile.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/compile.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/cpp_jit.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/cpp_jit.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/fuzzer.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/fuzzer.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/sparse_fuzzer.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/sparse_fuzzer.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/timer.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/pycache/timer.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/_stubs.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/_stubs.py
@ -0,0 +1,40 @@
+from typing import Any, Callable, Dict, Protocol, runtime_checkable
+
+
+class TimerClass(Protocol):
+    """This is the portion of the `timeit.Timer` API used by benchmark utils."""
+    def __init__(
+        self,
+        stmt: str,
+        setup: str,
+        timer: Callable[[], float],
+        globals: Dict[str, Any],
+        **kwargs: Any,
+    ) -> None:
+        ...
+
+    def timeit(self, number: int) -> float:
+        ...
+
+
+@runtime_checkable
+class TimeitModuleType(Protocol):
+    """Modules generated from `timeit_template.cpp`."""
+    def timeit(self, number: int) -> float:
+        ...
+
+
+class CallgrindModuleType(Protocol):
+    """Replicates the valgrind endpoints in `torch._C`.
+
+    These bindings are used to collect Callgrind profiles on earlier versions
+    of PyTorch and will eventually be removed.
+    """
+    __file__: str
+    __name__: str
+
+    def _valgrind_supported_platform(self) -> bool:
+        ...
+
+    def _valgrind_toggle(self) -> None:
+        ...
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/common.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/common.py
@ -0,0 +1,355 @@
+"""Base shared classes and utilities."""
+
+import collections
+import contextlib
+import dataclasses
+import os
+import shutil
+import tempfile
+import textwrap
+import time
+from typing import cast, Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple
+import uuid
+
+import torch
+
+
+__all__ = ["TaskSpec", "Measurement", "select_unit", "unit_to_english", "trim_sigfig", "ordered_unique", "set_torch_threads"]
+
+
+_MAX_SIGNIFICANT_FIGURES = 4
+_MIN_CONFIDENCE_INTERVAL = 25e-9  # 25 ns
+
+# Measurement will include a warning if the distribution is suspect. All
+# runs are expected to have some variation; these parameters set the
+# thresholds.
+_IQR_WARN_THRESHOLD = 0.1
+_IQR_GROSS_WARN_THRESHOLD = 0.25
+
+
+@dataclasses.dataclass(init=True, repr=False, eq=True, frozen=True)
+class TaskSpec:
+    """Container for information used to define a Timer. (except globals)"""
+    stmt: str
+    setup: str
+    global_setup: str = ""
+    label: Optional[str] = None
+    sub_label: Optional[str] = None
+    description: Optional[str] = None
+    env: Optional[str] = None
+    num_threads: int = 1
+
+    @property
+    def title(self) -> str:
+        """Best effort attempt at a string label for the measurement."""
+        if self.label is not None:
+            return self.label + (f": {self.sub_label}" if self.sub_label else "")
+        elif "\n" not in self.stmt:
+            return self.stmt + (f": {self.sub_label}" if self.sub_label else "")
+        return (
+            f"stmt:{f' ({self.sub_label})' if self.sub_label else ''}\n"
+            f"{textwrap.indent(self.stmt, '  ')}"
+        )
+
+    def setup_str(self) -> str:
+        return (
+            "" if (self.setup == "pass" or not self.setup)
+            else f"setup:\n{textwrap.indent(self.setup, '  ')}" if "\n" in self.setup
+            else f"setup: {self.setup}"
+        )
+
+    def summarize(self) -> str:
+        """Build TaskSpec portion of repr string for other containers."""
+        sections = [
+            self.title,
+            self.description or "",
+            self.setup_str(),
+        ]
+        return "\n".join([f"{i}\n" if "\n" in i else i for i in sections if i])
+
+_TASKSPEC_FIELDS = tuple(i.name for i in dataclasses.fields(TaskSpec))
+
+
+@dataclasses.dataclass(init=True, repr=False)
+class Measurement:
+    """The result of a Timer measurement.
+
+    This class stores one or more measurements of a given statement. It is
+    serializable and provides several convenience methods
+    (including a detailed __repr__) for downstream consumers.
+    """
+    number_per_run: int
+    raw_times: List[float]
+    task_spec: TaskSpec
+    metadata: Optional[Dict[Any, Any]] = None  # Reserved for user payloads.
+
+    def __post_init__(self) -> None:
+        self._sorted_times: Tuple[float, ...] = ()
+        self._warnings: Tuple[str, ...] = ()
+        self._median: float = -1.0
+        self._mean: float = -1.0
+        self._p25: float = -1.0
+        self._p75: float = -1.0
+
+    def __getattr__(self, name: str) -> Any:
+        # Forward TaskSpec fields for convenience.
+        if name in _TASKSPEC_FIELDS:
+            return getattr(self.task_spec, name)
+        return super().__getattribute__(name)
+
+    # =========================================================================
+    # == Convenience methods for statistics ===================================
+    # =========================================================================
+    #
+    # These methods use raw time divided by number_per_run; this is an
+    # extrapolation and hides the fact that different number_per_run will
+    # result in different amortization of overheads, however if Timer has
+    # selected an appropriate number_per_run then this is a non-issue, and
+    # forcing users to handle that division would result in a poor experience.
+    @property
+    def times(self) -> List[float]:
+        return [t / self.number_per_run for t in self.raw_times]
+
+    @property
+    def median(self) -> float:
+        self._lazy_init()
+        return self._median
+
+    @property
+    def mean(self) -> float:
+        self._lazy_init()
+        return self._mean
+
+    @property
+    def iqr(self) -> float:
+        self._lazy_init()
+        return self._p75 - self._p25
+
+    @property
+    def significant_figures(self) -> int:
+        """Approximate significant figure estimate.
+
+        This property is intended to give a convenient way to estimate the
+        precision of a measurement. It only uses the interquartile region to
+        estimate statistics to try to mitigate skew from the tails, and
+        uses a static z value of 1.645 since it is not expected to be used
+        for small values of `n`, so z can approximate `t`.
+
+        The significant figure estimation used in conjunction with the
+        `trim_sigfig` method to provide a more human interpretable data
+        summary. __repr__ does not use this method; it simply displays raw
+        values. Significant figure estimation is intended for `Compare`.
+        """
+        self._lazy_init()
+        n_total = len(self._sorted_times)
+        lower_bound = int(n_total // 4)
+        upper_bound = int(torch.tensor(3 * n_total / 4).ceil())
+        interquartile_points: Tuple[float, ...] = self._sorted_times[lower_bound:upper_bound]
+        std = torch.tensor(interquartile_points).std(unbiased=False).item()
+        sqrt_n = torch.tensor(len(interquartile_points)).sqrt().item()
+
+        # Rough estimates. These are by no means statistically rigorous.
+        confidence_interval = max(1.645 * std / sqrt_n, _MIN_CONFIDENCE_INTERVAL)
+        relative_ci = torch.tensor(self._median / confidence_interval).log10().item()
+        num_significant_figures = int(torch.tensor(relative_ci).floor())
+        return min(max(num_significant_figures, 1), _MAX_SIGNIFICANT_FIGURES)
+
+    @property
+    def has_warnings(self) -> bool:
+        self._lazy_init()
+        return bool(self._warnings)
+
+    def _lazy_init(self) -> None:
+        if self.raw_times and not self._sorted_times:
+            self._sorted_times = tuple(sorted(self.times))
+            _sorted_times = torch.tensor(self._sorted_times, dtype=torch.float64)
+            self._median = _sorted_times.quantile(.5).item()
+            self._mean = _sorted_times.mean().item()
+            self._p25 = _sorted_times.quantile(.25).item()
+            self._p75 = _sorted_times.quantile(.75).item()
+
+            def add_warning(msg: str) -> None:
+                rel_iqr = self.iqr / self.median * 100
+                self._warnings += (
+                    f"  WARNING: Interquartile range is {rel_iqr:.1f}% "
+                    f"of the median measurement.\n           {msg}",
+                )
+
+            if not self.meets_confidence(_IQR_GROSS_WARN_THRESHOLD):
+                add_warning("This suggests significant environmental influence.")
+            elif not self.meets_confidence(_IQR_WARN_THRESHOLD):
+                add_warning("This could indicate system fluctuation.")
+
+
+    def meets_confidence(self, threshold: float = _IQR_WARN_THRESHOLD) -> bool:
+        return self.iqr / self.median < threshold
+
+    @property
+    def title(self) -> str:
+        return self.task_spec.title
+
+    @property
+    def env(self) -> str:
+        return (
+            "Unspecified env" if self.taskspec.env is None
+            else cast(str, self.taskspec.env)
+        )
+
+    @property
+    def as_row_name(self) -> str:
+        return self.sub_label or self.stmt or "[Unknown]"
+
+    def __repr__(self) -> str:
+        """
+        Example repr:
+            <utils.common.Measurement object at 0x7f395b6ac110>
+              Broadcasting add (4x8)
+              Median: 5.73 us
+              IQR:    2.25 us (4.01 to 6.26)
+              372 measurements, 100 runs per measurement, 1 thread
+              WARNING: Interquartile range is 39.4% of the median measurement.
+                       This suggests significant environmental influence.
+        """
+        self._lazy_init()
+        skip_line, newline = "MEASUREMENT_REPR_SKIP_LINE", "\n"
+        n = len(self._sorted_times)
+        time_unit, time_scale = select_unit(self._median)
+        iqr_filter = '' if n >= 4 else skip_line
+
+        repr_str = f"""
+{super().__repr__()}
+{self.task_spec.summarize()}
+  {'Median: ' if n > 1 else ''}{self._median / time_scale:.2f} {time_unit}
+  {iqr_filter}IQR:    {self.iqr / time_scale:.2f} {time_unit} ({self._p25 / time_scale:.2f} to {self._p75 / time_scale:.2f})
+  {n} measurement{'s' if n > 1 else ''}, {self.number_per_run} runs {'per measurement,' if n > 1 else ','} {self.num_threads} thread{'s' if self.num_threads > 1 else ''}
+{newline.join(self._warnings)}""".strip()  # noqa: B950
+
+        return "\n".join(l for l in repr_str.splitlines(keepends=False) if skip_line not in l)
+
+    @staticmethod
+    def merge(measurements: Iterable["Measurement"]) -> List["Measurement"]:
+        """Convenience method for merging replicates.
+
+        Merge will extrapolate times to `number_per_run=1` and will not
+        transfer any metadata. (Since it might differ between replicates)
+        """
+        grouped_measurements: DefaultDict[TaskSpec, List[Measurement]] = collections.defaultdict(list)
+        for m in measurements:
+            grouped_measurements[m.task_spec].append(m)
+
+        def merge_group(task_spec: TaskSpec, group: List["Measurement"]) -> "Measurement":
+            times: List[float] = []
+            for m in group:
+                # Different measurements could have different `number_per_run`,
+                # so we call `.times` which normalizes the results.
+                times.extend(m.times)
+
+            return Measurement(
+                number_per_run=1,
+                raw_times=times,
+                task_spec=task_spec,
+                metadata=None,
+            )
+
+        return [merge_group(t, g) for t, g in grouped_measurements.items()]
+
+
+def select_unit(t: float) -> Tuple[str, float]:
+    """Determine how to scale times for O(1) magnitude.
+
+    This utility is used to format numbers for human consumption.
+    """
+    time_unit = {-3: "ns", -2: "us", -1: "ms"}.get(int(torch.tensor(t).log10().item() // 3), "s")
+    time_scale = {"ns": 1e-9, "us": 1e-6, "ms": 1e-3, "s": 1}[time_unit]
+    return time_unit, time_scale
+
+
+def unit_to_english(u: str) -> str:
+    return {
+        "ns": "nanosecond",
+        "us": "microsecond",
+        "ms": "millisecond",
+        "s": "second",
+    }[u]
+
+
+def trim_sigfig(x: float, n: int) -> float:
+    """Trim `x` to `n` significant figures. (e.g. 3.14159, 2 -> 3.10000)"""
+    assert n == int(n)
+    magnitude = int(torch.tensor(x).abs().log10().ceil().item())
+    scale = 10 ** (magnitude - n)
+    return float(torch.tensor(x / scale).round() * scale)
+
+
+def ordered_unique(elements: Iterable[Any]) -> List[Any]:
+    return list(collections.OrderedDict(dict.fromkeys(elements)).keys())
+
+
+@contextlib.contextmanager
+def set_torch_threads(n: int) -> Iterator[None]:
+    prior_num_threads = torch.get_num_threads()
+    try:
+        torch.set_num_threads(n)
+        yield
+    finally:
+        torch.set_num_threads(prior_num_threads)
+
+
+def _make_temp_dir(prefix: Optional[str] = None, gc_dev_shm: bool = False) -> str:
+    """Create a temporary directory. The caller is responsible for cleanup.
+
+    This function is conceptually similar to `tempfile.mkdtemp`, but with
+    the key additional feature that it will use shared memory if the
+    `BENCHMARK_USE_DEV_SHM` environment variable is set. This is an
+    implementation detail, but an important one for cases where many Callgrind
+    measurements are collected at once. (Such as when collecting
+    microbenchmarks.)
+
+    This is an internal utility, and is exported solely so that microbenchmarks
+    can reuse the util.
+    """
+    use_dev_shm: bool = (os.getenv("BENCHMARK_USE_DEV_SHM") or "").lower() in ("1", "true")
+    if use_dev_shm:
+        root = "/dev/shm/pytorch_benchmark_utils"
+        assert os.name == "posix", f"tmpfs (/dev/shm) is POSIX only, current platform is {os.name}"
+        assert os.path.exists("/dev/shm"), "This system does not appear to support tmpfs (/dev/shm)."
+        os.makedirs(root, exist_ok=True)
+
+        # Because we're working in shared memory, it is more important than
+        # usual to clean up ALL intermediate files. However we don't want every
+        # worker to walk over all outstanding directories, so instead we only
+        # check when we are sure that it won't lead to contention.
+        if gc_dev_shm:
+            for i in os.listdir(root):
+                owner_file = os.path.join(root, i, "owner.pid")
+                if not os.path.exists(owner_file):
+                    continue
+
+                with open(owner_file) as f:
+                    owner_pid = int(f.read())
+
+                if owner_pid == os.getpid():
+                    continue
+
+                try:
+                    # https://stackoverflow.com/questions/568271/how-to-check-if-there-exists-a-process-with-a-given-pid-in-python
+                    os.kill(owner_pid, 0)
+
+                except OSError:
+                    print(f"Detected that {os.path.join(root, i)} was orphaned in shared memory. Cleaning up.")
+                    shutil.rmtree(os.path.join(root, i))
+
+    else:
+        root = tempfile.gettempdir()
+
+    # We include the time so names sort by creation time, and add a UUID
+    # to ensure we don't collide.
+    name = f"{prefix or tempfile.gettempprefix()}__{int(time.time())}__{uuid.uuid4()}"
+    path = os.path.join(root, name)
+    os.makedirs(path, exist_ok=False)
+
+    if use_dev_shm:
+        with open(os.path.join(path, "owner.pid"), "w") as f:
+            f.write(str(os.getpid()))
+
+    return path
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/compare.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/compare.py
@ -0,0 +1,348 @@
+# mypy: allow-untyped-defs
+"""Display class to aggregate and print the results of many measurements."""
+import collections
+import enum
+import itertools as it
+from typing import DefaultDict, List, Optional, Tuple
+
+from torch.utils.benchmark.utils import common
+from torch import tensor as _tensor
+import operator
+
+__all__ = ["Colorize", "Compare"]
+
+BEST = "\033[92m"
+GOOD = "\033[34m"
+BAD = "\033[2m\033[91m"
+VERY_BAD = "\033[31m"
+BOLD = "\033[1m"
+TERMINATE = "\033[0m"
+
+
+class Colorize(enum.Enum):
+    NONE = "none"
+    COLUMNWISE = "columnwise"
+    ROWWISE = "rowwise"
+
+
+# Classes to separate internal bookkeeping from what is rendered.
+class _Column:
+    def __init__(
+        self,
+        grouped_results: List[Tuple[Optional[common.Measurement], ...]],
+        time_scale: float,
+        time_unit: str,
+        trim_significant_figures: bool,
+        highlight_warnings: bool,
+    ):
+        self._grouped_results = grouped_results
+        self._flat_results = list(it.chain(*grouped_results))
+        self._time_scale = time_scale
+        self._time_unit = time_unit
+        self._trim_significant_figures = trim_significant_figures
+        self._highlight_warnings = (
+            highlight_warnings
+            and any(r.has_warnings for r in self._flat_results if r)
+        )
+        leading_digits = [
+            int(_tensor(r.median / self._time_scale).log10().ceil()) if r else None
+            for r in self._flat_results
+        ]
+        unit_digits = max(d for d in leading_digits if d is not None)
+        decimal_digits = min(
+            max(m.significant_figures - digits, 0)
+            for digits, m in zip(leading_digits, self._flat_results)
+            if (m is not None) and (digits is not None)
+        ) if self._trim_significant_figures else 1
+        length = unit_digits + decimal_digits + (1 if decimal_digits else 0)
+        self._template = f"{{:>{length}.{decimal_digits}f}}{{:>{7 if self._highlight_warnings else 0}}}"
+
+    def get_results_for(self, group):
+        return self._grouped_results[group]
+
+    def num_to_str(self, value: Optional[float], estimated_sigfigs: int, spread: Optional[float]):
+        if value is None:
+            return " " * len(self.num_to_str(1, estimated_sigfigs, None))
+
+        if self._trim_significant_figures:
+            value = common.trim_sigfig(value, estimated_sigfigs)
+
+        return self._template.format(
+            value,
+            f" (! {spread * 100:.0f}%)" if self._highlight_warnings and spread is not None else "")
+
+
+def optional_min(seq):
+    l = list(seq)
+    return None if len(l) == 0 else min(l)
+
+
+class _Row:
+    def __init__(self, results, row_group, render_env, env_str_len,
+                 row_name_str_len, time_scale, colorize, num_threads=None):
+        super().__init__()
+        self._results = results
+        self._row_group = row_group
+        self._render_env = render_env
+        self._env_str_len = env_str_len
+        self._row_name_str_len = row_name_str_len
+        self._time_scale = time_scale
+        self._colorize = colorize
+        self._columns: Tuple[_Column, ...] = ()
+        self._num_threads = num_threads
+
+    def register_columns(self, columns: Tuple[_Column, ...]):
+        self._columns = columns
+
+    def as_column_strings(self):
+        concrete_results = [r for r in self._results if r is not None]
+        env = f"({concrete_results[0].env})" if self._render_env else ""
+        env = env.ljust(self._env_str_len + 4)
+        output = ["  " + env + concrete_results[0].as_row_name]
+        for m, col in zip(self._results, self._columns or ()):
+            if m is None:
+                output.append(col.num_to_str(None, 1, None))
+            else:
+                output.append(col.num_to_str(
+                    m.median / self._time_scale,
+                    m.significant_figures,
+                    m.iqr / m.median if m.has_warnings else None
+                ))
+        return output
+
+    @staticmethod
+    def color_segment(segment, value, best_value):
+        if value <= best_value * 1.01 or value <= best_value + 100e-9:
+            return BEST + BOLD + segment + TERMINATE * 2
+        if value <= best_value * 1.1:
+            return GOOD + BOLD + segment + TERMINATE * 2
+        if value >= best_value * 5:
+            return VERY_BAD + BOLD + segment + TERMINATE * 2
+        if value >= best_value * 2:
+            return BAD + segment + TERMINATE * 2
+
+        return segment
+
+    def row_separator(self, overall_width):
+        return (
+            [f"{self._num_threads} threads: ".ljust(overall_width, "-")]
+            if self._num_threads is not None else []
+        )
+
+    def finalize_column_strings(self, column_strings, col_widths):
+        best_values = [-1 for _ in column_strings]
+        if self._colorize == Colorize.ROWWISE:
+            row_min = min(r.median for r in self._results if r is not None)
+            best_values = [row_min for _ in column_strings]
+        elif self._colorize == Colorize.COLUMNWISE:
+            best_values = [
+                optional_min(r.median for r in column.get_results_for(self._row_group) if r is not None)
+                for column in (self._columns or ())
+            ]
+
+        row_contents = [column_strings[0].ljust(col_widths[0])]
+        for col_str, width, result, best_value in zip(column_strings[1:], col_widths[1:], self._results, best_values):
+            col_str = col_str.center(width)
+            if self._colorize != Colorize.NONE and result is not None and best_value is not None:
+                col_str = self.color_segment(col_str, result.median, best_value)
+            row_contents.append(col_str)
+        return row_contents
+
+
+class Table:
+    def __init__(
+            self,
+            results: List[common.Measurement],
+            colorize: Colorize,
+            trim_significant_figures: bool,
+            highlight_warnings: bool
+    ):
+        assert len({r.label for r in results}) == 1
+
+        self.results = results
+        self._colorize = colorize
+        self._trim_significant_figures = trim_significant_figures
+        self._highlight_warnings = highlight_warnings
+        self.label = results[0].label
+        self.time_unit, self.time_scale = common.select_unit(
+            min(r.median for r in results)
+        )
+
+        self.row_keys = common.ordered_unique([self.row_fn(i) for i in results])
+        self.row_keys.sort(key=operator.itemgetter(slice(2)))  # preserve stmt order
+        self.column_keys = common.ordered_unique([self.col_fn(i) for i in results])
+        self.rows, self.columns = self.populate_rows_and_columns()
+
+    @staticmethod
+    def row_fn(m: common.Measurement) -> Tuple[int, Optional[str], str]:
+        return m.num_threads, m.env, m.as_row_name
+
+    @staticmethod
+    def col_fn(m: common.Measurement) -> Optional[str]:
+        return m.description
+
+    def populate_rows_and_columns(self) -> Tuple[Tuple[_Row, ...], Tuple[_Column, ...]]:
+        rows: List[_Row] = []
+        columns: List[_Column] = []
+        ordered_results: List[List[Optional[common.Measurement]]] = [
+            [None for _ in self.column_keys]
+            for _ in self.row_keys
+        ]
+        row_position = {key: i for i, key in enumerate(self.row_keys)}
+        col_position = {key: i for i, key in enumerate(self.column_keys)}
+        for r in self.results:
+            i = row_position[self.row_fn(r)]
+            j = col_position[self.col_fn(r)]
+            ordered_results[i][j] = r
+
+        unique_envs = {r.env for r in self.results}
+        render_env = len(unique_envs) > 1
+        env_str_len = max(len(i) for i in unique_envs) if render_env else 0
+
+        row_name_str_len = max(len(r.as_row_name) for r in self.results)
+
+        prior_num_threads = -1
+        prior_env = ""
+        row_group = -1
+        rows_by_group: List[List[List[Optional[common.Measurement]]]] = []
+        for (num_threads, env, _), row in zip(self.row_keys, ordered_results):
+            thread_transition = (num_threads != prior_num_threads)
+            if thread_transition:
+                prior_num_threads = num_threads
+                prior_env = ""
+                row_group += 1
+                rows_by_group.append([])
+            rows.append(
+                _Row(
+                    results=row,
+                    row_group=row_group,
+                    render_env=(render_env and env != prior_env),
+                    env_str_len=env_str_len,
+                    row_name_str_len=row_name_str_len,
+                    time_scale=self.time_scale,
+                    colorize=self._colorize,
+                    num_threads=num_threads if thread_transition else None,
+                )
+            )
+            rows_by_group[-1].append(row)
+            prior_env = env
+
+        for i in range(len(self.column_keys)):
+            grouped_results = [tuple(row[i] for row in g) for g in rows_by_group]
+            column = _Column(
+                grouped_results=grouped_results,
+                time_scale=self.time_scale,
+                time_unit=self.time_unit,
+                trim_significant_figures=self._trim_significant_figures,
+                highlight_warnings=self._highlight_warnings,)
+            columns.append(column)
+
+        rows_tuple, columns_tuple = tuple(rows), tuple(columns)
+        for ri in rows_tuple:
+            ri.register_columns(columns_tuple)
+        return rows_tuple, columns_tuple
+
+    def render(self) -> str:
+        string_rows = [[""] + self.column_keys]
+        for r in self.rows:
+            string_rows.append(r.as_column_strings())
+        num_cols = max(len(i) for i in string_rows)
+        for sr in string_rows:
+            sr.extend(["" for _ in range(num_cols - len(sr))])
+
+        col_widths = [max(len(j) for j in i) for i in zip(*string_rows)]
+        finalized_columns = ["  |  ".join(i.center(w) for i, w in zip(string_rows[0], col_widths))]
+        overall_width = len(finalized_columns[0])
+        for string_row, row in zip(string_rows[1:], self.rows):
+            finalized_columns.extend(row.row_separator(overall_width))
+            finalized_columns.append("  |  ".join(row.finalize_column_strings(string_row, col_widths)))
+
+        newline = "\n"
+        has_warnings = self._highlight_warnings and any(ri.has_warnings for ri in self.results)
+        return f"""
+[{(' ' + (self.label or '') + ' ').center(overall_width - 2, '-')}]
+{newline.join(finalized_columns)}
+
+Times are in {common.unit_to_english(self.time_unit)}s ({self.time_unit}).
+{'(! XX%) Measurement has high variance, where XX is the IQR / median * 100.' + newline if has_warnings else ""}"""[1:]
+
+
+class Compare:
+    """Helper class for displaying the results of many measurements in a
+    formatted table.
+
+    The table format is based on the information fields provided in
+    :class:`torch.utils.benchmark.Timer` (`description`, `label`, `sub_label`,
+    `num_threads`, etc).
+
+    The table can be directly printed using :meth:`print` or casted as a `str`.
+
+    For a full tutorial on how to use this class, see:
+    https://pytorch.org/tutorials/recipes/recipes/benchmark.html
+
+    Args:
+        results: List of Measurment to display.
+    """
+    def __init__(self, results: List[common.Measurement]):
+        self._results: List[common.Measurement] = []
+        self.extend_results(results)
+        self._trim_significant_figures = False
+        self._colorize = Colorize.NONE
+        self._highlight_warnings = False
+
+    def __str__(self):
+        return "\n".join(self._render())
+
+    def extend_results(self, results):
+        """Append results to already stored ones.
+
+        All added results must be instances of ``Measurement``.
+        """
+        for r in results:
+            if not isinstance(r, common.Measurement):
+                raise ValueError(
+                    "Expected an instance of `Measurement`, " f"got {type(r)} instead."
+                )
+        self._results.extend(results)
+
+    def trim_significant_figures(self):
+        """Enables trimming of significant figures when building the formatted table."""
+        self._trim_significant_figures = True
+
+    def colorize(self, rowwise=False):
+        """Colorize formatted table.
+
+        Colorize columnwise by default.
+        """
+        self._colorize = Colorize.ROWWISE if rowwise else Colorize.COLUMNWISE
+
+    def highlight_warnings(self):
+        """Enables warning highlighting when building formatted table."""
+        self._highlight_warnings = True
+
+    def print(self):
+        """Print formatted table"""
+        print(str(self))
+
+    def _render(self):
+        results = common.Measurement.merge(self._results)
+        grouped_results = self._group_by_label(results)
+        output = []
+        for group in grouped_results.values():
+            output.append(self._layout(group))
+        return output
+
+    def _group_by_label(self, results: List[common.Measurement]):
+        grouped_results: DefaultDict[str, List[common.Measurement]] = collections.defaultdict(list)
+        for r in results:
+            grouped_results[r.label].append(r)
+        return grouped_results
+
+    def _layout(self, results: List[common.Measurement]):
+        table = Table(
+            results,
+            self._colorize,
+            self._trim_significant_figures,
+            self._highlight_warnings
+        )
+        return table.render()
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/compile.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/compile.py
@ -0,0 +1,191 @@
+# mypy: allow-untyped-defs
+from typing import Any, Callable, cast, List, Optional, Union
+
+import torch
+import torch._dynamo
+from torch._dynamo.testing import CompileCounterWithBackend
+from torch.utils.benchmark import Timer
+
+
+__all__ = ["bench_all", "benchmark_compile"]
+
+
+_warned_tensor_cores = False
+_default_float_32_precision = torch.get_float32_matmul_precision()
+
+try:
+    from tabulate import tabulate
+
+    HAS_TABULATE = True
+except ModuleNotFoundError:
+    HAS_TABULATE = False
+    tabulate = None  # type: ignore[assignment]
+    print("tabulate is not installed, please pip install tabulate to use this utility")
+
+if HAS_TABULATE:
+    def _enable_tensor_cores():
+        global _warned_tensor_cores
+
+        if torch.cuda.is_available():
+            if torch.backends.cuda.matmul.allow_tf32 is False and torch.cuda.get_device_capability() >= (8, 0):
+                torch.set_float32_matmul_precision("high")
+                if not _warned_tensor_cores:
+                    print("Your GPU supports tensor cores")
+                    print("we will enable it automatically by setting `torch.set_float32_matmul_precision('high')`")
+                    _warned_tensor_cores = True
+
+    def _disable_tensor_cores():
+        torch.set_float32_matmul_precision(_default_float_32_precision)
+
+    def bench_loop(
+        model: Union[torch.nn.Module, Callable],
+        sample_input: Union[torch.Tensor, Any],
+        num_iters: int = 5,
+        optimizer: Optional[torch.optim.Optimizer] = None,
+        loss_fn: Optional[Callable] = None,
+    ):
+        # Define the statement and setup for the benchmark
+        if optimizer and loss_fn:
+            # Training mode
+            stmt = """
+    output = model(sample_input)
+    loss = loss_fn(output) if loss_fn else output.sum()
+    loss.backward()
+    optimizer.step()
+    optimizer.zero_grad()
+            """
+        else:
+            # Inference mode
+            stmt = "model(sample_input)"
+
+        # Create the Timer object
+        timer = Timer(
+            stmt=stmt,
+            globals={"model": model, "sample_input": sample_input, "optimizer": optimizer, "loss_fn": loss_fn},
+        )
+
+
+        result = timer.timeit(number=num_iters)
+
+        # Get the average time per iteration in milliseconds
+        avg_time = result.mean * 1000
+        return round(avg_time, 2)
+
+    def benchmark_compile(
+        model: Union[torch.nn.Module, Callable],
+        sample_input: Union[torch.Tensor, Any],
+        num_iters: int = 5,
+        backend: Optional[str] = None,
+        mode: Optional[str] = "default",
+        optimizer: Optional[torch.optim.Optimizer] = None,
+        loss_fn : Union[torch.nn.Module, Callable, None] = None,
+    ):
+        """
+        Use this utility to benchmark torch.compile
+        """
+        if backend:
+            try:
+                torch._dynamo.reset()
+                compile_counter_with_backend = CompileCounterWithBackend(backend)
+                opt_model = torch.compile(model, backend=compile_counter_with_backend, mode=mode)
+
+                # Compilation only happens after the first inference
+                compilation_time = bench_loop(opt_model, sample_input, 1, optimizer, loss_fn)
+
+                running_time = bench_loop(opt_model, sample_input, num_iters, optimizer, loss_fn)
+
+                if compile_counter_with_backend.frame_count == 0:
+                    raise RuntimeError("No compilation occurred during benchmarking.")
+
+                if compile_counter_with_backend.frame_count > 1:
+                    raise RuntimeError("Recompilation occurred during benchmarking.")
+
+            except Exception as e:
+                print(e)
+                print(f"Failed to compile {backend} with mode {mode}")
+                return None, None
+        else:
+            opt_model = model
+            compilation_time = None
+            running_time = bench_loop(opt_model, sample_input, num_iters, optimizer, loss_fn)
+
+        compilation_time = round(compilation_time, 2) if compilation_time else None
+        running_time = round(running_time, 2) if running_time else None
+
+
+        return compilation_time, running_time
+
+
+    def bench_all(
+        model : Union[torch.nn.Module, Callable],
+        sample_input: Union[torch.Tensor, Any],
+        num_iters : int = 5,
+        optimizer: Optional[torch.optim.Optimizer] = None,
+        loss_fn : Union[torch.nn.Module, Callable, None] = None,
+    ):
+        """
+        This is a simple utility that can be used to benchmark torch.compile
+        In particular it ensures that your GPU is setup to use tensor cores if it supports its
+        It also tries out all the main backends and prints a table of results so you can easily compare them all
+        Many of the backendds have their own optional dependencies so please pip install them seperately
+
+        You will get one table for inference and another for training
+        If you'd like to leverage this utility for training make sure to pass in a torch.optim.Optimizer
+
+        The important warnings are
+        Your GPU supports tensor cores
+        we will enable it automatically by setting `torch.set_float32_matmul_precision('high')`
+
+        If a compilation fails for any reason including the dependency not being included
+        then we will print Failed to compile {backend} with mode {mode}
+        """
+        field_names = ["Train/Inference", "Backend", "Mode", "Compilation Time", "Average Running Time"]
+        table = []
+
+
+        eager_time = None
+        torch._dynamo.reset()
+        _, eager_time = benchmark_compile(model, sample_input, num_iters, None, None, optimizer)
+        table.append(
+            [("Training" if optimizer else "Inference"), "Eager", "-", "-", f"{eager_time} ms"]
+        )
+
+        for backend in torch._dynamo.list_backends():
+
+            if backend == "inductor":
+                mode_options = cast(List[Optional[str]], list(torch._inductor.list_mode_options().keys())) + [None]
+                for mode in mode_options:
+                    if mode == "default":
+                        continue
+                    torch._dynamo.reset()
+                    try:
+                        if torch.cuda.is_available():
+                            _enable_tensor_cores()
+                        compilation_time, running_time = benchmark_compile(
+                            model, sample_input, num_iters, backend, mode, optimizer, loss_fn)
+                    finally:
+                        if torch.cuda.is_available():
+                            _disable_tensor_cores()
+                            table.append([
+                                ("Training" if optimizer else "Inference"),
+                                backend if backend else "-",
+                                mode if mode is not None else "-",
+                                f"{compilation_time} ms " if compilation_time else "-",
+                                f"{running_time} ms " if running_time else "-",
+                            ])
+
+            else:
+                torch._dynamo.reset()
+                compilation_time, running_time = benchmark_compile(
+                    model, sample_input, num_iters, backend, None, optimizer, loss_fn)
+
+                if running_time is not None:
+                    table.append([
+                        ("Training" if optimizer else "Inference"),
+                        backend, "-",
+                        f"{compilation_time} ms " or "-",
+                        f"{running_time} ms ",
+                    ])
+
+
+        return tabulate(table, headers=field_names, tablefmt="github")
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/cpp_jit.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/cpp_jit.py
@ -0,0 +1,172 @@
+"""JIT C++ strings into executables."""
+import atexit
+import os
+import re
+import shutil
+import textwrap
+import threading
+from typing import Any, List, Optional
+
+import torch
+from torch.utils.benchmark.utils._stubs import CallgrindModuleType, TimeitModuleType
+from torch.utils.benchmark.utils.common import _make_temp_dir
+from torch.utils import cpp_extension
+
+
+LOCK = threading.Lock()
+SOURCE_ROOT = os.path.split(os.path.abspath(__file__))[0]
+
+# We calculate uuid once at import time so that separate processes will have
+# separate build roots, but threads will share the same build root.
+# `cpp_extension` uses build root as part of the cache key, so per-invocation
+# uuid's (e.g. different build root per _compile_template call) would lead to
+# a 0% cache hit rate and spurious recompilation. Consider the following:
+#   ```
+#   setup = "auto x = torch::ones({1024, 1024});"
+#   stmt = "torch::mm(x, x);"
+#   for num_threads in [1, 2, 4, 8]:
+#     print(Timer(stmt, setup, num_threads=num_threads, language="c++").blocked_autorange())
+#   ````
+# `setup` and `stmt` do not change, so we can reuse the executable from the
+# first pass through the loop.
+_BUILD_ROOT: Optional[str] = None
+
+def _get_build_root() -> str:
+    global _BUILD_ROOT
+    if _BUILD_ROOT is None:
+        _BUILD_ROOT = _make_temp_dir(prefix="benchmark_utils_jit_build")
+        atexit.register(shutil.rmtree, _BUILD_ROOT)
+    return _BUILD_ROOT
+
+
+# BACK_TESTING_NOTE:
+#   There are two workflows where this code could be used. One is the obvious
+#   case where someone simply builds or installs PyTorch and uses Timer.
+#   The other is that the entire `torch/utils/benchmark` folder from a CURRENT
+#   PyTorch checkout is copy-pasted into a much OLDER version of the PyTorch
+#   source code. This is what we refer to here as "back testing". The rationale
+#   is that we might want to use current tooling to study some aspect of an
+#   earlier version of PyTorch. (e.g. a regression.)
+#
+#   The problem is that Timer relies on several aspects of core PyTorch, namely
+#   some binding functions for Valgrind symbols in `torch._C` and the
+#   `torch.__config__._cxx_flags()` method. If we were to naively copy code
+#   around this wouldn't work as the symbols of interest aren't present in
+#   earlier versions of PyTorch. In order to work around this, we must add back
+#   testing shims. These shims will never activate during normal use, but will
+#   allow Timer to function outside of the "correct" version of PyTorch by
+#   emulating functionality that was added later.
+#
+#   These shims are temporary, and as Timer becomes more integrated with
+#   PyTorch the cost and complexity of such shims will increase. Once back
+#   testing is no longer required (which is to say we have done enough historic
+#   analysis and the shims no longer justify their maintenance and code
+#   complexity costs) back testing paths will be removed.
+
+CXX_FLAGS: Optional[List[str]]
+if hasattr(torch.__config__, "_cxx_flags"):
+    try:
+        CXX_FLAGS = torch.__config__._cxx_flags().strip().split()
+        if CXX_FLAGS is not None and "-g" not in CXX_FLAGS:
+            CXX_FLAGS.append("-g")
+        # remove "-W" flags to allow build benchmarks
+        # with a relaxed constraint of compiler versions
+        if CXX_FLAGS is not None:
+            CXX_FLAGS = list(filter(lambda x: not x.startswith("-W"), CXX_FLAGS))
+
+    except RuntimeError:
+        # We are in FBCode.
+        CXX_FLAGS = None
+else:
+    # FIXME: Remove when back testing is no longer required.
+    CXX_FLAGS = ["-O2", "-fPIC", "-g"]
+
+EXTRA_INCLUDE_PATHS: List[str] = [os.path.join(SOURCE_ROOT, "valgrind_wrapper")]
+CONDA_PREFIX = os.getenv("CONDA_PREFIX")
+if CONDA_PREFIX is not None:
+    # Load will automatically search /usr/include, but not conda include.
+    EXTRA_INCLUDE_PATHS.append(os.path.join(CONDA_PREFIX, "include"))
+
+
+COMPAT_CALLGRIND_BINDINGS: Optional[CallgrindModuleType] = None
+def get_compat_bindings() -> CallgrindModuleType:
+    with LOCK:
+        global COMPAT_CALLGRIND_BINDINGS
+        if COMPAT_CALLGRIND_BINDINGS is None:
+            COMPAT_CALLGRIND_BINDINGS = cpp_extension.load(
+                name="callgrind_bindings",
+                sources=[os.path.join(
+                    SOURCE_ROOT,
+                    "valgrind_wrapper",
+                    "compat_bindings.cpp"
+                )],
+                extra_cflags=CXX_FLAGS,
+                extra_include_paths=EXTRA_INCLUDE_PATHS,
+            )
+    return COMPAT_CALLGRIND_BINDINGS
+
+
+def _compile_template(
+    *,
+    stmt: str,
+    setup: str,
+    global_setup: str,
+    src: str,
+    is_standalone: bool
+) -> Any:
+    for before, after, indentation in (
+        ("// GLOBAL_SETUP_TEMPLATE_LOCATION", global_setup, 0),
+        ("// SETUP_TEMPLATE_LOCATION", setup, 4),
+        ("// STMT_TEMPLATE_LOCATION", stmt, 8)
+    ):
+        # C++ doesn't care about indentation so this code isn't load
+        # bearing the way it is with Python, but this makes the source
+        # look nicer if a human has to look at it.
+        src = re.sub(
+            before,
+            textwrap.indent(after, " " * indentation)[indentation:],
+            src
+        )
+
+    # We want to isolate different Timers. However `cpp_extension` will
+    # cache builds which will significantly reduce the cost of repeated
+    # invocations.
+    with LOCK:
+        name = f"timer_cpp_{abs(hash(src))}"
+        build_dir = os.path.join(_get_build_root(), name)
+        os.makedirs(build_dir, exist_ok=True)
+
+        src_path = os.path.join(build_dir, "timer_src.cpp")
+        with open(src_path, "w") as f:
+            f.write(src)
+
+    # `cpp_extension` has its own locking scheme, so we don't need our lock.
+    return cpp_extension.load(
+        name=name,
+        sources=[src_path],
+        build_directory=build_dir,
+        extra_cflags=CXX_FLAGS,
+        extra_include_paths=EXTRA_INCLUDE_PATHS,
+        is_python_module=not is_standalone,
+        is_standalone=is_standalone,
+    )
+
+
+def compile_timeit_template(*, stmt: str, setup: str, global_setup: str) -> TimeitModuleType:
+    template_path: str = os.path.join(SOURCE_ROOT, "timeit_template.cpp")
+    with open(template_path) as f:
+        src: str = f.read()
+
+    module = _compile_template(stmt=stmt, setup=setup, global_setup=global_setup, src=src, is_standalone=False)
+    assert isinstance(module, TimeitModuleType)
+    return module
+
+
+def compile_callgrind_template(*, stmt: str, setup: str, global_setup: str) -> str:
+    template_path: str = os.path.join(SOURCE_ROOT, "valgrind_wrapper", "timer_callgrind_template.cpp")
+    with open(template_path) as f:
+        src: str = f.read()
+
+    target = _compile_template(stmt=stmt, setup=setup, global_setup=global_setup, src=src, is_standalone=True)
+    assert isinstance(target, str)
+    return target
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/fuzzer.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/fuzzer.py
@ -0,0 +1,462 @@
+# mypy: allow-untyped-defs
+import functools
+import itertools as it
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+
+
+__all__ = [
+    "Fuzzer",
+    "FuzzedParameter", "ParameterAlias",
+    "FuzzedTensor",
+]
+
+
+_DISTRIBUTIONS = (
+    "loguniform",
+    "uniform",
+)
+
+
+class FuzzedParameter:
+    """Specification for a parameter to be generated during fuzzing."""
+    def __init__(
+        self,
+        name: str,
+        minval: Optional[Union[int, float]] = None,
+        maxval: Optional[Union[int, float]] = None,
+        distribution: Optional[Union[str, Dict[Any, float]]] = None,
+        strict: bool = False,
+    ):
+        """
+        Args:
+            name:
+                A string name with which to identify the parameter.
+                FuzzedTensors can reference this string in their
+                specifications.
+            minval:
+                The lower bound for the generated value. See the description
+                of `distribution` for type behavior.
+            maxval:
+                The upper bound for the generated value. Type behavior is
+                identical to `minval`.
+            distribution:
+                Specifies the distribution from which this parameter should
+                be drawn. There are three possibilities:
+                    - "loguniform"
+                        Samples between `minval` and `maxval` (inclusive) such
+                        that the probabilities are uniform in log space. As a
+                        concrete example, if minval=1 and maxval=100, a sample
+                        is as likely to fall in [1, 10) as it is [10, 100].
+                    - "uniform"
+                        Samples are chosen with uniform probability between
+                        `minval` and `maxval` (inclusive). If either `minval`
+                        or `maxval` is a float then the distribution is the
+                        continuous uniform distribution; otherwise samples
+                        are constrained to the integers.
+                    - dict:
+                        If a dict is passed, the keys are taken to be choices
+                        for the variables and the values are interpreted as
+                        probabilities. (And must sum to one.)
+                If a dict is passed, `minval` and `maxval` must not be set.
+                Otherwise, they must be set.
+            strict:
+                If a parameter is strict, it will not be included in the
+                iterative resampling process which Fuzzer uses to find a
+                valid parameter configuration. This allows an author to
+                prevent skew from resampling for a given parameter (for
+                instance, a low size limit could inadvertently bias towards
+                Tensors with fewer dimensions) at the cost of more iterations
+                when generating parameters.
+        """
+        self._name = name
+        self._minval = minval
+        self._maxval = maxval
+        self._distribution = self._check_distribution(distribution)
+        self.strict = strict
+
+    @property
+    def name(self):
+        return self._name
+
+    def sample(self, state):
+        if self._distribution == "loguniform":
+            return self._loguniform(state)
+
+        if self._distribution == "uniform":
+            return self._uniform(state)
+
+        if isinstance(self._distribution, dict):
+            return self._custom_distribution(state)
+
+    def _check_distribution(self, distribution):
+        if not isinstance(distribution, dict):
+            assert distribution in _DISTRIBUTIONS
+        else:
+            assert not any(i < 0 for i in distribution.values()), "Probabilities cannot be negative"
+            assert abs(sum(distribution.values()) - 1) <= 1e-5, "Distribution is not normalized"
+            assert self._minval is None
+            assert self._maxval is None
+
+        return distribution
+
+    def _loguniform(self, state):
+        import numpy as np
+        output = int(2 ** state.uniform(
+            low=np.log2(self._minval) if self._minval is not None else None,
+            high=np.log2(self._maxval) if self._maxval is not None else None,
+        ))
+        if self._minval is not None and output < self._minval:
+            return self._minval
+        if self._maxval is not None and output > self._maxval:
+            return self._maxval
+        return output
+
+    def _uniform(self, state):
+        if isinstance(self._minval, int) and isinstance(self._maxval, int):
+            return int(state.randint(low=self._minval, high=self._maxval + 1))
+        return state.uniform(low=self._minval, high=self._maxval)
+
+    def _custom_distribution(self, state):
+        import numpy as np
+        # If we directly pass the keys to `choice`, numpy will convert
+        # them to numpy dtypes.
+        index = state.choice(
+            np.arange(len(self._distribution)),
+            p=tuple(self._distribution.values()))
+        return list(self._distribution.keys())[index]
+
+
+class ParameterAlias:
+    """Indicates that a parameter should alias the value of another parameter.
+
+    When used in conjunction with a custom distribution, this allows fuzzed
+    tensors to represent a broader range of behaviors. For example, the
+    following sometimes produces Tensors which broadcast:
+
+    Fuzzer(
+        parameters=[
+            FuzzedParameter("x_len", 4, 1024, distribution="uniform"),
+
+            # `y` will either be size one, or match the size of `x`.
+            FuzzedParameter("y_len", distribution={
+                0.5: 1,
+                0.5: ParameterAlias("x_len")
+            }),
+        ],
+        tensors=[
+            FuzzedTensor("x", size=("x_len",)),
+            FuzzedTensor("y", size=("y_len",)),
+        ],
+    )
+
+    Chains of alias' are allowed, but may not contain cycles.
+    """
+    def __init__(self, alias_to):
+        self.alias_to = alias_to
+
+    def __repr__(self):
+        return f"ParameterAlias[alias_to: {self.alias_to}]"
+
+
+def dtype_size(dtype):
+    if dtype == torch.bool:
+        return 1
+    if dtype.is_floating_point or dtype.is_complex:
+        return int(torch.finfo(dtype).bits / 8)
+    return int(torch.iinfo(dtype).bits / 8)
+
+
+def prod(values, base=1):
+    """np.prod can overflow, so for sizes the product should be done in Python.
+
+    Even though np.prod type promotes to int64, it can still overflow in which
+    case the negative value will pass the size check and OOM when attempting to
+    actually allocate the Tensor.
+    """
+    return functools.reduce(lambda x, y: int(x) * int(y), values, base)
+
+
+class FuzzedTensor:
+    def __init__(
+        self,
+        name: str,
+        size: Tuple[Union[str, int], ...],
+        steps: Optional[Tuple[Union[str, int], ...]] = None,
+        probability_contiguous: float = 0.5,
+        min_elements: Optional[int] = None,
+        max_elements: Optional[int] = None,
+        max_allocation_bytes: Optional[int] = None,
+        dim_parameter: Optional[str] = None,
+        roll_parameter: Optional[str] = None,
+        dtype=torch.float32,
+        cuda=False,
+        tensor_constructor: Optional[Callable] = None
+    ):
+        """
+        Args:
+            name:
+                A string identifier for the generated Tensor.
+            size:
+                A tuple of integers or strings specifying the size of the generated
+                Tensor. String values will replaced with a concrete int during the
+                generation process, while ints are simply passed as literals.
+            steps:
+                An optional tuple with the same length as `size`. This indicates
+                that a larger Tensor should be allocated, and then sliced to
+                produce the generated Tensor. For instance, if size is (4, 8)
+                and steps is (1, 4), then a tensor `t` of size (4, 32) will be
+                created and then `t[:, ::4]` will be used. (Allowing one to test
+                Tensors with strided memory.)
+            probability_contiguous:
+                A number between zero and one representing the chance that the
+                generated Tensor has a contiguous memory layout. This is achieved by
+                randomly permuting the shape of a Tensor, calling `.contiguous()`,
+                and then permuting back. This is applied before `steps`, which can
+                also cause a Tensor to be non-contiguous.
+            min_elements:
+                The minimum number of parameters that this Tensor must have for a
+                set of parameters to be valid. (Otherwise they are resampled.)
+            max_elements:
+                Like `min_elements`, but setting an upper bound.
+            max_allocation_bytes:
+                Like `max_elements`, but for the size of Tensor that must be
+                allocated prior to slicing for `steps` (if applicable). For
+                example, a FloatTensor with size (1024, 1024) and steps (4, 4)
+                would have 1M elements, but would require a 64 MB allocation.
+            dim_parameter:
+                The length of `size` and `steps` will be truncated to this value.
+                This allows Tensors of varying dimensions to be generated by the
+                Fuzzer.
+            dtype:
+                The PyTorch dtype of the generated Tensor.
+            cuda:
+                Whether to place the Tensor on a GPU.
+            tensor_constructor:
+                Callable which will be used instead of the default Tensor
+                construction method. This allows the author to enforce properties
+                of the Tensor (e.g. it can only have certain values). The dtype and
+                concrete shape of the Tensor to be created will be passed, and
+                concrete values of all parameters will be passed as kwargs. Note
+                that transformations to the result (permuting, slicing) will be
+                performed by the Fuzzer; the tensor_constructor is only responsible
+                for creating an appropriately sized Tensor.
+        """
+        self._name = name
+        self._size = size
+        self._steps = steps
+        self._probability_contiguous = probability_contiguous
+        self._min_elements = min_elements
+        self._max_elements = max_elements
+        self._max_allocation_bytes = max_allocation_bytes
+        self._dim_parameter = dim_parameter
+        self._dtype = dtype
+        self._cuda = cuda
+        self._tensor_constructor = tensor_constructor
+
+    @property
+    def name(self):
+        return self._name
+
+    @staticmethod
+    def default_tensor_constructor(size, dtype, **kwargs):
+        if dtype.is_floating_point or dtype.is_complex:
+            return torch.rand(size=size, dtype=dtype, device="cpu")
+        else:
+            return torch.randint(1, 127, size=size, dtype=dtype, device="cpu")
+
+    def _make_tensor(self, params, state):
+        import numpy as np
+        size, steps, allocation_size = self._get_size_and_steps(params)
+        constructor = (
+            self._tensor_constructor or
+            self.default_tensor_constructor
+        )
+
+        raw_tensor = constructor(size=allocation_size, dtype=self._dtype, **params)
+        if self._cuda:
+            raw_tensor = raw_tensor.cuda()
+
+        # Randomly permute the Tensor and call `.contiguous()` to force re-ordering
+        # of the memory, and then permute it back to the original shape.
+        dim = len(size)
+        order = np.arange(dim)
+        if state.rand() > self._probability_contiguous:
+            while dim > 1 and np.all(order == np.arange(dim)):
+                order = state.permutation(raw_tensor.dim())
+
+            raw_tensor = raw_tensor.permute(tuple(order)).contiguous()
+            raw_tensor = raw_tensor.permute(tuple(np.argsort(order)))
+
+        slices = [slice(0, size * step, step) for size, step in zip(size, steps)]
+        tensor = raw_tensor[slices]
+
+        properties = {
+            "numel": int(tensor.numel()),
+            "order": order,
+            "steps": steps,
+            "is_contiguous": tensor.is_contiguous(),
+            "dtype": str(self._dtype),
+        }
+
+        return tensor, properties
+
+    def _get_size_and_steps(self, params):
+        dim = (
+            params[self._dim_parameter]
+            if self._dim_parameter is not None
+            else len(self._size)
+        )
+
+        def resolve(values, dim):
+            """Resolve values into concrete integers."""
+            values = tuple(params.get(i, i) for i in values)
+            if len(values) > dim:
+                values = values[:dim]
+            if len(values) < dim:
+                values = values + tuple(1 for _ in range(dim - len(values)))
+            return values
+
+        size = resolve(self._size, dim)
+        steps = resolve(self._steps or (), dim)
+        allocation_size = tuple(size_i * step_i for size_i, step_i in zip(size, steps))
+        return size, steps, allocation_size
+
+    def satisfies_constraints(self, params):
+        size, _, allocation_size = self._get_size_and_steps(params)
+        # Product is computed in Python to avoid integer overflow.
+        num_elements = prod(size)
+        assert num_elements >= 0
+
+        allocation_bytes = prod(allocation_size, base=dtype_size(self._dtype))
+
+        def nullable_greater(left, right):
+            if left is None or right is None:
+                return False
+            return left > right
+
+        return not any((
+            nullable_greater(num_elements, self._max_elements),
+            nullable_greater(self._min_elements, num_elements),
+            nullable_greater(allocation_bytes, self._max_allocation_bytes),
+        ))
+
+
+class Fuzzer:
+    def __init__(
+        self,
+        parameters: List[Union[FuzzedParameter, List[FuzzedParameter]]],
+        tensors: List[Union[FuzzedTensor, List[FuzzedTensor]]],
+        constraints: Optional[List[Callable]] = None,
+        seed: Optional[int] = None
+    ):
+        """
+        Args:
+            parameters:
+                List of FuzzedParameters which provide specifications
+                for generated parameters. Iterable elements will be
+                unpacked, though arbitrary nested structures will not.
+            tensors:
+                List of FuzzedTensors which define the Tensors which
+                will be created each step based on the parameters for
+                that step. Iterable elements will be unpacked, though
+                arbitrary nested structures will not.
+            constraints:
+                List of callables. They will be called with params
+                as kwargs, and if any of them return False the current
+                set of parameters will be rejected.
+            seed:
+                Seed for the RandomState used by the Fuzzer. This will
+                also be used to set the PyTorch random seed so that random
+                ops will create reproducible Tensors.
+        """
+        import numpy as np
+        if seed is None:
+            seed = np.random.RandomState().randint(0, 2 ** 32 - 1, dtype=np.int64)
+        self._seed = seed
+        self._parameters = Fuzzer._unpack(parameters, FuzzedParameter)
+        self._tensors = Fuzzer._unpack(tensors, FuzzedTensor)
+        self._constraints = constraints or ()
+
+        p_names = {p.name for p in self._parameters}
+        t_names = {t.name for t in self._tensors}
+        name_overlap = p_names.intersection(t_names)
+        if name_overlap:
+            raise ValueError(f"Duplicate names in parameters and tensors: {name_overlap}")
+
+        self._rejections = 0
+        self._total_generated = 0
+
+    @staticmethod
+    def _unpack(values, cls):
+        return tuple(it.chain(
+            *[[i] if isinstance(i, cls) else i for i in values]
+        ))
+
+    def take(self, n):
+        import numpy as np
+        state = np.random.RandomState(self._seed)
+        torch.manual_seed(state.randint(low=0, high=2 ** 63, dtype=np.int64))
+        for _ in range(n):
+            params = self._generate(state)
+            tensors = {}
+            tensor_properties = {}
+            for t in self._tensors:
+                tensor, properties = t._make_tensor(params, state)
+                tensors[t.name] = tensor
+                tensor_properties[t.name] = properties
+            yield tensors, tensor_properties, params
+
+    @property
+    def rejection_rate(self):
+        if not self._total_generated:
+            return 0.
+        return self._rejections / self._total_generated
+
+    def _generate(self, state):
+        strict_params: Dict[str, Union[float, int, ParameterAlias]] = {}
+        for _ in range(1000):
+            candidate_params: Dict[str, Union[float, int, ParameterAlias]] = {}
+            for p in self._parameters:
+                if p.strict:
+                    if p.name in strict_params:
+                        candidate_params[p.name] = strict_params[p.name]
+                    else:
+                        candidate_params[p.name] = p.sample(state)
+                        strict_params[p.name] = candidate_params[p.name]
+                else:
+                    candidate_params[p.name] = p.sample(state)
+
+            candidate_params = self._resolve_aliases(candidate_params)
+
+            self._total_generated += 1
+            if not all(f(candidate_params) for f in self._constraints):
+                self._rejections += 1
+                continue
+
+            if not all(t.satisfies_constraints(candidate_params) for t in self._tensors):
+                self._rejections += 1
+                continue
+
+            return candidate_params
+        raise ValueError("Failed to generate a set of valid parameters.")
+
+    @staticmethod
+    def _resolve_aliases(params):
+        params = dict(params)
+        alias_count = sum(isinstance(v, ParameterAlias) for v in params.values())
+
+        keys = list(params.keys())
+        while alias_count:
+            for k in keys:
+                v = params[k]
+                if isinstance(v, ParameterAlias):
+                    params[k] = params[v.alias_to]
+            alias_count_new = sum(isinstance(v, ParameterAlias) for v in params.values())
+            if alias_count == alias_count_new:
+                raise ValueError(f"ParameterAlias cycle detected\n{params}")
+
+            alias_count = alias_count_new
+
+        return params
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/sparse_fuzzer.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/sparse_fuzzer.py
@ -0,0 +1,121 @@
+# mypy: allow-untyped-defs
+from typing import Optional, Tuple, Union
+from numbers import Number
+import torch
+from torch.utils.benchmark import FuzzedTensor
+import math
+
+class FuzzedSparseTensor(FuzzedTensor):
+    def __init__(
+        self,
+        name: str,
+        size: Tuple[Union[str, int], ...],
+        min_elements: Optional[int] = None,
+        max_elements: Optional[int] = None,
+        dim_parameter: Optional[str] = None,
+        sparse_dim: Optional[str] = None,
+        nnz: Optional[str] = None,
+        density: Optional[str] = None,
+        coalesced: Optional[str] = None,
+        dtype=torch.float32,
+        cuda=False
+    ):
+        """
+        Args:
+            name:
+                A string identifier for the generated Tensor.
+            size:
+                A tuple of integers or strings specifying the size of the generated
+                Tensor. String values will replaced with a concrete int during the
+                generation process, while ints are simply passed as literals.
+            min_elements:
+                The minimum number of parameters that this Tensor must have for a
+                set of parameters to be valid. (Otherwise they are resampled.)
+            max_elements:
+                Like `min_elements`, but setting an upper bound.
+            dim_parameter:
+                The length of `size` will be truncated to this value.
+                This allows Tensors of varying dimensions to be generated by the
+                Fuzzer.
+            sparse_dim:
+                The number of sparse dimensions in a sparse tensor.
+            density:
+                This value allows tensors of varying sparsities to be generated by the Fuzzer.
+            coalesced:
+                The sparse tensor format permits uncoalesced sparse tensors,
+                where there may be duplicate coordinates in the indices.
+            dtype:
+                The PyTorch dtype of the generated Tensor.
+            cuda:
+                Whether to place the Tensor on a GPU.
+        """
+        super().__init__(name=name, size=size, min_elements=min_elements,
+                         max_elements=max_elements, dim_parameter=dim_parameter, dtype=dtype, cuda=cuda)
+        self._density = density
+        self._coalesced = coalesced
+        self._sparse_dim = sparse_dim
+
+    @staticmethod
+    def sparse_tensor_constructor(size, dtype, sparse_dim, nnz, is_coalesced):
+        """sparse_tensor_constructor creates a sparse tensor with coo format.
+
+        Note that when `is_coalesced` is False, the number of elements is doubled but the number of indices
+        represents the same amount of number of non zeros `nnz`, i.e, this is virtually the same tensor
+        with the same sparsity pattern. Moreover, most of the sparse operation will use coalesce() method
+        and what we want here is to get a sparse tensor with the same `nnz` even if this is coalesced or not.
+
+        In the other hand when `is_coalesced` is True the number of elements is reduced in the coalescing process
+        by an unclear amount however the probability to generate duplicates indices are low for most of the cases.
+        This decision was taken on purpose to maintain the construction cost as low as possible.
+        """
+        if isinstance(size, Number):
+            size = [size] * sparse_dim
+        assert all(size[d] > 0 for d in range(sparse_dim)) or nnz == 0, 'invalid arguments'
+        v_size = [nnz] + list(size[sparse_dim:])
+        if dtype.is_floating_point:
+            v = torch.rand(size=v_size, dtype=dtype, device="cpu")
+        else:
+            v = torch.randint(1, 127, size=v_size, dtype=dtype, device="cpu")
+
+        i = torch.rand(sparse_dim, nnz, device="cpu")
+        i.mul_(torch.tensor(size[:sparse_dim]).unsqueeze(1).to(i))
+        i = i.to(torch.long)
+
+        if not is_coalesced:
+            v = torch.cat([v, torch.randn_like(v)], 0)
+            i = torch.cat([i, i], 1)
+
+        x = torch.sparse_coo_tensor(i, v, torch.Size(size))
+        if is_coalesced:
+            x = x.coalesce()
+        return x
+
+    def _make_tensor(self, params, state):
+        size, _, _ = self._get_size_and_steps(params)
+        density = params['density']
+        nnz = math.ceil(sum(size) * density)
+        assert nnz <= sum(size)
+
+        is_coalesced = params['coalesced']
+        sparse_dim = params['sparse_dim'] if self._sparse_dim else len(size)
+        sparse_dim = min(sparse_dim, len(size))
+        tensor = self.sparse_tensor_constructor(size, self._dtype, sparse_dim, nnz, is_coalesced)
+
+        if self._cuda:
+            tensor = tensor.cuda()
+        sparse_dim = tensor.sparse_dim()
+        dense_dim = tensor.dense_dim()
+        is_hybrid = len(size[sparse_dim:]) > 0
+
+        properties = {
+            "numel": int(tensor.numel()),
+            "shape": tensor.size(),
+            "is_coalesced": tensor.is_coalesced(),
+            "density": density,
+            "sparsity": 1.0 - density,
+            "sparse_dim": sparse_dim,
+            "dense_dim": dense_dim,
+            "is_hybrid": is_hybrid,
+            "dtype": str(self._dtype),
+        }
+        return tensor, properties
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/timeit_template.cpp
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/timeit_template.cpp
@ -0,0 +1,43 @@
+/* C++ template for Timer.timeit
+
+This template will be consumed by `cpp_jit.py`, and will replace:
+    `GLOBAL_SETUP_TEMPLATE_LOCATION`,
+    `SETUP_TEMPLATE_LOCATION`
+      and
+    `STMT_TEMPLATE_LOCATION`
+sections with user provided statements.
+*/
+#include <chrono>
+
+#include <c10/util/irange.h>
+#include <torch/csrc/utils/pybind.h>
+#include <pybind11/pybind11.h>
+#include <torch/extension.h>
+
+// Global setup. (e.g. #includes)
+// GLOBAL_SETUP_TEMPLATE_LOCATION
+
+double timeit(int n) {
+  pybind11::gil_scoped_release no_gil;
+
+  // Setup
+  // SETUP_TEMPLATE_LOCATION
+
+  {
+    // Warmup
+    // STMT_TEMPLATE_LOCATION
+  }
+
+  // Main loop
+  auto start_time = std::chrono::high_resolution_clock::now();
+  for (const auto loop_idx : c10::irange(n)) {
+    (void)loop_idx;
+    // STMT_TEMPLATE_LOCATION
+  }
+  auto end_time = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration<double>(end_time - start_time).count();
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("timeit", &timeit);
+}
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/timer.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/timer.py
@ -0,0 +1,537 @@
+"""Timer class based on the timeit.Timer class, but torch aware."""
+import enum
+import timeit
+import textwrap
+from typing import overload, Any, Callable, Dict, List, NoReturn, Optional, Tuple, Type, Union
+
+import torch
+from torch.utils.benchmark.utils import common, cpp_jit
+from torch.utils.benchmark.utils._stubs import TimerClass, TimeitModuleType
+from torch.utils.benchmark.utils.valgrind_wrapper import timer_interface as valgrind_timer_interface
+
+
+__all__ = ["Timer", "timer", "Language"]
+
+
+if torch.backends.cuda.is_built() and torch.cuda.is_available():  # type: ignore[no-untyped-call]
+    def timer() -> float:
+        torch.cuda.synchronize()
+        return timeit.default_timer()
+elif torch._C._get_privateuse1_backend_name() != "privateuseone":
+    privateuse1_device_handler = getattr(torch, torch._C._get_privateuse1_backend_name(), None) \
+        if torch._C._get_privateuse1_backend_name() != "cpu" else None
+
+    def timer() -> float:
+        if privateuse1_device_handler:
+            privateuse1_device_handler.synchronize()
+        return timeit.default_timer()
+else:
+    timer = timeit.default_timer
+
+
+class Language(enum.Enum):
+    PYTHON = 0
+    CPP = 1
+
+
+class CPPTimer:
+    def __init__(
+        self,
+        stmt: str,
+        setup: str,
+        global_setup: str,
+        timer: Callable[[], float],
+        globals: Dict[str, Any],
+    ) -> None:
+        if timer is not timeit.default_timer:
+            raise NotImplementedError(
+                "PyTorch was built with CUDA and a GPU is present; however "
+                "Timer does not yet support GPU measurements. If your "
+                "code is CPU only, pass `timer=timeit.default_timer` to the "
+                "Timer's constructor to indicate this. (Note that this will "
+                "produce incorrect results if the GPU is in fact used, as "
+                "Timer will not synchronize CUDA.)"
+            )
+
+        if globals:
+            raise ValueError("C++ timing does not support globals.")
+
+        self._stmt: str = textwrap.dedent(stmt)
+        self._setup: str = textwrap.dedent(setup)
+        self._global_setup: str = textwrap.dedent(global_setup)
+        self._timeit_module: Optional[TimeitModuleType] = None
+
+    def timeit(self, number: int) -> float:
+        if self._timeit_module is None:
+            self._timeit_module = cpp_jit.compile_timeit_template(
+                stmt=self._stmt,
+                setup=self._setup,
+                global_setup=self._global_setup,
+            )
+
+        return self._timeit_module.timeit(number)
+
+
+class Timer:
+    """Helper class for measuring execution time of PyTorch statements.
+
+    For a full tutorial on how to use this class, see:
+    https://pytorch.org/tutorials/recipes/recipes/benchmark.html
+
+    The PyTorch Timer is based on `timeit.Timer` (and in fact uses
+    `timeit.Timer` internally), but with several key differences:
+
+    1) Runtime aware:
+        Timer will perform warmups (important as some elements of PyTorch are
+        lazily initialized), set threadpool size so that comparisons are
+        apples-to-apples, and synchronize asynchronous CUDA functions when
+        necessary.
+
+    2) Focus on replicates:
+        When measuring code, and particularly complex kernels / models,
+        run-to-run variation is a significant confounding factor. It is
+        expected that all measurements should include replicates to quantify
+        noise and allow median computation, which is more robust than mean.
+        To that effect, this class deviates from the `timeit` API by
+        conceptually merging `timeit.Timer.repeat` and `timeit.Timer.autorange`.
+        (Exact algorithms are discussed in method docstrings.) The `timeit`
+        method is replicated for cases where an adaptive strategy is not
+        desired.
+
+    3) Optional metadata:
+        When defining a Timer, one can optionally specify `label`, `sub_label`,
+        `description`, and `env`. (Defined later) These fields are included in
+        the representation of result object and by the `Compare` class to group
+        and display results for comparison.
+
+    4) Instruction counts
+        In addition to wall times, Timer can run a statement under Callgrind
+        and report instructions executed.
+
+    Directly analogous to `timeit.Timer` constructor arguments:
+
+        `stmt`, `setup`, `timer`, `globals`
+
+    PyTorch Timer specific constructor arguments:
+
+        `label`, `sub_label`, `description`, `env`, `num_threads`
+
+    Args:
+        stmt: Code snippet to be run in a loop and timed.
+
+        setup: Optional setup code. Used to define variables used in `stmt`
+
+        global_setup: (C++ only)
+            Code which is placed at the top level of the file for things like
+            `#include` statements.
+
+        timer:
+            Callable which returns the current time. If PyTorch was built
+            without CUDA or there is no GPU present, this defaults to
+            `timeit.default_timer`; otherwise it will synchronize CUDA before
+            measuring the time.
+
+        globals:
+            A dict which defines the global variables when `stmt` is being
+            executed. This is the other method for providing variables which
+            `stmt` needs.
+
+        label:
+            String which summarizes `stmt`. For instance, if `stmt` is
+            "torch.nn.functional.relu(torch.add(x, 1, out=out))"
+            one might set label to "ReLU(x + 1)" to improve readability.
+
+        sub_label:
+            Provide supplemental information to disambiguate measurements
+            with identical stmt or label. For instance, in our example
+            above sub_label might be "float" or "int", so that it is easy
+            to differentiate:
+            "ReLU(x + 1): (float)"
+
+            "ReLU(x + 1): (int)"
+            when printing Measurements or summarizing using `Compare`.
+
+        description:
+            String to distinguish measurements with identical label and
+            sub_label. The principal use of `description` is to signal to
+            `Compare` the columns of data. For instance one might set it
+            based on the input size  to create a table of the form: ::
+
+                                        | n=1 | n=4 | ...
+                                        ------------- ...
+                ReLU(x + 1): (float)    | ... | ... | ...
+                ReLU(x + 1): (int)      | ... | ... | ...
+
+
+            using `Compare`. It is also included when printing a Measurement.
+
+        env:
+            This tag indicates that otherwise identical tasks were run in
+            different environments, and are therefore not equivalent, for
+            instance when A/B testing a change to a kernel. `Compare` will
+            treat Measurements with different `env` specification as distinct
+            when merging replicate runs.
+
+        num_threads:
+            The size of the PyTorch threadpool when executing `stmt`. Single
+            threaded performance is important as both a key inference workload
+            and a good indicator of intrinsic algorithmic efficiency, so the
+            default is set to one. This is in contrast to the default PyTorch
+            threadpool size which tries to utilize all cores.
+    """
+
+    _timer_cls: Type[TimerClass] = timeit.Timer
+
+    def __init__(
+        self,
+        stmt: str = "pass",
+        setup: str = "pass",
+        global_setup: str = "",
+        timer: Callable[[], float] = timer,
+        globals: Optional[Dict[str, Any]] = None,
+        label: Optional[str] = None,
+        sub_label: Optional[str] = None,
+        description: Optional[str] = None,
+        env: Optional[str] = None,
+        num_threads: int = 1,
+        language: Union[Language, str] = Language.PYTHON,
+    ):
+        if not isinstance(stmt, str):
+            raise ValueError("Currently only a `str` stmt is supported.")
+
+        # We copy `globals` to prevent mutations from leaking.
+        # (For instance, `eval` adds the `__builtins__` key)
+        self._globals = dict(globals or {})
+
+        timer_kwargs = {}
+        if language in (Language.PYTHON, "py", "python"):
+            # Include `torch` if not specified as a convenience feature.
+            self._globals.setdefault("torch", torch)
+            self._language: Language = Language.PYTHON
+            if global_setup:
+                raise ValueError(
+                    f"global_setup is C++ only, got `{global_setup}`. Most "
+                    "likely this code can simply be moved to `setup`."
+                )
+
+        elif language in (Language.CPP, "cpp", "c++"):
+            assert self._timer_cls is timeit.Timer, "_timer_cls has already been swapped."
+            self._timer_cls = CPPTimer
+            setup = ("" if setup == "pass" else setup)
+            self._language = Language.CPP
+            timer_kwargs["global_setup"] = global_setup
+
+        else:
+            raise ValueError(f"Invalid language `{language}`.")
+
+        # Convenience adjustment so that multi-line code snippets defined in
+        # functions do not IndentationError (Python) or look odd (C++). The
+        # leading newline removal is for the initial newline that appears when
+        # defining block strings. For instance:
+        #   textwrap.dedent("""
+        #     print("This is a stmt")
+        #   """)
+        # produces '\nprint("This is a stmt")\n'.
+        #
+        # Stripping this down to 'print("This is a stmt")' doesn't change
+        # what gets executed, but it makes __repr__'s nicer.
+        stmt = textwrap.dedent(stmt)
+        stmt = (stmt[1:] if stmt and stmt[0] == "\n" else stmt).rstrip()
+        setup = textwrap.dedent(setup)
+        setup = (setup[1:] if setup and setup[0] == "\n" else setup).rstrip()
+
+        self._timer = self._timer_cls(
+            stmt=stmt,
+            setup=setup,
+            timer=timer,
+            globals=valgrind_timer_interface.CopyIfCallgrind.unwrap_all(self._globals),
+            **timer_kwargs,
+        )
+        self._task_spec = common.TaskSpec(
+            stmt=stmt,
+            setup=setup,
+            global_setup=global_setup,
+            label=label,
+            sub_label=sub_label,
+            description=description,
+            env=env,
+            num_threads=num_threads,
+        )
+
+    def _timeit(self, number: int) -> float:
+        # Even calling a timer in C++ takes ~50 ns, so no real operation should
+        # take less than 1 ns. (And this prevents divide by zero errors.)
+        return max(self._timer.timeit(number), 1e-9)
+
+    def timeit(self, number: int = 1000000) -> common.Measurement:
+        """Mirrors the semantics of timeit.Timer.timeit().
+
+        Execute the main statement (`stmt`) `number` times.
+        https://docs.python.org/3/library/timeit.html#timeit.Timer.timeit
+        """
+        with common.set_torch_threads(self._task_spec.num_threads):
+            # Warmup
+            self._timeit(number=max(int(number // 100), 2))
+
+            return common.Measurement(
+                number_per_run=number,
+                raw_times=[self._timeit(number=number)],
+                task_spec=self._task_spec
+            )
+
+    def repeat(self, repeat: int = -1, number: int = -1) -> None:
+        raise NotImplementedError("See `Timer.blocked_autorange.`")
+
+    def autorange(self, callback: Optional[Callable[[int, float], NoReturn]] = None) -> None:
+        raise NotImplementedError("See `Timer.blocked_autorange.`")
+
+    def _threaded_measurement_loop(
+        self,
+        number: int,
+        time_hook: Callable[[], float],
+        stop_hook: Callable[[List[float]], bool],
+        min_run_time: float,
+        max_run_time: Optional[float] = None,
+        callback: Optional[Callable[[int, float], NoReturn]] = None
+    ) -> List[float]:
+        total_time = 0.0
+        can_stop = False
+        times: List[float] = []
+        with common.set_torch_threads(self._task_spec.num_threads):
+            while (total_time < min_run_time) or (not can_stop):
+                time_spent = time_hook()
+                times.append(time_spent)
+                total_time += time_spent
+                if callback:
+                    callback(number, time_spent)
+                can_stop = stop_hook(times)
+                if max_run_time and total_time > max_run_time:
+                    break
+        return times
+
+    def _estimate_block_size(self, min_run_time: float) -> int:
+        with common.set_torch_threads(self._task_spec.num_threads):
+            # Estimate the block size needed for measurement to be negligible
+            # compared to the inner loop. This also serves as a warmup.
+            overhead = torch.tensor([self._timeit(0) for _ in range(5)]).median().item()
+            number = 1
+            while True:
+                time_taken = self._timeit(number)
+                relative_overhead = overhead / time_taken
+                if relative_overhead <= 1e-4 and time_taken >= min_run_time / 1000:
+                    break
+                if time_taken > min_run_time:
+                    break
+                # Avoid overflow in C++ pybind11 interface
+                if number * 10 > 2147483647:
+                    break
+                number *= 10
+        return number
+
+    def blocked_autorange(
+        self,
+        callback: Optional[Callable[[int, float], NoReturn]] = None,
+        min_run_time: float = 0.2,
+    ) -> common.Measurement:
+        """Measure many replicates while keeping timer overhead to a minimum.
+
+        At a high level, blocked_autorange executes the following pseudo-code::
+
+            `setup`
+
+            total_time = 0
+            while total_time < min_run_time
+                start = timer()
+                for _ in range(block_size):
+                    `stmt`
+                total_time += (timer() - start)
+
+        Note the variable `block_size` in the inner loop. The choice of block
+        size is important to measurement quality, and must balance two
+        competing objectives:
+
+            1) A small block size results in more replicates and generally
+               better statistics.
+
+            2) A large block size better amortizes the cost of `timer`
+               invocation, and results in a less biased measurement. This is
+               important because CUDA synchronization time is non-trivial
+               (order single to low double digit microseconds) and would
+               otherwise bias the measurement.
+
+        blocked_autorange sets block_size by running a warmup period,
+        increasing block size until timer overhead is less than 0.1% of
+        the overall computation. This value is then used for the main
+        measurement loop.
+
+        Returns:
+            A `Measurement` object that contains measured runtimes and
+            repetition counts, and can be used to compute statistics.
+            (mean, median, etc.)
+        """
+        number = self._estimate_block_size(min_run_time)
+
+        def time_hook() -> float:
+            return self._timeit(number)
+
+        def stop_hook(times: List[float]) -> bool:
+            return True
+
+        times = self._threaded_measurement_loop(
+            number, time_hook, stop_hook,
+            min_run_time=min_run_time,
+            callback=callback)
+
+        return common.Measurement(
+            number_per_run=number,
+            raw_times=times,
+            task_spec=self._task_spec
+        )
+
+    def adaptive_autorange(
+            self,
+            threshold: float = 0.1,
+            *,
+            min_run_time: float = 0.01,
+            max_run_time: float = 10.0,
+            callback: Optional[Callable[[int, float], NoReturn]] = None,
+    ) -> common.Measurement:
+        """Similar to `blocked_autorange` but also checks for variablility in measurements
+        and repeats until iqr/median is smaller than `threshold` or `max_run_time` is reached.
+
+
+        At a high level, adaptive_autorange executes the following pseudo-code::
+
+            `setup`
+
+            times = []
+            while times.sum < max_run_time
+                start = timer()
+                for _ in range(block_size):
+                    `stmt`
+                times.append(timer() - start)
+
+                enough_data = len(times)>3 and times.sum > min_run_time
+                small_iqr=times.iqr/times.mean<threshold
+
+                if enough_data and small_iqr:
+                    break
+
+        Args:
+            threshold: value of iqr/median threshold for stopping
+
+            min_run_time: total runtime needed before checking `threshold`
+
+            max_run_time: total runtime  for all measurements regardless of `threshold`
+
+        Returns:
+            A `Measurement` object that contains measured runtimes and
+            repetition counts, and can be used to compute statistics.
+            (mean, median, etc.)
+        """
+        number = self._estimate_block_size(min_run_time=0.05)
+
+        def time_hook() -> float:
+            return self._timeit(number)
+
+        def stop_hook(times: List[float]) -> bool:
+            if len(times) > 3:
+                return common.Measurement(
+                    number_per_run=number,
+                    raw_times=times,
+                    task_spec=self._task_spec
+                ).meets_confidence(threshold=threshold)
+            return False
+        times = self._threaded_measurement_loop(
+            number, time_hook, stop_hook, min_run_time, max_run_time, callback=callback)
+
+        return common.Measurement(
+            number_per_run=number,
+            raw_times=times,
+            task_spec=self._task_spec
+        )
+
+    @overload
+    def collect_callgrind(
+        self,
+        number: int,
+        *,
+        repeats: None,
+        collect_baseline: bool,
+        retain_out_file: bool,
+    ) -> valgrind_timer_interface.CallgrindStats:
+        ...
+
+    @overload
+    def collect_callgrind(
+        self,
+        number: int,
+        *,
+        repeats: int,
+        collect_baseline: bool,
+        retain_out_file: bool,
+    ) -> Tuple[valgrind_timer_interface.CallgrindStats, ...]:
+        ...
+
+    def collect_callgrind(
+        self,
+        number: int = 100,
+        *,
+        repeats: Optional[int] = None,
+        collect_baseline: bool = True,
+        retain_out_file: bool = False,
+    ) -> Any:
+        """Collect instruction counts using Callgrind.
+
+        Unlike wall times, instruction counts are deterministic
+        (modulo non-determinism in the program itself and small amounts of
+        jitter from the Python interpreter.) This makes them ideal for detailed
+        performance analysis. This method runs `stmt` in a separate process
+        so that Valgrind can instrument the program. Performance is severely
+        degraded due to the instrumentation, however this is ameliorated by
+        the fact that a small number of iterations is generally sufficient to
+        obtain good measurements.
+
+        In order to to use this method `valgrind`, `callgrind_control`, and
+        `callgrind_annotate` must be installed.
+
+        Because there is a process boundary between the caller (this process)
+        and the `stmt` execution, `globals` cannot contain arbitrary in-memory
+        data structures. (Unlike timing methods) Instead, globals are
+        restricted to builtins, `nn.Modules`'s, and TorchScripted functions/modules
+        to reduce the surprise factor from serialization and subsequent
+        deserialization. The `GlobalsBridge` class provides more detail on this
+        subject. Take particular care with nn.Modules: they rely on pickle and
+        you may need to add an import to `setup` for them to transfer properly.
+
+        By default, a profile for an empty statement will be collected and
+        cached to indicate how many instructions are from the Python loop which
+        drives `stmt`.
+
+        Returns:
+            A `CallgrindStats` object which provides instruction counts and
+            some basic facilities for analyzing and manipulating results.
+        """
+        if not isinstance(self._task_spec.stmt, str):
+            raise ValueError("`collect_callgrind` currently only supports string `stmt`")
+
+        if repeats is not None and repeats < 1:
+            raise ValueError("If specified, `repeats` must be >= 1")
+
+        # Check that the statement is valid. It doesn't guarantee success, but it's much
+        # simpler and quicker to raise an exception for a faulty `stmt` or `setup` in
+        # the parent process rather than the valgrind subprocess.
+        self._timeit(1)
+        is_python = (self._language == Language.PYTHON)
+        assert is_python or not self._globals
+        result = valgrind_timer_interface.wrapper_singleton().collect_callgrind(
+            task_spec=self._task_spec,
+            globals=self._globals,
+            number=number,
+            repeats=repeats or 1,
+            collect_baseline=collect_baseline and is_python,
+            is_python=is_python,
+            retain_out_file=retain_out_file,
+        )
+
+        return (result[0] if repeats is None else result)
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/init.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/init.py
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/pycache/timer_interface.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/pycache/timer_interface.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
@ -0,0 +1,129 @@
+
+/*
+   ----------------------------------------------------------------
+
+   Notice that the following BSD-style license applies to this one
+   file (callgrind.h) only.  The rest of Valgrind is licensed under the
+   terms of the GNU General Public License, version 2, unless
+   otherwise indicated.  See the COPYING file in the source
+   distribution for details.
+
+   ----------------------------------------------------------------
+
+   This file is part of callgrind, a valgrind tool for cache simulation
+   and call tree tracing.
+
+   Copyright (C) 2003-2017 Josef Weidendorfer.  All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. The origin of this software must not be misrepresented; you must
+      not claim that you wrote the original software.  If you use this
+      software in a product, an acknowledgment in the product
+      documentation would be appreciated but is not required.
+
+   3. Altered source versions must be plainly marked as such, and must
+      not be misrepresented as being the original software.
+
+   4. The name of the author may not be used to endorse or promote
+      products derived from this software without specific prior written
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ----------------------------------------------------------------
+
+   Notice that the above BSD-style license applies to this one file
+   (callgrind.h) only.  The entire rest of Valgrind is licensed under
+   the terms of the GNU General Public License, version 2.  See the
+   COPYING file in the source distribution for details.
+
+   ----------------------------------------------------------------
+*/
+
+#ifndef __CALLGRIND_H
+#define __CALLGRIND_H
+
+#include "valgrind.h"
+
+/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !!
+   This enum comprises an ABI exported by Valgrind to programs
+   which use client requests.  DO NOT CHANGE THE ORDER OF THESE
+   ENTRIES, NOR DELETE ANY -- add new ones at the end.
+
+   The identification ('C','T') for Callgrind has historical
+   reasons: it was called "Calltree" before. Besides, ('C','G') would
+   clash with cachegrind.
+ */
+
+typedef
+   enum {
+      VG_USERREQ__DUMP_STATS = VG_USERREQ_TOOL_BASE('C','T'),
+      VG_USERREQ__ZERO_STATS,
+      VG_USERREQ__TOGGLE_COLLECT,
+      VG_USERREQ__DUMP_STATS_AT,
+      VG_USERREQ__START_INSTRUMENTATION,
+      VG_USERREQ__STOP_INSTRUMENTATION
+   } Vg_CallgrindClientRequest;
+
+/* Dump current state of cost centers, and zero them afterwards */
+#define CALLGRIND_DUMP_STATS                                    \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DUMP_STATS,       \
+                                  0, 0, 0, 0, 0)
+
+/* Dump current state of cost centers, and zero them afterwards.
+   The argument is appended to a string stating the reason which triggered
+   the dump. This string is written as a description field into the
+   profile data dump. */
+#define CALLGRIND_DUMP_STATS_AT(pos_str)                        \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DUMP_STATS_AT,    \
+                                  pos_str, 0, 0, 0, 0)
+
+/* Zero cost centers */
+#define CALLGRIND_ZERO_STATS                                    \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__ZERO_STATS,       \
+                                  0, 0, 0, 0, 0)
+
+/* Toggles collection state.
+   The collection state specifies whether the happening of events
+   should be noted or if they are to be ignored. Events are noted
+   by increment of counters in a cost center */
+#define CALLGRIND_TOGGLE_COLLECT                                \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__TOGGLE_COLLECT,   \
+                                  0, 0, 0, 0, 0)
+
+/* Start full callgrind instrumentation if not already switched on.
+   When cache simulation is done, it will flush the simulated cache;
+   this will lead to an artificial cache warmup phase afterwards with
+   cache misses which would not have happened in reality. */
+#define CALLGRIND_START_INSTRUMENTATION                              \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__START_INSTRUMENTATION, \
+                                  0, 0, 0, 0, 0)
+
+/* Stop full callgrind instrumentation if not already switched off.
+   This flushes Valgrinds translation cache, and does no additional
+   instrumentation afterwards, which effectivly will run at the same
+   speed as the "none" tool (ie. at minimal slowdown).
+   Use this to bypass Callgrind aggregation for uninteresting code parts.
+   To start Callgrind in this mode to ignore the setup phase, use
+   the option "--instr-atstart=no". */
+#define CALLGRIND_STOP_INSTRUMENTATION                               \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__STOP_INSTRUMENTATION,  \
+                                  0, 0, 0, 0, 0)
+
+#endif /* __CALLGRIND_H */
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/compat_bindings.cpp
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/compat_bindings.cpp
@ -0,0 +1,35 @@
+/* Used to collect profiles of old versions of PyTorch. */
+#include <callgrind.h>
+#include <pybind11/pybind11.h>
+
+bool _valgrind_supported_platform() {
+#if defined(NVALGRIND)
+  return false;
+#else
+  return true;
+#endif
+}
+
+void _valgrind_toggle() {
+#if defined(NVALGRIND)
+  TORCH_CHECK(false, "Valgrind is not supported.");
+#else
+  CALLGRIND_TOGGLE_COLLECT;
+#endif
+}
+
+void _valgrind_toggle_and_dump_stats() {
+#if defined(NVALGRIND)
+  TORCH_CHECK(false, "Valgrind is not supported.");
+#else
+  // NB: See note in Module.cpp
+  CALLGRIND_TOGGLE_COLLECT;
+  CALLGRIND_DUMP_STATS;
+#endif
+}
+
+PYBIND11_MODULE(callgrind_bindings, m) {
+  m.def("_valgrind_supported_platform", &_valgrind_supported_platform);
+  m.def("_valgrind_toggle", &_valgrind_toggle);
+  m.def("_valgrind_toggle_and_dump_stats", &_valgrind_dump_stats);
+}
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp
@ -0,0 +1,68 @@
+/* C++ template for Timer.collect_callgrind
+
+This template will be consumed by `cpp_jit.py`, and will replace:
+    `GLOBAL_SETUP_TEMPLATE_LOCATION`,
+    `SETUP_TEMPLATE_LOCATION`
+      and
+    `STMT_TEMPLATE_LOCATION`
+sections with user provided statements.
+*/
+
+#include <c10/util/irange.h>
+#include <callgrind.h>
+#include <torch/torch.h>
+
+#include <string>
+
+// Global setup. (e.g. #includes)
+// GLOBAL_SETUP_TEMPLATE_LOCATION
+
+#if defined(NVALGRIND)
+static_assert(false);
+#endif
+
+int main(int argc, char* argv[]) {
+  // This file should only be called inside of `Timer`, so we can adopt a
+  // very simple and rigid argument parsing scheme.
+  TORCH_CHECK(argc == 9);
+  TORCH_CHECK(std::string(argv[1]) == "--number");
+  auto number = std::stoi(argv[2]);
+
+  TORCH_CHECK(
+      std::string(argv[3]) == "--number-warmup" ||
+      std::string(argv[3]) == "--number_warmup");
+  auto number_warmup = std::stoi(argv[4]);
+
+  TORCH_CHECK(std::string(argv[5]) == "--repeats");
+  auto repeats = std::stoi(argv[6]);
+
+  TORCH_CHECK(
+      std::string(argv[7]) == "--number-threads" ||
+      std::string(argv[7]) == "--number_threads");
+  auto number_threads = std::stoi(argv[8]);
+  torch::set_num_threads(number_threads);
+
+  // Setup
+  // SETUP_TEMPLATE_LOCATION
+
+  // Warmup
+  for (const auto i : c10::irange(number_warmup)) {
+    (void)i;
+    // STMT_TEMPLATE_LOCATION
+  }
+
+  // Main loop
+  for (const auto repeat : c10::irange(repeats)) {
+    (void)repeat;
+    CALLGRIND_TOGGLE_COLLECT;
+
+    for (const auto i : c10::irange(number)) {
+      (void)i;
+      // STMT_TEMPLATE_LOCATION
+    }
+
+    // NB: See note in Module.cpp
+    CALLGRIND_TOGGLE_COLLECT;
+    CALLGRIND_DUMP_STATS;
+  }
+}
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@ -0,0 +1,907 @@
+"""Intermediate layer between `Timer` and `valgrind`."""
+import collections
+import enum
+import dataclasses
+import itertools as it
+import os
+import pickle
+import re
+import shutil
+import subprocess
+import sys
+import textwrap
+from typing import (
+    cast, Any, Callable, DefaultDict, Dict, Iterator, List, NamedTuple,
+    Optional, Tuple, Union, TYPE_CHECKING)
+
+import torch
+from torch.utils.benchmark.utils import common, cpp_jit
+from torch.utils.benchmark.utils._stubs import CallgrindModuleType
+import operator
+
+
+__all__ = ["FunctionCount", "FunctionCounts", "CallgrindStats", "CopyIfCallgrind"]
+
+
+if TYPE_CHECKING:
+    CompletedProcessType = subprocess.CompletedProcess[str]
+else:
+    CompletedProcessType = subprocess.CompletedProcess
+
+
+class FunctionCount(NamedTuple):
+    # TODO(#105471): Rename the count field
+    count: int  # type: ignore[assignment]
+    function: str
+
+
+@dataclasses.dataclass(repr=False, eq=False, frozen=True)
+class FunctionCounts:
+    """Container for manipulating Callgrind results.
+
+    It supports:
+        1) Addition and subtraction to combine or diff results.
+        2) Tuple-like indexing.
+        3) A `denoise` function which strips CPython calls which are known to
+           be non-deterministic and quite noisy.
+        4) Two higher order methods (`filter` and `transform`) for custom
+           manipulation.
+    """
+    _data: Tuple[FunctionCount, ...]
+    inclusive: bool
+    truncate_rows: bool = True
+
+    # For normal use, torch._tensor_str.PRINT_OPTS.linewidth determines
+    # the print settings. This is simply to allow hermetic unit tests.
+    _linewidth: Optional[int] = None
+
+    def __iter__(self) -> Iterator[FunctionCount]:
+        yield from self._data
+
+    def __len__(self) -> int:
+        return len(self._data)
+
+    def __getitem__(self, item: Any) -> Union[FunctionCount, "FunctionCounts"]:
+        data: Union[FunctionCount, Tuple[FunctionCount, ...]] = self._data[item]
+        return (
+            FunctionCounts(cast(Tuple[FunctionCount, ...], data), self.inclusive, truncate_rows=False)
+            if isinstance(data, tuple) else data
+        )
+
+    def __repr__(self) -> str:
+        count_len = 0
+        for c, _ in self:
+            # Account for sign in string length.
+            count_len = max(count_len, len(str(c)) + int(c < 0))
+
+        lines = []
+        linewidth = self._linewidth or torch._tensor_str.PRINT_OPTS.linewidth
+        fn_str_len = max(linewidth - count_len - 4, 40)
+        for c, fn in self:
+            if len(fn) > fn_str_len:
+                left_len = int((fn_str_len - 5) // 2)
+                fn = fn[:left_len] + " ... " + fn[-(fn_str_len - left_len - 5):]
+            lines.append(f"  {c:>{count_len}}  {fn}")
+
+        if self.truncate_rows and len(lines) > 18:
+            lines = lines[:9] + ["...".rjust(count_len + 2)] + lines[-9:]
+
+        if not self.inclusive:
+            lines.extend(["", f"Total: {self.sum()}"])
+
+        return "\n".join([super().__repr__()] + lines)
+
+    def __add__(
+        self,
+        other: "FunctionCounts",
+    ) -> "FunctionCounts":
+        return self._merge(other, lambda c: c)
+
+    def __sub__(
+        self,
+        other: "FunctionCounts",
+    ) -> "FunctionCounts":
+        return self._merge(other, operator.neg)
+
+    def __mul__(self, other: Union[int, float]) -> "FunctionCounts":
+        return self._from_dict({
+            fn: int(c * other) for c, fn in self._data
+        }, self.inclusive)
+
+    def transform(self, map_fn: Callable[[str], str]) -> "FunctionCounts":
+        """Apply `map_fn` to all of the function names.
+
+        This can be used to regularize function names (e.g. stripping irrelevant
+        parts of the file path), coalesce entries by mapping multiple functions
+        to the same name (in which case the counts are added together), etc.
+        """
+        counts: DefaultDict[str, int] = collections.defaultdict(int)
+        for c, fn in self._data:
+            counts[map_fn(fn)] += c
+
+        return self._from_dict(counts, self.inclusive)
+
+    def filter(self, filter_fn: Callable[[str], bool]) -> "FunctionCounts":
+        """Keep only the elements where `filter_fn` applied to function name returns True."""
+        return FunctionCounts(tuple(i for i in self if filter_fn(i.function)), self.inclusive)
+
+    def sum(self) -> int:
+        return sum(c for c, _ in self)
+
+    def denoise(self) -> "FunctionCounts":
+        """Remove known noisy instructions.
+
+        Several instructions in the CPython interpreter are rather noisy. These
+        instructions involve unicode to dictionary lookups which Python uses to
+        map variable names. FunctionCounts is generally a content agnostic
+        container, however this is sufficiently important for obtaining
+        reliable results to warrant an exception."""
+        return self.filter(lambda fn: "dictobject.c:lookdict_unicode" not in fn)
+
+    def _merge(
+        self,
+        second: "FunctionCounts",
+        merge_fn: Callable[[int], int]
+    ) -> "FunctionCounts":
+        assert self.inclusive == second.inclusive, "Cannot merge inclusive and exclusive counts."
+        counts: DefaultDict[str, int] = collections.defaultdict(int)
+        for c, fn in self:
+            counts[fn] += c
+
+        for c, fn in second:
+            counts[fn] += merge_fn(c)
+
+        return self._from_dict(counts, self.inclusive)
+
+    @staticmethod
+    def _from_dict(counts: Dict[str, int], inclusive: bool) -> "FunctionCounts":
+        flat_counts = (FunctionCount(c, fn) for fn, c in counts.items() if c)
+        return FunctionCounts(tuple(sorted(flat_counts, reverse=True)), inclusive)
+
+
+@dataclasses.dataclass(repr=False, eq=False, frozen=True)
+class CallgrindStats:
+    """Top level container for Callgrind results collected by Timer.
+
+    Manipulation is generally done using the FunctionCounts class, which is
+    obtained by calling `CallgrindStats.stats(...)`. Several convenience
+    methods are provided as well; the most significant is
+    `CallgrindStats.as_standardized()`.
+    """
+    task_spec: common.TaskSpec
+    number_per_run: int
+    built_with_debug_symbols: bool
+    baseline_inclusive_stats: FunctionCounts
+    baseline_exclusive_stats: FunctionCounts
+    stmt_inclusive_stats: FunctionCounts
+    stmt_exclusive_stats: FunctionCounts
+    stmt_callgrind_out: Optional[str]
+
+    def __repr__(self) -> str:
+        newline = "\n"  # `\` cannot appear in fstring code section.
+        base_stats = self.baseline_exclusive_stats
+        output = f"""
+{super().__repr__()}
+{self.task_spec.summarize()}
+  {'':>25}All{'':>10}Noisy symbols removed
+    Instructions: {self.counts(denoise=False):>12}{'':>15}{self.counts(denoise=True):>12}
+    Baseline:     {base_stats.sum():>12}{'':>15}{base_stats.denoise().sum():>12}
+{self.number_per_run} runs per measurement, {self.task_spec.num_threads} thread{'s' if self.task_spec.num_threads > 1 else ''}
+""".strip()
+        if not self.built_with_debug_symbols:
+            output += textwrap.dedent("""
+            Warning: PyTorch was not built with debug symbols.
+                     Source information may be limited. Rebuild with
+                     REL_WITH_DEB_INFO=1 for more detailed results.""")
+        return output
+
+    def stats(self, inclusive: bool = False) -> FunctionCounts:
+        """Returns detailed function counts.
+
+        Conceptually, the FunctionCounts returned can be thought of as a tuple
+        of (count, path_and_function_name) tuples.
+
+        `inclusive` matches the semantics of callgrind. If True, the counts
+        include instructions executed by children. `inclusive=True` is useful
+        for identifying hot spots in code; `inclusive=False` is useful for
+        reducing noise when diffing counts from two different runs. (See
+        CallgrindStats.delta(...) for more details)
+        """
+        return self.stmt_inclusive_stats if inclusive else self.stmt_exclusive_stats
+
+    def counts(self, *, denoise: bool = False) -> int:
+        """Returns the total number of instructions executed.
+
+        See `FunctionCounts.denoise()` for an explanation of the `denoise` arg.
+        """
+        stats = self.stmt_exclusive_stats
+        return (stats.denoise() if denoise else stats).sum()
+
+    # FIXME: Once 3.7 is the minimum version, type annotate `other` per PEP 563
+    def delta(
+        self,
+        other: "CallgrindStats",
+        inclusive: bool = False,
+    ) -> FunctionCounts:
+        """Diff two sets of counts.
+
+        One common reason to collect instruction counts is to determine the
+        the effect that a particular change will have on the number of instructions
+        needed to perform some unit of work. If a change increases that number, the
+        next logical question is "why". This generally involves looking at what part
+        if the code increased in instruction count. This function automates that
+        process so that one can easily diff counts on both an inclusive and
+        exclusive basis.
+        """
+        return self.stats(inclusive=inclusive) - other.stats(inclusive=inclusive)
+
+    def as_standardized(self) -> "CallgrindStats":
+        """Strip library names and some prefixes from function strings.
+
+        When comparing two different sets of instruction counts, on stumbling
+        block can be path prefixes. Callgrind includes the full filepath
+        when reporting a function (as it should). However, this can cause
+        issues when diffing profiles. If a key component such as Python
+        or PyTorch was built in separate locations in the two profiles, which
+        can result in something resembling::
+
+            23234231 /tmp/first_build_dir/thing.c:foo(...)
+             9823794 /tmp/first_build_dir/thing.c:bar(...)
+              ...
+               53453 .../aten/src/Aten/...:function_that_actually_changed(...)
+              ...
+             -9823794 /tmp/second_build_dir/thing.c:bar(...)
+            -23234231 /tmp/second_build_dir/thing.c:foo(...)
+
+        Stripping prefixes can ameliorate this issue by regularizing the
+        strings and causing better cancellation of equivalent call sites
+        when diffing.
+        """
+        def strip(stats: FunctionCounts) -> FunctionCounts:
+            transforms = (
+                # PyTorch may have been built in different locations.
+                (r"^.+build/\.\./", "build/../"),
+                (r"^.+/" + re.escape("build/aten/"), "build/aten/"),
+
+                # "Python" and "Objects" come from CPython.
+                (r"^.+/" + re.escape("Python/"), "Python/"),
+                (r"^.+/" + re.escape("Objects/"), "Objects/"),
+
+                # Strip library name. e.g. `libtorch.so`
+                (r"\s\[.+\]$", ""),
+            )
+
+            for before, after in transforms:
+                stats = stats.transform(lambda fn: re.sub(before, after, fn))
+
+            return stats
+
+        return CallgrindStats(
+            task_spec=self.task_spec,
+            number_per_run=self.number_per_run,
+            built_with_debug_symbols=self.built_with_debug_symbols,
+            baseline_inclusive_stats=strip(self.baseline_inclusive_stats),
+            baseline_exclusive_stats=strip(self.baseline_exclusive_stats),
+            stmt_inclusive_stats=strip(self.stmt_inclusive_stats),
+            stmt_exclusive_stats=strip(self.stmt_exclusive_stats),
+
+            # `as_standardized` will change symbol names, so the contents will
+            # no longer map directly to `callgrind.out`
+            stmt_callgrind_out=None,
+        )
+
+
+class Serialization(enum.Enum):
+    PICKLE = 0
+    TORCH = 1
+    TORCH_JIT = 2
+
+
+_GLOBALS_ALLOWED_TYPES: Dict[Serialization, Tuple[Any, ...]] = {
+    Serialization.PICKLE: (str, bytes, bool, int, float, complex),
+    Serialization.TORCH_JIT: (torch.jit.ScriptFunction, torch.jit.ScriptModule),
+    Serialization.TORCH: (torch.nn.Module,),
+}
+
+
+class CopyIfCallgrind:
+    """Signal that a global may be replaced with a deserialized copy.
+
+    See `GlobalsBridge` for why this matters.
+    """
+    def __init__(self, value: Any, *, setup: Optional[str] = None):
+        for method, supported_types in _GLOBALS_ALLOWED_TYPES.items():
+            if any(isinstance(value, t) for t in supported_types):
+                self._value: Any = value
+                self._setup: Optional[str] = setup
+                self._serialization: Serialization = method
+                break
+        else:
+            supported_str = "\n".join([
+                getattr(t, "__name__", repr(t))
+                for t in it.chain(_GLOBALS_ALLOWED_TYPES.values())])
+
+            raise ValueError(
+                f"Unsupported type: {type(value)}\n"
+                f"`collect_callgrind` restricts globals to the following types:\n"
+                f"{textwrap.indent(supported_str, '  ')}"
+            )
+
+    @property
+    def value(self) -> Any:
+        return self._value
+
+    @property
+    def setup(self) -> Optional[str]:
+        return self._setup
+
+    @property
+    def serialization(self) -> Serialization:
+        return self._serialization
+
+    @staticmethod
+    def unwrap_all(globals: Dict[str, Any]) -> Dict[str, Any]:
+        return {
+            k: (v.value if isinstance(v, CopyIfCallgrind) else v)
+            for k, v in globals.items()
+        }
+
+
+class GlobalsBridge:
+    """Handle the transfer of (certain) globals when collecting Callgrind statistics.
+
+    Key takeaway: Any globals passed must be wrapped in `CopyIfCallgrind` to
+                  work with `Timer.collect_callgrind`.
+
+    Consider the following code snippet:
+    ```
+        import pickle
+        import timeit
+
+        class Counter:
+            value = 0
+
+            def __call__(self):
+                self.value += 1
+
+        counter = Counter()
+        timeit.Timer("counter()", globals={"counter": counter}).timeit(10)
+        print(counter.value)  # 10
+
+        timeit.Timer(
+            "counter()",
+            globals={"counter": pickle.loads(pickle.dumps(counter))}
+        ).timeit(20)
+        print(counter.value)  # Still 10
+    ```
+
+    In the first case, `stmt` is executed using the objects in `globals`;
+    however, the addition of serialization and deserialization changes the
+    semantics and may meaningfully change behavior.
+
+    This is a practical consideration when collecting Callgrind statistics.
+    Unlike `exec` based execution (which `timeit` uses under the hood) which
+    can share in-memory data structures with the caller, Callgrind collection
+    requires an entirely new process in order to run under Valgrind. This means
+    that any data structures used for statement execution will have to be
+    serialized and deserialized in the subprocess.
+
+    In order to avoid surprising semantics from (user invisible) process
+    boundaries, what can be passed through `globals` is severely restricted
+    for `Timer.collect_callgrind`. It is expected that most setup should be
+    achievable (albeit perhaps less ergonomically) by passing a `setup`
+    string.
+
+    There are, however, exceptions. One such class are TorchScripted functions.
+    Because they require a concrete file with source code it is not possible
+    to define them using a `setup` string. Another group are torch.nn.Modules,
+    whose construction can be complex and prohibitively cumbersome to coerce
+    into a `setup` string. Finally, most builtin types are sufficiently well
+    behaved and sufficiently common to warrant allowing as well. (e.g.
+    `globals={"n": 1}` is very convenient.)
+
+    Fortunately, all have well defined serialization semantics. This class
+    is responsible for enabling the Valgrind subprocess to use elements in
+    `globals` so long as they are an allowed type.
+
+    Caveats:
+        The user is required to acknowledge this serialization by wrapping
+        elements in `globals` with `CopyIfCallgrind`.
+
+        While ScriptFunction and ScriptModule are expected to save and load
+        quite robustly, it is up to the user to ensure that an nn.Module can
+        un-pickle successfully.
+
+        `torch.Tensor` and `np.ndarray` are deliberately excluded. The
+        serialization/deserialization process perturbs the representation of a
+        tensor in ways that could result in incorrect measurements. For example,
+        if a tensor lives in pinned CPU memory, this fact would not be preserved
+        by a dump, and that will in turn change the performance of certain CUDA
+        operations.
+    """
+
+    def __init__(self, globals: Dict[str, Any], data_dir: str) -> None:
+        self._globals: Dict[str, CopyIfCallgrind] = {}
+        self._data_dir = data_dir
+        if not os.path.exists(data_dir):
+            os.mkdir(data_dir)
+
+        if globals.get("torch", torch) is not torch:
+            raise ValueError("`collect_callgrind` does not support mocking out `torch`.")
+
+        for name, value in globals.items():
+            if name in ("torch", "__builtins__"):
+                # Torch will be imported by the collection script, and
+                # __builtins__ is added by Timer.
+                continue
+
+            if not isinstance(value, CopyIfCallgrind):
+                raise ValueError(
+                    "`collect_callgrind` requires that globals be wrapped in "
+                    "`CopyIfCallgrind` so that serialization is explicit."
+                )
+
+            self._globals[name] = value
+
+    def construct(self) -> str:
+        load_lines = []
+        for name, wrapped_value in self._globals.items():
+            if wrapped_value.setup is not None:
+                load_lines.append(textwrap.dedent(wrapped_value.setup))
+
+            if wrapped_value.serialization == Serialization.PICKLE:
+                path = os.path.join(self._data_dir, f"{name}.pkl")
+                load_lines.append(
+                    f"with open({repr(path)}, 'rb') as f:\n    {name} = pickle.load(f)")
+                with open(path, "wb") as f:
+                    pickle.dump(wrapped_value.value, f)
+
+            elif wrapped_value.serialization == Serialization.TORCH:
+                path = os.path.join(self._data_dir, f"{name}.pt")
+                load_lines.append(f"{name} = torch.load({repr(path)})")
+                torch.save(wrapped_value.value, path)
+
+            elif wrapped_value.serialization == Serialization.TORCH_JIT:
+                path = os.path.join(self._data_dir, f"{name}.pt")
+                load_lines.append(f"{name} = torch.jit.load({repr(path)})")
+                with open(path, "wb") as f:
+                    torch.jit.save(wrapped_value.value, f)  # type: ignore[no-untyped-call]
+
+            else:
+                raise NotImplementedError(
+                    f"Unknown serialization method: {wrapped_value.serialization}")
+
+        return "\n".join(load_lines)
+
+
+class _ValgrindWrapper:
+    def __init__(self) -> None:
+        self._bindings_module: Optional[CallgrindModuleType] = None
+        valgrind_symbols = (
+            "_valgrind_supported_platform",
+            "_valgrind_toggle",
+            "_valgrind_toggle_and_dump_stats",
+        )
+        if all(hasattr(torch._C, symbol) for symbol in valgrind_symbols):
+            self._supported_platform: bool = torch._C._valgrind_supported_platform()
+
+        else:
+            print("Callgrind bindings are not present in `torch._C`. JIT-ing bindings.")
+            self._bindings_module = cpp_jit.get_compat_bindings()
+            assert all(hasattr(self._bindings_module, symbol) for symbol in valgrind_symbols)
+            self._supported_platform = self._bindings_module._valgrind_supported_platform()
+
+        self._commands_available: Dict[str, bool] = {}
+        if self._supported_platform:
+            # Only bother checking on supported platforms.
+            for cmd in ("valgrind", "callgrind_control", "callgrind_annotate"):
+                self._commands_available[cmd] = not subprocess.run(
+                    ["which", cmd],
+                    capture_output=True,
+                    check=False,
+                ).returncode
+
+        self._build_type: Optional[str] = None
+        build_search = re.search("BUILD_TYPE=(.+),", torch.__config__.show())  # type: ignore[no-untyped-call]
+        if build_search is not None:
+            self._build_type = build_search.groups()[0].split(",")[0]
+
+    def _validate(self) -> None:
+        if not self._supported_platform:
+            raise OSError("Valgrind is not supported on this platform.")
+
+        missing_cmds = [cmd for cmd, available in self._commands_available.items() if not available]
+        if missing_cmds:
+            raise OSError("Missing: " + ", ".join(missing_cmds))
+
+    def collect_callgrind(
+        self,
+        task_spec: common.TaskSpec,
+        globals: Dict[str, Any],
+        *,
+        number: int,
+        repeats: int,
+        collect_baseline: bool,
+        is_python: bool,
+        retain_out_file: bool,
+    ) -> Tuple[CallgrindStats, ...]:
+        """Collect stats, and attach a reference run which can be used to filter interpreter overhead."""
+        self._validate()
+        assert is_python or not collect_baseline
+
+        *task_stats, baseline_stats = self._invoke(
+            task_spec=task_spec,
+            globals=globals,
+            number=number,
+            repeats=repeats,
+            collect_baseline=collect_baseline,
+            is_python=is_python,
+            retain_out_file=retain_out_file,
+        )
+        assert len(task_stats) == repeats
+
+        return tuple(
+            CallgrindStats(
+                task_spec=task_spec,
+                number_per_run=number,
+                built_with_debug_symbols=self._build_type == "RelWithDebInfo",
+                baseline_inclusive_stats=baseline_stats[0],
+                baseline_exclusive_stats=baseline_stats[1],
+                stmt_inclusive_stats=stmt_inclusive_stats,
+                stmt_exclusive_stats=stmt_exclusive_stats,
+                stmt_callgrind_out=out_contents,
+            )
+            for stmt_inclusive_stats, stmt_exclusive_stats, out_contents in task_stats
+        )
+
+    def _invoke(
+        self,
+        *,
+        task_spec: common.TaskSpec,
+        globals: Dict[str, Any],
+        number: int,
+        repeats: int,
+        collect_baseline: bool,
+        is_python: bool,
+        retain_out_file: bool,
+    ) -> Tuple[Tuple[FunctionCounts, FunctionCounts, Optional[str]], ...]:
+        """Core invocation method for Callgrind collection.
+
+        Valgrind operates by effectively replacing the CPU with an emulated
+        version which allows it to instrument any code at the cost of severe
+        performance degradation. This has the practical effect that in order
+        to collect Callgrind statistics, a new process has to be created
+        running under `valgrind`. The steps for this process are:
+
+        1) Create a scratch directory.
+        2) Codegen a run script. (_ValgrindWrapper._construct_script)
+            Inside the run script:
+                * Validate that Python and torch match the parent process
+                * Validate that it is indeed running under valgrind
+                * Execute `setup` and warm up `stmt`
+                * Begin collecting stats
+                * Run the `stmt` loop
+                * Stop collecting stats
+        3) Parse the run results.
+        4) Cleanup the scratch directory.
+        """
+        working_dir = common._make_temp_dir(prefix="callgrind")
+        data_dir = os.path.join(working_dir, "data")
+        script_file = os.path.join(working_dir, "timer_callgrind.py")
+        callgrind_out = os.path.join(working_dir, "callgrind.out")
+        error_log = os.path.join(working_dir, "error.txt")
+        stat_log = os.path.join(working_dir, "callgrind_stat.txt")
+        stdout_stderr_log = os.path.join(working_dir, "stdout_stderr.log")
+
+        def run(args: List[str], **kwargs: Any) -> Tuple[CompletedProcessType, str]:
+            # https://thraxil.org/users/anders/posts/2008/03/13/Subprocess-Hanging-PIPE-is-your-enemy/
+            f_stdout_stderr = open(stdout_stderr_log, "wb")
+            try:
+                invocation = subprocess.run(
+                    args,
+                    stdout=f_stdout_stderr,
+                    stderr=subprocess.STDOUT,
+                    **kwargs,
+                )
+                with open(stdout_stderr_log) as f:
+                    return invocation, f.read()
+            finally:
+                f_stdout_stderr.close()
+
+        try:
+            if is_python:
+                if self._bindings_module is not None:
+                    shutil.copy(
+                        self._bindings_module.__file__,
+                        os.path.join(working_dir, os.path.split(self._bindings_module.__file__)[1])
+                    )
+
+                script_file = os.path.join(working_dir, "timer_callgrind.py")
+                with open(script_file, "w") as f:
+                    f.write(self._construct_script(
+                        task_spec,
+                        globals=GlobalsBridge(globals, data_dir),
+                        number=number,
+                        repeats=repeats,
+                        collect_baseline=collect_baseline,
+                        error_log=error_log,
+                        stat_log=stat_log,
+                        bindings=self._bindings_module))
+
+                run_loop_cmd = ["python", script_file]
+            else:
+                assert not collect_baseline
+                run_loop_exec = cpp_jit.compile_callgrind_template(
+                    stmt=task_spec.stmt,
+                    setup=task_spec.setup,
+                    global_setup=task_spec.global_setup,
+                )
+                run_loop_cmd = [
+                    run_loop_exec,
+                    "--number", str(number),
+                    "--number-warmup", str(min(number, 10)),
+                    "--repeats", str(repeats),
+                    "--number-threads", str(task_spec.num_threads),
+                ]
+
+            valgrind_invocation, valgrind_invocation_output = run([
+                "valgrind",
+                "--tool=callgrind",
+                f"--callgrind-out-file={callgrind_out}",
+                "--dump-line=yes",
+                "--dump-instr=yes",
+                "--instr-atstart=yes",
+                "--collect-atstart=no",
+            ] + run_loop_cmd)
+
+            if valgrind_invocation.returncode:
+                error_report = ""
+                if os.path.exists(error_log):
+                    with open(error_log) as f:
+                        error_report = f.read()
+                if not error_report:
+                    error_report = "Unknown error.\n" + valgrind_invocation_output
+
+                raise OSError(f"Failed to collect callgrind profile:\n{error_report}")
+
+            def parse_output(fpath: str, inclusive: bool) -> FunctionCounts:
+                annotate_invocation, annotate_invocation_output = run([
+                    "callgrind_annotate",
+                    f"--inclusive={'yes' if inclusive else 'no'}",
+                    "--threshold=100",
+                    "--show-percs=no",
+                    fpath
+                ], check=True)
+
+                total_pattern = re.compile(r"^([0-9,]+)\s+PROGRAM TOTALS")
+                begin_pattern = re.compile(r"Ir\s+file:function")
+                function_pattern = re.compile(r"^\s*([0-9,]+)\s+(.+:.+)$")
+
+                class ScanState(enum.Enum):
+                    SCANNING_FOR_TOTAL = 0
+                    SCANNING_FOR_START = 1
+                    PARSING = 2
+
+                scan_state = ScanState.SCANNING_FOR_TOTAL
+                fn_counts = []
+                for l in annotate_invocation_output.splitlines(keepends=False):
+                    if scan_state == ScanState.SCANNING_FOR_TOTAL:
+                        total_match = total_pattern.match(l)
+                        if total_match:
+                            program_totals = int(total_match.groups()[0].replace(",", ""))
+                            scan_state = ScanState.SCANNING_FOR_START
+
+                    elif scan_state == ScanState.SCANNING_FOR_START:
+                        if begin_pattern.match(l):
+                            scan_state = ScanState.PARSING
+
+                    else:
+                        assert scan_state == ScanState.PARSING
+                        fn_match = function_pattern.match(l)
+                        if fn_match:
+                            ir_str, file_function = fn_match.groups()
+                            ir = int(ir_str.replace(",", ""))
+                            if ir == program_totals:  # type: ignore[possibly-undefined]
+                                # Callgrind includes some top level red herring symbols when
+                                # a program dumps multiple profiles.
+                                continue
+                            fn_counts.append(FunctionCount(ir, file_function))
+
+                        elif re.match(r"-+", l):
+                            # Ignore heading separator lines.
+                            continue
+
+                        else:
+                            break
+
+                assert scan_state == ScanState.PARSING, f"Failed to parse {fpath}"
+                return FunctionCounts(tuple(sorted(fn_counts, reverse=True)), inclusive=inclusive)
+
+            def read_results(i: int) -> Tuple[FunctionCounts, FunctionCounts, Optional[str]]:
+                if i == repeats and not collect_baseline:
+                    # Null baseline.
+                    return (
+                        FunctionCounts((), inclusive=True),
+                        FunctionCounts((), inclusive=False),
+                        None,
+                    )
+
+                fpath = f"{callgrind_out}.{i + 1}"  # Callgrind one-indexes files.
+                callgrind_out_contents: Optional[str] = None
+                if retain_out_file:
+                    with open(fpath) as f:
+                        callgrind_out_contents = f.read()
+
+                return (
+                    parse_output(fpath, inclusive=True),
+                    parse_output(fpath, inclusive=False),
+                    callgrind_out_contents
+                )
+
+            return tuple(read_results(i) for i in range(repeats + 1))
+        finally:
+            shutil.rmtree(working_dir)
+
+    @staticmethod
+    def _construct_script(
+        task_spec: common.TaskSpec,
+        globals: GlobalsBridge,
+        *,
+        number: int,
+        repeats: int,
+        collect_baseline: bool,
+        error_log: str,
+        stat_log: str,
+        bindings: Optional[CallgrindModuleType],
+    ) -> str:
+        def block_stmt(stmt: str, indent: int = 0) -> str:
+            """Partially unroll benchmark loop.
+
+            The naive template looks something like:
+                "for _ in range({number}): {stmt}"
+
+            However a loop in Python is surprisingly expensive, and significantly
+            increases the number of background Python instructions. So instead we
+            partially unroll the loops, with a block size of 100 chosen to keep
+            the instruction overhead from `range` low while also not ballooning
+            the size of the generated file.
+            """
+            block_size = 100
+            loop_count = number // block_size
+            if loop_count == 1:
+                # There is no point in having `for _ in range(1): ...` rather
+                # than just `...`, and this lets us save shave a few background
+                # instructions.
+                loop_count = 0
+            remainder = number - block_size * loop_count
+            blocked_stmt = ""
+
+            if loop_count:
+                unrolled_stmts = textwrap.indent("\n".join([stmt] * block_size), " " * 4)
+                blocked_stmt += f"for _ in range({loop_count}):\n{unrolled_stmts}\n"
+
+            if remainder:
+                blocked_stmt += "\n".join([stmt] * remainder)
+
+            return textwrap.indent(blocked_stmt, " " * indent)
+
+        pass_baseline = (
+            "callgrind_bindings._valgrind_toggle()\n"
+            f"{block_stmt('pass')}\n"
+            "callgrind_bindings._valgrind_toggle_and_dump_stats()"
+        )
+
+        return textwrap.dedent(r"""
+            import gc
+            import os
+            import pickle
+            import subprocess
+            import sys
+            import time
+
+            # Mitigate https://github.com/pytorch/pytorch/issues/37377
+            # which can sometimes cause the subprocess call to fail.
+            import numpy as np
+
+            import torch
+            torch.set_num_threads({num_threads})
+
+            {bindings_import}
+
+            PID = os.getpid()
+
+            def log_failure(msg):
+                with open({error_log_repr}, "wt") as f:
+                    f.write(msg)
+                sys.exit(1)
+
+            def check_result(completed_process):
+                if completed_process.returncode:
+                    log_failure(f"Command failed: {{' '.join(completed_process.args)}}")
+                return completed_process
+
+            # =============================================================================
+            # == Check that subprocess matches parent =====================================
+            # =============================================================================
+            if os.path.realpath(sys.executable) != "{parent_interpreter}":
+                log_failure(
+                    "Interpreter mismatch:\n"
+                    f"  {{os.path.realpath(sys.executable)}}\n    vs.\n  {parent_interpreter}"
+                )
+
+            if torch.__file__ != "{torch_file}":
+                log_failure(
+                    "PyTorch does not match expected file:\n"
+                    f"  {{torch.__file__}}\n    vs.\n  {torch_file}"
+                )
+
+            # =============================================================================
+            # == User specified setup =====================================================
+            # =============================================================================
+            # Load serialized globals
+            {load_globals}
+
+            # User setup str
+            {setup}
+
+            for _ in range({warmup_number}):
+            {indented_stmt}
+
+            # =============================================================================
+            # == Callgrind management =====================================================
+            # =============================================================================
+            with open("{stat_log}", "wb") as stat_file:
+                # If many instances of callgrind are running at once, the output of
+                # `callgrind_control` may exceed 16kb which would cause `subprocess.PIPE`
+                # to deadlock. So instead we use a file.
+                callgrind_stat = check_result(subprocess.run(
+                    ["callgrind_control", "--stat"],
+                    stdout=stat_file,
+                    stderr=subprocess.STDOUT,
+                ))
+
+            with open("{stat_log}", "rt") as stat_file:
+                stat_lines = stat_file.read().splitlines()
+
+            if f"PID {{PID}}: python {{__file__}}" not in stat_lines:
+                log_failure("Process does not appear to be running callgrind.")
+
+            gc.collect()
+            time.sleep(0.01)
+
+            # =============================================================================
+            # == User code block ==========================================================
+            # =============================================================================
+            for _ in range({repeats}):
+                callgrind_bindings._valgrind_toggle()
+            {blocked_stmt}
+                callgrind_bindings._valgrind_toggle_and_dump_stats()
+                gc.collect()
+
+            {baseline}
+        """).strip().format(
+            indented_stmt=textwrap.indent(task_spec.stmt, " " * 4),
+            blocked_stmt=block_stmt(task_spec.stmt, indent=4),
+            baseline=(pass_baseline if collect_baseline else ""),
+            number=number,
+            repeats=repeats,
+            load_globals=globals.construct(),
+            setup=task_spec.setup,
+            warmup_number=min(number, 10),
+            num_threads=task_spec.num_threads,
+            error_log_repr=repr(error_log),
+            stat_log=stat_log,
+            parent_interpreter=os.path.realpath(sys.executable),
+            torch_file=torch.__file__,
+            bindings_import=(
+                "import torch._C as callgrind_bindings" if bindings is None
+                else f"import {bindings.__name__} as callgrind_bindings"),
+        )
+
+
+CALLGRIND_SINGLETON: Optional[_ValgrindWrapper] = None
+def wrapper_singleton() -> _ValgrindWrapper:
+    global CALLGRIND_SINGLETON
+    if CALLGRIND_SINGLETON is None:
+        CALLGRIND_SINGLETON = _ValgrindWrapper()
+    return CALLGRIND_SINGLETON
--- a/rl/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
+++ b/rl/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h