I am done

2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions
--- a/rl/Lib/site-packages/torch/_C/_VariableFunctions.pyi
+++ b/rl/Lib/site-packages/torch/_C/_VariableFunctions.pyi
--- a/rl/Lib/site-packages/torch/_C/init.pyi
+++ b/rl/Lib/site-packages/torch/_C/init.pyi
--- a/rl/Lib/site-packages/torch/_C/_aoti.pyi
+++ b/rl/Lib/site-packages/torch/_C/_aoti.pyi
@ -0,0 +1,20 @@
+from ctypes import c_void_p
+
+from torch import Tensor
+
+# Defined in torch/csrc/inductor/aoti_runner/pybind.cpp
+
+# Tensor to AtenTensorHandle
+def unsafe_alloc_void_ptrs_from_tensors(tensors: list[Tensor]) -> list[c_void_p]: ...
+def unsafe_alloc_void_ptr_from_tensor(tensor: Tensor) -> c_void_p: ...
+
+# AtenTensorHandle to Tensor
+def alloc_tensors_by_stealing_from_void_ptrs(
+    handles: list[c_void_p],
+) -> list[Tensor]: ...
+def alloc_tensor_by_stealing_from_void_ptr(
+    handle: c_void_p,
+) -> Tensor: ...
+
+class AOTIModelContainerRunnerCpu: ...
+class AOTIModelContainerRunnerCuda: ...
--- a/rl/Lib/site-packages/torch/_C/_autograd.pyi
+++ b/rl/Lib/site-packages/torch/_C/_autograd.pyi
@ -0,0 +1,135 @@
+# mypy: allow-untyped-defs
+from enum import Enum
+from typing import Any, Callable
+
+import torch
+from torch._C._profiler import (
+    _ProfilerEvent,
+    ActiveProfilerType,
+    ProfilerActivity,
+    ProfilerConfig,
+)
+
+# Defined in torch/csrc/autograd/init.cpp
+
+class DeviceType(Enum):
+    CPU = ...
+    CUDA = ...
+    XPU = ...
+    MKLDNN = ...
+    OPENGL = ...
+    OPENCL = ...
+    IDEEP = ...
+    HIP = ...
+    FPGA = ...
+    MAIA = ...
+    XLA = ...
+    MTIA = ...
+    MPS = ...
+    HPU = ...
+    Meta = ...
+    Vulkan = ...
+    Metal = ...
+    PrivateUse1 = ...
+
+class ProfilerEvent:
+    def cpu_elapsed_us(self, other: ProfilerEvent) -> float: ...
+    def cpu_memory_usage(self) -> int: ...
+    def cuda_elapsed_us(self, other: ProfilerEvent) -> float: ...
+    def privateuse1_elapsed_us(self, other: ProfilerEvent) -> float: ...
+    def cuda_memory_usage(self) -> int: ...
+    def device(self) -> int: ...
+    def handle(self) -> int: ...
+    def has_cuda(self) -> bool: ...
+    def is_remote(self) -> bool: ...
+    def kind(self) -> int: ...
+    def name(self) -> str: ...
+    def node_id(self) -> int: ...
+    def sequence_nr(self) -> int: ...
+    def shapes(self) -> list[list[int]]: ...
+    def thread_id(self) -> int: ...
+    def flops(self) -> float: ...
+    def is_async(self) -> bool: ...
+
+class _KinetoEvent:
+    def name(self) -> str: ...
+    def device_index(self) -> int: ...
+    def device_resource_id(self) -> int: ...
+    def start_ns(self) -> int: ...
+    def end_ns(self) -> int: ...
+    def duration_ns(self) -> int: ...
+    def is_async(self) -> bool: ...
+    def linked_correlation_id(self) -> int: ...
+    def shapes(self) -> list[list[int]]: ...
+    def dtypes(self) -> list[str]: ...
+    def concrete_inputs(self) -> list[Any]: ...
+    def kwinputs(self) -> dict[str, Any]: ...
+    def device_type(self) -> DeviceType: ...
+    def start_thread_id(self) -> int: ...
+    def end_thread_id(self) -> int: ...
+    def correlation_id(self) -> int: ...
+    def fwd_thread_id(self) -> int: ...
+    def stack(self) -> list[str]: ...
+    def scope(self) -> int: ...
+    def sequence_nr(self) -> int: ...
+    def flops(self) -> int: ...
+    def cuda_elapsed_us(self) -> int: ...
+    def privateuse1_elapsed_us(self) -> int: ...
+    def is_user_annotation(self) -> bool: ...
+
+class _ProfilerResult:
+    def events(self) -> list[_KinetoEvent]: ...
+    def legacy_events(self) -> list[list[ProfilerEvent]]: ...
+    def save(self, path: str) -> None: ...
+    def experimental_event_tree(self) -> list[_ProfilerEvent]: ...
+    def trace_start_ns(self) -> int: ...
+
+class SavedTensor: ...
+
+def _enable_profiler(
+    config: ProfilerConfig,
+    activities: set[ProfilerActivity],
+) -> None: ...
+def _prepare_profiler(
+    config: ProfilerConfig,
+    activities: set[ProfilerActivity],
+) -> None: ...
+def _toggle_collection_dynamic(
+    enable: bool,
+    activities: set[ProfilerActivity],
+) -> None: ...
+def _disable_profiler() -> _ProfilerResult: ...
+def _profiler_enabled() -> bool: ...
+def _add_metadata_json(key: str, value: str) -> None: ...
+def _kineto_step() -> None: ...
+def _get_current_graph_task_keep_graph() -> bool: ...
+def _get_sequence_nr() -> int: ...
+def kineto_available() -> bool: ...
+def _record_function_with_args_enter(name: str, *args) -> torch.Tensor: ...
+def _record_function_with_args_exit(handle: torch.Tensor) -> None: ...
+def _supported_activities() -> set[ProfilerActivity]: ...
+def _enable_record_function(enable: bool) -> None: ...
+def _set_empty_test_observer(is_global: bool, sampling_prob: float) -> None: ...
+def _push_saved_tensors_default_hooks(
+    pack_hook: Callable[[torch.Tensor], Any],
+    unpack_hook: Callable[[Any], torch.Tensor],
+) -> None: ...
+def _pop_saved_tensors_default_hooks() -> None: ...
+def _unsafe_set_version_counter(t: torch.Tensor, prev_version: int) -> None: ...
+def _enable_profiler_legacy(config: ProfilerConfig) -> None: ...
+def _disable_profiler_legacy() -> list[list[ProfilerEvent]]: ...
+def _profiler_type() -> ActiveProfilerType: ...
+def _saved_tensors_hooks_enable() -> None: ...
+def _saved_tensors_hooks_disable(message: str) -> None: ...
+def _saved_tensors_hooks_get_disabled_error_message() -> str | None: ...
+def _saved_tensors_hooks_set_tracing(is_tracing: bool) -> bool: ...
+
+class CreationMeta(Enum):
+    DEFAULT = ...
+    IN_CUSTOM_FUNCTION = ...
+    MULTI_OUTPUT_NODE = ...
+    NO_GRAD_MODE = ...
+    INFERENCE_MODE = ...
+
+def _set_creation_meta(t: torch.Tensor, creation_meta: CreationMeta) -> None: ...
+def _get_creation_meta(t: torch.Tensor) -> CreationMeta: ...
--- a/rl/Lib/site-packages/torch/_C/_cpu.pyi
+++ b/rl/Lib/site-packages/torch/_C/_cpu.pyi
@ -0,0 +1,12 @@
+from torch.types import _bool, _int
+
+# Defined in torch/csrc/cpu/Module.cpp
+
+def _is_avx2_supported() -> _bool: ...
+def _is_avx512_supported() -> _bool: ...
+def _is_avx512_vnni_supported() -> _bool: ...
+def _is_avx512_bf16_supported() -> _bool: ...
+def _is_amx_tile_supported() -> _bool: ...
+def _init_amx() -> _bool: ...
+def _L1d_cache_size() -> _int: ...
+def _L2_cache_size() -> _int: ...
--- a/rl/Lib/site-packages/torch/_C/_cudnn.pyi
+++ b/rl/Lib/site-packages/torch/_C/_cudnn.pyi
@ -0,0 +1,17 @@
+from enum import Enum
+
+from torch.types import _bool, Tuple
+
+# Defined in torch/csrc/cuda/shared/cudnn.cpp
+is_cuda: _bool
+
+def getRuntimeVersion() -> Tuple[int, int, int]: ...
+def getCompileVersion() -> Tuple[int, int, int]: ...
+def getVersionInt() -> int: ...
+
+class RNNMode(int, Enum):
+    value: int
+    rnn_relu = ...
+    rnn_tanh = ...
+    lstm = ...
+    gru = ...
--- a/rl/Lib/site-packages/torch/_C/_cusparselt.pyi
+++ b/rl/Lib/site-packages/torch/_C/_cusparselt.pyi
@ -0,0 +1 @@
+def getVersionInt() -> int: ...
--- a/rl/Lib/site-packages/torch/_C/_distributed_autograd.pyi
+++ b/rl/Lib/site-packages/torch/_C/_distributed_autograd.pyi
@ -0,0 +1,27 @@
+# mypy: allow-untyped-defs
+from typing import Any
+
+import torch
+
+# This module is defined in torch/csrc/distributed/autograd/init.cpp
+
+class DistAutogradContext:
+    def _context_id(self) -> int: ...
+    def _recv_functions(self) -> dict[int, Any]: ...
+    def _send_functions(self) -> dict[int, Any]: ...
+    def _known_worker_ids(self) -> set[int]: ...
+
+def _new_context() -> DistAutogradContext: ...
+def _release_context(context_id: int) -> None: ...
+def _get_max_id() -> int: ...
+def _is_valid_context(worker_id: int) -> bool: ...
+def _retrieve_context(context_id: int) -> DistAutogradContext: ...
+def _current_context() -> DistAutogradContext: ...
+def _init(worker_id: int) -> None: ...
+def _get_debug_info() -> dict[str, str]: ...
+def backward(
+    context_id: int,
+    roots: list[torch.Tensor],
+    retain_graph=False,
+) -> None: ...
+def get_gradients(context_id: int) -> dict[torch.Tensor, torch.Tensor]: ...
--- a/rl/Lib/site-packages/torch/_C/_distributed_c10d.pyi
+++ b/rl/Lib/site-packages/torch/_C/_distributed_c10d.pyi
@ -0,0 +1,699 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code="type-arg"
+from datetime import timedelta
+from enum import Enum
+from typing import Any, Optional, overload
+
+import torch
+from torch import Tensor
+from torch._C import ScriptObject
+from torch.futures import Future
+
+# This module is defined in torch/csrc/distributed/c10d/init.cpp
+
+_DEFAULT_FIRST_BUCKET_BYTES: int
+_DEFAULT_NO_TIMEOUT: timedelta
+_DEFAULT_PG_TIMEOUT: timedelta
+_DEFAULT_PG_NCCL_TIMEOUT: timedelta
+
+class BuiltinCommHookType(Enum):
+    ALLREDUCE = ...
+    FP16_COMPRESS = ...
+
+def _register_comm_hook(reducer: Reducer, state: Any, comm_hook: Any): ...
+def _register_builtin_comm_hook(
+    reducer: Reducer,
+    comm_hook_type: BuiltinCommHookType,
+): ...
+def _set_global_rank(rank: int) -> None: ...
+def _hash_tensors(tensors: list[Tensor]) -> int: ...
+
+class GradBucket:
+    def index(self) -> int: ...
+    def buffer(self) -> Tensor: ...
+    def gradients(self) -> list[Tensor]: ...
+    def is_last(self) -> bool: ...
+    def set_buffer(self, tensor: Tensor) -> None: ...
+    def parameters(self) -> list[Tensor]: ...
+
+class Reducer:
+    def __init__(
+        self,
+        params: list[Tensor],
+        bucket_indices: list[list[int]],
+        per_bucket_size_limits: list[int],
+        process_group: ProcessGroup,
+        expect_sparse_gradients: list[bool] = ...,
+        bucket_bytes_cap: int = ...,  # kDefaultBucketBytesCap in reducer.hpp
+        find_unused_parameters: bool = ...,
+        gradient_as_bucket_view: bool = ...,
+        param_to_name_mapping: dict[int, str] = ...,
+        first_bucket_types_cap: int = ...,  # kDefaultFirstBucketBytes in reducer.hpp
+    ) -> None: ...
+    def prepare_for_forward(self) -> None: ...
+    def prepare_for_backward(self, output: list[Tensor]) -> None: ...
+    def get_backward_stats(self) -> list[int]: ...
+    def _install_post_backward_futures(self, futures: list[Future]) -> None: ...
+    def _rebuild_buckets(self) -> bool: ...
+    def _get_zeros_like_grad_buckets(self) -> list[GradBucket]: ...
+    def _push_all_rebuilt_params(self) -> None: ...
+    def _set_forward_pass_work_handle(
+        self,
+        work: Work,
+        use_static_world_size: bool,
+    ): ...
+    def _get_local_used_map(self) -> Tensor: ...
+    def _set_ddp_runtime_logging_sample_rate(self, sample_rate: int) -> None: ...
+    def _set_static_graph(self) -> None: ...
+    def _run_comm_hook(self, bucket: GradBucket) -> Future: ...
+    def set_logger(self, logger: Logger) -> None: ...
+    def _remove_autograd_hooks(self) -> None: ...
+    def _check_reducer_finalized(self) -> None: ...
+    def _set_sparse_metadata(self, global_unique_ids: dict[str, Tensor]) -> None: ...
+    def _reset_state(self) -> None: ...
+    def _update_process_group(self, new_process_group: ProcessGroup) -> None: ...
+
+class DDPLoggingData:
+    strs_map: dict[str, str]
+    ints_map: dict[str, int]
+
+class Logger:
+    def __init__(self, reducer: Reducer) -> None: ...
+    def set_construction_data_and_log(
+        self,
+        module_name: str,
+        device_ids: list[int],
+        output_device: int,
+        broadcast_buffers: bool,
+        has_sync_bn: bool,
+        static_graph: bool,
+    ): ...
+    def set_runtime_stats_and_log(self) -> None: ...
+    def set_error_and_log(self, error: str) -> None: ...
+    def _get_ddp_logging_data(self) -> DDPLoggingData: ...
+    def _set_comm_hook_name(self, comm_hook: str) -> None: ...
+    def _set_uneven_input_join(self) -> None: ...
+    def _set_static_graph(self) -> None: ...
+
+class _WorkerServer:
+    def __init__(self, socket_path: str) -> None: ...
+    def shutdown(self) -> None: ...
+
+def get_debug_level(): ...
+def set_debug_level(): ...
+def set_debug_level_from_env(): ...
+
+class DebugLevel(Enum):
+    OFF = ...
+    INFO = ...
+    DETAIL = ...
+
+class ReduceOp:
+    def __init__(self, op: RedOpType) -> None: ...
+
+    SUM: RedOpType = ...
+    AVG: RedOpType = ...
+    PRODUCT: RedOpType = ...
+    MIN: RedOpType = ...
+    MAX: RedOpType = ...
+    BAND: RedOpType = ...
+    BOR: RedOpType = ...
+    BXOR: RedOpType = ...
+    PREMUL_SUM: RedOpType = ...
+    UNUSED: RedOpType = ...
+
+    class RedOpType(Enum): ...
+
+class BroadcastOptions:
+    rootRank: int
+    rootTensor: int
+    timeout: timedelta
+    asyncOp: bool
+
+class AllreduceOptions:
+    reduceOp: ReduceOp
+    timeout: timedelta
+
+class AllreduceCoalescedOptions(AllreduceOptions): ...
+
+class ReduceOptions:
+    reduceOp: ReduceOp
+    rootRank: int
+    rootTensor: int
+    timeout: timedelta
+
+class AllgatherOptions:
+    timeout: timedelta
+    asyncOp: bool
+
+class GatherOptions:
+    rootRank: int
+    timeout: timedelta
+
+class ScatterOptions:
+    rootRank: int
+    timeout: timedelta
+    asyncOp: bool
+
+class ReduceScatterOptions:
+    reduceOp: ReduceOp
+    timeout: timedelta
+    asyncOp: bool
+
+class BarrierOptions:
+    device_ids: list[int]
+    device: torch.device
+    timeout: timedelta
+
+class AllToAllOptions:
+    timeout: timedelta
+
+class Store:
+    def set(self, key: str, value: str): ...
+    def get(self, key: str) -> bytes: ...
+    def add(self, key: str, value: int) -> int: ...
+    def compare_set(
+        self,
+        key: str,
+        expected_value: str,
+        desired_value: str,
+    ) -> bytes: ...
+    def delete_key(self, key: str) -> bool: ...
+    def num_keys(self) -> int: ...
+    def set_timeout(self, timeout: timedelta): ...
+    @overload
+    def wait(self, keys: list[str]): ...
+    @overload
+    def wait(self, keys: list[str], timeout: timedelta): ...
+
+class FileStore(Store):
+    def __init__(self, path: str, numWorkers: int = ...) -> None: ...
+
+class HashStore(Store):
+    def __init__(self) -> None: ...
+
+class TCPStore(Store):
+    def __init__(
+        self,
+        host_name: str,
+        port: int,
+        world_size: int | None = ...,
+        is_master: bool = ...,
+        timeout: timedelta = ...,
+        wait_for_workers: bool = ...,
+        multi_tenant: bool = ...,
+        master_listen_fd: int | None = ...,
+        use_libuv: bool | None = ...,
+    ) -> None: ...
+    @property
+    def host(self) -> str: ...
+    @property
+    def port(self) -> int: ...
+
+class PrefixStore(Store):
+    def __init__(self, prefix: str, store: Store) -> None: ...
+    @property
+    def underlying_store(self) -> Store: ...
+
+class _ControlCollectives:
+    def barrier(self, key: str, timeout: timedelta, blocking: bool) -> None: ...
+    def broadcast_send(self, key: str, data: str, timeout: timedelta) -> None: ...
+    def broadcast_recv(self, key: str, timeout: timedelta) -> str: ...
+    def gather_send(self, key: str, data: str, timeout: timedelta) -> None: ...
+    def gather_recv(self, key: str, timeout: timedelta) -> str: ...
+    def scatter_send(self, key: str, data: str, timeout: timedelta) -> None: ...
+    def scatter_recv(self, key: str, timeout: timedelta) -> str: ...
+    def all_gather(self, key: str, data: str, timeout: timedelta) -> str: ...
+    def all_sum(self, key: str, data: int, timeout: timedelta) -> int: ...
+
+class _StoreCollectives(_ControlCollectives):
+    def __init__(self, store: Store, rank: int, world_size: int) -> None: ...
+
+class _DistributedBackendOptions:
+    def __init__(self) -> None: ...
+    @property
+    def store(self) -> Store: ...
+    @store.setter
+    def store(self, store: Store) -> None: ...
+    @property
+    def group_rank(self) -> int: ...
+    @group_rank.setter
+    def group_rank(self, rank: int) -> None: ...
+    @property
+    def group_size(self) -> int: ...
+    @group_size.setter
+    def group_size(self, size: int) -> None: ...
+    @property
+    def timeout(self) -> timedelta: ...
+    @timeout.setter
+    def timeout(self, timeout: timedelta) -> None: ...
+    @property
+    def group_id(self) -> str: ...
+    @group_id.setter
+    def group_id(self, group_id: str) -> None: ...
+    @property
+    def global_ranks_in_group(self) -> list[int]: ...
+    @global_ranks_in_group.setter
+    def global_ranks_in_group(self, ranks: list[int]) -> None: ...
+
+class Work:
+    def is_completed(self) -> bool: ...
+    def is_success(self) -> bool: ...
+    def exception(self) -> Any: ...
+    def wait(self, timeout: timedelta = ...) -> bool: ...
+    def get_future(self) -> Future: ...
+    def source_rank(self) -> int: ...
+    def _source_rank(self) -> int: ...
+    def result(self) -> list[Tensor]: ...
+    def synchronize(self): ...
+    def boxed(self) -> ScriptObject: ...
+    @staticmethod
+    def unbox(obj: ScriptObject) -> Work: ...
+
+class Backend:
+    class Options:
+        def __init__(self, backend: str, timeout: timedelta = ...) -> None: ...
+        @property
+        def backend(self) -> str: ...
+        @property
+        def _timeout(self) -> timedelta: ...
+        @_timeout.setter
+        def _timeout(self, val: timedelta) -> None: ...
+
+    def __init__(
+        self,
+        rank: int,
+        size: int,
+    ) -> None: ...
+    @property
+    def supports_splitting(self) -> bool: ...
+    @property
+    def options(self) -> Options: ...
+    def rank(self) -> int: ...
+    def size(self) -> int: ...
+    def eager_connect_single_device(self, device: torch.device | None) -> None: ...
+    def _set_sequence_number_for_group(self) -> None: ...
+    def _set_default_timeout(self, timeout: timedelta) -> None: ...
+
+class ProcessGroup:
+    class Options:
+        def __init__(self, backend: str, timeout: timedelta = ...) -> None: ...
+        @property
+        def backend(self) -> str: ...
+        @property
+        def _timeout(self) -> timedelta: ...
+        @_timeout.setter
+        def _timeout(self, val: timedelta) -> None: ...
+
+    class BackendType(Enum):
+        UNDEFINED = ...
+        GLOO = ...
+        NCCL = ...
+        UCC = ...
+        MPI = ...
+        CUSTOM = ...
+
+    def __init__(
+        self,
+        store: Store,
+        rank: int,
+        size: int,
+        options: Options,
+    ) -> None: ...
+    def rank(self) -> int: ...
+    def size(self) -> int: ...
+    @overload
+    def broadcast(
+        self,
+        tensors: list[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def broadcast(
+        self,
+        tensor: Tensor,
+        root: int,
+    ) -> Work: ...
+    @overload
+    def allreduce(
+        self,
+        tensors: list[Tensor],
+        opts: AllreduceOptions = ...,
+    ) -> Work: ...
+    @overload
+    def allreduce(
+        self,
+        tensors: list[Tensor],
+        op=...,
+    ) -> Work: ...
+    @overload
+    def allreduce(
+        self,
+        tensor: Tensor,
+        op=...,
+    ) -> Work: ...
+    def allreduce_coalesced(
+        self,
+        tensors: list[Tensor],
+        opts=...,
+    ) -> Work: ...
+    def reduce_scatter_tensor_coalesced(
+        self,
+        outputTensors: list[Tensor],
+        inputTensors: list[Tensor],
+        opts: ReduceScatterOptions | None = None,
+    ) -> Work: ...
+    @overload
+    def reduce(
+        self,
+        tensors: list[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def reduce(
+        self,
+        tensor: Tensor,
+        root: int,
+        op=...,
+    ) -> Work: ...
+    @overload
+    def allgather(
+        self,
+        output_tensors: list[list[Tensor]],
+        input_tensors: list[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def allgather(
+        self,
+        output_tensors: list[Tensor],
+        input_tensor: Tensor,
+    ) -> Work: ...
+    def _allgather_base(
+        self,
+        output: Tensor,
+        input: Tensor,
+        opts=...,
+    ) -> Work: ...
+    def allgather_coalesced(
+        self,
+        output_lists: list[list[Tensor]],
+        input_list: list[Tensor],
+        opts=...,
+    ) -> Work: ...
+    def allgather_into_tensor_coalesced(
+        self,
+        output_lists: list[Tensor],
+        input_list: list[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def gather(
+        self,
+        output_tensors: list[list[Tensor]],
+        input_tensors: list[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def gather(
+        self,
+        output_tensors: list[Tensor],
+        input_tensor: Tensor,
+        root: int,
+    ) -> Work: ...
+    @overload
+    def scatter(
+        self,
+        output_tensors: list[Tensor],
+        input_tensors: list[list[Tensor]],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def scatter(
+        self,
+        output_tensor: Tensor,
+        input_tensors: list[Tensor],
+        root: int,
+    ) -> Work: ...
+    @overload
+    def reduce_scatter(
+        self,
+        output_tensors: list[Tensor],
+        input_tensors: list[list[Tensor]],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def reduce_scatter(
+        self,
+        output_tensors: Tensor,
+        input_tensor: list[Tensor],
+    ) -> Work: ...
+    def _reduce_scatter_base(
+        self,
+        outputTensor: Tensor,
+        inputTensor: Tensor,
+        opts: ReduceScatterOptions | None,
+    ) -> Work: ...
+    @overload
+    def alltoall_base(
+        self,
+        output_tensor: Tensor,
+        input_tensor: Tensor,
+        output_split_sizes: list[int],
+        input_split_sizes: list[int],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def alltoall_base(
+        self,
+        output: Tensor,
+        input: Tensor,
+        output_split_sizes: list[int],
+        input_split_sizes: list[int],
+    ) -> Work: ...
+    @overload
+    def alltoall(
+        self,
+        output_tensor: list[Tensor],
+        input_tensor: list[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def alltoall(
+        self,
+        output: list[Tensor],
+        input: list[Tensor],
+    ) -> Work: ...
+    def send(
+        self,
+        tensors: list[Tensor],
+        dstRank: int,
+        tag: int,
+    ) -> Work: ...
+    def recv(
+        self,
+        tensors: list[Tensor],
+        srcRank: int,
+        tag: int,
+    ) -> Work: ...
+    def recv_anysource(self, tensors: list[Tensor], tag: int) -> Work: ...
+    def barrier(self, opts=...) -> Work: ...
+    def boxed(self) -> ScriptObject: ...
+    @staticmethod
+    def unbox(obj: ScriptObject) -> ProcessGroup: ...
+    def _start_coalescing(self, device: torch.device) -> None: ...
+    def _end_coalescing(self, device: torch.device) -> Work: ...
+    def _get_backend_name(self) -> str: ...
+    def _backend_id(self, backend_type: BackendType) -> int: ...
+    @property
+    def _device_types(self) -> list[torch.device]: ...
+    def _get_backend(self, device: torch.device) -> Backend: ...
+    def _register_backend(
+        self,
+        device: torch.device,
+        backend_type: BackendType,
+        backend: Backend | None,
+    ) -> None: ...
+    def _set_group_name(self, name: str) -> None: ...
+    def _set_group_desc(self, desc: str) -> None: ...
+    def name(self) -> str: ...
+    def _has_hooks(self) -> bool: ...
+    def _wait_for_pending_works(self) -> None: ...
+    def _set_sequence_number_for_group(self) -> None: ...
+    @property
+    def bound_device_id(self) -> torch.device | None: ...
+    @bound_device_id.setter
+    def bound_device_id(self, device: torch.device | None) -> None: ...
+    @property
+    def group_name(self) -> str: ...
+    @property
+    def group_desc(self) -> str: ...
+
+class ProcessGroupGloo(Backend):
+    class Device: ...
+
+    class Options(ProcessGroup.Options):
+        devices: list[ProcessGroupGloo.Device]
+        threads: int
+
+        def __init__(self): ...
+
+    def __init__(
+        self,
+        store: Store,
+        rank: int,
+        size: int,
+        timeout: timedelta,
+    ) -> None: ...
+    @staticmethod
+    def create_device(hostname="", interface="") -> Device: ...
+    @staticmethod
+    def create_default_device() -> Device: ...
+    def _set_default_timeout(self, timeout) -> None: ...
+
+class _ProcessGroupWrapper(Backend):
+    def __init__(self, pg: Backend, gloo_pg: ProcessGroupGloo) -> None: ...
+    wrapped_pg: Backend
+
+class ProcessGroupNCCL(Backend):
+    class NCCLConfig:
+        blocking: int
+        cga_cluster_size: int
+        min_ctas: int
+        max_ctas: int
+
+    class Options(ProcessGroup.Options):
+        config: ProcessGroupNCCL.NCCLConfig
+        is_high_priority_stream: bool
+        split_from: ProcessGroupNCCL
+        split_color: int
+        global_ranks_in_group: list[int]
+        group_name: str
+
+        def __init__(self, is_high_priority_stream: bool = False): ...
+
+    def __init__(
+        self,
+        store: Store,
+        rank: int,
+        size: int,
+        options: Options,
+    ) -> None: ...
+    def _group_start(self) -> None: ...
+    def _group_end(self) -> None: ...
+    def _set_default_timeout(self, timeout) -> None: ...
+    def _shutdown(self) -> None: ...
+    def perform_nocolor_split(self, device: torch.device) -> None: ...
+    def comm_split_count(self) -> int: ...
+    def _add_ephemeral_timeout(self, timeout: timedelta) -> None: ...
+    @property
+    def uid(self) -> int: ...
+    @property
+    def options(self) -> Options: ...  # type: ignore[override]
+
+class ProcessGroupUCC(Backend):
+    def __init__(
+        self,
+        store: Store,
+        rank: int,
+        size: int,
+        timeout: timedelta,
+    ) -> None: ...
+
+class ProcessGroupMPI(Backend):
+    def __init__(
+        self,
+        rank: int,
+        size: int,
+        pgComm: int,
+    ) -> None: ...
+    @staticmethod
+    def create(ranks: list[int]) -> ProcessGroupMPI: ...
+
+def _compute_bucket_assignment_by_size(
+    tensors: list[Tensor],
+    bucket_size_limits: list[int],
+    expect_sparse_gradient: list[bool] = ...,
+    tensor_indices: list[int] = ...,
+) -> tuple[list[list[int]], list[int]]: ...
+def _broadcast_coalesced(
+    process_group: ProcessGroup,
+    tensors: list[Tensor],
+    buffer_size: int,
+    src: int,
+): ...
+def _test_python_store(store: Store): ...
+def _verify_params_across_processes(
+    process_group: ProcessGroup,
+    params: list[Tensor],
+    logger: Logger | None,
+): ...
+def _make_nccl_premul_sum(factor: float | list[Tensor]) -> ReduceOp: ...
+def _register_process_group(
+    group_name: str,
+    process_group: ProcessGroup,
+) -> None: ...
+def _resolve_process_group(group_name: str) -> ProcessGroup: ...
+def _register_work(tensor: torch.Tensor, work: Work) -> ProcessGroup: ...
+def _unregister_all_process_groups() -> None: ...
+def _unregister_process_group(group_name: str) -> None: ...
+
+class _SymmetricMemory:
+    @staticmethod
+    def set_group_info(
+        group_name: str,
+        rank: int,
+        world_size: int,
+        store: Store,
+    ) -> None: ...
+    @staticmethod
+    def empty_strided_p2p(
+        size: torch.types._size,
+        stride: torch.types._size,
+        dtype: torch.dtype,
+        device: torch.device,
+        group_name: str,
+    ) -> torch.Tensor: ...
+    @property
+    def rank(self) -> int: ...
+    @property
+    def world_size(self) -> int: ...
+    @staticmethod
+    def rendezvous(tensor: torch.Tensor) -> _SymmetricMemory: ...
+    def get_buffer(
+        self,
+        rank: int,
+        sizes: torch.types._size,
+        dtype: torch.dtype,
+        storage_offset: int | None = 0,
+    ) -> torch.Tensor: ...
+    def barrier(self, channel: int = 0) -> None: ...
+    def put_signal(self, dst_rank: int, channel: int = 0) -> None: ...
+    def wait_signal(self, src_rank: int, channel: int = 0) -> None: ...
+
+class ProcessGroupCudaP2P(Backend):
+    class Options:
+        nccl_options: Optional[ProcessGroupNCCL.Options]
+        buffer_size: Optional[int]
+
+        def __init__(self) -> None: ...
+
+    def __init__(
+        self,
+        store: Store,
+        rank: int,
+        size: int,
+        options: ProcessGroupCudaP2P.Options,
+    ) -> None: ...
+    def is_p2p_available(self) -> bool: ...
+    def get_buffer_size(self) -> int: ...
+    def stream(self) -> torch.cuda.Stream: ...
+    def intra_node_barrier(self) -> Work: ...
+    def get_p2p_buffer(
+        self,
+        rank: int,
+        sizes: torch.Size,
+        dtype: torch.dtype,
+        storage_offset: Optional[int] = 0,
+    ) -> torch.Tensor: ...
+    def _shutdown(self) -> None: ...
--- a/rl/Lib/site-packages/torch/_C/_distributed_rpc.pyi
+++ b/rl/Lib/site-packages/torch/_C/_distributed_rpc.pyi
@ -0,0 +1,188 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code="type-arg"
+from datetime import timedelta
+from typing import Any, Generic, overload, TypeVar
+
+import torch
+from torch._C import Future
+from torch._C._autograd import ProfilerEvent
+from torch._C._distributed_c10d import Store
+from torch._C._profiler import ProfilerConfig
+
+# This module is defined in torch/csrc/distributed/rpc/init.cpp
+
+_DEFAULT_INIT_METHOD: str
+_DEFAULT_NUM_WORKER_THREADS: int
+_UNSET_RPC_TIMEOUT: float
+_DEFAULT_RPC_TIMEOUT_SEC: float
+
+_T = TypeVar("_T")
+
+class RpcBackendOptions:
+    rpc_timeout: float
+    init_method: str
+    def __init__(
+        self,
+        rpc_timeout: float = ...,
+        init_method: str = ...,
+    ) -> None: ...
+
+class WorkerInfo:
+    def __init__(self, name: str, worker_id: int) -> None: ...
+    @property
+    def name(self) -> str: ...
+    @property
+    def id(self) -> int: ...
+    def __eq__(self, other: object) -> bool: ...
+
+class RpcAgent:
+    def join(self, shutdown: bool = False, timeout: float = 0): ...
+    def sync(self): ...
+    def shutdown(self): ...
+    @overload
+    def get_worker_info(self) -> WorkerInfo: ...
+    @overload
+    def get_worker_info(self, workerName: str) -> WorkerInfo: ...
+    def get_worker_infos(self) -> list[WorkerInfo]: ...
+    def _get_device_map(self, dst: WorkerInfo) -> dict[torch.device, torch.device]: ...
+    def get_debug_info(self) -> dict[str, str]: ...
+    def get_metrics(self) -> dict[str, str]: ...
+
+class PyRRef(Generic[_T]):
+    def __init__(self, value: _T, type_hint: Any = None) -> None: ...
+    def is_owner(self) -> bool: ...
+    def confirmed_by_owner(self) -> bool: ...
+    def owner(self) -> WorkerInfo: ...
+    def owner_name(self) -> str: ...
+    def to_here(self, timeout: float = ...) -> _T: ...
+    def local_value(self) -> Any: ...
+    def rpc_sync(self, timeout: float = ...) -> Any: ...
+    def rpc_async(self, timeout: float = ...) -> Any: ...
+    def remote(self, timeout: float = ...) -> Any: ...
+    def _serialize(self) -> tuple: ...
+    @staticmethod
+    def _deserialize(tp: tuple) -> PyRRef: ...
+    def _get_type(self) -> type[_T]: ...
+    def _get_future(self) -> Future[_T]: ...
+    def _get_profiling_future(self) -> Future[_T]: ...
+    def _set_profiling_future(self, profilingFuture: Future[_T]): ...
+
+class _TensorPipeRpcBackendOptionsBase(RpcBackendOptions):
+    num_worker_threads: int
+    device_maps: dict[str, dict[torch.device, torch.device]]
+    devices: list[torch.device]
+    def __init__(
+        self,
+        num_worker_threads: int,
+        _transports: list | None,
+        _channels: list | None,
+        rpc_timeout: float = ...,
+        init_method: str = ...,
+        device_maps: dict[str, dict[torch.device, torch.device]] = {},  # noqa: B006
+        devices: list[torch.device] = [],  # noqa: B006
+    ) -> None: ...
+    def _set_device_map(
+        self,
+        to: str,
+        device_map: dict[torch.device, torch.device],
+    ): ...
+
+class TensorPipeAgent(RpcAgent):
+    def __init__(
+        self,
+        store: Store,
+        name: str,
+        worker_id: int,
+        world_size: int | None,
+        opts: _TensorPipeRpcBackendOptionsBase,
+        reverse_device_maps: dict[str, dict[torch.device, torch.device]],
+        devices: list[torch.device],
+    ) -> None: ...
+    def join(self, shutdown: bool = False, timeout: float = 0): ...
+    def shutdown(self): ...
+    @overload
+    def get_worker_info(self) -> WorkerInfo: ...
+    @overload
+    def get_worker_info(self, workerName: str) -> WorkerInfo: ...
+    @overload
+    def get_worker_info(self, id: int) -> WorkerInfo: ...
+    def get_worker_infos(self) -> list[WorkerInfo]: ...
+    def _get_device_map(self, dst: WorkerInfo) -> dict[torch.device, torch.device]: ...
+    def _update_group_membership(
+        self,
+        worker_info: WorkerInfo,
+        my_devices: list[torch.device],
+        reverse_device_map: dict[str, dict[torch.device, torch.device]],
+        is_join: bool,
+    ): ...
+    def _get_backend_options(self) -> _TensorPipeRpcBackendOptionsBase: ...
+    @property
+    def is_static_group(self) -> bool: ...
+    @property
+    def store(self) -> Store: ...
+
+def _is_current_rpc_agent_set() -> bool: ...
+def _get_current_rpc_agent() -> RpcAgent: ...
+def _set_and_start_rpc_agent(agent: RpcAgent): ...
+def _reset_current_rpc_agent(): ...
+def _delete_all_user_and_unforked_owner_rrefs(timeout: timedelta = ...): ...
+def _destroy_rref_context(ignoreRRefLeak: bool): ...
+def _rref_context_get_debug_info() -> dict[str, str]: ...
+def _cleanup_python_rpc_handler(): ...
+def _invoke_rpc_builtin(
+    dst: WorkerInfo,
+    opName: str,
+    rpcTimeoutSeconds: float,
+    *args: Any,
+    **kwargs: Any,
+): ...
+def _invoke_rpc_python_udf(
+    dst: WorkerInfo,
+    pickledPythonUDF: str,
+    tensors: list[torch.Tensor],
+    rpcTimeoutSeconds: float,
+    isAsyncExecution: bool,
+): ...
+def _invoke_rpc_torchscript(
+    dstWorkerName: str,
+    qualifiedNameStr: str,
+    argsTuple: tuple,
+    kwargsDict: dict,
+    rpcTimeoutSeconds: float,
+    isAsyncExecution: bool,
+): ...
+def _invoke_remote_builtin(
+    dst: WorkerInfo,
+    opName: str,
+    rpcTimeoutSeconds: float,
+    *args: Any,
+    **kwargs: Any,
+): ...
+def _invoke_remote_python_udf(
+    dst: WorkerInfo,
+    pickledPythonUDF: str,
+    tensors: list[torch.Tensor],
+    rpcTimeoutSeconds: float,
+    isAsyncExecution: bool,
+): ...
+def _invoke_remote_torchscript(
+    dstWorkerName: WorkerInfo,
+    qualifiedNameStr: str,
+    rpcTimeoutSeconds: float,
+    isAsyncExecution: bool,
+    *args: Any,
+    **kwargs: Any,
+): ...
+def get_rpc_timeout() -> float: ...
+def enable_gil_profiling(flag: bool): ...
+def _set_rpc_timeout(rpcTimeoutSeconds: float): ...
+
+class RemoteProfilerManager:
+    @staticmethod
+    def set_current_profiling_key(key: str): ...
+
+def _enable_server_process_global_profiler(new_config: ProfilerConfig): ...
+def _disable_server_process_global_profiler() -> list[list[list[ProfilerEvent]]]: ...
+def _set_profiler_node_id(default_node_id: int): ...
+def _enable_jit_rref_pickle(): ...
+def _disable_jit_rref_pickle(): ...
--- a/rl/Lib/site-packages/torch/_C/_distributed_rpc_testing.pyi
+++ b/rl/Lib/site-packages/torch/_C/_distributed_rpc_testing.pyi
@ -0,0 +1,32 @@
+import torch
+from torch._C._distributed_c10d import Store
+from torch._C._distributed_rpc import _TensorPipeRpcBackendOptionsBase, TensorPipeAgent
+
+# This module is defined in torch/csrc/distributed/rpc/testing/init.cpp
+
+class FaultyTensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
+    def __init__(
+        self,
+        num_worker_threads: int,
+        rpc_timeout: float,
+        init_method: str,
+        messages_to_fail: list[str],
+        messages_to_delay: dict[str, float],
+        num_fail_sends: int,
+    ) -> None: ...
+    num_send_recv_threads: int
+    messages_to_fail: list[str]
+    messages_to_delay: dict[str, float]
+    num_fail_sends: int
+
+class FaultyTensorPipeAgent(TensorPipeAgent):
+    def __init__(
+        self,
+        store: Store,
+        name: str,
+        rank: int,
+        world_size: int,
+        options: FaultyTensorPipeRpcBackendOptions,
+        reverse_device_maps: dict[str, dict[torch.device, torch.device]],
+        devices: list[torch.device],
+    ) -> None: ...
--- a/rl/Lib/site-packages/torch/_C/_functions.pyi
+++ b/rl/Lib/site-packages/torch/_C/_functions.pyi
@ -0,0 +1,11 @@
+from typing import AnyStr
+
+from torch import Tensor
+
+class UndefinedGrad:
+    def __init__(self) -> None: ...
+    def __call__(self, *inputs: Tensor) -> list[Tensor]: ...
+
+class DelayedError:
+    def __init__(self, msg: AnyStr, num_inputs: int) -> None: ...
+    def __call__(self, inputs: list[Tensor]) -> list[Tensor]: ...
--- a/rl/Lib/site-packages/torch/_C/_functorch.pyi
+++ b/rl/Lib/site-packages/torch/_C/_functorch.pyi
@ -0,0 +1,83 @@
+# mypy: allow-untyped-defs
+from enum import Enum
+
+from torch import Tensor
+
+# Defined in torch/csrc/functorch/init.cpp
+
+def _set_dynamic_layer_keys_included(included: bool) -> None: ...
+def get_unwrapped(tensor: Tensor) -> Tensor: ...
+def is_batchedtensor(tensor: Tensor) -> bool: ...
+def is_functionaltensor(tensor: Tensor) -> bool: ...
+def is_functorch_wrapped_tensor(tensor: Tensor) -> bool: ...
+def is_gradtrackingtensor(tensor: Tensor) -> bool: ...
+def is_legacy_batchedtensor(tensor: Tensor) -> bool: ...
+def maybe_get_bdim(tensor: Tensor) -> int: ...
+def maybe_get_level(tensor: Tensor) -> int: ...
+def maybe_current_level() -> int | None: ...
+def unwrap_if_dead(tensor: Tensor) -> Tensor: ...
+def _unwrap_for_grad(tensor: Tensor, level: int) -> Tensor: ...
+def _wrap_for_grad(tensor: Tensor, level: int) -> Tensor: ...
+def _unwrap_batched(tensor: Tensor, level: int) -> tuple[Tensor, int | None]: ...
+def current_level() -> int: ...
+def count_jvp_interpreters() -> int: ...
+def _add_batch_dim(tensor: Tensor, bdim: int, level: int) -> Tensor: ...
+def set_single_level_autograd_function_allowed(allowed: bool) -> None: ...
+def get_single_level_autograd_function_allowed() -> bool: ...
+def _unwrap_functional_tensor(tensor: Tensor, reapply_views: bool) -> Tensor: ...
+def _wrap_functional_tensor(tensor: Tensor, level: int) -> Tensor: ...
+def _vmap_increment_nesting(batch_size: int, randomness: str) -> int: ...
+def _vmap_decrement_nesting() -> int: ...
+def _grad_increment_nesting() -> int: ...
+def _grad_decrement_nesting() -> int: ...
+def _jvp_increment_nesting() -> int: ...
+def _jvp_decrement_nesting() -> int: ...
+
+# Defined in aten/src/ATen/functorch/Interpreter.h
+class TransformType(Enum):
+    Torch: TransformType = ...
+    Vmap: TransformType = ...
+    Grad: TransformType = ...
+    Jvp: TransformType = ...
+    Functionalize: TransformType = ...
+
+class RandomnessType(Enum):
+    Error: TransformType = ...
+    Same: TransformType = ...
+    Different: TransformType = ...
+
+class CInterpreter:
+    def key(self) -> TransformType: ...
+    def level(self) -> int: ...
+
+class CGradInterpreterPtr:
+    def __init__(self, interpreter: CInterpreter) -> None: ...
+    def lift(self, Tensor) -> Tensor: ...
+    def prevGradMode(self) -> bool: ...
+
+class CJvpInterpreterPtr:
+    def __init__(self, interpreter: CInterpreter) -> None: ...
+    def lift(self, Tensor) -> Tensor: ...
+    def prevFwdGradMode(self) -> bool: ...
+
+class CFunctionalizeInterpreterPtr:
+    def __init__(self, interpreter: CInterpreter) -> None: ...
+    def key(self) -> TransformType: ...
+    def level(self) -> int: ...
+    def functionalizeAddBackViews(self) -> bool: ...
+
+class CVmapInterpreterPtr:
+    def __init__(self, interpreter: CInterpreter) -> None: ...
+    def key(self) -> TransformType: ...
+    def level(self) -> int: ...
+    def batchSize(self) -> int: ...
+    def randomness(self) -> RandomnessType: ...
+
+class DynamicLayer: ...
+
+def get_dynamic_layer_stack_depth() -> int: ...
+def get_interpreter_stack() -> list[CInterpreter]: ...
+def peek_interpreter_stack() -> CInterpreter: ...
+def pop_dynamic_layer_stack() -> DynamicLayer: ...
+def pop_dynamic_layer_stack_and_undo_to_depth(int) -> None: ...
+def push_dynamic_layer_stack(dl: DynamicLayer) -> int: ...
--- a/rl/Lib/site-packages/torch/_C/_instruction_counter.pyi
+++ b/rl/Lib/site-packages/torch/_C/_instruction_counter.pyi
@ -0,0 +1,4 @@
+# Defined in torch/csrc/instruction_counter/Module.cpp
+
+def start() -> int: ...
+def end(id: int) -> int: ...
--- a/rl/Lib/site-packages/torch/_C/_itt.pyi
+++ b/rl/Lib/site-packages/torch/_C/_itt.pyi
@ -0,0 +1,5 @@
+# Defined in torch/csrc/itt.cpp
+def is_available() -> None: ...
+def rangePush(message: str) -> None: ...
+def rangePop() -> None: ...
+def mark(message: str) -> None: ...
--- a/rl/Lib/site-packages/torch/_C/_lazy.pyi
+++ b/rl/Lib/site-packages/torch/_C/_lazy.pyi
@ -0,0 +1,27 @@
+# mypy: allow-untyped-defs
+from torch import Tensor
+
+# defined in torch/csrc/lazy/python/init.cpp
+def _mark_step(device: str, devices: list[str], wait: bool): ...
+def _wait_device_ops(devices: list[str]): ...
+def _reset_metrics(): ...
+def _counter_names() -> list[str]: ...
+def _counter_value(name: str) -> int: ...
+def _metrics_report() -> str: ...
+def _get_graph_hash(tensors: list[Tensor]) -> str: ...
+def _sync_multi(
+    tensors: list[Tensor],
+    devices: list[str],
+    wait: bool = True,
+    sync_ltc_data: bool = True,
+): ...
+def _get_tensor_id(tensor: Tensor) -> int: ...
+def _get_tensors_text(tensors: list[Tensor]) -> str: ...
+def _get_tensors_dot(tensors: list[Tensor]) -> str: ...
+def _get_tensors_backend(tensors: list[Tensor]) -> str: ...
+def _get_force_fallback() -> str: ...
+def _set_force_fallback(newval: str): ...
+def _clear_ir_cache(): ...
+def _dump_ir_cache(filename: str): ...
+def _set_reuse_ir(val: bool): ...
+def _get_default_device_type(): ...
--- a/rl/Lib/site-packages/torch/_C/_lazy_ts_backend.pyi
+++ b/rl/Lib/site-packages/torch/_C/_lazy_ts_backend.pyi
@ -0,0 +1,12 @@
+# mypy: allow-untyped-defs
+# defined in torch/csrc/lazy/python/init.cpp
+
+from typing import Any
+
+from torch import Tensor
+
+def _init(): ...
+def _get_tensors_ts_device_data_node(
+    tensors: list[Tensor],
+) -> tuple[list[int], list[Any]]: ...
+def _run_cached_graph(hash_str: str, graph_inputs: list[Any]) -> list[Tensor]: ...
--- a/rl/Lib/site-packages/torch/_C/_monitor.pyi
+++ b/rl/Lib/site-packages/torch/_C/_monitor.pyi
@ -0,0 +1,44 @@
+# Defined in torch/csrc/monitor/python_init.cpp
+
+import datetime
+from enum import Enum
+from typing import Callable
+
+class Aggregation(Enum):
+    VALUE = ...
+    MEAN = ...
+    COUNT = ...
+    SUM = ...
+    MAX = ...
+    MIN = ...
+
+class Stat:
+    name: str
+    count: int
+    def __init__(
+        self,
+        name: str,
+        aggregations: list[Aggregation],
+        window_size: int,
+        max_samples: int = -1,
+    ) -> None: ...
+    def add(self, v: float) -> None: ...
+    def get(self) -> dict[Aggregation, float]: ...
+
+class Event:
+    name: str
+    timestamp: datetime.datetime
+    data: dict[str, int | float | bool | str]
+    def __init__(
+        self,
+        name: str,
+        timestamp: datetime.datetime,
+        data: dict[str, int | float | bool | str],
+    ) -> None: ...
+
+def log_event(e: Event) -> None: ...
+
+class EventHandlerHandle: ...
+
+def register_event_handler(handler: Callable[[Event], None]) -> EventHandlerHandle: ...
+def unregister_event_handler(handle: EventHandlerHandle) -> None: ...
--- a/rl/Lib/site-packages/torch/_C/_nn.pyi
+++ b/rl/Lib/site-packages/torch/_C/_nn.pyi
@ -0,0 +1,89 @@
+# @generated by tools/pyi/gen_pyi.py from torch/_C/_nn.pyi.in
+# mypy: disable-error-code="type-arg"
+
+from typing import List, Literal, Optional, overload, Sequence, Tuple, Union
+
+from torch import memory_format, Tensor
+from torch.types import _bool, _device, _dtype, _int, _size
+
+# Defined in tools/autograd/templates/python_nn_functions.cpp
+
+def adaptive_max_pool2d(input: Tensor, output_size: Union[_int, _size]) -> Tuple[Tensor, Tensor]: ...
+def adaptive_max_pool3d(input: Tensor, output_size: Union[_int, _size]) -> Tuple[Tensor, Tensor]: ...
+def avg_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Optional[Union[_int, _size]] = None, padding: Union[_int, _size] = 0, ceil_mode: bool = False, count_include_pad: bool = True, divisor_override: Optional[int] = None) -> Tensor: ...
+def avg_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Optional[Union[_int, _size]] = None, padding: Union[_int, _size] = 0, ceil_mode: bool = False, count_include_pad: bool = True, divisor_override: Optional[int] = None) -> Tensor: ...
+def elu_(input: Tensor, alpha: float = ...) -> Tensor: ...
+def fractional_max_pool2d(input: Tensor, kernel_size: Union[_int, _size], output_size: Union[_int, _size], _random_samples: Tensor) -> Tuple[Tensor, Tensor]: ...
+def fractional_max_pool3d(input: Tensor, kernel_size: Union[_int, _size], output_size: Union[_int, _size], _random_samples: Tensor) -> Tuple[Tensor, Tensor]: ...
+def gelu(input: Tensor, approximate: str = ...) -> Tensor: ...
+def hardsigmoid(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def hardtanh(input: Tensor, min_val: float = ..., max_val: float = ..., *, out: Optional[Tensor] = None) -> Tensor: ...
+def hardtanh_(input: Tensor, min_val: float = ..., max_val: float = ...) -> Tensor: ...
+def leaky_relu(input: Tensor, negative_slope: float = ..., *, out: Optional[Tensor] = None) -> Tensor: ...
+def leaky_relu_(input: Tensor, negative_slope: float = ...) -> Tensor: ...
+def linear(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None) -> Tensor: ...
+def log_sigmoid(input: Tensor) -> Tensor: ...
+def one_hot(tensor: Tensor, num_classes: int = ...) -> Tensor: ...
+def pad(input: Tensor, pad: Sequence[int], mode: str = ..., value: Optional[float] = None) -> Tensor: ...
+def scaled_dot_product_attention(query: Tensor, key: Tensor, value: Tensor, attn_mask: Optional[Tensor] = None, dropout_p: float = 0.0, is_causal: bool = False, scale: Optional[float] = None, enable_gqa: bool = False) -> Tensor: ...
+def softplus(input: Tensor, beta: float = ..., threshold: float = ...) -> Tensor: ...
+def softshrink(input: Tensor, lambd: float = ...) -> Tensor: ...
+
+# Defined in aten/src/ATen/native/mkldnn/Linear.cpp
+def mkldnn_linear(input: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor: ...
+
+# Defined at aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
+def mkldnn_reorder_conv2d_weight(
+    self: Tensor,
+    padding: List,
+    stride: List,
+    dilatation: List,
+    groups: int,
+) -> Tensor: ...
+def mkldnn_reorder_conv3d_weight(
+    self: Tensor,
+    padding: List,
+    stride: List,
+    dilatation: List,
+    groups: int,
+) -> Tensor: ...
+
+# Defined in aten/src/ATen/native/mkldnn/Prelu.cpp
+def mkldnn_prelu(input: Tensor, weight: Tensor) -> Tensor: ...
+
+# Defined at tools/autograd/templates/python_nn_functions.cpp
+@overload
+def _parse_to(
+    device: _device,
+    dtype: _dtype,
+    non_blocking: _bool,
+    copy: _bool,
+    *,
+    memory_format: memory_format,
+) -> Tuple[_device, _dtype, _bool, memory_format]: ...
+@overload
+def _parse_to(
+    dtype: _dtype,
+    non_blocking: _bool,
+    copy: _bool,
+    *,
+    memory_format: memory_format,
+) -> Tuple[_device, _dtype, _bool, memory_format]: ...
+@overload
+def _parse_to(
+    tensor: Tensor,
+    non_blocking: _bool,
+    copy: _bool,
+    *,
+    memory_format: memory_format,
+) -> Tuple[_device, _dtype, _bool, memory_format]: ...
+
+# Defined in aten/src/ATen/native/PackedSequence.cpp
+def pad_sequence(
+    sequences: Union[List[Tensor], Tuple[Tensor, ...]],
+    batch_first: bool = False,
+    padding_value: float = 0.0,
+    padding_side: Union[Literal["left", "right"], str] = "right",
+) -> Tensor: ...
+def flatten_dense_tensors(tensors: List[Tensor]) -> Tensor: ...
+def unflatten_dense_tensors(flat: Tensor, tensors: List[Tensor]) -> List[Tensor]: ...
--- a/rl/Lib/site-packages/torch/_C/_nvtx.pyi
+++ b/rl/Lib/site-packages/torch/_C/_nvtx.pyi
@ -0,0 +1,7 @@
+# mypy: allow-untyped-defs
+# Defined in torch/csrc/cuda/shared/nvtx.cpp
+def rangePushA(message: str) -> int: ...
+def rangePop() -> int: ...
+def rangeStartA(message: str) -> int: ...
+def rangeEnd(int) -> None: ...
+def markA(message: str) -> None: ...
--- a/rl/Lib/site-packages/torch/_C/_onnx.pyi
+++ b/rl/Lib/site-packages/torch/_C/_onnx.pyi
@ -0,0 +1,39 @@
+# Defined in torch/csrc/onnx/init.cpp
+
+from enum import Enum
+
+PRODUCER_VERSION: str
+
+class TensorProtoDataType(Enum):
+    UNDEFINED = ...
+    FLOAT = ...
+    UINT8 = ...
+    INT8 = ...
+    UINT16 = ...
+    INT16 = ...
+    INT32 = ...
+    INT64 = ...
+    STRING = ...
+    BOOL = ...
+    FLOAT16 = ...
+    DOUBLE = ...
+    UINT32 = ...
+    UINT64 = ...
+    COMPLEX64 = ...
+    COMPLEX128 = ...
+    BFLOAT16 = ...
+    FLOAT8E5M2 = ...
+    FLOAT8E4M3FN = ...
+    FLOAT8E5M2FNUZ = ...
+    FLOAT8E4M3FNUZ = ...
+
+class OperatorExportTypes(Enum):
+    ONNX = ...
+    ONNX_ATEN = ...
+    ONNX_ATEN_FALLBACK = ...
+    ONNX_FALLTHROUGH = ...
+
+class TrainingMode(Enum):
+    EVAL = ...
+    PRESERVE = ...
+    TRAINING = ...
--- a/rl/Lib/site-packages/torch/_C/_profiler.pyi
+++ b/rl/Lib/site-packages/torch/_C/_profiler.pyi
@ -0,0 +1,244 @@
+from enum import Enum
+from typing import Any, Literal
+from typing_extensions import TypeAlias
+
+from torch._C import device, dtype, layout
+
+# defined in torch/csrc/profiler/python/init.cpp
+
+class RecordScope(Enum):
+    FUNCTION = ...
+    BACKWARD_FUNCTION = ...
+    TORCHSCRIPT_FUNCTION = ...
+    KERNEL_FUNCTION_DTYPE = ...
+    CUSTOM_CLASS = ...
+    BUILD_FEATURE = ...
+    LITE_INTERPRETER = ...
+    USER_SCOPE = ...
+    STATIC_RUNTIME_OP = ...
+    STATIC_RUNTIME_MODEL = ...
+
+class ProfilerState(Enum):
+    Disable = ...
+    CPU = ...
+    CUDA = ...
+    NVTX = ...
+    ITT = ...
+    KINETO = ...
+    KINETO_GPU_FALLBACK = ...
+    KINETO_PRIVATEUSE1_FALLBACK = ...
+    KINETO_PRIVATEUSE1 = ...
+
+class ActiveProfilerType(Enum):
+    NONE = ...
+    LEGACY = ...
+    KINETO = ...
+    NVTX = ...
+    ITT = ...
+
+class ProfilerActivity(Enum):
+    CPU = ...
+    CUDA = ...
+    XPU = ...
+    MTIA = ...
+    PrivateUse1 = ...
+
+class _EventType(Enum):
+    TorchOp = ...
+    Backend = ...
+    Allocation = ...
+    OutOfMemory = ...
+    PyCall = ...
+    PyCCall = ...
+    Kineto = ...
+
+class _ExperimentalConfig:
+    def __init__(
+        self,
+        profiler_metrics: list[str] = ...,
+        profiler_measure_per_kernel: bool = ...,
+        verbose: bool = ...,
+        performance_events: list[str] = ...,
+        enable_cuda_sync_events: bool = ...,
+    ) -> None: ...
+
+class ProfilerConfig:
+    def __init__(
+        self,
+        state: ProfilerState,
+        report_input_shapes: bool,
+        profile_memory: bool,
+        with_stack: bool,
+        with_flops: bool,
+        with_modules: bool,
+        experimental_config: _ExperimentalConfig,
+    ) -> None: ...
+
+class _ProfilerEvent:
+    start_tid: int
+    start_time_ns: int
+    children: list[_ProfilerEvent]
+
+    # TODO(robieta): remove in favor of `self.typed`
+    extra_fields: (
+        _ExtraFields_TorchOp
+        | _ExtraFields_Backend
+        | _ExtraFields_Allocation
+        | _ExtraFields_OutOfMemory
+        | _ExtraFields_PyCall
+        | _ExtraFields_PyCCall
+        | _ExtraFields_Kineto
+    )
+
+    @property
+    def typed(
+        self,
+    ) -> (
+        tuple[Literal[_EventType.TorchOp], _ExtraFields_TorchOp]
+        | tuple[Literal[_EventType.Backend], _ExtraFields_Backend]
+        | tuple[Literal[_EventType.Allocation], _ExtraFields_Allocation]
+        | tuple[Literal[_EventType.OutOfMemory], _ExtraFields_OutOfMemory]
+        | tuple[Literal[_EventType.PyCall], _ExtraFields_PyCall]
+        | tuple[Literal[_EventType.PyCCall], _ExtraFields_PyCCall]
+        | tuple[Literal[_EventType.Kineto], _ExtraFields_Kineto]
+    ): ...
+    @property
+    def name(self) -> str: ...
+    @property
+    def tag(self) -> _EventType: ...
+    @property
+    def id(self) -> int: ...
+    @property
+    def parent(self) -> _ProfilerEvent | None: ...
+    @property
+    def correlation_id(self) -> int: ...
+    @property
+    def end_time_ns(self) -> int: ...
+    @property
+    def duration_time_ns(self) -> int: ...
+
+class _TensorMetadata:
+    impl_ptr: int | None
+    storage_data_ptr: int | None
+    id: int | None
+
+    @property
+    def allocation_id(self) -> int | None: ...
+    @property
+    def layout(self) -> layout: ...
+    @property
+    def device(self) -> device: ...
+    @property
+    def dtype(self) -> dtype: ...
+    @property
+    def sizes(self) -> list[int]: ...
+    @property
+    def strides(self) -> list[int]: ...
+
+Scalar: TypeAlias = int | float | bool | complex
+Input: TypeAlias = _TensorMetadata | list[_TensorMetadata] | Scalar | None
+
+class _ExtraFields_TorchOp:
+    name: str
+    sequence_number: int
+    allow_tf32_cublas: bool
+
+    @property
+    def inputs(self) -> list[Input]: ...
+    @property
+    def scope(self) -> RecordScope: ...
+
+class _ExtraFields_Backend: ...
+
+class _ExtraFields_Allocation:
+    ptr: int
+    id: int | None
+    alloc_size: int
+    total_allocated: int
+    total_reserved: int
+
+    @property
+    def allocation_id(self) -> int | None: ...
+    @property
+    def device(self) -> device: ...
+
+class _ExtraFields_OutOfMemory: ...
+
+class _PyFrameState:
+    line_number: int
+    function_name: str
+
+    @property
+    def file_name(self) -> str: ...
+
+class _NNModuleInfo:
+    @property
+    def self_ptr(self) -> int: ...
+    @property
+    def cls_ptr(self) -> int: ...
+    @property
+    def cls_name(self) -> str: ...
+    @property
+    def parameters(
+        self,
+    ) -> list[tuple[str, _TensorMetadata, _TensorMetadata | None]]: ...
+
+class _OptimizerInfo:
+    @property
+    def parameters(
+        self,
+    ) -> list[
+        tuple[
+            # Parameter
+            _TensorMetadata,
+            #
+            # Gradient (if present during optimizer.step())
+            _TensorMetadata | None,
+            #
+            # Optimizer state for Parameter as (name, tensor) pairs
+            list[tuple[str, _TensorMetadata]],
+        ]
+    ]: ...
+
+class _ExtraFields_PyCCall:
+    @property
+    def caller(self) -> _PyFrameState: ...
+
+class _ExtraFields_PyCall:
+    @property
+    def callsite(self) -> _PyFrameState: ...
+    @property
+    def caller(self) -> _PyFrameState: ...
+    @property
+    def module(self) -> _NNModuleInfo | None: ...
+    @property
+    def optimizer(self) -> _OptimizerInfo | None: ...
+
+class _ExtraFields_Kineto: ...
+
+def _add_execution_trace_observer(output_file_path: str) -> bool: ...
+def _remove_execution_trace_observer() -> None: ...
+def _enable_execution_trace_observer() -> None: ...
+def _disable_execution_trace_observer() -> None: ...
+def _set_record_concrete_inputs_enabled_val(val: bool) -> None: ...
+def _set_fwd_bwd_enabled_val(val: bool) -> None: ...
+def _set_cuda_sync_enabled_val(val: bool) -> None: ...
+
+class CapturedTraceback: ...
+
+def gather_traceback(python: bool, script: bool, cpp: bool) -> CapturedTraceback: ...
+
+# The Dict has name, filename, line
+def symbolize_tracebacks(
+    to_symbolize: list[CapturedTraceback],
+) -> list[list[dict[str, str]]]: ...
+
+class _RecordFunctionFast:
+    def __init__(
+        self,
+        name: str,
+        input_values: list | tuple | None = None,
+        keyword_values: dict | None = None,
+    ) -> None: ...
+    def __enter__(self) -> None: ...
+    def __exit__(self, *args: Any) -> None: ...
--- a/rl/Lib/site-packages/torch/_C/_verbose.pyi
+++ b/rl/Lib/site-packages/torch/_C/_verbose.pyi
@ -0,0 +1,3 @@
+# Defined in torch/csrc/utils/verbose.cpp
+def mkl_set_verbose(enable: int) -> int: ...
+def mkldnn_set_verbose(level: int) -> int: ...