I am done

2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/BUILD.bazel
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/BUILD.bazel
@ -0,0 +1,4 @@
+load("//:tools/bazel.bzl", "rules")
+load(":build.bzl", "define_targets")
+
+define_targets(rules = rules)
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/README.md
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/README.md
@ -0,0 +1,3 @@
+If you add a file to this directory, you **MUST** update
+`torch/CMakeLists.txt` and add the file as a dependency to
+the `add_custom_command` call.
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/init.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/init.py
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/context.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/context.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_annotated_fn_args.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_annotated_fn_args.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_autograd.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_autograd.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_autograd_functions.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_autograd_functions.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_inplace_or_view_type.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_inplace_or_view_type.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_python_functions.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_python_functions.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_trace_type.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_trace_type.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_variable_factories.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_variable_factories.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_variable_type.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_variable_type.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_view_funcs.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_view_funcs.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/load_derivatives.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/load_derivatives.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/build.bzl
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/build.bzl
@ -0,0 +1,14 @@
+def define_targets(rules):
+    rules.py_library(
+        name = "autograd",
+        srcs = rules.glob(["*.py"]),
+        data = rules.glob([
+            "*.yaml",
+            "templates/*",
+        ]),
+        visibility = ["//:__subpackages__"],
+        deps = [
+            rules.requirement("PyYAML"),
+            "//torchgen",
+        ],
+    )
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/context.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/context.py
@ -0,0 +1,31 @@
+import functools
+from typing import Callable
+
+from torchgen.api.autograd import NativeFunctionWithDifferentiabilityInfo as NFWDI
+from torchgen.context import native_function_manager
+from torchgen.utils import T
+
+
+# Like tools.api.context.with_native_function, but for
+# NativeFunctionWithDifferentiabilityInfo.
+def with_native_function_with_differentiability_info(
+    func: Callable[[NFWDI], T]
+) -> Callable[[NFWDI], T]:
+    @functools.wraps(func)
+    def wrapper(f: NFWDI) -> T:
+        with native_function_manager(f.func):
+            return func(f)
+
+    return wrapper
+
+
+# Like the above but with an additional dispatch key string argument
+def with_native_function_with_differentiability_info_and_key(
+    func: Callable[[NFWDI, str], T]
+) -> Callable[[NFWDI, str], T]:
+    @functools.wraps(func)
+    def wrapper(f: NFWDI, key: str) -> T:
+        with native_function_manager(f.func):
+            return func(f, key)
+
+    return wrapper
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/deprecated.yaml
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/deprecated.yaml
@ -0,0 +1,134 @@
+# Deprecated function signatures. These are exposed in Python, but not included
+# in the error message suggestions.
+
+- name: add(Tensor self, Scalar alpha, Tensor other) -> Tensor
+  aten: add(self, other, alpha)
+
+- name: add_(Tensor(a!) self, Scalar alpha, Tensor other) -> Tensor(a!)
+  aten: add_(self, other, alpha)
+
+- name: add(Tensor self, Scalar alpha, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  aten: add_out(out, self, other, alpha)
+
+- name: addbmm(Scalar beta, Tensor self, Scalar alpha, Tensor batch1, Tensor batch2) -> Tensor
+  aten: addbmm(self, batch1, batch2, beta, alpha)
+
+- name: addbmm_(Scalar beta, Tensor(a!) self, Scalar alpha, Tensor batch1, Tensor batch2) -> Tensor(a!)
+  aten: addbmm_(self, batch1, batch2, beta, alpha)
+
+- name: addbmm(Scalar beta, Tensor self, Scalar alpha, Tensor batch1, Tensor batch2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addbmm_out(out, self, batch1, batch2, beta, alpha)
+
+- name: addbmm(Scalar beta, Tensor self, Tensor batch1, Tensor batch2) -> Tensor
+  aten: addbmm(self, batch1, batch2, beta, 1)
+
+- name: addbmm_(Scalar beta, Tensor(a!) self, Tensor batch1, Tensor batch2) -> Tensor(a!)
+  aten: addbmm_(self, batch1, batch2, beta, 1)
+
+- name: addbmm(Scalar beta, Tensor self, Tensor batch1, Tensor batch2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addbmm_out(out, self, batch1, batch2, beta, 1)
+
+- name: addcdiv(Tensor self, Scalar value, Tensor tensor1, Tensor tensor2) -> Tensor
+  aten: addcdiv(self, tensor1, tensor2, value)
+
+- name: addcdiv_(Tensor(a!) self, Scalar value, Tensor tensor1, Tensor tensor2) -> Tensor(a!)
+  aten: addcdiv_(self, tensor1, tensor2, value)
+
+- name: addcdiv(Tensor self, Scalar value, Tensor tensor1, Tensor tensor2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addcdiv_out(out, self, tensor1, tensor2, value)
+
+- name: addcmul(Tensor self, Scalar value, Tensor tensor1, Tensor tensor2) -> Tensor
+  aten: addcmul(self, tensor1, tensor2, value)
+
+- name: addcmul_(Tensor(a!) self, Scalar value, Tensor tensor1, Tensor tensor2) -> Tensor(a!)
+  aten: addcmul_(self, tensor1, tensor2, value)
+
+- name: addcmul(Tensor self, Scalar value, Tensor tensor1, Tensor tensor2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addcmul_out(out, self, tensor1, tensor2, value)
+
+- name: addmm(Scalar beta, Tensor self, Scalar alpha, Tensor mat1, Tensor mat2) -> Tensor
+  aten: addmm(self, mat1, mat2, beta, alpha)
+
+- name: addmm_(Scalar beta, Tensor(a!) self, Scalar alpha, Tensor mat1, Tensor mat2) -> Tensor(a!)
+  aten: addmm_(self, mat1, mat2, beta, alpha)
+
+- name: addmm(Scalar beta, Tensor self, Scalar alpha, Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addmm_out(out, self, mat1, mat2, beta, alpha)
+
+- name: addmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2) -> Tensor
+  aten: addmm(self, mat1, mat2, beta, 1)
+
+- name: addmm_(Scalar beta, Tensor(a!) self, Tensor mat1, Tensor mat2) -> Tensor(a!)
+  aten: addmm_(self, mat1, mat2, beta, 1)
+
+- name: addmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addmm_out(out, self, mat1, mat2, beta, 1)
+
+- name: sspaddmm(Scalar beta, Tensor self, Scalar alpha, Tensor mat1, Tensor mat2) -> Tensor
+  aten: sspaddmm(self, mat1, mat2, beta, alpha)
+
+- name: sspaddmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2) -> Tensor
+  aten: sspaddmm(self, mat1, mat2, beta, 1)
+
+- name: addmv(Scalar beta, Tensor self, Scalar alpha, Tensor mat, Tensor vec) -> Tensor
+  aten: addmv(self, mat, vec, beta, alpha)
+
+- name: addmv_(Scalar beta, Tensor(a!) self, Scalar alpha, Tensor mat, Tensor vec) -> Tensor(a!)
+  aten: addmv_(self, mat, vec, beta, alpha)
+
+- name: addmv(Scalar beta, Tensor self, Scalar alpha, Tensor mat, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addmv_out(out, self, mat, vec, beta, alpha)
+
+- name: addmv(Scalar beta, Tensor self, Tensor mat, Tensor vec) -> Tensor
+  aten: addmv(self, mat, vec, beta, 1)
+
+- name: addmv_(Scalar beta, Tensor(a!) self, Tensor mat, Tensor vec) -> Tensor(a!)
+  aten: addmv_(self, mat, vec, beta, 1)
+
+- name: addmv(Scalar beta, Tensor self, Tensor mat, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addmv_out(out, self, mat, vec, beta, 1)
+
+- name: addr(Scalar beta, Tensor self, Scalar alpha, Tensor vec1, Tensor vec2) -> Tensor
+  aten: addr(self, vec1, vec2, beta, alpha)
+
+- name: addr_(Scalar beta, Tensor(a!) self, Scalar alpha, Tensor vec1, Tensor vec2) -> Tensor(a!)
+  aten: addr_(self, vec1, vec2, beta, alpha)
+
+- name: addr(Scalar beta, Tensor self, Scalar alpha, Tensor vec1, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addr_out(out, self, vec1, vec2, beta, alpha)
+
+- name: addr(Scalar beta, Tensor self, Tensor vec1, Tensor vec2) -> Tensor
+  aten: addr(self, vec1, vec2, beta, 1)
+
+- name: addr_(Scalar beta, Tensor(a!) self, Tensor vec1, Tensor vec2) -> Tensor(a!)
+  aten: addr_(self, vec1, vec2, beta, 1)
+
+- name: addr(Scalar beta, Tensor self, Tensor vec1, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addr_out(out, self, vec1, vec2, beta, 1)
+
+- name: baddbmm(Scalar beta, Tensor self, Scalar alpha, Tensor batch1, Tensor batch2) -> Tensor
+  aten: baddbmm(self, batch1, batch2, beta, alpha)
+
+- name: baddbmm_(Scalar beta, Tensor(a!) self, Scalar alpha, Tensor batch1, Tensor batch2) -> Tensor(a!)
+  aten: baddbmm_(self, batch1, batch2, beta, alpha)
+
+- name: baddbmm(Scalar beta, Tensor self, Scalar alpha, Tensor batch1, Tensor batch2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: baddbmm_out(out, self, batch1, batch2, beta, alpha)
+
+- name: baddbmm(Scalar beta, Tensor self, Tensor batch1, Tensor batch2) -> Tensor
+  aten: baddbmm(self, batch1, batch2, beta, 1)
+
+- name: baddbmm_(Scalar beta, Tensor(a!) self, Tensor batch1, Tensor batch2) -> Tensor(a!)
+  aten: baddbmm_(self, batch1, batch2, beta, 1)
+
+- name: baddbmm(Scalar beta, Tensor self, Tensor batch1, Tensor batch2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: baddbmm_out(out, self, batch1, batch2, beta, 1)
+
+- name: sub(Tensor self, Scalar alpha, Tensor other) -> Tensor
+  aten: sub(self, other, alpha)
+
+- name: sub_(Tensor(a!) self, Scalar alpha, Tensor other) -> Tensor(a!)
+  aten: sub_(self, other, alpha)
+
+- name: sub(Tensor self, Scalar alpha, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  aten: sub_out(out, self, other, alpha)
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/derivatives.yaml
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/derivatives.yaml
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_annotated_fn_args.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_annotated_fn_args.py
@ -0,0 +1,132 @@
+"""
+For procedural tests needed for __torch_function__, we use this function
+to export method names and signatures as needed by the tests in
+test/test_overrides.py.
+
+python -m tools.autograd.gen_annotated_fn_args \
+       aten/src/ATen/native/native_functions.yaml \
+       aten/src/ATen/native/tags.yaml \
+       $OUTPUT_DIR \
+       tools/autograd
+
+Where $OUTPUT_DIR is where you would like the files to be
+generated.  In the full build system, OUTPUT_DIR is
+torch/testing/_internal/generated
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import textwrap
+from collections import defaultdict
+from typing import Any, Sequence, TYPE_CHECKING
+
+import torchgen.api.python as python
+from torchgen.context import with_native_function
+from torchgen.gen import parse_native_yaml
+from torchgen.utils import FileManager
+
+from .gen_python_functions import (
+    is_py_fft_function,
+    is_py_linalg_function,
+    is_py_nn_function,
+    is_py_special_function,
+    is_py_torch_function,
+    is_py_variable_method,
+    should_generate_py_binding,
+)
+
+
+if TYPE_CHECKING:
+    from torchgen.model import Argument, BaseOperatorName, NativeFunction
+
+
+def gen_annotated(
+    native_yaml_path: str, tags_yaml_path: str, out: str, autograd_dir: str
+) -> None:
+    native_functions = parse_native_yaml(
+        native_yaml_path, tags_yaml_path
+    ).native_functions
+    mappings = (
+        (is_py_torch_function, "torch._C._VariableFunctions"),
+        (is_py_nn_function, "torch._C._nn"),
+        (is_py_linalg_function, "torch._C._linalg"),
+        (is_py_special_function, "torch._C._special"),
+        (is_py_fft_function, "torch._C._fft"),
+        (is_py_variable_method, "torch.Tensor"),
+    )
+    annotated_args: list[str] = []
+    for pred, namespace in mappings:
+        groups: dict[BaseOperatorName, list[NativeFunction]] = defaultdict(list)
+        for f in native_functions:
+            if not should_generate_py_binding(f) or not pred(f):
+                continue
+            groups[f.func.name.name].append(f)
+        for group in groups.values():
+            for f in group:
+                annotated_args.append(f"{namespace}.{gen_annotated_args(f)}")
+
+    template_path = os.path.join(autograd_dir, "templates")
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    fm.write_with_template(
+        "annotated_fn_args.py",
+        "annotated_fn_args.py.in",
+        lambda: {
+            "annotated_args": textwrap.indent("\n".join(annotated_args), "    "),
+        },
+    )
+
+
+@with_native_function
+def gen_annotated_args(f: NativeFunction) -> str:
+    def _get_kwargs_func_exclusion_list() -> list[str]:
+        # functions that currently don't work with kwargs in test_overrides.py
+        return [
+            "diagonal",
+            "round_",
+            "round",
+            "scatter_",
+        ]
+
+    def _add_out_arg(
+        out_args: list[dict[str, Any]], args: Sequence[Argument], *, is_kwarg_only: bool
+    ) -> None:
+        for arg in args:
+            if arg.default is not None:
+                continue
+            out_arg: dict[str, Any] = {}
+            out_arg["is_kwarg_only"] = str(is_kwarg_only)
+            out_arg["name"] = arg.name
+            out_arg["simple_type"] = python.argument_type_str(
+                arg.type, simple_type=True
+            )
+            size_t = python.argument_type_size(arg.type)
+            if size_t:
+                out_arg["size"] = size_t
+            out_args.append(out_arg)
+
+    out_args: list[dict[str, Any]] = []
+    _add_out_arg(out_args, f.func.arguments.flat_positional, is_kwarg_only=False)
+    if f"{f.func.name.name}" not in _get_kwargs_func_exclusion_list():
+        _add_out_arg(out_args, f.func.arguments.flat_kwarg_only, is_kwarg_only=True)
+
+    return f"{f.func.name.name}: {repr(out_args)},"
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate annotated_fn_args script")
+    parser.add_argument(
+        "native_functions", metavar="NATIVE", help="path to native_functions.yaml"
+    )
+    parser.add_argument("tags", metavar="TAGS", help="path to tags.yaml")
+    parser.add_argument("out", metavar="OUT", help="path to output directory")
+    parser.add_argument(
+        "autograd", metavar="AUTOGRAD", help="path to template directory"
+    )
+    args = parser.parse_args()
+    gen_annotated(args.native_functions, args.tags, args.out, args.autograd)
+
+
+if __name__ == "__main__":
+    main()
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_autograd.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_autograd.py
@ -0,0 +1,147 @@
+"""
+To run this file by hand from the root of the PyTorch
+repository, run:
+
+python -m tools.autograd.gen_autograd \
+       aten/src/ATen/native/native_functions.yaml \
+       aten/src/ATen/native/tags.yaml \
+       $OUTPUT_DIR \
+       tools/autograd
+
+Where $OUTPUT_DIR is where you would like the files to be
+generated.  In the full build system, OUTPUT_DIR is
+torch/csrc/autograd/generated/
+"""
+
+# gen_autograd.py generates C++ autograd functions and Python bindings.
+#
+# It delegates to the following scripts:
+#
+#  gen_autograd_functions.py: generates subclasses of torch::autograd::Node
+#  gen_variable_type.py: generates VariableType.h which contains all tensor methods
+#  gen_python_functions.py: generates Python bindings to THPVariable
+#
+
+from __future__ import annotations
+
+import argparse
+import os
+
+from torchgen.api import cpp
+from torchgen.api.autograd import (
+    match_differentiability_info,
+    NativeFunctionWithDifferentiabilityInfo,
+)
+from torchgen.gen import parse_native_yaml
+from torchgen.selective_build.selector import SelectiveBuilder
+
+from . import gen_python_functions
+from .gen_autograd_functions import (
+    gen_autograd_functions_lib,
+    gen_autograd_functions_python,
+)
+from .gen_inplace_or_view_type import gen_inplace_or_view_type
+from .gen_trace_type import gen_trace_type
+from .gen_variable_factories import gen_variable_factories
+from .gen_variable_type import gen_variable_type
+from .gen_view_funcs import gen_view_funcs
+from .load_derivatives import load_derivatives
+
+
+def gen_autograd(
+    native_functions_path: str,
+    tags_path: str,
+    out: str,
+    autograd_dir: str,
+    operator_selector: SelectiveBuilder,
+    disable_autograd: bool = False,
+) -> None:
+    # Parse and load derivatives.yaml
+    differentiability_infos, used_dispatch_keys = load_derivatives(
+        os.path.join(autograd_dir, "derivatives.yaml"), native_functions_path, tags_path
+    )
+
+    template_path = os.path.join(autograd_dir, "templates")
+
+    native_funcs = parse_native_yaml(native_functions_path, tags_path).native_functions
+    fns = sorted(
+        filter(
+            operator_selector.is_native_function_selected_for_training, native_funcs
+        ),
+        key=lambda f: cpp.name(f.func),
+    )
+    fns_with_diff_infos: list[
+        NativeFunctionWithDifferentiabilityInfo
+    ] = match_differentiability_info(fns, differentiability_infos)
+
+    # Generate VariableType.h/cpp
+    if not disable_autograd:
+        gen_variable_type(
+            out,
+            native_functions_path,
+            tags_path,
+            fns_with_diff_infos,
+            template_path,
+            used_dispatch_keys,
+        )
+
+        gen_inplace_or_view_type(
+            out, native_functions_path, tags_path, fns_with_diff_infos, template_path
+        )
+
+        # operator filter not applied as tracing sources are excluded in selective build
+        gen_trace_type(out, native_funcs, template_path)
+    # Generate Functions.h/cpp
+    gen_autograd_functions_lib(out, differentiability_infos, template_path)
+
+    # Generate variable_factories.h
+    gen_variable_factories(out, native_functions_path, tags_path, template_path)
+
+    # Generate ViewFuncs.h/cpp
+    gen_view_funcs(out, fns_with_diff_infos, template_path)
+
+
+def gen_autograd_python(
+    native_functions_path: str,
+    tags_path: str,
+    out: str,
+    autograd_dir: str,
+) -> None:
+    differentiability_infos, _ = load_derivatives(
+        os.path.join(autograd_dir, "derivatives.yaml"), native_functions_path, tags_path
+    )
+
+    template_path = os.path.join(autograd_dir, "templates")
+
+    # Generate Functions.h/cpp
+    gen_autograd_functions_python(out, differentiability_infos, template_path)
+
+    # Generate Python bindings
+    deprecated_path = os.path.join(autograd_dir, "deprecated.yaml")
+    gen_python_functions.gen(
+        out, native_functions_path, tags_path, deprecated_path, template_path
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate autograd C++ files script")
+    parser.add_argument(
+        "native_functions", metavar="NATIVE", help="path to native_functions.yaml"
+    )
+    parser.add_argument("tags", metavar="NATIVE", help="path to tags.yaml")
+    parser.add_argument("out", metavar="OUT", help="path to output directory")
+    parser.add_argument(
+        "autograd", metavar="AUTOGRAD", help="path to autograd directory"
+    )
+    args = parser.parse_args()
+    gen_autograd(
+        args.native_functions,
+        args.tags,
+        args.out,
+        args.autograd,
+        SelectiveBuilder.get_nop_selector(),
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_autograd_functions.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_autograd_functions.py
@ -0,0 +1,925 @@
+# Generates C++ autograd functions for the derivatives of ATen operations
+#
+# This writes two files:
+#  Functions.h/cpp: subclasses of autograd::Node
+#  python_functions.h/cpp: Python bindings for the above classes
+#
+
+from __future__ import annotations
+
+from typing import Sequence
+
+from torchgen.api.autograd import (
+    Derivative,
+    DifferentiabilityInfo,
+    SavedAttribute,
+    uses_retain_variables,
+    uses_single_grad,
+)
+from torchgen.api.types import (
+    ArrayRefCType,
+    BaseCppType,
+    BaseCType,
+    Binding,
+    boolT,
+    doubleT,
+    intArrayRefT,
+    iTensorListRefT,
+    ListCType,
+    longT,
+    MutRefCType,
+    OptionalCType,
+    optionalIntArrayRefT,
+    optionalSymIntArrayRefT,
+    scalarT,
+    stringT,
+    symIntArrayRefT,
+    SymIntT,
+    TENSOR_LIST_LIKE_CTYPES,
+    tensorListT,
+    tensorT,
+    VectorCType,
+)
+from torchgen.code_template import CodeTemplate
+from torchgen.model import Argument, FunctionSchema
+from torchgen.utils import FileManager
+
+from .gen_inplace_or_view_type import VIEW_FUNCTIONS
+
+
+FUNCTION_DECLARATION = CodeTemplate(
+    """\
+#ifdef _WIN32
+struct ${op} : public ${superclass} {
+  TORCH_API ${op}() = default;
+#else
+struct TORCH_API ${op} : public ${superclass} {
+#endif
+  using ${superclass}::${superclass};
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "${op}"; }
+  void release_variables() override {
+    ${thread_lock}
+    ${release_variables}
+  }
+  ${will_release_variables}
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  ${saved_variables}
+  ${saved_list_sizes}
+};
+"""
+)
+
+WILL_RELEASE_VARIABLES = CodeTemplate(
+    """\
+bool retain_variables = true;
+void will_release_variables() override {
+  retain_variables = false;
+}
+"""
+)
+
+FUNCTION_DEFINITION = CodeTemplate(
+    """\
+variable_list ${op}::apply(variable_list&& grads) {
+  ${thread_lock}
+  ${asserts}
+  IndexRangeGenerator gen;
+  ${compute_index_ranges}
+  variable_list grad_inputs(gen.size());
+  ${body}
+  return grad_inputs;
+}
+void ${op}::compiled_args(CompiledNodeArgs& args) {
+    ${compiled_args}
+}
+variable_list ${op}::apply_with_saved(const variable_list& grads, SwapSavedVariables& saved) {
+    ${apply_with_saved_before}
+    variable_list result = apply(variable_list(grads));
+    ${apply_with_saved_after}
+    return result;
+}
+"""
+)
+
+GRAD_INPUT_MASK = CodeTemplate(
+    """\
+  auto grad_input_mask = std::array<bool, ${n}>{
+    ${masks}
+  };\
+"""
+)
+
+DERIVATIVE_SINGLE = CodeTemplate(
+    """\
+if (task_should_compute_output({ ${name}_ix })) {
+  auto grad_result = ${derivative};
+  copy_range(grad_inputs, ${name}_ix, grad_result);
+}
+"""
+)
+
+# note(crcrpar): `self` argument and other optional positional argument
+# of foreach functions are basically a list of n `Tensor`s thus iterating over
+# `grads` in order to utilize and apply the existing derivative definitions
+# to each `Tensor`(s) of `self`, and the others.
+DERIVATIVE_SINGLE_FOREACH = CodeTemplate(
+    """\
+if (task_should_compute_output({ ${name}_ix })) {
+  std::vector<Tensor> grad_result;
+  grad_result.reserve(grads.size());
+  for (const auto & i : c10::irange(grads.size())) {
+    if (grads[i].defined()) {
+      grad_result.emplace_back(${derivative});
+    } else {
+      grad_result.emplace_back(Tensor());
+    }
+  }
+  copy_range(grad_inputs, ${name}_ix, grad_result);
+}
+"""
+)
+
+DERIVATIVE_MULTI_COPY_RANGE = CodeTemplate(
+    """\
+  if (task_should_compute_output({ ${name}_ix })) {
+    copy_range(grad_inputs, ${name}_ix, std::get<${i}>(grad_result));
+  }
+"""
+)
+
+DERIVATIVE_MULTI = CodeTemplate(
+    """\
+if (task_should_compute_output({ ${idx_ranges} })) {
+  ${grad_input_mask}
+  auto grad_result = ${derivative};
+  ${copy_ranges}
+}
+"""
+)
+
+# Generates python bindings
+#
+# This generates the definitions for:
+#   (1) The PyTypeObject for each backward grad_fn subclassing Node
+#   (2) The entry for PyTypeObject's tp_getset slot (an array of PyGetSetDef structs)
+#       We generate one PyGetSetDef struct for each of grad_fn's saved inputs and outputs
+#       Each PyGetSetDef has a function ptr to a getter, also defined here (3).
+#   (3) Getters for each of grad_fn's saved inputs and outputs.
+#
+PY_FUNCTION_DEFINITION = CodeTemplate(
+    """\
+static PyTypeObject ${op}Class;
+addClass<${op}>(module, ${op}Class, "${op}", ${op}_properties);
+"""
+)
+
+PY_FUNCTION_PROPS_AND_GETTERS = CodeTemplate(
+    """\
+${all_getter_definitions}
+
+static struct PyGetSetDef ${op}_properties[] = {
+  THP_FUNCTION_DEFAULT_PROPERTIES,
+  ${all_getsetdef_structs}
+  {nullptr} /* sentinel */
+};
+
+"""
+)
+
+PY_GETSETDEF_STRUCT = CodeTemplate(
+    """\
+{(char*)"_saved_${name}", (getter)THP${op}_${name}_getter, nullptr, nullptr, nullptr}"""
+)
+
+PY_RAW_GETSETDEF_STRUCT = CodeTemplate(
+    """\
+{(char*)"_raw_saved_${name}", (getter)THP${op}_${name}_raw_getter, nullptr, nullptr, nullptr}"""
+)
+
+# Getter templates
+GETTER_DEFINITION = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  auto prop = static_cast<${op}*>(self->cdata.get())->${name};
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_SAVEDVAR = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  const auto& prop = static_cast<${op}*>(self->cdata.get())->${name}_;
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_RAW_SAVEDVAR = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_raw_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  const auto& prop = static_cast<${op}*>(self->cdata.get())->${name}_;
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_VEC_SAVEDVAR = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  const auto *node = static_cast<${op}*>(self->cdata.get());
+  const auto& prop = node->${name}_;
+  if (node->${name}_released_) {
+    PyErr_SetString(PyExc_RuntimeError, ERR_BACKWARD_TWICE);
+    return nullptr;
+  }
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_RAW_VEC_SAVEDVAR = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_raw_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  const auto *node = static_cast<${op}*>(self->cdata.get());
+  const auto& prop = node->${name}_;
+  if (node->${name}_released_) {
+    PyErr_SetString(PyExc_RuntimeError, ERR_BACKWARD_TWICE);
+    return nullptr;
+  }
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_OPT = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  auto opt_prop = static_cast<${op}*>(self->cdata.get())->${name};
+  if (!opt_prop.has_value()) {
+    Py_RETURN_NONE;
+  }
+  auto prop = opt_prop.value();
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_OPT_ARRAYREF = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  auto opt_prop = static_cast<${op}*>(self->cdata.get())->${name};
+  if (!opt_prop.list.has_value()) {
+    Py_RETURN_NONE;
+  }
+  auto prop = opt_prop.list.value();
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+# Getter body
+GETTER_BODY_SAVEDVAR = """\
+return THPVariable_Wrap(prop.unpack(self->cdata));
+"""
+
+GETTER_BODY_RAW_SAVEDVAR = """\
+pybind11::object obj = pybind11::cast(prop, pybind11::return_value_policy::reference);
+return obj.release().ptr();
+"""
+
+GETTER_BODY_VEC_SAVEDVAR = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i: c10::irange(prop.size())) {
+  PyTuple_SetItem(tup, (Py_ssize_t) i, THPVariable_Wrap(prop[i].unpack(self->cdata)));
+}
+return tup;
+"""
+
+GETTER_BODY_RAW_VEC_SAVEDVAR = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i : c10::irange(prop.size())) {
+  pybind11::object obj = pybind11::cast(prop[i], pybind11::return_value_policy::reference);
+  PyTuple_SetItem(tup, (Py_ssize_t) i, obj.release().ptr());
+}
+return tup;
+"""
+
+GETTER_BODY_ARRAYREF_LONG = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i : c10::irange(prop.size())) {
+  PyTuple_SetItem(tup, (Py_ssize_t) i, PyLong_FromUnsignedLong((uint64_t) prop[i]));
+}
+return tup;
+"""
+
+GETTER_BODY_ARRAYREF_SYMINT = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i : c10::irange(prop.size())) {
+    auto si = prop[i];
+    if (auto m = si.maybe_as_int()) {
+      PyTuple_SetItem(tup, (Py_ssize_t) i, PyLong_FromUnsignedLong(*m));
+    } else {
+      auto py_symint = py::cast(si).release().ptr();
+      PyTuple_SetItem(tup, (Py_ssize_t) i, py_symint);
+    }
+}
+return tup;
+"""
+
+GETTER_BODY_ARRAYREF_DOUBLE = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i : c10::irange(prop.size())) {
+  PyTuple_SetItem(tup, (Py_ssize_t) i, PyFloat_FromDouble((double) prop[i]));
+}
+return tup;
+"""
+
+GETTER_BODY_INT64_T = """\
+return PyLong_FromUnsignedLong((int64_t) prop);
+"""
+
+GETTER_BODY_SYMINT = """\
+if (auto m = prop.maybe_as_int()) {
+  return PyLong_FromUnsignedLong(*m);
+} else {
+  return py::cast(prop).release().ptr();
+}
+"""
+
+GETTER_BODY_DOUBLE = """\
+return PyFloat_FromDouble((double) prop);
+"""
+
+GETTER_BODY_BOOL = """\
+if (prop) {
+  Py_RETURN_TRUE;
+} else {
+  Py_RETURN_FALSE;
+}
+"""
+
+GETTER_BODY_STRING = """\
+return PyUnicode_FromStringAndSize(prop.data(), prop.size());
+"""
+
+GETTER_BODY_SCALAR = """\
+if (prop.isComplex()) {
+  auto cprop = prop.to<c10::complex<double>>();
+  return PyComplex_FromDoubles(cprop.real(), cprop.imag());
+} else if (prop.isFloatingPoint()) {
+  return PyFloat_FromDouble(prop.to<double>());
+} else if (prop.isIntegral(/*includeBool=*/false)) {
+  return PyLong_FromLong(prop.to<int64_t>());
+} else if (prop.isBoolean()) {
+  if (prop.to<bool>()) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+} else {
+  PyErr_SetString(PyExc_RuntimeError, "Unknown scalar type");
+  return nullptr;
+}
+"""
+
+
+GETTER_BODY_VEC_SCALAR = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i: c10::irange(prop.size())) {
+  if (prop[i].isComplex()) {
+    auto cprop = prop[i].to<c10::complex<double>>();
+    PyTuple_SetItem(tup, (Py_ssize_t) i, PyComplex_FromDoubles(cprop.real(), cprop.imag()));
+  } else if (prop[i].isFloatingPoint()) {
+    auto double_prop = prop[i].to<double>();
+    PyTuple_SetItem(tup, (Py_ssize_t) i, PyFloat_FromDouble(double_prop));
+  } else if (prop[i].isIntegral(/*includeBool=*/false)) {
+    auto long_prop = prop[i].to<int64_t>();
+    PyTuple_SetItem(tup, (Py_ssize_t) i, PyLong_FromLong(long_prop));
+  } else if (prop[i].isBoolean()) {
+    if (prop[i].to<bool>()) {
+      PyTuple_SetItem(tup, (Py_ssize_t) i, Py_True);
+    } else {
+      PyTuple_SetItem(tup, (Py_ssize_t) i, Py_False);
+    }
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, "Unknown scalar type");
+    return nullptr;
+  }
+}
+return tup;
+"""
+
+
+MISC_GETTER_DEFS = {
+    OptionalCType(BaseCType(longT)): (GETTER_DEFINITION_OPT, GETTER_BODY_INT64_T),
+    OptionalCType(BaseCType(SymIntT)): (GETTER_DEFINITION_OPT, GETTER_BODY_SYMINT),
+    BaseCType(doubleT): (GETTER_DEFINITION, GETTER_BODY_DOUBLE),
+    OptionalCType(BaseCType(doubleT)): (GETTER_DEFINITION_OPT, GETTER_BODY_DOUBLE),
+    BaseCType(boolT): (GETTER_DEFINITION, GETTER_BODY_BOOL),
+    BaseCType(scalarT): (GETTER_DEFINITION, GETTER_BODY_SCALAR),
+    OptionalCType(BaseCType(scalarT)): (GETTER_DEFINITION_OPT, GETTER_BODY_SCALAR),
+}
+
+# These functions have backwards which cannot be traced, and so must have
+# their backward functions traced opaquely.
+# VIEW_FUNCTIONS are not traceable because they use as_strided, which
+# has an untraceable backwards, see
+# https://github.com/pytorch/pytorch/issues/4250
+# TODO: This is probably not exhaustive, but it's a start
+UNTRACEABLE_FUNCTIONS = VIEW_FUNCTIONS
+
+
+def get_infos_with_derivatives_list(
+    differentiability_infos: dict[FunctionSchema, dict[str, DifferentiabilityInfo]]
+) -> list[DifferentiabilityInfo]:
+    diff_info_list = [
+        info
+        for diffinfo_dict in differentiability_infos.values()
+        for info in diffinfo_dict.values()
+    ]
+
+    return list(filter(lambda info: info.args_with_derivatives, diff_info_list))
+
+
+def gen_autograd_functions_lib(
+    out: str,
+    differentiability_infos: dict[FunctionSchema, dict[str, DifferentiabilityInfo]],
+    template_path: str,
+) -> None:
+    """Functions.h and Functions.cpp body
+
+    These contain the auto-generated subclasses of torch::autograd::Node
+    for each every differentiable torch function.
+    """
+
+    # get a 1D list of diffinfos, we do not need them to be per FunctionSchema/DispatchKey here
+    # infos with the diff dispatchkeys but the same name will still be in the same shard.
+    infos = get_infos_with_derivatives_list(differentiability_infos)
+    declarations = [process_function(f, FUNCTION_DECLARATION) for f in infos]
+    definitions = [process_function(f, FUNCTION_DEFINITION) for f in infos]
+
+    file_basename = "Functions"
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    for suffix in [".h", ".cpp"]:
+        fname = file_basename + suffix
+        fm.write_with_template(
+            fname,
+            fname,
+            lambda: {
+                "generated_comment": "@"
+                + f"generated from {fm.template_dir_for_comments()}/"
+                + fname,
+                "autograd_function_declarations": declarations,
+                "autograd_function_definitions": definitions,
+            },
+        )
+
+
+def gen_autograd_functions_python(
+    out: str,
+    differentiability_infos: dict[FunctionSchema, dict[str, DifferentiabilityInfo]],
+    template_path: str,
+) -> None:
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    num_shards = 5
+    fm.write(
+        "python_functions.h",
+        lambda: {
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/python_functions.h",
+            "shard_forward_declare": [
+                f"void initialize_autogenerated_functions_{i}(PyObject* module);"
+                for i in range(num_shards)
+            ],
+            "shard_call": [
+                f"initialize_autogenerated_functions_{i}(module);"
+                for i in range(num_shards)
+            ],
+        },
+    )
+
+    # get a 1D list of diffinfos, we do not need them to be per FunctionSchema/DispatchKey here
+    # infos with the diff dispatchkeys but the same name will still be in the same shard.
+    infos = get_infos_with_derivatives_list(differentiability_infos)
+    fm.write_sharded(
+        "python_functions.cpp",
+        infos,
+        key_fn=lambda info: info.name,
+        base_env={
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/python_functions.cpp",
+        },
+        env_callable=lambda info: {
+            "py_function_initializers": [
+                process_function(info, PY_FUNCTION_DEFINITION)
+            ],
+            "py_function_props_and_getters": [
+                process_function(info, PY_FUNCTION_PROPS_AND_GETTERS)
+            ],
+        },
+        num_shards=num_shards,
+        sharded_keys={"py_function_initializers", "py_function_props_and_getters"},
+    )
+
+
+def process_function(info: DifferentiabilityInfo, template: CodeTemplate) -> str:
+    saved_variables: list[str] = []
+    release_variables: list[str] = []
+    saved_list_sizes: list[str] = []
+    unpack: list[str] = []
+    asserts: list[str] = []
+    compute_index_ranges: list[str] = []
+    getter_definitions: list[str] = []
+    py_getsetdef_structs: list[str] = []
+    compiled_args: list[str] = []
+    apply_with_saved_before: list[str] = []
+    apply_with_saved_after: list[str] = []
+
+    for arg in info.args_with_derivatives:
+        if arg.type in TENSOR_LIST_LIKE_CTYPES:
+            size = f"{arg.name}_size_"
+            saved_list_sizes.append(f"size_t {arg.name}_size_;")
+        else:
+            size = "1"
+        compute_index_ranges.append(f"auto {arg.name}_ix = gen.range({size});")
+
+    def save_var(var: SavedAttribute, is_output: bool) -> None:
+        name = var.nctype.name
+        type = var.nctype.type
+        should_append_getsetdef = True
+        should_append_raw_getsetdef = False
+        visit_name = name
+        uses_cpp_saved_variable_cls = False
+
+        if (
+            type == BaseCType(tensorT)
+            or type == OptionalCType(BaseCType(tensorT))
+            or type == MutRefCType(OptionalCType(BaseCType(tensorT)))
+            or (type == BaseCType(scalarT) and is_output)
+        ):
+            uses_cpp_saved_variable_cls = True
+            saved_variables.append(f"SavedVariable {name}_;")
+            release_variables.append(f"{name}_.reset_data();")
+            ptr = "shared_from_this()" if is_output else ""
+            unpack.append(f"auto {name} = {name}_.unpack({ptr});")
+            getter_definitions.append(
+                GETTER_DEFINITION_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_SAVEDVAR
+                )
+            )
+            getter_definitions.append(
+                GETTER_DEFINITION_RAW_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_RAW_SAVEDVAR
+                )
+            )
+            should_append_raw_getsetdef = True
+            visit_name = f"{name}_"
+        elif (
+            type == BaseCType(tensorListT)
+            or type == BaseCType(iTensorListRefT)
+            or type == VectorCType(BaseCType(tensorT))
+        ):
+            # note(crcrpar): [nuanced return type of out-of-place foreach functions]
+            # When an out-of-place foreach function whose return signature is `Tensor[]`
+            # spells out its backward definitions in `derivatives.yaml`, and some of them depend on
+            # `result`, `result`'s type is interpreted and treated as `std::vector<Tensor>`.
+            # An out-of-place foreach whose backwards rely on their output doesn't suffer from this
+            # difference if the definitions are codegen'ed.
+            # This special case is needed for `_foreach_pow.List` and `_foreach_pow.ScalarAndTensor`
+            # as of https://github.com/pytorch/pytorch/pull/105504.
+            if type == VectorCType(BaseCType(tensorT)):
+                assert (
+                    info.func.func.name.name.base.startswith("_foreach") and is_output
+                )
+            uses_cpp_saved_variable_cls = True
+            saved_variables.append(f"std::vector<SavedVariable> {name}_;")
+            saved_variables.append(f"bool {name}_released_ = false;")
+            # Just clear() is sufficient, we don't need to loop and clear each variable.
+            # Because the SavedVariable owns a tensor and a grad_fn, removing the SavedVariable makes them go away as well.
+            release_variables.append(f"{name}_.clear();")
+            release_variables.append(f"{name}_released_ = true;")
+            ptr = "shared_from_this()" if is_output else "nullptr"
+            unpack.append(f"auto {name} = unpack_list({name}_, {ptr});")
+            asserts.append(f"TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);")
+            getter_definitions.append(
+                GETTER_DEFINITION_VEC_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_VEC_SAVEDVAR
+                )
+            )
+            getter_definitions.append(
+                GETTER_DEFINITION_RAW_VEC_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_RAW_VEC_SAVEDVAR
+                )
+            )
+            should_append_raw_getsetdef = True
+            visit_name = f"{name}_"
+        elif type == ListCType(OptionalCType(BaseCType(tensorT))):
+            uses_cpp_saved_variable_cls = True
+            saved_variables.append(f"std::vector<SavedVariable> {name}_;")
+            saved_variables.append(f"bool {name}_released_ = false;")
+            # Just clear() is sufficient, we don't need to loop and clear each variable.
+            # Because the SavedVariable owns a tensor and a grad_fn, removing the SavedVariable makes them go away as well.
+            release_variables.append(f"{name}_.clear();")
+            release_variables.append(f"{name}_released_ = true;")
+            unpack.append(f"auto {name} = unpack_opt_list({name}_);")
+            asserts.append(f"TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);")
+            getter_definitions.append(
+                GETTER_DEFINITION_VEC_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_VEC_SAVEDVAR
+                )
+            )
+            getter_definitions.append(
+                GETTER_DEFINITION_RAW_VEC_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_RAW_VEC_SAVEDVAR
+                )
+            )
+            should_append_raw_getsetdef = True
+            visit_name = f"{name}_"
+        elif type == BaseCType(intArrayRefT):
+            saved_variables.append(f"std::vector<int64_t> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG
+                )
+            )
+        elif type == BaseCType(symIntArrayRefT):
+            saved_variables.append(f"std::vector<c10::SymInt> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_SYMINT
+                )
+            )
+        elif type == BaseCType(optionalIntArrayRefT):
+            saved_variables.append(f"c10::OptionalArray<int64_t> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG
+                )
+            )
+        elif type == BaseCType(optionalSymIntArrayRefT):
+            saved_variables.append(f"c10::OptionalArray<c10::SymInt> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_SYMINT
+                )
+            )
+        elif type == OptionalCType(BaseCType(intArrayRefT)):
+            saved_variables.append(f"c10::OptionalArray<int64_t> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG
+                )
+            )
+        elif type == OptionalCType(BaseCType(symIntArrayRefT)):
+            saved_variables.append(f"c10::OptionalArray<c10::SymInt> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_SYMINT
+                )
+            )
+        elif type == OptionalCType(ArrayRefCType(BaseCType(doubleT))):
+            saved_variables.append(f"c10::OptionalArray<double> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_DOUBLE
+                )
+            )
+        elif type == BaseCType(longT):
+            saved_variables.append(f"{type.cpp_type()} {name} = 0;")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_INT64_T
+                )
+            )
+        elif type == BaseCType(SymIntT):
+            saved_variables.append(f"c10::SymInt {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_SYMINT
+                )
+            )
+        elif type == BaseCType(stringT):
+            saved_variables.append(f"std::string {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_STRING
+                )
+            )
+        elif type == OptionalCType(BaseCType(stringT)):
+            saved_variables.append(f"std::optional<std::string> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_STRING
+                )
+            )
+        elif type == ArrayRefCType(
+            elem=BaseCType(type=BaseCppType(ns="at", name="Scalar"))
+        ):
+            saved_variables.append(f"std::vector<at::Scalar> {name};")
+            saved_variables.append(f"bool {name}_released_ = false;")
+            # Just clear() is sufficient, we don't need to loop and clear each variable.
+            # Because the SavedVariable owns a tensor and a grad_fn, removing the SavedVariable makes them go away as well.
+            release_variables.append(f"{name}.clear();")
+            # release_variables.append(f"{name}_released_ = true;")
+            # unpack.append(f"auto {name} = unpack_list({name}_);")
+            # asserts.append(f"TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);")
+            getter_definitions.append(
+                CodeTemplate(
+                    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  const auto *node = static_cast<${op}*>(self->cdata.get());
+  const auto& prop = node->${name};
+  if (node->${name}_released_) {
+    PyErr_SetString(PyExc_RuntimeError, ERR_BACKWARD_TWICE);
+    return nullptr;
+  }
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+                            """
+                ).substitute(
+                    op=info.op,
+                    name=name,
+                    body=GETTER_BODY_VEC_SCALAR,
+                )
+            )
+        else:
+            # Check for indicators that you're putting a non-owning reference
+            # into the saved variable field.  If this is spuriously firing,
+            # edit this field.  Otherwise, you probably need to add a case
+            # above.
+            assert (
+                "ref" not in type.cpp_type().lower()
+                and "view" not in type.cpp_type().lower()
+                and "*" not in type.cpp_type()
+                and "&" not in type.cpp_type()
+            ), f"{type.cpp_type()} looks like it contains a non-owning reference"
+            saved_variables.append(f"{type.cpp_type()} {name};")
+
+            if type in MISC_GETTER_DEFS:
+                getter_def, body = MISC_GETTER_DEFS[type]
+                getter_definitions.append(
+                    getter_def.substitute(op=info.op, name=name, body=body)
+                )
+            else:
+                # Types we don't expose python bindings to yet:
+                #   TypeAndSize, at::ScalarType, TensorOptions, TensorGeometry,
+                #   std::vector<std::vector<int64_t>>, std::vector<at::ScalarType>
+                should_append_getsetdef = False
+
+        if should_append_getsetdef:
+            py_getsetdef_structs.append(
+                PY_GETSETDEF_STRUCT.substitute(op=info.op, name=name)
+            )
+        if should_append_raw_getsetdef:
+            py_getsetdef_structs.append(
+                PY_RAW_GETSETDEF_STRUCT.substitute(op=info.op, name=name)
+            )
+
+        if uses_cpp_saved_variable_cls:
+            compiled_args.append(
+                f"args.collect({visit_name}, {'true' if is_output else 'false'});"
+            )
+        else:
+            compiled_args.append(f"args.collect({visit_name});")
+        apply_with_saved_before.append(f"saved.before({visit_name});")
+        apply_with_saved_after.append(f"saved.after({visit_name});")
+
+    for var in sorted(info.all_saved_inputs, key=lambda sa: str(sa.nctype.name)):
+        save_var(var, is_output=False)
+    for var in sorted(info.all_saved_outputs, key=lambda sa: str(sa.nctype.name)):
+        save_var(var, is_output=True)
+
+    # lock the mutex when we release variables and in Node::apply to protect thread safety
+    # see Note [Thread Safety on Autograd Node]
+    if len(release_variables) > 0:
+        thread_lock = "std::lock_guard<std::mutex> lock(mutex_);"
+    else:
+        thread_lock = ""
+
+    if uses_retain_variables(info):
+        will_release_variables = WILL_RELEASE_VARIABLES.substitute()
+    else:
+        will_release_variables = ""
+
+    body: list[str] = []
+
+    if uses_single_grad(info):
+        body.append("const auto& grad = grads[0];")
+    else:
+        # Generate aliases for gradients named for returned values.
+        body.extend(
+            f"const auto& {name} = grads[{info.available_named_gradients.index(name)}];"
+            for name in sorted(info.used_named_gradients)
+        )
+
+    def emit_derivative(
+        derivative: Derivative,
+        args_with_derivatives: Sequence[Binding],
+    ) -> tuple[bool, str]:
+        formula = derivative.formula
+        var_names = derivative.var_names
+        if len(var_names) == 1:
+            checks_any_grad_defined = False
+            if "not_implemented" not in formula:
+                matching_args = [
+                    arg for arg in args_with_derivatives if arg.name == var_names[0]
+                ]
+                if len(matching_args) == 1:
+                    # We can add undefined grad support if the input variable is a Tensor
+                    arg = matching_args[0]
+                    if isinstance(arg.argument, Argument) and str(
+                        arg.argument.type
+                    ) in ("Tensor", "Tensor?"):
+                        formula = "any_grad_defined ? (" + formula + ") : Tensor()"
+                        checks_any_grad_defined = True
+            if info.name.startswith("_foreach_"):
+                derivative_template = DERIVATIVE_SINGLE_FOREACH
+            else:
+                derivative_template = DERIVATIVE_SINGLE
+            return (
+                checks_any_grad_defined,
+                derivative_template.substitute(name=var_names[0], derivative=formula),
+            )
+        else:
+            if "grad_input_mask" in formula:
+                masks = [
+                    f"task_should_compute_output({{ {n}_ix }})," for n in var_names
+                ]
+                grad_input_mask = GRAD_INPUT_MASK.substitute(
+                    masks=masks, n=len(var_names)
+                )
+            else:
+                grad_input_mask = ""
+            idx_ranges = ", ".join(f"{n}_ix" for n in var_names)
+            copy_ranges: list[str] = []
+            for i, n in enumerate(var_names):
+                copy_ranges.append(DERIVATIVE_MULTI_COPY_RANGE.substitute(name=n, i=i))
+            return False, DERIVATIVE_MULTI.substitute(
+                idx_ranges=idx_ranges,
+                copy_ranges=copy_ranges,
+                derivative=formula,
+                grad_input_mask=grad_input_mask,
+            )
+
+    body.extend(unpack)
+    need_any_grad_defined_var = False
+    for derivative in info.derivatives:
+        checks_any_grad_defined, derivative_text = emit_derivative(
+            derivative, info.args_with_derivatives
+        )
+        body.append(derivative_text)
+        need_any_grad_defined_var |= checks_any_grad_defined
+    # Since single-output derivative formulas need to check if grads are
+    # defined, only perform the check once, before all the formulas
+    if need_any_grad_defined_var:
+        body.insert(
+            -len(info.derivatives),
+            "bool any_grad_defined = any_variable_defined(grads);",
+        )
+
+    if info.name in UNTRACEABLE_FUNCTIONS:
+        superclass = "Node"
+    else:
+        superclass = "TraceableFunction"
+
+    all_getsetdef_structs = (
+        ",\n".join(py_getsetdef_structs) + "," if len(py_getsetdef_structs) != 0 else ""
+    )
+    all_getter_definitions = "\n".join(getter_definitions)
+
+    return template.substitute(
+        op=info.op,
+        compute_index_ranges=compute_index_ranges,
+        saved_variables=saved_variables,
+        release_variables=release_variables,
+        saved_list_sizes=saved_list_sizes,
+        asserts=asserts,
+        thread_lock=thread_lock,
+        will_release_variables=will_release_variables,
+        body=body,
+        superclass=superclass,
+        all_getter_definitions=all_getter_definitions,
+        all_getsetdef_structs=all_getsetdef_structs,
+        compiled_args=compiled_args,
+        apply_with_saved_before=apply_with_saved_before,
+        apply_with_saved_after=apply_with_saved_after,
+    )
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_inplace_or_view_type.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_inplace_or_view_type.py
@ -0,0 +1,675 @@
+# Generates ADInplaceOrViewType.h/cpp
+#
+# NOTE: If any changes are being made to the ADInplaceOrView codegen please also check
+# if updates are needed in torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+# The fallback is expected to mimick this codegen, so we should keep the two in sync.
+
+from __future__ import annotations
+
+from torchgen.api import cpp
+from torchgen.api.autograd import (
+    dispatch_strategy,
+    gen_differentiable_outputs,
+    NativeFunctionWithDifferentiabilityInfo,
+)
+from torchgen.api.types import (
+    BaseCType,
+    Binding,
+    boolT,
+    ConstRefCType,
+    CType,
+    DispatcherSignature,
+    intArrayRefT,
+    longT,
+    OptionalCType,
+    symIntArrayRefT,
+    SymIntT,
+    tensorT,
+)
+from torchgen.code_template import CodeTemplate
+from torchgen.context import with_native_function
+from torchgen.model import (
+    NativeFunction,
+    SchemaKind,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+from torchgen.utils import FileManager
+
+from .context import with_native_function_with_differentiability_info
+from .gen_trace_type import (
+    get_return_value,
+    MANUAL_AUTOGRAD,
+    tie_return_values,
+    type_wrapper_name,
+)
+
+
+# See NOTE [ Autograd View Variables ] in variable.h for details.
+# If you update list VIEW_FUNCTIONS or RETURNS_VIEWS_OF_INPUT,
+# you **MUST** also update the public list of view ops accordingly in
+# docs/source/tensor_view.rst. Note not all ATen functions are exposed to public,
+# e.g alias & sparse_coo_tensor_with_dims_and_tensors.
+#
+# A map: function name => name of the argument that all outputs are view of
+
+VIEW_FUNCTIONS_WITH_METADATA_CHANGE = [
+    "view_as_complex",
+    "view_as_real",
+    "_conj",
+    "_neg_view",
+    "_nested_get_values",
+    "_nested_view_from_buffer",
+    "_nested_view_from_jagged",
+]
+
+VIEW_FUNCTIONS = {
+    "numpy_T": "self",
+    "alias": "self",
+    "as_strided": "self",
+    "diagonal": "self",
+    "expand": "self",
+    "permute": "self",
+    "select": "self",
+    "slice": "self",
+    "slice_inverse": "self",
+    "split": "self",
+    "split_with_sizes": "self",
+    "squeeze": "self",
+    "t": "self",
+    "transpose": "self",
+    "unfold": "self",
+    "unsqueeze": "self",
+    "flatten": "self",
+    "view": "self",
+    "unbind": "self",
+    "_indices": "self",
+    "_values": "self",
+    "indices": "self",
+    "values": "self",
+    "crow_indices": "self",
+    "col_indices": "self",
+    "ccol_indices": "self",
+    "row_indices": "self",
+    # sparse_coo ctor output should really be views of both indices and values,
+    # but we only supports making as view of a single variable, and indices is
+    # discrete anyways.
+    # FIXME: clone indices on construction.
+    "sparse_coo_tensor_with_dims_and_tensors": "values",
+    "_reshape_alias": "self",
+    "_test_autograd_multiple_dispatch_view": "self",
+}
+
+for key in VIEW_FUNCTIONS_WITH_METADATA_CHANGE:
+    VIEW_FUNCTIONS[key] = "self"
+
+# note: some VIEW_FUNCTIONS are just compositions of the view functions above
+# this list contains both the root view functions and any that are purely composed
+# of viewing functions, and is used by the JIT to determine when an operator
+# may return a view of its inputs; however they may sometimes return a copy.
+# (e.g. `contiguous`)
+RETURNS_VIEWS_OF_INPUT = set(VIEW_FUNCTIONS.keys()).union(
+    {
+        "chunk",
+        "detach",
+        "contiguous",
+        "reshape",
+        "reshape_as",
+        "expand_as",
+        "view_as",
+        "real",
+        "imag",
+        "narrow",
+        "movedim",
+        "tensor_split",
+        "swapdims",
+        "swapaxes",
+        "mT",
+        "mH",
+        "adjoint",
+        "matrix_H",
+    }
+)
+
+# These are the functions we consider views for the purposes of validating
+# StorageImpl and TensorImpl in gen_variable_type.
+# `_unsafe_view` is not included in VIEW_FUNCTIONS above because it is not a
+# view for the purposes of ADInplaceOrView kernel, we do not want to call as_view
+# See NOTE [Unsafe View] for more info.
+ALL_VIEW_FUNCTIONS = {
+    **VIEW_FUNCTIONS,
+    "_unsafe_view": "self",
+}
+
+ARRAYREF_TO_VEC = CodeTemplate(
+    """\
+auto ${vec} = ${arg}.vec();
+"""
+)
+
+OPTIONAL_TO_VAL = CodeTemplate(
+    """\
+auto ${val} = ${arg}.value_or(${default});
+"""
+)
+
+CALL_DISPATCH = CodeTemplate(
+    """\
+at::_ops::${unambiguous_name}::call(${unpacked_args})"""
+)
+
+REVERSE_VIEW_DISPATCH = CodeTemplate(
+    """\
+${reverse_name}(${unpacked_args})"""
+)
+
+MULTI_OUTPUT_VIEW_ITERATION = CodeTemplate(
+    """\
+for (auto ${view_idx} : c10::irange(${var}.size())) {
+  ${body}
+}
+"""
+)
+
+SETUP_REPLAY_VIEW_IF_NOT_SUPPORT_AS_STRIDED_OR_VIEW_WITH_METADATA_CHANGE = CodeTemplate(
+    """\
+std::unique_ptr<torch::autograd::ViewFunc> func(nullptr);
+std::function<at::Tensor(const at::Tensor&)> rev_func=nullptr;
+if (${is_view_with_metadata_change} ||
+    !self.unsafeGetTensorImpl()->support_as_strided() ||
+    self.unsafeGetTensorImpl()->is_python_dispatch() ||
+    c10::AutogradState::get_tls_state().get_view_replay_enabled()) {
+  ${replay_view_func}
+  ${reverse_replay_view_func}
+}
+"""
+)
+
+REPLAY_VIEW_FUNC = CodeTemplate(
+    """\
+func = std::make_unique<${view_func_name}>(${view_func_args});
+"""
+)
+
+REVERSE_REPLAY_VIEW_LAMBDA_FUNC = CodeTemplate(
+    """\
+rev_func = [=](const at::Tensor& ${input_view}) {
+  return ${reverse_replay_view_call};
+};
+"""
+)
+
+METHOD_DEFINITION = CodeTemplate(
+    """\
+${return_type} ${type_wrapper_name}(${formals}) {
+  ${type_definition_body}
+}
+"""
+)
+
+WRAPPER_REGISTRATION = CodeTemplate(
+    """\
+m.impl("${unqual_operator_name_with_overload}",
+       TORCH_FN(${class_type}::${type_wrapper_name})
+);
+"""
+)
+
+AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION = CodeTemplate(
+    """\
+m.impl("${unqual_operator_name_with_overload}", torch::autograd::autogradNotImplementedFallback());
+"""
+)
+
+INPLACE_REDISPATCH = CodeTemplate(
+    """\
+{
+  at::AutoDispatchBelowADInplaceOrView guard;
+  at::_ops::${unambiguous_name}::redispatch(${unpacked_args});
+}
+"""
+)
+
+ASSIGN_RETURN_VALUE = CodeTemplate(
+    """\
+${return_values} = ${rhs_value};
+"""
+)
+
+VIEW_REDISPATCH = CodeTemplate(
+    """\
+${assign_return_values} ([&]() {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return at::_ops::${unambiguous_name}::redispatch(${unpacked_args});
+})();
+"""
+)
+
+TMP_VAR = "_tmp"
+
+
+# FIXME: Ideally these functions should be methods on Type class, but we have a
+#        comment in codegen/model.py there saying these concepts are not well defined.
+#        Thus we put a version that commonly used by autograd codegen here.
+def is_tensor_type(t: Type) -> bool:
+    # TODO: Should handle optional here?
+    return t.is_tensor_like() and t.is_list_like() is None
+
+
+def is_tensor_list_type(t: Type) -> bool:
+    # TODO: Should handle optional here?
+    return t.is_tensor_like() and t.is_list_like() is not None
+
+
+UNPACK_TENSOR = CodeTemplate(
+    """\
+auto${ref} ${arg_name}_ = unpack${suffix}(${arg_name}, "${arg_name}", ${arg_pos});"""
+)
+
+
+def unpacked_name(arg_name: str) -> str:
+    return arg_name + "_"
+
+
+# e.g. select.int -> select_copy_int_inverse()
+def inverse_view_name(f: NativeFunction) -> str:
+    copy_variant = f"{f.root_name}_copy"
+    overload = f"{f.func.name.overload_name}"
+    if overload != "":
+        overload = "_" + overload
+    return f"{copy_variant}{overload}_inverse"
+
+
+def extract_bindings(f: NativeFunction) -> list[Binding]:
+    return [
+        r
+        for a in f.func.schema_order_arguments()
+        for r in cpp.argument(
+            a,
+            method=False,
+            symint=True,
+            cpp_no_default_args=set(),
+            faithful=False,
+            has_tensor_options=False,
+        )
+    ]
+
+
+@with_native_function
+def unpack_args(f: NativeFunction) -> tuple[list[str], list[Binding]]:
+    body: list[str] = []
+    unpacked_bindings: list[Binding] = []
+
+    for i, binding in enumerate(extract_bindings(f)):
+        assert not isinstance(binding.argument, SelfArgument)
+        if isinstance(binding.argument, TensorOptionsArguments):
+            raise RuntimeError("VariableKernel shouldn't take TensorOptions")
+
+        is_nullable = binding.argument.type.is_nullable()
+        if not binding.argument.type.is_tensor_like() or is_nullable:
+            unpacked_bindings.append(binding)
+            continue
+
+        is_tensor_list = is_tensor_list_type(binding.argument.type)
+        ref = (not is_nullable) and not is_tensor_list
+        suffix = "_opt" if is_nullable and not is_tensor_list else ""
+        body.append(
+            UNPACK_TENSOR.substitute(
+                arg_name=binding.name,
+                arg_pos=i,
+                suffix=suffix,
+                ref="&" if ref else "",
+            )
+        )
+        unpacked_bindings.append(
+            Binding(
+                name=unpacked_name(binding.name),
+                nctype=binding.nctype,
+                argument=binding.argument,
+                default=binding.default,
+            )
+        )
+
+    return body, unpacked_bindings
+
+
+def get_base_name(f: NativeFunction) -> str:
+    return f.func.name.name.base  # TODO: should be str(f.func.name.name)?
+
+
+def get_view_info(f: NativeFunction) -> str | None:
+    base_name = get_base_name(f)
+    view_info = VIEW_FUNCTIONS.get(base_name, None)
+    if view_info is None and base_name in RETURNS_VIEWS_OF_INPUT:
+        view_info = "self"
+    return view_info
+
+
+def emit_view_func(
+    f: NativeFunction, bindings: list[Binding], view_idx: str | None = None
+) -> str:
+    """Generate an additional lambda function to recover views in backward when as_strided is not supported.
+    See Note [View + Inplace update for base tensor] and [View + Inplace update for view tensor] for more details.
+    """
+    # TODO: Clean this logic up if we get rid of reverse view funcs or reify them.
+    input_base = "input_base"
+    replay_view_func = ""
+    updated_args: list[str] = []
+    known_view_arg_simple_types: list[CType] = [
+        BaseCType(longT),
+        OptionalCType(BaseCType(longT)),
+        BaseCType(SymIntT),
+        OptionalCType(BaseCType(SymIntT)),
+        BaseCType(boolT),
+        BaseCType(intArrayRefT),
+        BaseCType(symIntArrayRefT),
+        ConstRefCType(BaseCType(tensorT)),
+        ConstRefCType(OptionalCType(BaseCType(tensorT))),
+    ]
+    for binding in bindings:
+        arg, arg_type = binding.name, binding.nctype.type
+        if arg == "self":
+            updated_args.append(input_base)
+            continue
+        if arg_type not in known_view_arg_simple_types:
+            known_types_str = ", ".join([str(t) for t in known_view_arg_simple_types])
+            raise TypeError(
+                f"You are adding an {arg_type} {arg} argument to op {cpp.name(f.func)} in addition to known types: "
+                f"{known_types_str}. Please update the list or materialize it so that it can be closed "
+                "over by value, also add a test in pytorch/xla/test/test_operations.py where this code "
+                "is exercised."
+            )
+        if arg_type == BaseCType(intArrayRefT) or arg_type == BaseCType(
+            symIntArrayRefT
+        ):
+            # It's not safe to close over IntArrayRef by value, since this is a
+            # reference type, so materialize a vector to close over by value
+            arg_vec = arg + "_vec"
+            replay_view_func += ARRAYREF_TO_VEC.substitute(arg=arg, vec=arg_vec)
+            updated_args.append(arg_vec)
+        elif arg_type == OptionalCType(BaseCType(longT)):
+            # Materialize int64_t? to int64_t
+            arg_value = arg + "_val"
+            replay_view_func += OPTIONAL_TO_VAL.substitute(
+                arg=arg, val=arg_value, default="0"
+            )
+            updated_args.append(arg_value)
+        elif arg_type == ConstRefCType(BaseCType(tensorT)) or arg_type == ConstRefCType(
+            OptionalCType(BaseCType(tensorT))
+        ):
+            # NB: Closing over a tensor. If a user modifies this tensor, this will be silently
+            # incorrect. The proper thing to do is to store the version counter and copy on write.
+            updated_args.append(arg)
+        else:
+            updated_args.append(arg)
+
+    from .gen_view_funcs import view_func_name
+
+    view_func_args = [b.name for b in bindings if b.name != "self"]
+    if view_idx is not None:
+        view_func_args.append(f"{view_idx}")
+    replay_view_func += REPLAY_VIEW_FUNC.substitute(
+        view_func_name=view_func_name(f, include_namespace=True),
+        view_func_args=view_func_args,
+    )
+
+    input_view = "input_view"
+    reverse_unpacked_args = [
+        "self",
+        f"{input_view}",
+        # inverse_return_mode=
+        "at::functionalization::InverseReturnMode::AlwaysView",
+        *(() if view_idx is None else (f"{view_idx}",)),
+        # skip input_base arg
+        *updated_args[1:],
+    ]
+
+    from torchgen.api.functionalization import reverse_name
+
+    reverse_replay_view_call = REVERSE_VIEW_DISPATCH.substitute(
+        reverse_name=reverse_name(f, include_namespace=True),
+        unpacked_args=reverse_unpacked_args,
+    )
+    reverse_replay_view_func = REVERSE_REPLAY_VIEW_LAMBDA_FUNC.substitute(
+        input_view=input_view, reverse_replay_view_call=reverse_replay_view_call
+    )
+
+    is_view_with_metadata_change = (
+        "true" if cpp.name(f.func) in VIEW_FUNCTIONS_WITH_METADATA_CHANGE else "false"
+    )
+
+    return SETUP_REPLAY_VIEW_IF_NOT_SUPPORT_AS_STRIDED_OR_VIEW_WITH_METADATA_CHANGE.substitute(
+        is_view_with_metadata_change=is_view_with_metadata_change,
+        replay_view_func=replay_view_func,
+        reverse_replay_view_func=reverse_replay_view_func,
+    )
+
+
+def emit_view_body(
+    fn: NativeFunctionWithDifferentiabilityInfo, var: str
+) -> tuple[str, str]:
+    # See NOTE [ Autograd View Variables ] in variable.h for details.
+    f = fn.func
+    base_name = get_base_name(f)
+    view_info = get_view_info(f)
+    call = ""
+    differentiable_outputs = gen_differentiable_outputs(fn)
+    differentiable_output_vars = {r.name for r in differentiable_outputs}
+    if not isinstance(view_info, str):
+        raise TypeError(
+            f"The view info should be a string for {base_name}, but it is: {view_info}"
+        )
+    if len(differentiable_output_vars) == 0:
+        # no output is differentiable (.indices() for SparseTensors for example)
+        rhs_value = (
+            f"as_view({view_info}, {var}, "
+            f"/* is_bw_differentiable */ false, /* is_fw_differentiable */ false)"
+        )
+    elif len(differentiable_output_vars) == 1:
+        # Single differentiable output (Tensor or Tensor[])
+        return_info = differentiable_outputs[0]
+        # We only support simple Tensor or a TensorList for functions that return views
+        if not is_tensor_type(return_info.type) and not is_tensor_list_type(
+            return_info.type
+        ):
+            raise RuntimeError(
+                f"{base_name} that return differentiable views can only return Tensor or Tensor[]"
+            )
+
+        # See Note [ View + Inplace detection]
+        def get_creation_meta_in_mode(original: str) -> str:
+            creation_meta_with_grad_mode = f"(at::GradMode::is_enabled() ? {original} : CreationMeta::NO_GRAD_MODE)"
+            return f"InferenceMode::is_enabled() ? CreationMeta::INFERENCE_MODE : {creation_meta_with_grad_mode}"
+
+        # Only allow rebasing of the history if we return a single Tensor
+        # If we are in a no grad block, raise a warning
+        # See NOTE [ View + Inplace detection ] for more details about this logic
+        if is_tensor_list_type(return_info.type):
+            creation_meta = get_creation_meta_in_mode("CreationMeta::MULTI_OUTPUT_NODE")
+            view_idx = "view_idx"
+            view_func = emit_view_func(
+                f, extract_bindings(f), view_idx=view_idx
+            ).strip()
+            as_view_call = (
+                f"as_view(/* base */ {view_info}, /* output */ {var}[{view_idx}], "
+                "/* is_bw_differentiable */ true, /* is_fw_differentiable */ true, "
+                "/* view_func */ std::move(func), /* rev_view_func */ rev_func, "
+                f"/* creation_meta */ {creation_meta});"
+            )
+            call += MULTI_OUTPUT_VIEW_ITERATION.substitute(
+                var=var, view_idx=view_idx, body=f"{view_func}\n{as_view_call}"
+            )
+            rhs_value = f"std::move({var})"
+        else:
+            call += emit_view_func(f, extract_bindings(f), view_idx=None)
+            creation_meta = get_creation_meta_in_mode("CreationMeta::DEFAULT")
+            rhs_value = (
+                f"as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, "
+                "/* is_fw_differentiable */ true, "
+                f"/* view_func */ std::move(func), /* rev_view_func */ rev_func, /* creation_meta */ {creation_meta})"
+            )
+    else:
+        # This could be supported but we don't need it at the moment, so keeping things simple.
+        raise RuntimeError(
+            "Function that return multiple differentiable output "
+            "when at least one of them is view is not supported."
+        )
+    return call, rhs_value
+
+
+def modifies_arguments(f: NativeFunction) -> bool:
+    return f.func.kind() in [SchemaKind.inplace, SchemaKind.out]
+
+
+@with_native_function_with_differentiability_info
+def emit_inplace_or_view_body(fn: NativeFunctionWithDifferentiabilityInfo) -> list[str]:
+    f = fn.func
+    inplace_view_body: list[str] = []
+
+    dispatcher_sig = DispatcherSignature.from_schema(f.func)
+    dispatcher_exprs = dispatcher_sig.exprs()
+
+    # code-generated ADInplaceOrView kernels plumb and recompute dispatch keys directly through the kernel for performance.
+    # See Note [Plumbing Keys Through The Dispatcher] for details.
+    dispatch_key_set = "ks & c10::after_ADInplaceOrView_keyset"
+    redispatch_args = ", ".join([dispatch_key_set] + [a.expr for a in dispatcher_exprs])
+
+    # Note that this calls the slow, dispatching variants of manual_cpp_binding ops.
+    # We could probably work harder to ensure that the fast variants are called instead, but the perf benefit would be minimal.
+    if modifies_arguments(f):  # inplace op
+        inplace_view_body.append(
+            INPLACE_REDISPATCH.substitute(
+                unambiguous_name=f.func.name.unambiguous_name(),
+                unpacked_args=redispatch_args,
+            )
+        )
+        for r in cpp.return_names(f):
+            inplace_view_body.append(f"increment_version({r});")
+    else:
+        assert get_view_info(f) is not None
+        inplace_view_body.append(
+            VIEW_REDISPATCH.substitute(
+                assign_return_values="auto " + TMP_VAR + " = ",
+                unambiguous_name=f.func.name.unambiguous_name(),
+                unpacked_args=redispatch_args,
+            )
+        )
+        call, rhs_value = emit_view_body(fn, TMP_VAR)
+        inplace_view_body.append(call)
+        assert rhs_value is not None
+        inplace_view_body.append(
+            ASSIGN_RETURN_VALUE.substitute(
+                return_values=tie_return_values(f), rhs_value=rhs_value
+            )
+        )
+    if f.func.returns:
+        inplace_view_body.append(f"return {get_return_value(f)};")
+    return inplace_view_body
+
+
+@with_native_function
+def gen_formals(f: NativeFunction) -> str:
+    return ", ".join(
+        # code-generated autograd kernels plumb and recompute dispatch keys directly through the kernel for performance.
+        # See Note [Plumbing Keys Through The Dispatcher] for details.
+        ["c10::DispatchKeySet ks"]
+        + [
+            f'{cpp.argument_type(a, binds="__placeholder__", symint=True).cpp_type()} {a.name}'
+            for a in f.func.schema_order_arguments()
+        ]
+    )
+
+
+@with_native_function_with_differentiability_info
+def inplace_or_view_method_definition(
+    fn: NativeFunctionWithDifferentiabilityInfo,
+) -> str | None:
+    f = fn.func
+    if get_view_info(f) is None and (
+        # For functions that modify their inputs but don't return them,
+        # we can't give them autograd support.
+        # See https://github.com/pytorch/pytorch/issues/53796
+        not modifies_arguments(f)
+        or len(f.func.returns) == 0
+    ):
+        return None
+    return METHOD_DEFINITION.substitute(
+        return_type=cpp.returns_type(f.func.returns, symint=True).cpp_type(),
+        type_wrapper_name=type_wrapper_name(f),
+        formals=gen_formals(f),
+        type_definition_body=emit_inplace_or_view_body(fn),
+    )
+
+
+@with_native_function_with_differentiability_info
+def inplace_or_view_method_registration(
+    fn: NativeFunctionWithDifferentiabilityInfo,
+) -> str | None:
+    f = fn.func
+    if get_view_info(f) is None and (
+        not modifies_arguments(f) or len(f.func.returns) == 0
+    ):
+        return None
+    return WRAPPER_REGISTRATION.substitute(
+        unqual_operator_name_with_overload=f.func.name,
+        type_wrapper_name=type_wrapper_name(f),
+        class_type="ADInplaceOrView",
+    )
+
+
+def use_derived(fn: NativeFunctionWithDifferentiabilityInfo) -> bool:
+    f = fn.func
+    name = cpp.name(f.func)
+    return name not in MANUAL_AUTOGRAD and dispatch_strategy(fn) == "use_derived"
+
+
+def gen_inplace_or_view_type_env(
+    fn: NativeFunctionWithDifferentiabilityInfo,
+) -> dict[str, list[str]]:
+    definition = inplace_or_view_method_definition(fn)
+    registration = inplace_or_view_method_registration(fn)
+
+    return {
+        "ops_headers": (
+            [f"#include <ATen/ops/{fn.func.root_name}_ops.h>"]
+            if definition is not None
+            else []
+        ),
+        "inplace_or_view_method_definitions": [definition]
+        if definition is not None
+        else [],
+        "inplace_or_view_wrapper_registrations": [registration]
+        if registration is not None
+        else [],
+    }
+
+
+def gen_inplace_or_view_type(
+    out: str,
+    native_yaml_path: str,
+    tags_yaml_path: str,
+    fns_with_infos: list[NativeFunctionWithDifferentiabilityInfo],
+    template_path: str,
+) -> None:
+    # NOTE: see Note [Sharded File] at the top of the VariableType.cpp
+    # template regarding sharding of the generated files.
+    num_shards = 2
+
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    fm.write_sharded(
+        "ADInplaceOrViewType.cpp",
+        [fn for fn in fns_with_infos if use_derived(fn)],
+        key_fn=lambda fn: fn.func.root_name,
+        base_env={
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/ADInplaceOrViewType.cpp",
+        },
+        env_callable=gen_inplace_or_view_type_env,
+        num_shards=2,
+        sharded_keys={
+            "ops_headers",
+            "inplace_or_view_method_definitions",
+            "inplace_or_view_wrapper_registrations",
+        },
+    )
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_python_functions.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_python_functions.py
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_trace_type.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_trace_type.py
@ -0,0 +1,536 @@
+from __future__ import annotations
+
+import itertools
+from typing import Sequence
+
+from torchgen.api import cpp
+from torchgen.api.types import DispatcherSignature
+from torchgen.code_template import CodeTemplate
+from torchgen.context import with_native_function
+from torchgen.model import Argument, NativeFunction, SchemaKind, TensorOptionsArguments
+from torchgen.utils import FileManager
+
+
+# Note [Manual Backend kernels]
+# For these ops, we want to manually register to dispatch key Backend and
+# skip codegen-ed registeration to all keys before Backend.
+# For codegen this means:
+#   - op set below must match ops with manual_kernel_registration=True in native_functions.yaml
+#     where we skip codegen backend kernels
+#   - all ops below are part of MANUAL_AUTOGRAD to skip codegen Autograd kernel registration
+#   - all ops below are part of MANUAL_TRACER to skip codegen Tracer kernel registration
+# Note: we still register to dispatch key Profiler for these ops, keeping it untouched for now.
+# You can find the manual registration in torch/csrc/autograd/VariableTypeManual.cpp
+MANUAL_BACKEND = {
+    "options",
+    "data",
+    "set_data",
+    "is_leaf",
+    "output_nr",
+    "_version",
+    "retain_grad",
+    "_backward",
+    "requires_grad_",
+}
+
+# For these ops we want to skip the codegen-ed registration to both Autograd and Tracer keys.
+# You can find the manual registration in torch/csrc/autograd/VariableTypeManual.cpp
+MANUAL_AUTOGRAD_AND_TRACER = {
+    "resize_",
+    "resize_as_",
+    "detach",
+    "detach_",
+    "copy_",
+    "_fw_primal",
+    "_make_dual",
+}
+
+# Currently MANUAL_AUTOGRAD and MANUAL_TRACER share the same set of ops:
+#   union(MANUAL_BACKEND, MANUAL_AUTOGRAD_AND_TRACER)
+# You can find the manual registration in torch/csrc/autograd/VariableTypeManual.cpp
+MANUAL_AUTOGRAD = MANUAL_TRACER = MANUAL_BACKEND | MANUAL_AUTOGRAD_AND_TRACER
+
+# These functions we don't want to record for tracing, because we always want
+# to trace their constituent parts.  This is a temporary hack in lieue
+# of proper scopes, where subsequent compilation passes can ask for the unfolding
+# on demand.  Only concrete ATen methods can be disabled this way; it will have
+# NO EFFECT otherwise.
+DONT_RECORD_TRACE = {
+    "convolution",
+    "conv1d",
+    "conv2d",
+    "conv3d",
+    "conv_transpose1d",
+    "conv_transpose2d",
+    "conv_transpose3d",
+    "lstm_cell",
+    "gru_cell",
+    "rnn_tanh_cell",
+    "rnn_relu_cell",
+    # FIXME: figure out a better way when we support sparse tensors in jit
+    "_coalesced",
+}
+
+
+def should_trace(f: NativeFunction) -> bool:
+    # Operations involving Storage or Type are not traceable at the moment
+    if any(
+        str(arg.type) in {"Storage", "Type", "ConstQuantizerPtr"}
+        for arg in f.func.schema_order_arguments()
+    ):
+        return False
+    # We can't trace functions which don't have any Tensor or TensorList returns
+    if not any(r.type.is_tensor_like() for r in f.func.returns):
+        return False
+    return f.func.name.name.base not in DONT_RECORD_TRACE
+
+
+SELECT = CodeTemplate(
+    """\
+
+if (${cond}) {
+  ${true}
+} else {
+  ${false}
+}
+"""
+)
+
+OP_NAME = CodeTemplate(
+    """\
+op_name = c10::Symbol::fromQualString("aten::${trace_name}");
+"""
+)
+
+# These functions have their names recorded under trace renamed,
+RENAME_TRACE = {
+    "zero": "zeros_like",  # replacing aten::zero_ with aten::zeros_like
+    "fill": "full_like",  # replacing aten::fill_ with aten::full_like
+}
+
+
+def format_trace_op_name(f: NativeFunction) -> str:
+    # TODO: byte-for-byte compatible with old codegen behavior - should clean up
+    if (
+        f.func.kind() in (SchemaKind.functional, SchemaKind.out)
+        or f.func.name.name.dunder_method
+    ):
+        # special case for *_out functions: the in-place and out-of-place ops
+        # are overloaded with the same name in the JIT
+        trace_name = str(f.func.name.name)
+        trace_name = RENAME_TRACE.get(trace_name, trace_name)
+        return OP_NAME.substitute(trace_name=trace_name)
+
+    # otherwise, this is an in-place op and we need to emit both in- and
+    # out-of-place versions
+    outplace_trace_name = f.func.name.name.base
+    inplace_trace_name = cpp.name(f.func)
+    outplace_trace_name = RENAME_TRACE.get(outplace_trace_name, outplace_trace_name)
+    inplace_trace_name = RENAME_TRACE.get(inplace_trace_name, inplace_trace_name)
+
+    return SELECT.substitute(
+        cond="tracer_state->force_outplace",
+        true=OP_NAME.substitute(trace_name=outplace_trace_name),
+        false=OP_NAME.substitute(trace_name=inplace_trace_name),
+    )
+
+
+ADD_TRACE_INPUT = CodeTemplate("""jit::tracer::addInputs(node, "${name}", ${input});""")
+
+
+def format_trace_inputs(f: NativeFunction) -> str:
+    def dispatch_trace_input(arg: Argument | TensorOptionsArguments) -> Sequence[str]:
+        if isinstance(arg, TensorOptionsArguments):
+            name = "options"
+            return [
+                ADD_TRACE_INPUT.substitute(
+                    name=name, input="c10::optTypeMetaToScalarType(options.dtype_opt())"
+                ),
+                ADD_TRACE_INPUT.substitute(name=name, input="options.layout()"),
+                ADD_TRACE_INPUT.substitute(name=name, input="options.device()"),
+                ADD_TRACE_INPUT.substitute(name=name, input="options.pinned_memory()"),
+            ]
+        else:
+            name = arg.name
+            if str(arg.type) == "Tensor?[]":
+                return [f'jit::tracer::addInputs(node, "{name}", {name});']
+            else:
+                return [ADD_TRACE_INPUT.substitute(name=name, input=name)]
+
+    args: list[Argument | TensorOptionsArguments] = list(
+        f.func.schema_order_arguments()
+    )
+
+    if f.func.is_out_fn():
+        # *_out functions take the result as a separate argument, but we don't want to
+        # trace that argument directly. Instead, we trace its TensorOptions.
+        # So first, we need to remove the out argument from the list of arguments to trace.
+        num_out_args = len(f.func.arguments.out)
+        args = args[:-num_out_args]
+
+    trace_inputs = itertools.chain.from_iterable(
+        dispatch_trace_input(arg) for arg in args
+    )
+
+    if f.func.is_out_fn():
+        # for *_out functions, handle the result argument differently for inplace/outplace.
+        # For inplace: just add the input to the end to confirm with the JIT schema
+        inplace = [
+            ADD_TRACE_INPUT.substitute(
+                name=f.func.arguments.out[i].name, input=f.func.arguments.out[i].name
+            )
+            for i in range(num_out_args)
+        ]
+
+        # for outplace: do nothing, except if the function is a factory.
+        # Factories are a bit special because their out-of-place overloads
+        # take an extra TensorOptions argument, which is missing in the _out function
+        has_tensor_return = any(r.type.is_tensor_like() for r in f.func.returns)
+        has_tensor_input_arg = any(
+            a.type.is_tensor_like() for a in f.func.arguments.flat_non_out
+        )
+        is_factory_method = f.category_override == "factory" or (
+            has_tensor_return and not has_tensor_input_arg
+        )
+
+        # HACK: preserve old codegen behavior - the old codegen set the `is_factory_method`
+        # flag for the whole family of ops with the same basename if any of them is a
+        # factory method. For most cases the whole family of ops are indeed all factory
+        # method - 'normal' is the only exception. So we handle it specially here to avoid
+        # cloning the old logic.
+        if f.func.name.name.base == "normal":
+            is_factory_method = True
+
+        if is_factory_method:
+            outplace = [
+                ADD_TRACE_INPUT.substitute(
+                    name="out",
+                    input="c10::optTypeMetaToScalarType(out.options().dtype_opt())",
+                ),
+                ADD_TRACE_INPUT.substitute(name="out", input="out.options().layout()"),
+                ADD_TRACE_INPUT.substitute(name="out", input="out.options().device()"),
+                ADD_TRACE_INPUT.substitute(
+                    name="out", input="out.options().pinned_memory()"
+                ),
+            ]
+        else:
+            outplace = []
+
+        trace_inputs = itertools.chain(
+            trace_inputs,
+            [
+                SELECT.substitute(
+                    cond="tracer_state->force_outplace",
+                    true="\n".join(outplace),
+                    false="\n".join(inplace),
+                )
+            ],
+        )
+
+    return "\n".join(trace_inputs)
+
+
+# `torch.jit.trace` have undocumented keyword argument `_force_outplace`,
+# which force jit to replace functions with outplace variants (for
+# example `aten::add_` becomes `aten::add`).
+#
+# This replacement implemented in-place with minimum modifications of
+# arguments stack (as it assumes that outplace call has the same arguments
+# as inplace version).
+#
+# However there are no such substitutions available for `aten::fill_`
+# and `aten::zero_` operators, as we never implemented `aten::fill`
+# and `aten::zero`. So jit tracing hack replacing `aten::zero_` with
+# `aten::zeros_like` and replacing `aten::fill_` with `aten::full_like`.
+#
+# But as they potentially can have different arguments, we also have
+# to hack into the stack and add missing ones.
+#
+# A possible alternative would be:
+#
+#  - Add `aten::fill` and `aten::zero`
+#
+#  - Or keep `aten::zeros_like` arguments aligned with `aten::zero_`
+# arguments (inside of the `native_functions.yaml`)
+RENAME_TRACE_ADD_ARGS = {
+    "fill": """\
+    jit::tracer::addInputs(node, "options", ::std::optional<ScalarType>());
+    jit::tracer::addInputs(node, "options", layout_or_default(::std::nullopt));
+    jit::tracer::addInputs(node, "options", device_or_default(::std::nullopt));
+    jit::tracer::addInputs(node, "options", pinned_memory_or_default(::std::nullopt));
+    ::std::optional<MemoryFormat> memory_format = c10::MemoryFormat::Preserve;
+    jit::tracer::addInputs(node, "memory_format", memory_format);
+""",
+    "zero": """\
+    jit::tracer::addInputs(node, "options", ::std::optional<ScalarType>());
+    jit::tracer::addInputs(node, "options", layout_or_default(::std::nullopt));
+    jit::tracer::addInputs(node, "options", device_or_default(::std::nullopt));
+    jit::tracer::addInputs(node, "options", pinned_memory_or_default(::std::nullopt));
+    ::std::optional<MemoryFormat> memory_format = c10::MemoryFormat::Preserve;
+    jit::tracer::addInputs(node, "memory_format", memory_format);
+""",
+}
+
+INPLACE_GUARD = CodeTemplate(
+    """\
+jit::tracer::ensureUniqueIfOutOfPlaced("${name}", ${mutable_input});
+"""
+)
+
+PRE_RECORD_TRACE = CodeTemplate(
+    """\
+torch::jit::Node* node = nullptr;
+std::shared_ptr<jit::tracer::TracingState> tracer_state;
+if (jit::tracer::isTracing()) {
+  tracer_state = jit::tracer::getTracingState();
+  at::Symbol op_name;
+  ${set_op_name}
+  node = tracer_state->createNode(op_name, /*num_outputs=*/0);
+  jit::tracer::recordSourceLocation(node);
+  ${add_trace_inputs}
+  tracer_state->insertNode(node);
+  ${inplace_guard}
+  jit::tracer::setTracingState(nullptr);
+}
+"""
+)
+
+
+def format_prerecord_trace(f: NativeFunction) -> str:
+    if not should_trace(f):
+        return ""
+
+    # TODO: clean up old codegen behavior
+    is_inplace = (
+        f.func.kind() in (SchemaKind.inplace, SchemaKind.out)
+        and not f.func.name.name.dunder_method
+    )
+    add_args = (
+        RENAME_TRACE_ADD_ARGS.get(f.func.name.name.base, "") if is_inplace else ""
+    )
+    additional_inputs = (
+        SELECT.substitute(
+            cond="tracer_state->force_outplace",
+            true=add_args,
+            false="",
+        )
+        if add_args
+        else ""
+    )
+
+    return PRE_RECORD_TRACE.substitute(
+        set_op_name=format_trace_op_name(f),
+        add_trace_inputs=format_trace_inputs(f) + additional_inputs,
+        inplace_guard=INPLACE_GUARD.substitute(
+            name=cpp.name(f.func),
+            mutable_input=f.func.arguments.out[0].name
+            if f.func.arguments.out
+            else "self",
+        )
+        if is_inplace
+        else "",
+    )
+
+
+POST_RECORD_TRACE = CodeTemplate(
+    """\
+if (tracer_state) {
+  jit::tracer::setTracingState(std::move(tracer_state));
+  ${add_trace_outputs}
+}
+"""
+)
+
+
+def format_postrecord_trace(f: NativeFunction) -> str:
+    if not should_trace(f):
+        return ""
+
+    # For outplacing ops, *_out overloads require special handling to move the
+    # output *argument* to a return value
+    if f.func.is_out_fn():
+        output_names_outplace = [arg.name for arg in f.func.arguments.out]
+        output_names_inplace = cpp.return_names(f)
+
+        # Code size optimization: the common case is that the return value is
+        # the same for both variants
+        if output_names_outplace == output_names_inplace:
+            outputs = [
+                f"jit::tracer::addOutput(node, {n});" for n in output_names_outplace
+            ]
+            return POST_RECORD_TRACE.substitute(add_trace_outputs=outputs)
+
+        selection = SELECT.substitute(
+            cond="force_outplace",
+            true="\n".join(
+                f"jit::tracer::addOutput(node, {n});" for n in output_names_outplace
+            ),
+            false="\n".join(
+                f"jit::tracer::addOutput(node, {n});" for n in output_names_inplace
+            ),
+        )
+        return POST_RECORD_TRACE.substitute(add_trace_outputs=selection)
+    else:
+        output_names = cpp.return_names(f)
+        outputs = [f"jit::tracer::addOutput(node, {n});" for n in output_names]
+        return POST_RECORD_TRACE.substitute(add_trace_outputs=outputs)
+
+
+def tie_return_values(f: NativeFunction) -> str:
+    if len(f.func.returns) == 1:
+        return f'auto {f.func.returns[0].name or "result"}'
+    names = cpp.return_names(f)
+    return f'auto [{", ".join(names)}]'
+
+
+def get_return_value(f: NativeFunction) -> str:
+    names = cpp.return_names(f)
+    if len(f.func.returns) == 1:
+        return names[0]
+    if f.func.kind() == SchemaKind.out:
+        return f'std::forward_as_tuple({", ".join(names)})'
+    else:
+        moved = ", ".join(f"std::move({name})" for name in names)
+        return f"std::make_tuple({moved})"
+
+
+TRACE_DISPATCH = CodeTemplate(
+    """\
+${assign_return_values}at::_ops::${unambiguous_name}::redispatch(${unpacked_args});"""
+)
+
+
+def emit_trace_body(f: NativeFunction) -> list[str]:
+    trace_body: list[str] = []
+
+    trace_body.append(format_prerecord_trace(f))
+
+    dispatcher_sig = DispatcherSignature.from_schema(f.func)
+    dispatcher_exprs = dispatcher_sig.exprs()
+
+    # code-generated tracing kernels plumb and recompute dispatch keys directly through the kernel for performance.
+    # See Note [Plumbing Keys Through The Dispatcher] for details.
+    dispatch_key_set = "ks & c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, c10::DispatchKey::Tracer)"
+    redispatch_args = ", ".join([dispatch_key_set] + [a.expr for a in dispatcher_exprs])
+
+    assign_return_values = (
+        f"{tie_return_values(f)} = "
+        if f.func.kind() in [SchemaKind.functional, SchemaKind.mutable]
+        and f.func.returns
+        else ""
+    )
+
+    # Note that this calls the slow, dispatching variants of manual_cpp_binding ops.
+    # We could probably work harder to ensure that the fast variants are
+    # called instead, but the perf benefit would be minimal.
+    trace_body.append(
+        TRACE_DISPATCH.substitute(
+            assign_return_values=assign_return_values,
+            unambiguous_name=f.func.name.unambiguous_name(),
+            unpacked_args=redispatch_args,
+        )
+    )
+
+    trace_body.append(format_postrecord_trace(f))
+    if f.func.returns:
+        trace_body.append(f"return {get_return_value(f)};")
+    return trace_body
+
+
+METHOD_DEFINITION = CodeTemplate(
+    """\
+${return_type} ${type_wrapper_name}(${formals}) {
+  ${type_definition_body}
+}
+"""
+)
+
+
+def type_wrapper_name(f: NativeFunction, key: str = "Default") -> str:
+    if f.func.name.overload_name:
+        name = f"{cpp.name(f.func)}_{f.func.name.overload_name}"
+    else:
+        name = cpp.name(f.func)
+
+    # The key argument is only used in gen_variable_type where we need fns per autograd dispatch key.
+    # In gen_trace_type and gen_inplace_view_type where only one fn per native_fn must be generated,
+    # the key argument should not be passed.
+    # We do not append key if it is Default so that generated functions from
+    # before per-dispatch-key derivatives were added retain the same names.
+    if key != "Default":
+        name = name + f"_{key}"
+    return name
+
+
+@with_native_function
+def method_definition(f: NativeFunction) -> str:
+    assert cpp.name(f.func) not in MANUAL_TRACER
+
+    formals = ", ".join(
+        # code-generated tracing kernels plumb and recompute dispatch keys directly through the kernel for performance.
+        # See Note [Plumbing Keys Through The Dispatcher] for details.
+        ["c10::DispatchKeySet ks"]
+        + [
+            f'{cpp.argument_type(a, binds="__placeholder__", symint=True).cpp_type()} {a.name}'
+            for a in f.func.schema_order_arguments()
+        ]
+    )
+
+    return METHOD_DEFINITION.substitute(
+        return_type=cpp.returns_type(f.func.returns, symint=True).cpp_type(),
+        type_wrapper_name=type_wrapper_name(f),
+        formals=formals,
+        type_definition_body=emit_trace_body(f),
+    )
+
+
+WRAPPER_REGISTRATION = CodeTemplate(
+    """\
+m.impl("${name}",
+       TORCH_FN(${class_type}::${type_wrapper_name})
+);
+"""
+)
+
+
+@with_native_function
+def method_registration(f: NativeFunction) -> str:
+    assert cpp.name(f.func) not in MANUAL_TRACER
+
+    return WRAPPER_REGISTRATION.substitute(
+        name=f.func.name,
+        type_wrapper_name=type_wrapper_name(f),
+        class_type="TraceType",
+    )
+
+
+def gen_trace_type_func(fn: NativeFunction) -> dict[str, list[str]]:
+    return {
+        "ops_headers": [f"#include <ATen/ops/{fn.root_name}_ops.h>"],
+        "trace_method_definitions": [method_definition(fn)],
+        "trace_wrapper_registrations": [method_registration(fn)],
+    }
+
+
+def gen_trace_type(
+    out: str, native_functions: list[NativeFunction], template_path: str
+) -> None:
+    # NOTE: see Note [Sharded File] at the top of the VariableType.cpp
+    # template regarding sharding of the generated files.
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    fm.write_sharded(
+        "TraceType.cpp",
+        [fn for fn in native_functions if cpp.name(fn.func) not in MANUAL_TRACER],
+        key_fn=lambda fn: fn.root_name,
+        base_env={
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/TraceType.cpp",
+        },
+        env_callable=gen_trace_type_func,
+        num_shards=5,
+        sharded_keys={
+            "ops_headers",
+            "trace_method_definitions",
+            "trace_wrapper_registrations",
+        },
+    )
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_variable_factories.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_variable_factories.py
@ -0,0 +1,116 @@
+# Generates C++ functions that wrap ATen tensor factory methods to turn them into Variables.
+#
+# This writes one file: variable_factories.h
+
+from __future__ import annotations
+
+import re
+
+import torchgen.api.python as python
+from torchgen.api import cpp
+from torchgen.api.types import CppSignatureGroup
+from torchgen.context import with_native_function
+from torchgen.gen import parse_native_yaml
+from torchgen.model import NativeFunction, TensorOptionsArguments, Variant
+from torchgen.utils import FileManager, mapMaybe
+
+
+OPTIONAL_TYPE_PATTERN = re.compile(r"std::optional<(.+)>")
+TYPE_PATTERN = re.compile(r"(?:const\s+)?([A-Z]\w+)")
+
+
+# Add 'at::' to types defined in ATen namespace, e.g. Tensor, TensorList, IntArrayRef and etc.
+# TODO: maybe update the cpp argument API to take optional namespace argument?
+def fully_qualified_type(argument_type: str) -> str:
+    def maybe_optional_type(type: str, is_opt: bool) -> str:
+        return f"std::optional<{type}>" if is_opt else type
+
+    opt_match = OPTIONAL_TYPE_PATTERN.match(argument_type)
+    is_opt = opt_match is not None
+    if opt_match:
+        argument_type = argument_type[opt_match.start(1) : opt_match.end(1)]
+    match = TYPE_PATTERN.match(argument_type)
+    if match is None:
+        return maybe_optional_type(argument_type, is_opt)
+    index = match.start(1)
+    qualified_type = f"{argument_type[:index]}at::{argument_type[index:]}"
+    return maybe_optional_type(qualified_type, is_opt)
+
+
+def gen_variable_factories(
+    out: str, native_yaml_path: str, tags_yaml_path: str, template_path: str
+) -> None:
+    native_functions = parse_native_yaml(
+        native_yaml_path, tags_yaml_path
+    ).native_functions
+    factory_functions = [fn for fn in native_functions if is_factory_function(fn)]
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    fm.write_with_template(
+        "variable_factories.h",
+        "variable_factories.h",
+        lambda: {
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/variable_factories.h",
+            "ops_headers": [
+                f"#include <ATen/ops/{fn.root_name}.h>" for fn in factory_functions
+            ],
+            "function_definitions": list(mapMaybe(process_function, factory_functions)),
+        },
+    )
+
+
+@with_native_function
+def is_factory_function(f: NativeFunction) -> bool:
+    if Variant.function not in f.variants:
+        return False
+
+    name = cpp.name(f.func)
+    has_tensor_options = python.has_tensor_options(f)
+    return has_tensor_options or name.endswith("_like")
+
+
+@with_native_function
+def process_function(f: NativeFunction) -> str | None:
+    name = cpp.name(f.func)
+    has_tensor_options = python.has_tensor_options(f)
+    is_factory = has_tensor_options or name.endswith("_like")
+
+    if Variant.function not in f.variants or not is_factory:
+        return None
+
+    cpp_sigs = CppSignatureGroup.from_native_function(f, method=False)
+    sigs = [cpp_sigs.signature]
+    if cpp_sigs.symint_signature is not None:
+        sigs.append(cpp_sigs.symint_signature)
+    r = ""
+    for sig in sigs:
+        formals: list[str] = []
+        exprs: list[str] = []
+        requires_grad = "false"
+        for arg in sig.arguments():
+            qualified_type = fully_qualified_type(arg.type)
+            if arg.default:
+                formals.append(f"{qualified_type} {arg.name} = {arg.default}")
+            else:
+                formals.append(f"{qualified_type} {arg.name}")
+
+            if isinstance(arg.argument, TensorOptionsArguments):
+                # note: we remove the requires_grad setting from the TensorOptions because
+                # it is ignored anyways (and we actually have an assertion that it isn't set
+                # which would fail otherwise). We handle requires_grad explicitly here
+                # instead of passing it through to the kernel.
+                exprs.append(
+                    f"at::TensorOptions({arg.name}).requires_grad(::std::nullopt)"
+                )
+                # Manually set the requires_grad bit on the result tensor.
+                requires_grad = f"{arg.name}.requires_grad()"
+            else:
+                exprs.append(arg.name)
+
+        r += f"""\
+inline at::Tensor {sig.name()}({', '.join(formals)}) {{
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::{sig.name()}({', '.join(exprs)}), /*requires_grad=*/{requires_grad});
+}}
+"""
+    return r
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_variable_type.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_variable_type.py
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_view_funcs.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_view_funcs.py
@ -0,0 +1,340 @@
+# Generates ViewFuncs.h/cpp
+#
+# NOTE: If any changes are being made to the ViewFunc codegen please also check
+# if updates are needed in torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+# The fallback is expected to mimic this codegen, so we should keep the two in sync.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torchgen.api.dispatcher as dispatcher
+from torchgen.api.translate import translate
+from torchgen.api.types import (
+    BaseCType,
+    Binding,
+    NamedCType,
+    SymIntT,
+    tensorT,
+    VectorCType,
+)
+from torchgen.code_template import CodeTemplate
+from torchgen.model import Argument, NativeFunction, OptionalType
+from torchgen.utils import FileManager
+
+from .gen_inplace_or_view_type import (
+    CALL_DISPATCH,
+    extract_bindings,
+    get_view_info,
+    modifies_arguments,
+    use_derived,
+)
+
+
+if TYPE_CHECKING:
+    from torchgen.api.autograd import NativeFunctionWithDifferentiabilityInfo
+
+
+FUNCTION_DECLARATION = CodeTemplate(
+    """\
+#define ${uppercase_op}_AVAILABLE
+struct ${op} : public ${superclass} {
+  ${op}(${constructor_args}) ${initializer_list}
+  {};
+  virtual ~${op}() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ${state}
+};
+
+"""
+)
+
+FUNCTION_DEFINITION = CodeTemplate(
+    """\
+std::vector<c10::SymInt> ${op}::get_symints() const {
+  ${get_symints}
+}
+
+size_t ${op}::num_symints() const {
+  return static_cast<size_t>(${num_symints});
+}
+
+void ${op}::set_symints(std::vector<c10::SymInt> ${symints_vec}) {
+  TORCH_INTERNAL_ASSERT(${symints_vec}.size() == num_symints());
+  ${set_symints}
+}
+
+std::vector<at::Tensor> ${op}::get_tensors() const {
+  ${get_tensors}
+}
+
+size_t ${op}::num_tensors() const {
+  return static_cast<size_t>(${num_tensors});
+}
+
+void ${op}::set_tensors(std::vector<at::Tensor> ${tensors_vec}) {
+  TORCH_INTERNAL_ASSERT(${tensors_vec}.size() == num_tensors());
+  ${set_tensors}
+}
+
+at::Tensor ${op}::operator()(const at::Tensor& ${call_input_name}) const {
+  return ${op_call};
+}
+
+std::unique_ptr<ViewFunc> ${op}::clone_and_set(
+    std::optional<std::vector<c10::SymInt>> ${symints_vec},
+    std::optional<std::vector<at::Tensor>> ${tensors_vec}) const {
+  auto output = std::make_unique<${op}>(${clone_args});
+  if (${symints_vec}.has_value()) {
+    output->set_symints(std::move(*(${symints_vec})));
+  }
+  if (${tensors_vec}.has_value()) {
+    output->set_tensors(std::move(*(${tensors_vec})));
+  }
+  return output;
+}
+
+"""
+)
+
+
+# e.g. as_strided -> AsStridedViewFunc for camel case or
+# as_strided_view_func otherwise
+def view_func_name(
+    f: NativeFunction, include_namespace: bool = False, camel_case: bool = True
+) -> str:
+    name = f.func.name.unambiguous_name()
+    view_func_name = f"{name.replace('.', '_')}_view_func"
+    if camel_case:
+        is_private = view_func_name.startswith("_")
+        view_func_name = "".join(
+            [p.title() for p in view_func_name.replace(".", "_").split("_")]
+        )
+        if is_private:
+            # put the leading underscore back in
+            view_func_name = f"_{view_func_name}"
+    namespace = "torch::autograd::generated::" if include_namespace else ""
+    return f"{namespace}{view_func_name}"
+
+
+def is_symint_or_tensor(arg: Argument) -> bool:
+    return arg.type.is_tensor_like() or arg.type.is_symint_like()
+
+
+def remove_const_ref(binding: Binding) -> Binding:
+    return Binding(
+        name=binding.name,
+        nctype=binding.nctype.remove_const_ref(),
+        argument=binding.argument,
+        default=binding.default,
+    )
+
+
+def returns_multi_tensor(fn: NativeFunction) -> bool:
+    returns = fn.func.returns
+    assert len(returns) == 1
+    returns_list_like = returns[0].type.is_list_like() is not None
+    returns_tensor_like = returns[0].type.is_tensor_like()
+    return returns_list_like and returns_tensor_like
+
+
+# Generates strings with logic for getting / setting state of a particular type.
+#
+# Args:
+#   bindings (list): List of state bindings of interest (may be empty)
+#   state_vec_type (NamedCType): Type of vector to either return or copy from
+#
+# Returns:
+#   tuple: (list of getter logic strings, list of setter logic strings, string
+#     with num items expression)
+def generate_state_getter_setter(
+    bindings: list[Binding],
+    state_vec_type: NamedCType,
+) -> tuple[list[str], list[str], str]:
+    getter_logic = []
+    setter_logic = []
+
+    state_vec = state_vec_type.name
+    getter_logic.append(f"{state_vec_type.cpp_type()} {state_vec};")
+    if len(bindings) > 0:
+        setter_logic.append("auto i = 0;")
+
+    num_exprs = []
+    for i, b in enumerate(bindings):
+        assert isinstance(b.argument, Argument)
+        if b.argument.type.is_list_like():
+            # Handle list-likes.
+            num_expr = f"{b.name}.size()"
+            num_exprs.append(num_expr)
+            getter = f"{state_vec}.insert({state_vec}.end(), {b.name}.begin(), {b.name}.end());"
+            setter = f"std::copy({state_vec}.begin() + i, {state_vec}.begin() + i + {b.name}.size(), {b.name}.begin());"
+        elif isinstance(b.argument.type, OptionalType):
+            # Handle optionals.
+            num_expr = f"({b.name}.has_value() ? 1 : 0)"
+            num_exprs.append(num_expr)
+            conditional = f"if({b.name}.has_value())"
+            getter = (
+                f"{conditional} {state_vec}.insert({state_vec}.end(), *({b.name}));"
+            )
+            setter = f"{conditional} {b.name} = {state_vec}[i];"
+        else:
+            num_expr = "1"
+            num_exprs.append(num_expr)
+            getter = f"{state_vec}.push_back({b.name});"
+            setter = f"{b.name} = {state_vec}[i];"
+
+        getter_logic.append(getter)
+        setter_logic.append(setter)
+        if i < len(bindings) - 1:
+            setter_logic.append(f"i += {num_expr};")
+
+    # Reserve / assert based on the total number of items expression.
+    num_items = "0" if len(num_exprs) == 0 else " + ".join(num_exprs)
+    if len(bindings) > 0:
+        getter_logic.insert(1, f"{state_vec}.reserve({num_items});")
+
+    getter_logic.append(f"return {state_vec};")
+
+    return getter_logic, setter_logic, num_items
+
+
+def process_function(fn: NativeFunction, template: CodeTemplate) -> str:
+    bindings = extract_bindings(fn)
+    non_self_bindings = [b for b in bindings if b.name != "self"]
+
+    non_self_args = fn.func.arguments.flat_all[1:]
+    non_self_value_bindings = [
+        dispatcher.argument(a, remove_non_owning_ref_types=True) for a in non_self_args
+    ]
+
+    # Generate constructor / clone args for the generated struct.
+    constructor_args = [b.defn() for b in non_self_bindings]
+    clone_args = [b.name for b in non_self_bindings]
+
+    # Generate state variable declarations for the generated struct.
+    state_variables = [
+        f"{remove_const_ref(b).defn()};" for b in non_self_value_bindings
+    ]
+
+    # Generate initializer list expressions for the generated struct.
+    # allow_expensive_conversions=True because we need to store e.g. SymIntArrayRefs as
+    # vector<SymInt>s.
+    init_exprs = translate(
+        non_self_bindings, non_self_value_bindings, allow_expensive_conversions=True
+    )
+    initializers = []
+    for b, init_expr in zip(non_self_bindings, init_exprs):
+        name = b.nctype.name
+        assert isinstance(name, str)
+        initializers.append(f"{name}({init_expr.expr})")
+
+    # Generate call to underlying view op
+    call_input_name = "input_base"
+    op_call_args = [call_input_name, *(b.name for b in non_self_bindings)]
+    op_call = CALL_DISPATCH.substitute(
+        unambiguous_name=fn.func.name.unambiguous_name(),
+        unpacked_args=op_call_args,
+    )
+
+    # Multi-output views additionally require a view_idx for disambiguation.
+    if returns_multi_tensor(fn):
+        view_idx_name = "view_idx"
+        view_idx_typename = "int64_t"
+        view_idx_decl = f"{view_idx_typename} {view_idx_name}"
+        constructor_args.append(view_idx_decl)
+        clone_args.append(view_idx_name)
+        state_variables.append(f"{view_idx_decl};")
+        initializers.append(f"{view_idx_name}({view_idx_name})")
+        op_call += f"[{view_idx_name}]"
+
+    # Generate initializer list for the generated struct.
+    initializer_list = f": {', '.join(initializers)}" if len(initializers) > 0 else ""
+
+    # Generate getter / setter logic for any symints.
+    symint_bindings = [
+        b
+        for b in non_self_bindings
+        if isinstance(b.argument, Argument) and b.argument.type.is_symint_like()
+    ]
+    symints_vec_type = NamedCType("symints", VectorCType(BaseCType(SymIntT)))
+    get_symints, set_symints, num_symints = generate_state_getter_setter(
+        symint_bindings, symints_vec_type
+    )
+
+    # Generate getter / setter logic for any tensors.
+    tensor_bindings = [
+        b
+        for b in non_self_bindings
+        if isinstance(b.argument, Argument) and b.argument.type.is_tensor_like()
+    ]
+    tensors_vec_type = NamedCType("tensors", VectorCType(BaseCType(tensorT)))
+    get_tensors, set_tensors, num_tensors = generate_state_getter_setter(
+        tensor_bindings, tensors_vec_type
+    )
+
+    return template.substitute(
+        op=view_func_name(fn),
+        uppercase_op=view_func_name(fn, camel_case=False).upper(),
+        superclass="torch::autograd::ViewFunc",
+        initializer_list=initializer_list,
+        state=state_variables,
+        constructor_args=constructor_args,
+        clone_args=clone_args,
+        symints_vec=symints_vec_type.name,
+        get_symints=get_symints,
+        set_symints=set_symints,
+        num_symints=num_symints,
+        tensors_vec=tensors_vec_type.name,
+        get_tensors=get_tensors,
+        set_tensors=set_tensors,
+        num_tensors=num_tensors,
+        call_input_name=call_input_name,
+        op_call=op_call,
+    )
+
+
+def gen_view_funcs(
+    out: str,
+    fns_with_infos: list[NativeFunctionWithDifferentiabilityInfo],
+    template_path: str,
+) -> None:
+    # don't need the info parts, just the function
+    fns = [fn.func for fn in fns_with_infos if use_derived(fn)]
+    # only want out-of-place views
+    view_fns = [
+        fn for fn in fns if get_view_info(fn) is not None and not modifies_arguments(fn)
+    ]
+
+    declarations = [process_function(fn, FUNCTION_DECLARATION) for fn in view_fns]
+    definitions = [process_function(fn, FUNCTION_DEFINITION) for fn in view_fns]
+    ops_headers = [f"#include <ATen/ops/{fn.root_name}_ops.h>" for fn in view_fns]
+
+    file_basename = "ViewFuncs"
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    for suffix in [".h", ".cpp"]:
+        fname = file_basename + suffix
+        fm.write_with_template(
+            fname,
+            fname,
+            lambda: {
+                "generated_comment": "@"
+                + f"generated from {fm.template_dir_for_comments()}/"
+                + fname,
+                "view_func_declarations": declarations,
+                "view_func_definitions": definitions,
+                "ops_headers": ops_headers,
+            },
+        )
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/load_derivatives.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/load_derivatives.py
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/ADInplaceOrViewType.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/ADInplaceOrViewType.cpp
@ -0,0 +1,38 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include "torch/csrc/autograd/VariableTypeUtils.h"
+#include "torch/csrc/autograd/generated/ViewFuncs.h"
+
+#include <torch/library.h>
+#include <ATen/FunctionalInverses.h>
+#include <ATen/FunctionalTensorWrapper.h>
+
+// ${generated_comment}
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+$ops_headers
+#endif
+
+using namespace at;
+using torch::autograd::CreationMeta;
+using torch::autograd::as_view;
+using torch::autograd::increment_version;
+
+namespace torch {
+
+namespace ADInplaceOrView {
+
+namespace {
+${inplace_or_view_method_definitions}
+}  // namespace
+}  // namespace ADInplaceOrView
+
+namespace {
+
+TORCH_LIBRARY_IMPL(aten, ADInplaceOrView, m) {
+  ${inplace_or_view_wrapper_registrations};
+}
+
+}  // namespace
+} // namespace torch
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/Functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/Functions.cpp
@ -0,0 +1,20 @@
+#include "torch/csrc/autograd/FunctionsManual.h"
+#include "torch/csrc/dynamo/compiled_autograd.h"
+
+// ${generated_comment}
+
+// The manual function definitions that used to be here are now in torch/csrc/autograd/FunctionsManual.cpp
+// This speeds up re-compilation and allow to share these implementations so that they can be
+// used for forward mode AD formulas as well.
+
+using namespace torch::autograd::generated::details;
+using at::Tensor;
+using at::Scalar;
+using at::IntArrayRef;
+using at::TensorList;
+
+namespace torch::autograd::generated {
+
+${autograd_function_definitions}
+
+} // namespace torch::autograd::generated
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/Functions.h
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/Functions.h
@ -0,0 +1,51 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/ATen.h>
+#include <ATen/core/functional.h>
+#include <ATen/TensorGeometry.h>
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/autograd/saved_variable.h"
+#include <torch/csrc/Export.h>
+
+#include <c10/core/SymIntArrayRef.h>
+
+namespace torch { namespace autograd { namespace generated {
+
+using at::Scalar;
+using at::Tensor;
+using at::IntArrayRef;
+using at::ArrayRef;
+using at::Type;
+using at::TensorGeometry;
+using at::ScalarType;
+using std::optional;
+using c10::fmap;
+
+inline std::vector<Tensor> unpack_list(at::ArrayRef<SavedVariable> xs, std::shared_ptr<Node> saved_for = nullptr) {
+  // NB: we must explicitly do the conversion in the lambda, otherwise template
+  // deduction will give a Tensor of Variable which is not convertible
+  return fmap(xs, [&saved_for](const SavedVariable& x) {
+    // TODO(crcrpar): Use `std::move(saved_for)` to avoid incrementing refcount, which would need refactoring.
+    return static_cast<Tensor>(x.unpack(saved_for));
+  });
+}
+
+inline c10::List<std::optional<Tensor>> unpack_opt_list(at::ArrayRef<SavedVariable> xs, std::shared_ptr<Node> saved_for = nullptr) {
+  torch::List<std::optional<Tensor>> result;
+  result.reserve(xs.size());
+  for (const SavedVariable& v : xs) {
+    auto var = v.unpack(saved_for);
+    result.push_back(var.defined() ? std::optional<Tensor>(var) : ::std::nullopt);
+  }
+  return result;
+}
+
+using torch::autograd::TypeAndSize;
+
+${autograd_function_declarations}
+
+}}} // namespace torch::autograd::generated
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/TraceType.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/TraceType.cpp
@ -0,0 +1,40 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include "torch/csrc/jit/frontend/tracer.h"
+
+#include <torch/library.h>
+
+#include "torch/csrc/autograd/function.h"
+
+#include "ATen/quantized/Quantizer.h"
+
+// ${generated_comment}
+
+// See the `Tracer` section in `torch/csrc/jit/OVERVIEW.md`.
+// NOTE See [Sharded File] comment in VariableType
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+$ops_headers
+#endif
+
+using namespace at;
+
+namespace torch {
+
+namespace TraceType {
+
+namespace {
+${trace_method_definitions}
+}  // namespace
+}  // namespace TraceType
+
+namespace {
+
+TORCH_LIBRARY_IMPL(aten, Tracer, m) {
+  ${trace_wrapper_registrations};
+}
+
+}  // namespace
+
+} // namespace torch
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/VariableType.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/VariableType.cpp
@ -0,0 +1,65 @@
+#include "torch/csrc/autograd/VariableTypeUtils.h"
+#include "torch/csrc/autograd/generated/VariableType.h"
+#include "torch/csrc/autograd/FunctionsManual.h"
+
+#include <ATen/RedispatchFunctions.h>
+#include <c10/core/impl/TorchDispatchModeTLS.h>
+#include <ATen/core/TorchDispatchUtils.h>
+#include <torch/library.h>
+
+#include <ATen/SparseCsrTensorUtils.h>
+
+
+// ${generated_comment}
+
+// NOTE [Sharded File]: on this file's split-into-shards state
+//
+// Back in the good old days, VariableType.cpp was generated as one
+// file with every function in it, and everything was great and
+// simple.
+//
+// However, this file was also very large (over 36,000 lines), and
+// compiling it was very slow, and in fact was a significant
+// bottleneck for incremental rebuilds. To address this, we now
+// generate the file split across multiple shards, named
+// VariableType_0.cpp and so on, which can be compiled in parallel.
+//
+// For ease of inspection and debugging, so that it's not necessary to
+// go rooting around in multiple files, we also generate all the
+// functions together in VariableTypeEverything.cpp. This generated
+// file is only for convenience; it's not actually used in the
+// build. If the file you're looking at now is one of the shards, you
+// may want to switch over to the Everything variant to make you
+// grepping smoother.
+
+using namespace at;
+using namespace torch::autograd::generated;
+using namespace torch::autograd::generated::details;
+
+
+namespace torch::autograd {
+
+namespace VariableType {
+namespace{
+  C10_UNUSED void reset_grad_accumulator(Variable & self) {
+    AutogradMeta* meta = torch::autograd::impl::get_autograd_meta(self);
+    if (meta != nullptr) {
+      meta->grad_accumulator_.reset();
+    }
+  }
+}
+
+namespace {
+
+
+${type_derived_method_definitions}
+}
+}
+
+namespace {
+
+${wrapper_registrations}
+
+}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/VariableType.h
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/VariableType.h
@ -0,0 +1,59 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Context.h>
+
+#include <c10/util/intrusive_ptr.h>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
+
+#include <cstdint> // for size_t
+#include <functional> // for function
+#include <memory> // for unique_ptr
+#include <string>
+#include <vector>
+
+namespace at {
+  struct Quantizer;
+};
+
+namespace torch { namespace autograd {
+
+using Variable = at::Tensor;
+using at::Context;
+using at::Device;
+using at::Dimname;
+using at::DimnameList;
+using at::Generator;
+using at::IntArrayRef;
+using at::MemoryFormat;
+using at::QScheme;
+using at::Scalar;
+using at::ScalarType;
+using at::Storage;
+using at::Tensor;
+using at::TensorList;
+using at::TensorOptions;
+using at::Quantizer;
+// This is temporary typedef to enable Quantizer in aten native function API
+// we'll remove them when we are actually exposing Quantizer class
+// to frontend
+using ConstQuantizerPtr = const c10::intrusive_ptr<Quantizer>&;
+using std::optional;
+
+namespace VariableType {
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allCUDATypes();
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allXPUTypes();
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allCPUTypes();
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allPrivateUser1Types();
+
+  at::Tensor & unpack(Tensor & t, const char * name, int pos);
+  const at::Tensor & unpack(const Tensor & t, const char * name, int pos);
+  at::Tensor unpack_opt(const Tensor & t, const char * name, int pos);
+  std::vector<at::Tensor> unpack(const at::ITensorListRef& tl, const char *name, int pos);
+};
+
+}} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/ViewFuncs.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/ViewFuncs.cpp
@ -0,0 +1,14 @@
+#include <torch/csrc/autograd/generated/ViewFuncs.h>
+
+// ${generated_comment}
+
+using at::Tensor;
+using at::Scalar;
+using at::IntArrayRef;
+using at::TensorList;
+
+namespace torch::autograd::generated {
+
+${view_func_definitions}
+
+} // namespace torch::autograd::generated
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/ViewFuncs.h
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/ViewFuncs.h
@ -0,0 +1,28 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <torch/library.h>
+#include <torch/csrc/autograd/variable.h>
+#include <c10/core/SymIntArrayRef.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+$ops_headers
+#endif
+
+namespace torch::autograd::generated {
+
+using at::Scalar;
+using at::Tensor;
+using at::IntArrayRef;
+using at::ArrayRef;
+using at::Type;
+using at::ScalarType;
+using std::optional;
+using c10::fmap;
+
+${view_func_declarations}
+
+} // namespace torch::autograd::generated
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/annotated_fn_args.py.in
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/annotated_fn_args.py.in
@ -0,0 +1,11 @@
+"""
+This file is needed for generating procedural tests required for
+testing __torch_function__. See tests/test_overrides.py.
+"""
+
+# flake8: noqa
+import torch
+
+annotated_args = {
+${annotated_args}
+}
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_enum_tag.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_enum_tag.cpp
@ -0,0 +1,15 @@
+#include <torch/csrc/autograd/python_enum_tag.h>
+#include <torch/csrc/utils/pybind.h>
+#include <pybind11/pybind11.h>
+#include <ATen/core/enum_tag.h>
+
+namespace py = pybind11;
+namespace torch {
+    namespace autograd {
+    void initEnumTag(PyObject* module) {
+        auto m = py::handle(module).cast<py::module>();
+        py::enum_<at::Tag>(m, "Tag")
+        ${enum_of_valid_tags};
+        m.doc() = "An Enum that contains tags that can be assigned to an operator registered in C++.";
+    }
+}}
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_fft_functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_fft_functions.cpp
@ -0,0 +1,81 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_fft_functions.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/utils/out_types.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/device_lazy_init.h"
+
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Device;
+using at::Layout;
+using at::Scalar;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
+using at::IntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
+
+using torch::utils::check_out_type_matches;
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef fft_functions[] = {
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPFFTVariableFunctionsModule = NULL;
+
+void initFFTFunctions(PyObject* module) {
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._fft",
+     NULL,
+     -1,
+     fft_functions
+  };
+  PyObject* fft = PyModule_Create(&def);
+  THPFFTVariableFunctionsModule = fft;
+  if (!fft) {
+    throw python_error();
+  }
+  // steals a reference to fft
+  if (PyModule_AddObject(module, "_fft", fft) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_functions.cpp
@ -0,0 +1,37 @@
+#include <torch/csrc/autograd/generated/python_functions.h>
+
+// ${generated_comment}
+
+#include <Python.h>
+#include <ATen/ATen.h>
+
+#include <c10/core/SymNodeImpl.h>
+#include "torch/csrc/autograd/generated/Functions.h"
+#include "torch/csrc/autograd/python_cpp_function.h"
+#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/autograd/saved_variable.h>
+#include <torch/csrc/utils/pybind.h>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/utils/pybind.h>
+
+// NOTE: See [Sharded File] comment in VariableType
+
+namespace torch::autograd::generated {
+
+template<typename C>
+static void addClass(PyObject* module, PyTypeObject& type, const char* name,
+  PyGetSetDef* function_properties=NULL, PyMethodDef* function_methods=NULL)
+{
+  _initFunctionPyTypeObject(type, name, function_properties, function_methods);
+  Py_INCREF(&type);
+  PyModule_AddObject(module, name, (PyObject*)&type);
+  registerCppFunction(typeid(C), &type);
+}
+
+${py_function_props_and_getters}
+
+void initialize_autogenerated_functions${shard_id}(PyObject* module) {
+  ${py_function_initializers}
+}
+
+} // namespace torch::autograd::generated
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_functions.h
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_functions.h
@ -0,0 +1,17 @@
+#pragma once
+
+#include <Python.h>
+
+// ${generated_comment}
+
+// Python bindings for automatically generated autograd functions
+
+namespace torch { namespace autograd { namespace generated {
+
+${shard_forward_declare}
+
+inline void initialize_autogenerated_functions(PyObject* module) {
+  ${shard_call}
+}
+
+}}} // namespace torch::autograd::generated
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_linalg_functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_linalg_functions.cpp
@ -0,0 +1,68 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_linalg_functions.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Scalar;
+using at::ScalarType;
+using at::MemoryFormat;
+using at::Generator;
+using at::IntArrayRef;
+using at::TensorList;
+
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef linalg_functions[] = {
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPLinalgVariableFunctionsModule = NULL;
+
+void initLinalgFunctions(PyObject* module) {
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._linalg",
+     NULL,
+     -1,
+     linalg_functions
+  };
+  PyObject* linalg = PyModule_Create(&def);
+  THPLinalgVariableFunctionsModule = linalg;
+  if (!linalg) {
+    throw python_error();
+  }
+  // steals a reference to linalg
+  if (PyModule_AddObject(module, "_linalg", linalg) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_nested_functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_nested_functions.cpp
@ -0,0 +1,81 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_nested_functions.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/utils/out_types.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/device_lazy_init.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Device;
+using at::Layout;
+using at::Scalar;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
+using at::IntArrayRef;
+using at::OptionalIntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
+
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef nested_functions[] = {
+  {NULL, NULL, 0, NULL},
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPNestedVariableFunctionsModule = NULL;
+
+void initNestedFunctions(PyObject* module) {
+  nested_functions[0] = get_nested_functions_manual()[0];
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._nested",
+     NULL,
+     -1,
+     nested_functions
+  };
+  PyObject* nested = PyModule_Create(&def);
+  THPNestedVariableFunctionsModule = nested;
+  if (!nested) {
+    throw python_error();
+  }
+  // steals a reference to nested
+  if (PyModule_AddObject(module, "_nested", nested) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_nn_functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_nn_functions.cpp
@ -0,0 +1,113 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_nn_functions.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/tensor_memoryformats.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Scalar;
+using at::MemoryFormat;
+using at::Generator;
+using at::IntArrayRef;
+using at::ArrayRef;
+
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+static PyObject* THPNNVariableFunctionsModule = NULL;
+
+static PyObject * THPVariable__parse_to(PyObject* module, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "to(Device device=None, ScalarType dtype=None, bool non_blocking=False, bool copy=False, *, MemoryFormat? memory_format=None)",
+    "to(ScalarType dtype, bool non_blocking=False, bool copy=False, *, MemoryFormat? memory_format=None)",
+    "to(Tensor tensor, bool non_blocking=False, bool copy=False, *, MemoryFormat? memory_format=None)",
+  });
+  ParsedArgs<5> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.has_torch_function()) {
+    return handle_torch_function(r, args, kwargs, THPNNVariableFunctionsModule, "torch.nn", "_parse_to");
+  }
+  auto parsed = parse_to_conversion(r, /*allow_copy*/ false); // we don't want copy for nn.Module.to
+  auto& device = std::get<0>(parsed);
+  auto& scalarType = std::get<1>(parsed);
+  auto non_blocking = std::get<2>(parsed);
+  auto opt_memory_format = std::get<4>(parsed);
+  auto tuple = THPObjectPtr{PyTuple_New(4)};
+  if (!tuple) throw python_error();
+  if (device) {
+    PyTuple_SET_ITEM(tuple.get(), 0, THPDevice_New(*device));
+  } else {
+    Py_INCREF(Py_None);
+    PyTuple_SET_ITEM(tuple.get(), 0, Py_None);
+  }
+  if (scalarType) {
+    PyTuple_SET_ITEM(tuple.get(), 1, Py_NewRef(torch::getTHPDtype(*scalarType)));
+  } else {
+    Py_INCREF(Py_None);
+    PyTuple_SET_ITEM(tuple.get(), 1, Py_None);
+  }
+  PyTuple_SET_ITEM(tuple.get(), 2, torch::autograd::utils::wrap(non_blocking));
+  if (opt_memory_format.has_value()) {
+    PyTuple_SET_ITEM(tuple.get(), 3, Py_NewRef(torch::utils::getTHPMemoryFormat(opt_memory_format.value())));
+  } else {
+    Py_INCREF(Py_None);
+    PyTuple_SET_ITEM(tuple.get(), 3, Py_None);
+  }
+  return tuple.release();
+  END_HANDLE_TH_ERRORS
+}
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef nn_functions[] = {
+  {"_parse_to", castPyCFunctionWithKeywords(THPVariable__parse_to),
+    METH_VARARGS | METH_KEYWORDS, nullptr},
+  ${py_method_defs}
+  {NULL}
+};
+
+void initNNFunctions(PyObject* module) {
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._nn",
+     NULL,
+     -1,
+     nn_functions
+  };
+  PyObject* nn = PyModule_Create(&def);
+  THPNNVariableFunctionsModule = nn;
+  if (!nn) {
+    throw python_error();
+  }
+  // steals a reference to nn
+  if (PyModule_AddObject(module, "_nn", nn) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_return_types.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_return_types.cpp
@ -0,0 +1,52 @@
+#include <Python.h>
+
+#include <vector>
+#include <map>
+#include <string>
+
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/Exceptions.h"
+
+namespace torch { namespace autograd { namespace generated {
+
+${py_return_types}
+
+}}}
+
+namespace torch::autograd {
+
+static void addReturnType(
+    PyObject* module,
+    const char* name,
+    PyTypeObject* type) {
+  // hold onto the TypeObject for the unlikely case of user
+  // deleting or overriding it.
+  Py_INCREF(type);
+  if (PyModule_AddObject(
+          module,
+          name,
+          (PyObject*)type) != 0) {
+    Py_DECREF(type);
+    throw python_error();
+  }
+}
+
+void initReturnTypes(PyObject* module) {
+  static struct PyModuleDef def = {
+      PyModuleDef_HEAD_INIT, "torch._C._return_types", nullptr, -1, {}};
+  PyObject* return_types_module = PyModule_Create(&def);
+  if (!return_types_module) {
+    throw python_error();
+  }
+
+  ${py_return_types_registrations}
+
+  // steals a reference to return_types on success
+  if (PyModule_AddObject(module, "_return_types", return_types_module) != 0) {
+    Py_DECREF(return_types_module);
+    throw python_error();
+  }
+}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_return_types.h
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_return_types.h
@ -0,0 +1,14 @@
+#pragma once
+
+namespace torch {
+namespace autograd {
+namespace generated {
+
+${py_return_types_declarations}
+
+}
+
+void initReturnTypes(PyObject* module);
+
+} // namespace autograd
+} // namespace torch
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_sparse_functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_sparse_functions.cpp
@ -0,0 +1,67 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_sparse_functions.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Scalar;
+using at::ScalarType;
+using at::MemoryFormat;
+using at::Generator;
+using at::IntArrayRef;
+using at::TensorList;
+
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef sparse_functions[] = {
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPSparseVariableFunctionsModule = NULL;
+
+void initSparseFunctions(PyObject* module) {
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._sparse",
+     NULL,
+     -1,
+     sparse_functions
+  };
+  PyObject* sparse = PyModule_Create(&def);
+  THPSparseVariableFunctionsModule = sparse;
+  if (!sparse) {
+    throw python_error();
+  }
+  // steals a reference to sparse
+  if (PyModule_AddObject(module, "_sparse", sparse) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_special_functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_special_functions.cpp
@ -0,0 +1,79 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_special_functions.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/utils/out_types.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/device_lazy_init.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Device;
+using at::Layout;
+using at::Scalar;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
+using at::IntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
+
+using torch::utils::check_out_type_matches;
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef special_functions[] = {
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPSpecialVariableFunctionsModule = NULL;
+
+void initSpecialFunctions(PyObject* module) {
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._special",
+     NULL,
+     -1,
+     special_functions
+  };
+  PyObject* special = PyModule_Create(&def);
+  THPSpecialVariableFunctionsModule = special;
+  if (!special) {
+    throw python_error();
+  }
+  // steals a reference to special
+  if (PyModule_AddObject(module, "_special", special) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_torch_functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_torch_functions.cpp
@ -0,0 +1,93 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+// Python bindings for torch.* functions implemented through ATen.
+//
+// The functions are bound as static methods on a class
+// torch._C._VariableFunctions which is also aliased as Variable._torch
+// and also copied into 'torch' module.
+
+#include <Python.h>
+
+// Undefine the copysign macro so that at::copysign works as intended with MSVC
+// https://github.com/python/cpython/blob/c60394c7fc9cc09b16e9675a3eeb5844b6d8523f/PC/pyconfig.h#L196
+#ifdef _MSC_VER
+#undef copysign
+#endif // _MSC_VER
+
+#include "torch/csrc/autograd/python_torch_functions.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/Dtype.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/utils/out_types.h"
+#include "torch/csrc/utils/pybind.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/tensor_layouts.h"
+#include "torch/csrc/utils/tensor_new.h"
+#include "torch/csrc/utils/tensor_numpy.h"
+#include "torch/csrc/jit/frontend/tracer.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/device_lazy_init.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+#include <functional>
+#include <initializer_list>
+#include <stdexcept>
+#include <utility>
+
+using at::Tensor;
+using at::Device;
+using at::Layout;
+using at::Scalar;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
+using at::IntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
+using at::ArrayRef;
+
+using torch::utils::check_out_type_matches;
+using namespace torch::autograd::utils;
+
+// NOTE: See [Sharded File] comment in VariableType
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef torch_functions_shard[] = {
+  ${py_method_defs}
+};
+
+void gatherTorchFunctions${shard_id}(std::vector<PyMethodDef> &torch_functions) {
+  constexpr size_t num_functions = sizeof(torch_functions_shard) / sizeof(torch_functions_shard[0]);
+  torch_functions.insert(
+    torch_functions.end(),
+    torch_functions_shard,
+    torch_functions_shard + num_functions);
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_variable_methods.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_variable_methods.cpp
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/variable_factories.h
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/variable_factories.h
@ -0,0 +1,135 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/core/Tensor.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/grad_mode.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/core/MemoryFormat.h>
+#include <torch/csrc/api/include/torch/detail/TensorDataContainer.h>
+#include <torch/csrc/autograd/variable.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/from_blob.h>
+$ops_headers
+#endif
+
+#include <functional>
+#include <initializer_list>
+#include <utility>
+
+namespace torch {
+
+/// NOTE: Currently `torch::tensor(...)` doesn't support mixed data types
+/// (i.e. `torch::tensor({{bool, 2.0}})` doesn't work). We might be able to
+/// support it in the future by iterating over all sub-lists to find
+/// the largest data type that can represent all of the elements, or by using
+/// variadic templates.
+///
+/// NOTE: C++ `torch::tensor` with a floating-point type or an `at::ArrayRef` / `std::vector` /
+/// (nested) braced-init-list of floating-point types always produces a tensor of dtype
+/// `torch::get_default_dtype()`, matching Python `torch.tensor` behavior.
+///
+/// NOTE: C++ `torch::tensor` with an integer type or an `at::ArrayRef` / `std::vector` /
+/// (nested) braced-init-list of integer types always produces a tensor of dtype `at::kLong`
+/// (aka. int64_t), matching Python `torch.tensor` behavior.
+///
+/// NOTE: The following dtypes are not supported by `torch::tensor` currently:
+/// - `unsigned int`
+/// - `unsigned long int`
+/// - `unsigned long long int`
+/// - `long long int`
+inline at::Tensor tensor(detail::TensorDataContainer tensor_data_container, const at::TensorOptions& options = {}) {
+  return autograd::make_variable(
+    // note: we remove the requires_grad setting from the TensorOptions because
+    // it is ignored anyways (and we actually have an assertion that it isn't set
+    // which would fail otherwise). We handle requires_grad explicitly here
+    // instead of passing it through to the kernel.
+    tensor_data_container.convert_to_tensor(options.requires_grad(::std::nullopt)),
+    options.requires_grad());
+}
+
+/// A generic deleter function.
+using Deleter = std::function<void(void*)>;
+using at::MemoryFormat;
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor, `strides` the
+/// stride in each dimension. The `deleter` function (a
+/// `std::function<void(void*)>`) will be called on the `data` when the Tensor
+/// data would normally be deallocated. The `TensorOptions` specify additional
+/// configuration options for the returned tensor, such as what type to
+/// interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    at::IntArrayRef strides,
+    const Deleter& deleter,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, strides, deleter, options.requires_grad(::std::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor, `strides` the
+/// stride in each dimension. The `TensorOptions`
+/// specify additional configuration options for the returned tensor, such as
+/// what type to interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    at::IntArrayRef strides,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, strides, options.requires_grad(::std::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor. The `deleter`
+/// (a `std::function<void(void*)>`) function will be called on the `data` when
+/// the Tensor data would normally be deallocated. The `TensorOptions` specify
+/// additional configuration options for the returned tensor, such as what type
+/// to interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    const Deleter& deleter,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, deleter, options.requires_grad(::std::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor. The
+/// `TensorOptions` specify additional configuration options for the returned
+/// tensor, such as what type to interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, options.requires_grad(::std::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+${function_definitions}
+
+} // namespace torch