I am done

2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/native/native_functions.yaml
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/native/native_functions.yaml
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/native/tags.yaml
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/native/tags.yaml
@ -0,0 +1,74 @@
+# This yaml file contains all the possible tags that can be defined in `tags` in `native_functions.yaml`
+
+- tag: inplace_view
+  desc: |
+          This tag indicates if an operator *only* modifies the tensor metadata
+- tag: pt2_compliant_tag
+  desc: |
+          This tag indicates if the operator is guaranteed to
+          work with the PT2 compilation APIs (torch.compile,
+          torch.export, etc). If you add this tag to an
+          operator, please use
+          `torch.testing._internal.optest.opcheck` to test that
+          the operator has been registered correctly and
+          works with torch.compile
+- tag: view_copy
+  desc: |
+          This tag indicates operators that are *_copy* variants
+          of view/aliasing operators. If an operator has a view_copy tag,
+          then it should have the name {op}_copy, where {op} is a view operator.
+- tag: dynamic_output_shape
+  desc: |
+          This tag indicates if an operator's output's shape depends on input Tensor
+          data.
+- tag: data_dependent_output
+  desc: |
+          Operator has a non-Tensor output whose value is dependent on the data
+          of Tensor inputs.  Among other things, this implies that this operator
+          cannot be run with meta tensor (since data is not available), nor
+          can it be symbolically traced.
+- tag: generated
+  desc: |
+          This tag indicates that the operator doesn't have an explicit entry in
+          native_functions.yaml, and instead was generated automatically by the codegen.
+- tag: nondeterministic_seeded
+  desc: |
+          This tag indicates if an operator is nondeterministically seeded
+          (i.e., is random) such that the operator intentionally produces
+          different results when run twice on the same inputs, but this randomness
+          is controlled by a Generator which, if reseeded would give you the
+          same result.
+- tag: nondeterministic_bitwise
+  desc: |
+          This tag indicates if an operator doesn't guarantee bitwise equivalence
+          across different runs of an operator with identical inputs.
+- tag: needs_fixed_stride_order
+  desc: |
+          This tag indicates that the operator should be passed Tensors following
+          the same stride permutation as observed in eager when compiled in inductor.
+          Only one of {needs_fixed_stride_order, flexible_layout} can apply; if
+          multiple are assigned then we assume the most restrictive one.
+- tag: flexible_layout
+  desc: |
+          This tag indicates that the custom operator can accept inputs with varying
+          strides/storage_offset and that when compiled, Inductor is allowed to change
+          the strides/storage_offset of inputs to the custom operator.
+          Only one of {needs_fixed_stride_order, flexible_layout} can apply; if
+          multiple are assigned then we assume the most restrictive one.
+
+# NOTE [Core ATen Ops]
+- tag: core
+  desc: |
+          Core aten ops is a subset of aten ops that remains after aten-to-aten decomposition and
+          functionalization pass. Core aten ops are fully functional and adhere to single static
+          assignment (SSA): this implies there will be no `inplace` or `_out` variants in this opset.
+          This opset is designed to serve as the functional IR to interface with compiler backends.
+          In contrast to primTorch, core aten opset doesn't decompose ops into explicit
+          type promotion and broadcasting ops.
+          Core aten ops is also effectively the opset produced by torchdynamo.export(aten_graph=True),
+          and thus can be used as an opset for export purpose.
+- tag: pointwise
+  desc: |
+          Pointwise operators are operators where each element of the output is computed only by accessing
+          the corresponding element of all the broadcasted inputs. The output shape will be the broadcasted
+          shape of the inputs.
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/ATenOpList.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/ATenOpList.cpp
@ -0,0 +1,36 @@
+#include <ATen/core/ATenOpList.h>
+
+#include <string>
+#include <cstring>
+#include <utility>
+#include <unordered_set>
+#include <ATen/core/operator_name.h>
+
+// ${generated_comment}
+
+namespace at {
+
+namespace {
+struct OpNameEquals final {
+  bool operator()(const std::pair<const char*, const char*>& lhs, const std::pair<const char*, const char*>& rhs) const {
+      return 0 == strcmp(lhs.first, rhs.first) && 0 == strcmp(lhs.second, rhs.second);
+  }
+};
+
+struct OpNameHash final {
+  size_t operator()(const std::pair<const char*, const char*>& p) const {
+      // use std::hash<std::string> because std::hash<const char*> would hash pointers and not pointed-to strings
+      return std::hash<std::string>()(p.first) ^ (~ std::hash<std::string>()(p.second));
+  }
+};
+}
+
+bool is_custom_op(const c10::OperatorName& opName) {
+  static std::unordered_set<std::pair<const char*, const char*>, OpNameHash, OpNameEquals> ops {
+    ${aten_ops}
+    {"", ""}
+  };
+  return ops.count(std::make_pair(
+             opName.name.c_str(), opName.overload_name.c_str())) == 0;
+}
+}
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/CompositeViewCopyKernels.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/CompositeViewCopyKernels.cpp
@ -0,0 +1,73 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include <ATen/InferSize.h>
+#include <ATen/Tensor.h>
+#include <ATen/native/Resize.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+#include <ATen/ops/clone.h>
+$ops_headers
+#endif
+
+namespace at {
+namespace native {
+
+// This file contains a number of kernels for aten functions that are fully code-generated.
+// TODO: rename this file to something more generic.
+
+namespace {
+at::Tensor clone_arg(const at::Tensor& t) {
+    return t.clone();
+}
+
+std::vector<at::Tensor> clone_arg(const at::TensorList& t_list) {
+    std::vector<at::Tensor> out(t_list.size());
+    for (const auto& i : c10::irange(t_list.size())) {
+        out[i] = t_list[i].clone();
+    }
+    return out;
+}
+
+// duped with gen_resize_out_helper from structured kernels
+void copy_arg(const at::Tensor& dst, const at::Tensor& src) {
+    TORCH_CHECK(src.dtype() == dst.dtype(),
+        "Expected out tensor to have dtype ", src.dtype(), ", but got ", dst.dtype(), " instead");
+    TORCH_CHECK(src.device() == dst.device(),
+        "Expected out tensor to have device ", src.device(), ", but got ", dst.device(), " instead");
+    dst.copy_(src);
+}
+
+void copy_arg(const at::TensorList& dst, const at::TensorList& src) {
+    TORCH_INTERNAL_ASSERT(dst.size() == src.size());
+    for (const auto& i : c10::irange(dst.size())) {
+        copy_arg(dst[i], src[i]);
+    }
+}
+
+// TODO: this doesn't handle restriding empty tensors correctly; see
+// gen_resize_out_helper for the correct algorithm
+
+void resize_out_helper(const at::Tensor& dst, const at::Tensor& src) {
+    at::native::resize_output(dst, src.sizes());
+}
+
+void resize_out_helper(const at::TensorList& dst, const at::TensorList& src) {
+    TORCH_INTERNAL_ASSERT(dst.size() == src.size());
+    for (const auto& i : c10::irange(dst.size())) {
+        at::native::resize_output(dst[i], src[i].sizes());
+    }
+}
+}
+
+
+${CompositeViewCopyKernel_Definitions}
+
+${GeneratedCompositeFunctional_Definitions}
+
+${GeneratedCompositeOut_Definitions}
+
+} // namespace native
+} // namespace at
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunction.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunction.h
@ -0,0 +1,23 @@
+#pragma once
+// ${generated_comment}
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace ${dispatch_namespace} {
+
+${dispatch_namespaced_declarations}
+
+} // namespace ${dispatch_namespace}
+} // namespace at
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunctions.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunctions.h
@ -0,0 +1,29 @@
+#include <ATen/core/TensorBody.h>
+
+// TODO Undo all logic introduced for Note [Avoiding Include Cycles In Static Dispatch]
+// Code introduced to avoid cyclic dependency in static dispatch is no longer
+// needed as static dispatch logic is moved from TensorBody.h, which caused cycles in the first place,
+// to Operators.cpp for supporting multiple backends with multiple kernels.
+//
+// Note [Avoiding Include Cycles In Static Dispatch]
+// In order to avoid #include cycles in the static dispatch build, we've carefully split out
+// the static function definition files into {DispatchKey}Functions.h and {DispatchKey}Functions_inl.h.
+//
+// Without this split, the include cycle looks like TensorBody.h -> CPUFunctions.h -> TensorBody.h.
+// - TensorBody.h #includes CPUFunctions.h in the static dispatch build, because the tensor methods
+//   all need to call into the fastpath C++ API defined in CPUFunctions.h. The methods are also all
+//   directly inlined into TensorBody.h.
+// - CPUFunctions.h #includes TensorBody.h because it contains function declarations for the entire C++ API,
+//   which include functions that have defaultable std::optional<Tensor> arguments.
+//   That requires knowing the full Tensor class definition.
+//
+// We break the cycle by doing the following:
+// - Split out CPUFunction.h into two files: CPUFunctions.h and CPUFunctions_inl.h
+// - CPUFunction.h is a dummy file that just includes the Tensor class and includes CPUFunctions_inl.,
+// - CPUFunctions_inl.h includes everything else
+// - (only in the static dispatch build) TensorBody.h makes sure to finish defining the Tensor class,
+//   and then it includes CPUFunctions_inl.h.
+// - All other files that want the cpu fastpath functions can include CPUFunctions.h directly.
+// - This also means that static dispatch build, CPUFunctions.h only needs to
+//   #include TensorBody.h, and it will automatically bring in CPUFunctions_inl.h.
+${inline_headers}
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunctions_inl.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyFunctions_inl.h
@ -0,0 +1,22 @@
+#pragma once
+// ${generated_comment}
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from                                  \
+  <ATen/ops/{my_operator}_${dispatch_namespace}_dispatch.h>.                   \
+  See NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+${DispatchKeyFunctions_inl_includes}
+
+
+${dispatch_namespaced_declarations}
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyNativeFunctions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyNativeFunctions.cpp
@ -0,0 +1,13 @@
+// ${generated_comment}
+${includes}
+${native_functions_include}
+
+namespace {
+${helper_fns}
+} // namespace
+
+${namespace_prologue}
+
+${native_function_definitions}
+
+${namespace_epilogue}
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyNativeFunctions.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/DispatchKeyNativeFunctions.h
@ -0,0 +1,19 @@
+#pragma once
+
+// an external backend might generate file within its code tree
+// and check all the source files within the tree with clang-format.
+// so, disable it since the backend might have a different config.
+// clang-format off
+
+// ${generated_comment}
+
+#include <ATen/Tensor.h>
+
+${namespace_prologue}
+
+struct ${class_name} {
+
+${dispatch_declarations}
+
+};
+${namespace_epilogue}
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/Function.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/Function.h
@ -0,0 +1,26 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+${static_dispatch_ops_headers}
+
+${operator_includes}
+
+namespace at {
+
+${function_definitions}
+
+}
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/FunctionalInverses.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/FunctionalInverses.h
@ -0,0 +1,33 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/Tensor.h>
+
+namespace at {
+namespace functionalization {
+
+enum class InverseReturnMode {
+  /// Specifies that functional inverses should always return a view.
+  AlwaysView,
+  /// Specifies that functional inverses should always return a non-view / copy.
+  NeverView,
+  /// Specifies that functional inverses should return a view unless a (copying) scatter
+  /// inverse exists, in which case that will be used instead.
+  /// This avoids as_strided() calls that can be difficult for subclasses to handle.
+  ViewOrScatterInverse,
+};
+
+struct FunctionalInverses {
+
+${view_inverse_declarations}
+
+// NB: These are not generated! They're manually implemented in the template.
+// TODO: Change codegen to generate these. See the following link:
+// https://github.com/pytorch/pytorch/blob/main/torchgen/model.py#L2583-L2585
+static at::Tensor chunk_inverse(const at::Tensor & base, const at::Tensor & mutated_view, InverseReturnMode inverse_return_mode, int64_t mutated_view_idx, int chunks, int dim);
+static at::Tensor narrow_inverse(const at::Tensor & base, const at::Tensor & mutated_view, InverseReturnMode inverse_return_mode, int dim, c10::SymInt start, c10::SymInt length);
+
+};
+}
+}
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/Functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/Functions.cpp
@ -0,0 +1,103 @@
+#include <array>
+
+#include <ATen/Functions.h>
+#include <ATen/Utils.h>
+#include <c10/core/Allocator.h>
+
+namespace at {
+
+Tensor TensorMaker::make_tensor() {
+   AutoDispatchBelowADInplaceOrView guard{}; // TODO: Remove.
+   tracer::impl::NoTracerDispatchMode tracer_guard{};
+
+   check_size_nonnegative(sizes_);
+
+   TORCH_CHECK_VALUE(
+       !deleter_ || !ctx_,
+       "The deleter and context arguments are mutually exclusive.");
+
+   if (device_ == std::nullopt) {
+     device_ = globalContext().getDeviceFromPtr(data_, opts_.device().type());
+   }
+
+   if (opts_.device().has_index()) {
+     // clang-format off
+     TORCH_CHECK_VALUE(
+         opts_.device() == *device_,
+         "Specified device ", opts_.device(), " does not match device of data ", *device_);
+     // clang-format on
+   }
+
+   std::size_t size_bytes = computeStorageSize();
+
+   DataPtr data_ptr{};
+   if (deleter_) {
+     data_ptr = makeDataPtrFromDeleter();
+   } else {
+     data_ptr = makeDataPtrFromContext();
+   }
+
+   TORCH_CHECK(!resizeable_ || allocator_ != nullptr, "Must specify an allocator with allocator() if you want to use resizeable_storage()");
+   Storage storage{Storage::use_byte_size_t{}, size_bytes, std::move(data_ptr), /*allocator=*/allocator_, /*resizable=*/resizeable_};
+
+   Tensor tensor = detail::make_tensor<TensorImpl>(
+       std::move(storage), opts_.computeDispatchKey(), opts_.dtype());
+
+  TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
+  if (strides_) {
+    tensor_impl->set_sizes_and_strides(sizes_, *strides_);
+  } else {
+    tensor_impl->set_sizes_contiguous(sizes_);
+  }
+  if (storage_offset_) {
+    tensor_impl->set_storage_offset(*storage_offset_);
+  }
+
+   return tensor;
+ }
+
+ std::size_t TensorMaker::computeStorageSize() const noexcept {
+   std::size_t itemsize = opts_.dtype().itemsize();
+
+   if (strides_) {
+     auto storage_size = detail::computeStorageNbytes(sizes_, *strides_, itemsize);
+     if (storage_offset_) {
+       storage_size += storage_offset_.value();
+     }
+     return storage_size;
+   }
+
+   std::size_t size = 1;
+   for (std::int64_t s : sizes_) {
+     size *= static_cast<std::size_t>(s);
+   }
+   auto storage_size = size * itemsize;
+   if (storage_offset_) {
+     storage_size += storage_offset_.value();
+   }
+   return storage_size;
+ }
+
+ inline DataPtr TensorMaker::makeDataPtrFromDeleter() noexcept {
+   return InefficientStdFunctionContext::makeDataPtr(data_, std::move(deleter_), *device_);
+ }
+
+ inline DataPtr TensorMaker::makeDataPtrFromContext() noexcept {
+   return DataPtr{data_, ctx_.release(), ctx_.get_deleter(), *device_};
+ }
+
+ IntArrayRef TensorMaker::makeTempSizes() const noexcept {
+   static std::int64_t zeros[5] = {0, 0, 0, 0, 0};
+   if (opts_.has_memory_format()) {
+     MemoryFormat format = *opts_.memory_format_opt();
+     if (format == MemoryFormat::ChannelsLast) {
+       return IntArrayRef(zeros, 4);
+     }
+     if (format == MemoryFormat::ChannelsLast3d) {
+       return IntArrayRef(zeros, 5);
+     }
+   }
+   return IntArrayRef(zeros, 1);
+ }
+
+} // namespace at
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/Functions.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/Functions.h
@ -0,0 +1,143 @@
+#pragma once
+
+// ${generated_comment}
+
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,            \
+  meaning the file will need to be re-compiled every time an operator     \
+  is changed or added. Consider if your change would be better placed in  \
+  another file, or if a more specific header might achieve the same goal. \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from <ATen/ops/{my_operator}.h> and   \
+  see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+// NOTE: [TORCH_ASSERT_ONLY_METHOD_OPERATORS]
+//
+// In ATen, certain generated headers files include the definitions of
+// every single operator in PyTorch. Unfortunately this means every
+// time an operator signature is updated or changed in
+// native_functions.yaml, you (and every other PyTorch developer) need
+// to recompile every source file that includes any of these headers.
+//
+// To break up these header dependencies, and improve incremental
+// build times for all PyTorch developers. These headers are split
+// into per-operator headers in the `ATen/ops` folder. This limits
+// incremental builds to only changes to methods of `Tensor`, or files
+// that use the specific operator being changed. With `at::sum` as an
+// example, you should include
+//
+//   <ATen/ops/sum.h>               // instead of ATen/Functions.h
+//   <ATen/ops/sum_native.h>        // instead of ATen/NativeFunctions.h
+//   <ATen/ops/sum_ops.h>           // instead of ATen/Operators.h
+//   <ATen/ops/sum_cpu_dispatch.h>  // instead of ATen/CPUFunctions.h
+//
+// However, even if you're careful to use this in your own code.
+// `Functions.h` might be included indirectly through another header
+// without you realising. To avoid this, you can add
+//
+//   #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+//
+// to the top of your source file. This way any time the non-specific
+// headers are included, the compiler will error out.
+//
+// Also, be aware that `ops` are not available in all build
+// configurations (namely fb-internal) so you must guard these
+// includes with `#ifdef AT_PER_OPERATOR_HEADERS`. e.g.
+//
+//   #ifndef AT_PER_OPERATOR_HEADERS
+//   #include <ATen/Functions.h>
+//   #else
+//   #include <ATen/ops/sum.h>
+//   #endif
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <c10/core/SymInt.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/util/OptionalArrayRef.h>
+
+#include <ATen/ops/from_blob.h>
+#include <ATen/ops/tensor.h>
+
+${Functions_includes}
+
+namespace at {
+
+${Functions_declarations}
+
+// Special C++ only overloads for std()-like functions (See gh-40287)
+// These are needed because int -> bool conversion takes precedence over int -> IntArrayRef
+// So, for example std(0) would select the std(unbiased=False) overload
+TORCH_API inline Tensor var(const Tensor& self, int dim) {
+  return at::var(self, IntArrayRef{dim});
+}
+TORCH_API inline std::tuple<Tensor, Tensor> var_mean(const Tensor& self, int dim) {
+  return at::var_mean(self, IntArrayRef{dim});
+}
+TORCH_API inline Tensor std(const Tensor& self, int dim) {
+  return at::std(self, IntArrayRef{dim});
+}
+TORCH_API inline std::tuple<Tensor, Tensor> std_mean(const Tensor& self, int dim) {
+  return at::std_mean(self, IntArrayRef{dim});
+}
+
+inline int64_t numel(const Tensor& tensor) {
+  return tensor.numel();
+}
+
+inline int64_t size(const Tensor& tensor, int64_t dim) {
+  return tensor.size(dim);
+}
+
+inline int64_t stride(const Tensor& tensor, int64_t dim) {
+  return tensor.stride(dim);
+}
+
+inline bool is_complex(const Tensor& tensor) {
+  return tensor.is_complex();
+}
+
+inline bool is_floating_point(const Tensor& tensor) {
+  return tensor.is_floating_point();
+}
+
+inline bool is_signed(const Tensor& tensor) {
+  return tensor.is_signed();
+}
+
+inline bool is_inference(const Tensor& tensor) {
+  return tensor.is_inference();
+}
+
+inline bool _is_zerotensor(const Tensor& tensor) {
+  return tensor._is_zerotensor();
+}
+
+inline bool is_conj(const Tensor& tensor) {
+  return tensor.is_conj();
+}
+
+inline Tensor conj(const Tensor& tensor) {
+  return tensor.conj();
+}
+
+inline bool is_neg(const Tensor& tensor) {
+  return tensor.is_neg();
+}
+
+}
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/LazyIr.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/LazyIr.h
@ -0,0 +1,19 @@
+#pragma once
+
+// This file contains autogenerated LazyTensor IR nodes
+${lazy_ir_sysinc}
+${lazy_ir_inc}
+
+${namespace_prologue}
+using at::operator<<;
+
+// kNullValue is used to contribute a static hash value any time
+// a node has an Optional<Value> input that is nullopt.  It is important
+// to differentiate between HASH(std::nullopt, something) and HASH(something, std::nullopt),
+// and using kNullValue in the hash function in the order of arguments
+// serves this purpose.
+static const torch::lazy::Value kNullValue = torch::lazy::Value();
+
+${ir_declarations}
+
+${namespace_epilogue}
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/LazyNonNativeIr.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/LazyNonNativeIr.h
@ -0,0 +1,11 @@
+#pragma once
+
+${lazy_non_native_ir_inc}
+
+// This file contains autogenerated LazyTensor Non Native IR nodes
+
+${namespace_prologue}
+
+${non_native_ir_nodes}
+
+${namespace_epilogue}
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/MethodOperators.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/MethodOperators.h
@ -0,0 +1,24 @@
+#pragma once
+
+// ${generated_comment}
+
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,             \
+  meaning the file will need to be re-compiled every time an operator      \
+  is changed or added. Consider if your change would be better placed in   \
+  another file, or if a more specific header might achieve the same goal.  \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+${MethodOperators_includes}
+
+namespace at {
+namespace _ops {
+${MethodOperators_declarations}
+} // namespace _ops
+} // namespace at
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/NativeFunction.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/NativeFunction.h
@ -0,0 +1,17 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+${extra_includes}
+
+${native_function_declarations}
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/NativeFunctions.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/NativeFunctions.h
@ -0,0 +1,33 @@
+#pragma once
+
+// ${generated_comment}
+
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,            \
+  meaning the file will need to be re-compiled every time an operator     \
+  is changed or added. Consider if your change would be better placed in  \
+  another file, or if a more specific header might achieve the same goal. \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the      \
+  file will need to be re-compiled every time an operator is changed or added.  \
+  Consider including a specific operator from <ATen/ops/{my_operator}_native.h> \
+  and see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+${NativeFunctions_includes}
+
+${NativeFunctions_declarations}
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/NativeMetaFunction.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/NativeMetaFunction.h
@ -0,0 +1,23 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+${meta_function_declarations}
+
+} // namespace native
+} // namespace at
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/NativeMetaFunctions.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/NativeMetaFunctions.h
@ -0,0 +1,19 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/IListRef.h>
+#include <ATen/TensorMeta.h>
+#include <ATen/TensorIterator.h>
+
+${NativeMetaFunctions_includes}
+
+namespace at {
+
+namespace meta {
+
+${NativeMetaFunctions_declarations}
+
+} // namespace meta
+} // namespace at
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/Operator.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/Operator.h
@ -0,0 +1,18 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+${declarations}
+
+}} // namespace at::_ops
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/Operators.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/Operators.cpp
@ -0,0 +1,19 @@
+#include <ATen/Tensor.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+
+// ${generated_comment}
+// NOTE See [Sharded File] comment in VariableType
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+${operator_headers}
+#endif
+
+${static_dispatch_extra_headers}
+
+namespace at { namespace _ops {
+
+${definitions}
+
+}} // namespace at::_ops
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/Operators.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/Operators.h
@ -0,0 +1,74 @@
+#pragma once
+
+// ${generated_comment}
+
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,             \
+  meaning the file will need to be re-compiled every time an operator      \
+  is changed or added. Consider if your change would be better placed in   \
+  another file, or if a more specific header might achieve the same goal.  \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from <ATen/ops/{my_operator}_ops.h>   \
+  and see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+#include <c10/core/SymInt.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/core/QScheme.h>
+#include <c10/util/OptionalArrayRef.h>
+#include <tuple>
+#include <vector>
+
+${Operators_includes}
+
+// Extension writers: do you write wrapper functions? Are you frustrated with
+// resolving overloads of operators? Are you frustrated with dealing with
+// pointer-to-methods and resolving overloads of pointer-to-methods?? Look no
+// further, this is the utility for you.
+//
+// Given an operator schema: aten::op.overload(...
+//
+// Use ATEN_FN2(op, overload) to get a *function* version of the operator
+// that is guaranteed to not be overloaded. This means that you can safely
+// decltype(&ATEN_FN2(op, overload)) it. NB: the 2 means this macro takes 2 args.
+//
+// Given an operator schema without an overload name: aten::op(...
+//
+// Use ATEN_FN(op) to get an unambiguous *function* version of the operator.
+//
+// There is some interesting behavior for out= operations.
+// ATEN_FN2(sin, out) gives a function that is *faithful* to the schema;
+// that is, the order of arguments is exactly what it looks like in the schema.
+
+#define ATEN_FN2(op_name, overload) at::_ops::op_name##_##overload::call
+#define ATEN_FN(op_name) at::_ops::op_name::call
+
+// Separately, ATEN_OP(op) and ATEN_OP2(op, overload) define a class containing compile-time
+// metadata about a given aten operator.
+// Notable data on the class includes:
+// - ATEN_OP2(add, Tensor)::name // returns the string name: "add"
+// - ATEN_OP2(add, Tensor)::overload_name // returns the string overload name: "Tensor"
+// - ATEN_OP2(add, Tensor)::schema // returns the C++ schema type: at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &)
+// - ATEN_OP2(add, Tensor)::schema_str // returns the string jit type: "add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"
+
+#define ATEN_OP2(op_name, overload) at::_ops::op_name##_##overload
+#define ATEN_OP(op_name) at::_ops::op_name
+
+// WARNING: Please do not call any of the ops in the _ops namespace directly.
+// Use the ATEN_FN macros. We do not guarantee stability of the naming
+// scheme for the functions in at::_ops
+
+// See Note [The ATen Operators API] for details of the at::_ops namespace
+
+namespace at {
+namespace _ops {
+${Operators_declarations}
+} // namespace _ops
+} // namespace at
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RedispatchFunctions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RedispatchFunctions.cpp
@ -0,0 +1,15 @@
+// ${generated_comment}
+
+#include <ATen/RedispatchFunctions.h>
+#include <ATen/Functions.h>
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/op_registration/adaption.h>
+
+namespace at {
+
+namespace redispatch {
+    ${function_redispatch_definitions}
+} // namespace redispatch
+
+} // namespace at
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RedispatchFunctions.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RedispatchFunctions.h
@ -0,0 +1,32 @@
+#pragma once
+
+// ${generated_comment}
+
+#ifdef TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider using the at::_ops::{name}::redispatch() interface by including     \
+  the specific operator from <ATen/ops/{my_operator}_ops.h>
+#endif
+
+#include <c10/core/Scalar.h>
+#include <ATen/Tensor.h>
+#include <c10/core/Storage.h>
+#include <ATen/core/Generator.h>
+#include <c10/util/Deprecated.h>
+#include <ATen/DeviceGuard.h>
+#include <c10/core/TensorOptions.h>
+#include <ATen/core/Reduction.h>
+#include <optional>
+#include <ATen/TensorUtils.h>
+#include <ATen/Context.h>
+#include <ATen/TracerMode.h>
+#include <ATen/Operators.h>
+
+namespace at {
+
+namespace redispatch {
+    ${function_redispatch_definitions}
+} // namespace redispatch
+
+}
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterBackendSelect.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterBackendSelect.cpp
@ -0,0 +1,29 @@
+// We register ops with a higher priority dispatch key (BackendSelect) than the usual backend-specific keys (e.g. CPU)
+// which makes calls to the factory functions dispatch to here.
+// We then 'manually' compute a lower-priority to re-dispatch to (e.g. CPU) to get to the eventually correct backend.
+// ${generated_comment}
+
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/core/dispatch/DispatchKeyExtractor.h>
+#include <torch/library.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+
+${ops_headers}
+#endif
+
+namespace at {
+
+namespace {
+
+${backend_select_method_definitions}
+
+TORCH_LIBRARY_IMPL(aten, BackendSelect, m) {
+  ${backend_select_function_registrations};
+}
+
+} // namespace
+} // at
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterCodegenUnboxedKernels.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterCodegenUnboxedKernels.cpp
@ -0,0 +1,41 @@
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/register_ops_utils.h>
+
+#include <ATen/UnboxingFunctions.h>
+
+// ${generated_comment}
+
+// NOTE [Sharded File]: This file is generated in a sharded fashion to speed up
+// incremental rebuilds. See the comment at the top of
+// templates/VariableType.cpp for an analogous, in-depth discussion.
+//
+// Generated by tools/jit/gen_unboxing.py. This file registers all ATen ops into JIT op registry instead of c10
+// dispatcher. JIT op registry only takes boxed kernels, so we are calling unboxing functions in UnboxingFunctions.h
+// to cast arguments into C++ types (instead of IValue) and delegate to unboxed kernels.
+
+namespace torch { namespace jit {
+
+using autograd::Variable;
+using autograd::variable_list;
+using at::Scalar;
+using at::ScalarType;
+using at::Tensor;
+using at::TensorOptions;
+using at::DeviceGuard;
+
+using ::c10::fmap;
+using ::c10::filter;
+
+namespace {
+
+RegisterOperators reg({
+
+    // Generated operators
+    ${unboxed_ops}
+});
+
+} // anon namespace
+
+
+}} // namespace torch::jit
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterDispatchDefinitions.ini
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterDispatchDefinitions.ini
@ -0,0 +1,24 @@
+${ns_prologue}
+
+// NB: TORCH_LIBRARY_IMPL must be in an anonymous namespace to avoid
+// ambiguity with conflicting identifiers that may have been defined in
+// at namespace already.
+namespace {
+
+${dispatch_helpers}
+
+${dispatch_anonymous_definitions}
+
+${static_init_dispatch_registrations}
+
+} // anonymous namespace
+
+${deferred_dispatch_registrations}
+
+namespace ${dispatch_namespace} {
+
+${dispatch_namespaced_definitions}
+
+} // namespace ${dispatch_namespace}
+
+${ns_epilogue}
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterDispatchKey.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterDispatchKey.cpp
@ -0,0 +1,54 @@
+// required for old g++ to compile PRId64 macros, see
+// https://github.com/pytorch/pytorch/issues/3571
+// for context
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+// an external backend might generate file within its code tree
+// and check all the source files within the tree with clang-format.
+// so, disable it since the backend might have a different config.
+// clang-format off
+
+// NOTE: This condition is true for all PyTorch internal libraries, it
+//       just excludes external projects such as torch_xla which
+//       re-use some of the PyTorch codegen machinery.
+#if defined(CAFFE2_BUILD_MAIN_LIB)        || \
+    defined(TORCH_CUDA_BUILD_MAIN_LIB)    || \
+    defined(TORCH_HIP_BUILD_MAIN_LIB)     || \
+    defined(TORCH_CUDA_CU_BUILD_MAIN_LIB) || \
+    defined(TORCH_CUDA_CPP_BUILD_MAIN_LIB)
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#endif
+
+// ${generated_comment}
+
+#include <c10/core/TensorImpl.h>
+#include <c10/core/Allocator.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/Dispatch.h>
+#include <c10/util/ExclusivelyOwned.h>
+#include <c10/util/Half.h>
+#include <c10/core/UndefinedTensorImpl.h>
+#include <optional>
+#include <ATen/Tensor.h>
+#include <ATen/native/Resize.h>
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include <ATen/Config.h>
+#include <ATen/core/op_registration/adaption.h>
+#include <torch/library.h>
+$extra_cuda_headers
+$external_backend_headers
+$dispatch_headers
+$ops_headers
+
+// See template file RegisterDispatchDefinitions.ini
+$dispatch_definitions
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterFunctionalization.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterFunctionalization.cpp
@ -0,0 +1,110 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include <ATen/core/LegacyTypeDispatch.h>
+#include <ATen/EmptyTensor.h>
+#include <ATen/FunctionalTensorWrapper.h>
+#include <ATen/FunctionalInverses.h>
+#include <ATen/MemoryOverlap.h>
+#include <torch/library.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#include <ATen/NativeFunctions.h>
+#else
+// needed for the meta tensor calls to get stride info in functionalization
+#include <ATen/ops/empty_strided_native.h>
+// needed for special handling of copy_().
+// See Note [functionalizating copy_() and not preserving strides]
+#include <ATen/ops/to_ops.h>
+#include <ATen/ops/expand_copy_ops.h>
+
+$ops_headers
+#endif
+
+namespace at {
+namespace functionalization {
+
+// This keyset is used by functionalization when it calls into meta kernels
+// to accurately propagate stride metadata.
+// Exclude any modes: the purpose of calling into meta kernels is only as an implementation
+// detail to perform shape inference, and we don't want any modal keys to run.
+// Specifically, we want to prevent functionalization and Python modes from running.
+constexpr auto exclude_keys_for_meta_dispatch =
+    c10::functorch_transforms_ks |
+    c10::DispatchKeySet({
+        c10::DispatchKey::FuncTorchDynamicLayerBackMode,
+        c10::DispatchKey::FuncTorchDynamicLayerFrontMode,
+        c10::DispatchKey::Python,
+        c10::DispatchKey::PreDispatch,
+
+    });
+
+// Helper around at::has_internal_overlap.
+// The ATen util is used in hot-path eager mode: it's always fast,
+// but might return TOO_HARD sometimes.
+// During functionalization, we're ok taking a bit longer
+// to detect memory overlap.
+inline bool has_internal_overlap_helper(const at::Tensor t) {
+  auto has_overlap = at::has_internal_overlap(t);
+  if (has_overlap == at::MemOverlap::Yes) return true;
+  if (has_overlap == at::MemOverlap::No) return false;
+  return false;
+}
+
+
+inline Tensor to_meta(const Tensor& t) {
+    if (!t.defined()) return t;
+    return at::native::empty_strided_meta_symint(t.sym_sizes(), t.sym_strides(),
+/*dtype=*/std::make_optional(t.scalar_type()), /*layout=*/std::make_optional(t.layout()),
+/*device=*/std::make_optional(c10::Device(kMeta)), /*pin_memory=*/std::nullopt);
+}
+
+inline std::optional<Tensor> to_meta(const std::optional<Tensor>& t) {
+  if (t.has_value()) {
+    return std::make_optional<Tensor>(to_meta(*t));
+  }
+  return std::nullopt;
+}
+
+inline std::vector<Tensor> to_meta(at::ITensorListRef t_list) {
+  std::vector<Tensor> outputs;
+  outputs.reserve(t_list.size());
+  for (const auto& tensor : t_list) {
+    outputs.push_back(to_meta(tensor));
+  }
+  return outputs;
+}
+
+inline c10::List<Tensor> to_meta(const c10::List<Tensor>& t_list) {
+  c10::List<Tensor> outputs;
+  outputs.reserve(t_list.size());
+  for (const auto i : c10::irange(t_list.size())) {
+    outputs.push_back(to_meta(t_list[i]));
+  }
+  return outputs;
+}
+
+inline c10::List<::std::optional<Tensor>> to_meta(const c10::List<::std::optional<Tensor>>& t_list) {
+  c10::List<::std::optional<Tensor>> outputs;
+  outputs.reserve(t_list.size());
+  for (const auto i : c10::irange(t_list.size())) {
+    outputs.push_back(to_meta(t_list[i]));
+  }
+  return outputs;
+}
+
+
+${func_definitions}
+
+}  // namespace functionalization
+
+namespace {
+
+TORCH_LIBRARY_IMPL(aten, Functionalize, m) {
+  ${func_registrations};
+}
+
+}  // namespace
+
+} // namespace at
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterSchema.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RegisterSchema.cpp
@ -0,0 +1,13 @@
+// ${generated_comment}
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <torch/library.h>
+
+namespace at {
+TORCH_LIBRARY(aten, m) {
+  ${aten_schema_registrations};
+  // Distributed Ops
+  // Implementations located in torch/csrc/jit/runtime/register_distributed_ops.cpp
+  m.def("get_gradients(int context_id) -> Dict(Tensor, Tensor)");
+}
+${schema_registrations}
+}  // namespace at
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RegistrationDeclarations.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/RegistrationDeclarations.h
@ -0,0 +1,4 @@
+// This file contains all native_functions that can be registered to
+// and the schema string that they should be registered with
+
+${registration_declarations}
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/TensorBody.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/TensorBody.h
@ -0,0 +1,753 @@
+#pragma once
+
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,            \
+  meaning the file will need to be re-compiled every time an operator     \
+  is changed or added. Consider if your change would be better placed in  \
+  another file, or if a more specific header might achieve the same goal. \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+
+#include <c10/core/Device.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/QScheme.h>
+#include <c10/core/Stream.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/core/UndefinedTensorImpl.h>
+#include <c10/core/WrapDimMinimal.h>
+#include <c10/util/Exception.h>
+#include <c10/util/ExclusivelyOwned.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/MaybeOwned.h>
+#include <optional>
+#include <c10/util/OptionalArrayRef.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/macros/Export.h>
+#include <ATen/core/CheckMemoryFormat.h>
+#include <ATen/core/DeprecatedTypePropertiesRegistry.h>
+#include <ATen/core/DeprecatedTypeProperties.h>
+#include <ATen/core/NamedTensor.h>
+#include <ATen/core/QuantizerBase.h>
+#include <c10/core/SymInt.h>
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/core/TensorBase.h>
+
+
+#include <ATen/MethodOperators.h>
+
+namespace c10{
+template<class T> class List;
+template<class T> class IListRef;
+}
+namespace at {
+struct Generator;
+struct Type;
+class DeprecatedTypeProperties;
+class Tensor;
+} // namespace at
+namespace at {
+namespace indexing {
+struct TensorIndex;
+} // namespace indexing
+} // namespace at
+
+namespace torch { namespace autograd {
+
+struct Node;
+
+}} // namespace torch::autograd
+
+namespace at {
+
+class OptionalTensorRef;
+class TensorRef;
+class Tensor;
+using TensorList = ArrayRef<Tensor>;
+using ITensorList = c10::IListRef<Tensor>;
+
+using Stream = c10::Stream;
+
+// Tensor is a "generic" object holding a pointer to the underlying TensorImpl object, which
+// has an embedded reference count. In this way, Tensor is similar to boost::intrusive_ptr.
+//
+// For example:
+//
+// void func(Tensor a) {
+//   Tensor b = a;
+//   ...
+// }
+//
+// In this example, when we say Tensor b = a, we are creating a new object that points to the
+// same underlying TensorImpl, and bumps its reference count. When b goes out of scope, the
+// destructor decrements the reference count by calling release() on the TensorImpl it points to.
+// The existing constructors, operator overloads, etc. take care to implement the correct semantics.
+//
+// Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
+// special care must be taken to handle this.
+class TORCH_API Tensor: public TensorBase {
+ protected:
+  // Create a Tensor with a +0 reference count. Special care must be
+  // taken to avoid decrementing this reference count at destruction
+  // time. Intended to support MaybeOwnedTraits<Tensor>.
+  explicit Tensor(unsafe_borrow_t, const TensorBase& rhs): TensorBase(unsafe_borrow_t{}, rhs) {}
+  friend MaybeOwnedTraits<Tensor>;
+  friend OptionalTensorRef;
+  friend TensorRef;
+
+ public:
+  Tensor() = default;
+  // This constructor should not be used by end users and is an implementation
+  // detail invoked by autogenerated code.
+  explicit Tensor(
+      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
+      : TensorBase(std::move(tensor_impl)) {}
+  Tensor(const Tensor &tensor) = default;
+  Tensor(Tensor &&tensor) = default;
+
+  // Implicitly move-constructible from TensorBase, but must be explicit to increase refcount
+  explicit Tensor(const TensorBase &base): TensorBase(base) {}
+  /*implicit*/ Tensor(TensorBase &&base): TensorBase(std::move(base)) {}
+
+  // Creates a new wrapper from TensorImpl. Intentionally a free method because
+  // it should be used with care. Checks necessary invariants
+  static Tensor wrap_tensor_impl(
+      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl) {
+    return TensorBase::wrap_tensor_impl(std::move(tensor_impl));
+  }
+
+  Tensor contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) const {
+    return TensorBase::contiguous(memory_format);
+  }
+
+  Tensor conj() const {
+    if (!this->is_complex()) {
+      return *this;
+    }
+
+    switch (this->layout()) {
+      case at::kSparse:
+      case at::kSparseCsr:
+      case at::kSparseCsc:
+      case at::kSparseBsr:
+      case at::kSparseBsc:
+        return this->conj_physical();
+      default:
+        return this->_conj();
+    }
+  }
+
+  // Aliased by Dimname overloads, so need explicit using
+  using TensorBase::size;
+  using TensorBase::sym_size;
+  using TensorBase::stride;
+
+  /// Should be used if *this can reasonably be expected to be contiguous and
+  /// performance is important.
+  /// Compared to contiguous, it saves a reference count
+  /// increment/decrement if *this is already contiguous, at the cost
+  /// in all cases of an extra pointer of stack usage, an extra branch
+  /// to access, and an extra branch at destruction time.
+  c10::MaybeOwned<Tensor> expect_contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) const &;
+
+  // Use .contiguous() instead. Trying to borrow from a prvalue Tensor
+  // will only lead to trouble and dangling references.
+  c10::MaybeOwned<Tensor> expect_contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) && = delete;
+
+  // The following overloads are very intruiging.  Consider the following
+  // program:
+  //
+  //    x[1] = 3;
+  //
+  // We would expect that the first entry of x is written to 3.  But how can we
+  // actually achieve this?  x[1] evaluates to a tensor...
+  //
+  // The answer is, using a ref-qualifier.  x[1] is an rvalue, which cannot be
+  // (profitably) assigned to in the traditional sense, so we overload
+  // assignment to mean, "Actually, copy 3 into the tensor data."  This is done
+  // with an rvalue-reference ref-qualified overload (the methods with && at the
+  // end of their type.)
+  //
+  // There's one more fly in the ointment: We also want
+  //
+  //    Tensor x = y;
+  //
+  // to work, and we want it NOT to copy.  So we need a traditional operator=
+  // overload.  But we MUST specify a mutable lvalue ref-qualifier, to
+  // disambiguate the traditional overload from the rvalue-reference
+  // ref-qualified overload.  Otherwise, it will be ambiguous, because
+  // a non ref-qualified method is eligible for all situations.
+
+  // Unfortunately, we have to write these constructors out manually
+  // to work around an MSVC bug:
+  //    error C2580: 'at::Tensor &at::Tensor::operator =(const at::Tensor &) &':
+  //    multiple versions of a defaulted special member functions are not allowed
+  // Tensor& operator=(const Tensor&) & = default;
+  // Tensor& operator=(Tensor&&) & = default;
+
+  // Also MSVC will wrongly issue the following warning with the aforementioned fix
+  //    warning C4522: 'at::Tensor': multiple assignment operators specified
+  // Let's just skip the warning.
+  //
+  // TODO: temporarily disabled
+
+  Tensor& operator=(const TensorBase& x) & {
+    impl_ = x.getIntrusivePtr();
+    return *this;
+  }
+  Tensor& operator=(TensorBase&& x) & noexcept {
+    impl_ = x.unsafeReleaseIntrusivePtr();
+    return *this;
+  }
+
+  Tensor& operator=(const Tensor &x) & {
+    return operator=(static_cast<const TensorBase&>(x));
+  }
+  Tensor& operator=(Tensor &&x) & noexcept {
+    return operator=(static_cast<TensorBase&&>(x));
+  }
+
+  Tensor& operator=(const Scalar &v) && {
+    return fill_(v);
+  }
+  Tensor& operator=(const Tensor &rhs) && {
+    return copy_(rhs);
+  }
+  Tensor& operator=(Tensor&& rhs) && {
+    return copy_(rhs);
+  }
+
+  C10_DEPRECATED_MESSAGE("Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device().")
+  DeprecatedTypeProperties & type() const {
+    return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+        dispatchKeyToBackend(legacyExtractDispatchKey(key_set())),
+        scalar_type());
+  }
+
+  Tensor toType(ScalarType t) const {
+    return to(options().dtype(t), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  // TODO: Deprecate me
+  Tensor toBackend(Backend b) const {
+    return to(options().device(backendToDeviceType(b)).layout(layout_from_backend(b)), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  C10_DEPRECATED_MESSAGE("Tensor.is_variable() is deprecated; everything is a variable now. (If you want to assert that variable has been appropriately handled already, use at::impl::variable_excluded_from_dispatch())")
+  bool is_variable() const noexcept {
+    return !at::impl::variable_excluded_from_dispatch();
+  }
+
+  template<typename T>
+  C10_DEPRECATED_MESSAGE("Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead.")
+  T * data() const {
+    return data_ptr<T>();
+  }
+
+  template <typename T>
+  T item() const;
+
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+  C10_DEPRECATED_MESSAGE("packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead")
+  GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {
+    return generic_packed_accessor<T,N,PtrTraits,index_t>();
+  }
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+  C10_DEPRECATED_MESSAGE("packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead")
+  GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() && = delete;
+
+  Tensor operator~() const {
+    return bitwise_not();
+  }
+  Tensor operator-() const {
+    return neg();
+  }
+  Tensor& operator+=(const Tensor & other) {
+    return add_(other);
+  }
+  Tensor& operator+=(const Scalar & other) {
+    return add_(other);
+  }
+  Tensor& operator-=(const Tensor & other) {
+    return sub_(other);
+  }
+  Tensor& operator-=(const Scalar & other) {
+    return sub_(other);
+  }
+  Tensor& operator*=(const Tensor & other) {
+    return mul_(other);
+  }
+  Tensor& operator*=(const Scalar & other) {
+    return mul_(other);
+  }
+  Tensor& operator/=(const Tensor & other) {
+    return div_(other);
+  }
+  Tensor& operator/=(const Scalar & other) {
+    return div_(other);
+  }
+  Tensor& operator&=(const Tensor & other) {
+    return bitwise_and_(other);
+  }
+  Tensor& operator|=(const Tensor & other) {
+    return bitwise_or_(other);
+  }
+  Tensor& operator^=(const Tensor & other) {
+    return bitwise_xor_(other);
+  }
+  Tensor operator[](const Scalar & index) const {
+    if (!index.isIntegral(false)) {
+      TORCH_CHECK_INDEX(false, "Can only index tensors with integral scalars");
+    }
+    return this->operator[](index.toLong());
+  }
+  Tensor operator[](const Tensor & index) const {
+    // These properties are checked in the Scalar constructor, but we already
+    // check them here to provide more useful diagnostics for the user.
+    if (!index.defined()) {
+      TORCH_CHECK_INDEX(false, "Can only index with tensors that are defined");
+    }
+    if (index.dim() != 0) {
+      TORCH_CHECK_INDEX(false,
+                        "Can only index with tensors that are scalars (zero-dim)");
+    }
+    // The Scalar(Tensor) constructor is explicit, so we need to call it.
+    return this->operator[](index.item());
+  }
+  Tensor operator[](int64_t index) const {
+    return select(0, index);
+  }
+
+  Tensor index(ArrayRef<at::indexing::TensorIndex> indices) const;
+  Tensor index(std::initializer_list<at::indexing::TensorIndex> indices) const;
+
+  Tensor & index_put_(ArrayRef<at::indexing::TensorIndex> indices, Tensor const & rhs);
+  Tensor & index_put_(ArrayRef<at::indexing::TensorIndex> indices, const Scalar& v);
+  Tensor & index_put_(std::initializer_list<at::indexing::TensorIndex> indices, Tensor const & rhs);
+  Tensor & index_put_(std::initializer_list<at::indexing::TensorIndex> indices, const Scalar& v);
+
+  Tensor cpu() const {
+    return to(options().device(c10::DeviceType::CPU), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  // TODO: The Python version also accepts arguments
+  Tensor cuda() const {
+    return to(options().device(c10::DeviceType::CUDA), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  Tensor hip() const {
+    return to(options().device(c10::DeviceType::HIP), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  Tensor ve() const {
+    return to(options().device(c10::DeviceType::VE), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  Tensor vulkan() const {
+    return to(options().device(c10::DeviceType::Vulkan), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  Tensor metal() const {
+    return to(options().device(c10::DeviceType::Metal), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  Tensor meta() const {
+    return to(options().device(c10::DeviceType::Meta), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  // ~~~~~ Autograd API ~~~~~
+
+  /// \fn bool is_leaf() const;
+  ///
+  /// All Tensors that have `requires_grad()` which is ``false`` will be leaf Tensors by convention.
+  ///
+  /// For Tensors that have `requires_grad()` which is ``true``, they will be leaf Tensors if they were
+  /// created by the user. This means that they are not the result of an operation and so
+  /// `grad_fn()` is `nullptr`.
+  ///
+  /// Only leaf Tensors will have their `grad()` populated during a call to `backward()`.
+  /// To get `grad()` populated for non-leaf Tensors, you can use `retain_grad()`.
+  ///
+  /// Example:
+  /// @code
+  /// auto a = torch::rand(10, torch::requires_grad());
+  /// std::cout << a.is_leaf() << std::endl; // prints `true`
+  ///
+  /// auto b = torch::rand(10, torch::requires_grad()).to(torch::kCUDA);
+  /// std::cout << b.is_leaf() << std::endl; // prints `false`
+  /// // b was created by the operation that cast a cpu Tensor into a cuda Tensor
+  ///
+  /// auto c = torch::rand(10, torch::requires_grad()) + 2;
+  /// std::cout << c.is_leaf() << std::endl; // prints `false`
+  /// // c was created by the addition operation
+  ///
+  /// auto d = torch::rand(10).cuda();
+  /// std::cout << d.is_leaf() << std::endl; // prints `true`
+  /// // d does not require gradients and so has no operation creating it (that is tracked by the autograd engine)
+  ///
+  /// auto e = torch::rand(10).cuda().requires_grad_();
+  /// std::cout << e.is_leaf() << std::endl; // prints `true`
+  /// // e requires gradients and has no operations creating it
+  ///
+  /// auto f = torch::rand(10, torch::device(torch::kCUDA).requires_grad(true));
+  /// std::cout << f.is_leaf() << std::endl; // prints `true`
+  /// // f requires grad, has no operation creating it
+  /// @endcode
+
+  /// \fn void backward(const Tensor & gradient={}, std::optional<bool> retain_graph=std::nullopt, bool create_graph=false, std::optional<TensorList> inputs=std::nullopt) const;
+  ///
+  /// Computes the gradient of current tensor with respect to graph leaves.
+  ///
+  /// The graph is differentiated using the chain rule. If the tensor is
+  /// non-scalar (i.e. its data has more than one element) and requires
+  /// gradient, the function additionally requires specifying ``gradient``.
+  /// It should be a tensor of matching type and location, that contains
+  /// the gradient of the differentiated function w.r.t. this Tensor.
+  ///
+  /// This function accumulates gradients in the leaves - you might need to
+  /// zero them before calling it.
+  ///
+  /// \param gradient Gradient w.r.t. the
+  ///     tensor. If it is a tensor, it will be automatically converted
+  ///     to a Tensor that does not require grad unless ``create_graph`` is True.
+  ///     None values can be specified for scalar Tensors or ones that
+  ///     don't require grad. If a None value would be acceptable then
+  ///     this argument is optional.
+  /// \param retain_graph If ``false``, the graph used to compute
+  ///     the grads will be freed. Note that in nearly all cases setting
+  ///     this option to True is not needed and often can be worked around
+  ///     in a much more efficient way. Defaults to the value of
+  ///     ``create_graph``.
+  /// \param create_graph If ``true``, graph of the derivative will
+  ///     be constructed, allowing to compute higher order derivative
+  ///     products. Defaults to ``false``.
+  /// \param inputs Inputs w.r.t. which the gradient will be accumulated into
+  ///     ``at::Tensor::grad``. All other Tensors will be ignored. If not
+  ///     provided, the gradient is accumulated into all the leaf Tensors
+  ///     that were used to compute the current tensor.
+  ///     When inputs are provided and a given input is not a leaf,
+  ///     the current implementation will call its grad_fn (even though it is not strictly needed to get this gradients).
+  ///     It is an implementation detail on which the user should not rely.
+  ///     See https://github.com/pytorch/pytorch/pull/60521#issuecomment-867061780 for more details.
+  void backward(const Tensor & gradient={}, std::optional<bool> retain_graph=std::nullopt, bool create_graph=false, std::optional<TensorList> inputs=std::nullopt) const {
+    // NB: Adding this wrapper to _backward here because we'd like our
+    // 'backwards' api to accept the 'inputs' argument optionally. Since code gen
+    // currently does not support optional of TensorList our approach is to replace
+    // backward in native_functions.yaml with _backward and call it here instead.
+    if (inputs.has_value()) {
+      TORCH_CHECK(inputs.value().size() > 0, "'inputs' argument to backward cannot be empty")
+      this->_backward(inputs.value(), gradient, retain_graph, create_graph);
+    } else {
+      this->_backward({}, gradient, retain_graph, create_graph);
+    }
+  }
+
+  /// \fn Tensor detach() const;
+  ///
+  /// Returns a new Tensor, detached from the current graph.
+  /// The result will never require gradient.
+
+  /// \fn Tensor & detach_() const;
+  ///
+  /// Detaches the Tensor from the graph that created it, making it a leaf.
+  /// Views cannot be detached in-place.
+
+  /// \fn void retain_grad() const;
+  ///
+  /// Enables this Tensor to have their :attr:`grad` populated during
+  /// :func:`backward`. This is a no-op for leaf tensors.
+
+  /// \fn bool retains_grad() const;
+  ///
+  /// Is ``true`` if this Tensor is non-leaf and its :attr:`grad` is enabled to be
+  /// populated during :func:`backward`, ``false`` otherwise.
+
+  const Tensor& set_requires_grad(bool requires_grad) const {
+    TensorBase::set_requires_grad(requires_grad);
+    return *this;
+  }
+
+  /// Return a mutable reference to the gradient. This is conventionally
+  /// used as `t.grad() = x` to set a gradient to a completely new tensor.
+  /// Note that this function work with a non-const Tensor and is not
+  /// thread safe.
+  Tensor& mutable_grad() const {
+    return impl_->mutable_grad();
+  }
+
+  /// This function returns an undefined tensor by default and returns a defined tensor
+  /// the first time a call to `backward()` computes gradients for this Tensor.
+  /// The attribute will then contain the gradients computed and future calls
+  /// to `backward()` will accumulate (add) gradients into it.
+  const Tensor& grad() const {
+    const Tensor& maybe_grad = impl_->grad();
+    if (!is_leaf() && !retains_grad() && !maybe_grad.defined()) {
+      TORCH_WARN(
+        "The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad "
+        "attribute won't be populated during autograd.backward(). If you indeed want the .grad "
+        "field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. "
+        "If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor "
+        "instead. See github.com/pytorch/pytorch/pull/30531 for more informations.");
+    }
+    return maybe_grad;
+  }
+
+  // The Forward AD API functions below are low level and are not to be used by end
+  // users who should use the API provided in torch/csrc/autograd.h
+
+  /// This function returns the forward gradient for this Tensor at the given level.
+  const Tensor& _fw_grad(uint64_t level) const {
+    return impl_->_fw_grad(level, *this);
+  }
+
+  /// This function can be used to set the value of the forward grad.
+  /// Note that the given new_grad might not be used directly if it has different
+  /// metadata (size/stride/storage offset) compared to this Tensor. In that case,
+  /// new_grad content will be copied into a new Tensor
+  void _set_fw_grad(const TensorBase& new_grad, uint64_t level, bool is_inplace_op) const {
+    impl_->_set_fw_grad(new_grad, *this, level, is_inplace_op);
+  }
+
+
+  // STOP.  Thinking of adding a method here, which only makes use
+  // of other ATen methods?  Define it in native_functions.yaml.
+
+  //example
+  //Tensor * add(Tensor & b);
+  ${tensor_method_declarations}
+
+  // Special C++ only overloads for std()-like functions (See gh-40287)
+  // These are needed because int -> bool conversion takes precedence over int -> IntArrayRef
+  // So, for example std(0) would select the std(unbiased=False) overload
+
+  Tensor var(int dim) const {
+    return var(IntArrayRef{dim});
+  }
+
+  Tensor std(int dim) const {
+    return std(IntArrayRef{dim});
+  }
+
+  // We changed .dtype() to return a TypeMeta in #12766. Ideally, we want the
+  // at::kDouble and its friends to be TypeMeta's, but that hasn't happened yet.
+  // Before that change, we make this method to maintain BC for C++ usage like
+  // `x.to(y.dtype)`.
+  // TODO: remove following two after at::kDouble and its friends are TypeMeta's.
+  inline Tensor to(caffe2::TypeMeta type_meta, bool non_blocking=false, bool copy=false) const {
+    return this->to(/*scalar_type=*/typeMetaToScalarType(type_meta), non_blocking, copy);
+  }
+  inline Tensor to(Device device, caffe2::TypeMeta type_meta, bool non_blocking=false, bool copy=false) const {
+    return this->to(device, /*scalar_type=*/typeMetaToScalarType(type_meta), non_blocking, copy);
+  }
+
+  template <typename F, typename... Args>
+  decltype(auto) m(F func, Args&&... params) const {
+    return func(*this, std::forward<Args>(params)...);
+  }
+
+  /// NOTE: This is similar to the legacy `.data()` function on `Variable`, and is intended
+  /// to be used from functions that need to access the `Variable`'s equivalent `Tensor`
+  /// (i.e. `Tensor` that shares the same storage and tensor metadata with the `Variable`).
+  ///
+  /// One notable difference with the legacy `.data()` function is that changes to the
+  /// returned `Tensor`'s tensor metadata (e.g. sizes / strides / storage / storage_offset)
+  /// will not update the original `Variable`, due to the fact that this function
+  /// shallow-copies the `Variable`'s underlying TensorImpl.
+  at::Tensor tensor_data() const {
+    return TensorBase::tensor_data();
+  }
+
+  /// NOTE: `var.variable_data()` in C++ has the same semantics as `tensor.data`
+  /// in Python, which create a new `Variable` that shares the same storage and
+  /// tensor metadata with the original `Variable`, but with a completely new
+  /// autograd history.
+  ///
+  /// NOTE: If we change the tensor metadata (e.g. sizes / strides /
+  /// storage / storage_offset) of a variable created from `var.variable_data()`, those
+  /// changes will not update the original variable `var`. In `.variable_data()`, we set
+  /// `allow_tensor_metadata_change_` to false to make such changes explicitly illegal,
+  /// in order to prevent users from changing metadata of `var.variable_data()`
+  /// and expecting the original variable `var` to also be updated.
+  at::Tensor variable_data() const {
+    return TensorBase::variable_data();
+  }
+
+  // Hooks
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  template <typename T>
+  using hook_return_void_t = std::enable_if_t<std::is_void<typename std::invoke_result_t<T&, Tensor>>::value, unsigned>;
+  template <typename T>
+  using hook_return_var_t = std::enable_if_t<std::is_same<typename std::invoke_result_t<T&, Tensor>, Tensor>::value, unsigned>;
+
+  /// Registers a backward hook.
+  ///
+  /// The hook will be called every time a gradient with respect to the Tensor is computed.
+  /// The hook should have one of the following signature:
+  /// ```
+  /// hook(Tensor grad) -> Tensor
+  /// ```
+  /// ```
+  /// hook(Tensor grad) -> void
+  /// ```
+  /// The hook should not modify its argument, but it can optionally return a new gradient
+  /// which will be used in place of `grad`.
+  ///
+  /// This function returns the index of the hook in the list which can be used to remove hook.
+  ///
+  /// Example:
+  /// @code
+  /// auto v = torch::tensor({0., 0., 0.}, torch::requires_grad());
+  /// auto h = v.register_hook([](torch::Tensor grad){ return grad * 2; }); // double the gradient
+  /// v.backward(torch::tensor({1., 2., 3.}));
+  /// // This prints:
+  /// // ```
+  /// //  2
+  /// //  4
+  /// //  6
+  /// // [ CPUFloatType{3} ]
+  /// // ```
+  /// std::cout << v.grad() << std::endl;
+  /// v.remove_hook(h);  // removes the hook
+  /// @endcode
+  template <typename T>
+  hook_return_void_t<T> register_hook(T&& hook) const;
+  template <typename T>
+  hook_return_var_t<T> register_hook(T&& hook) const;
+
+  // Variable methods
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  Tensor data() const {
+    return TensorBase::data();
+  }
+
+  void _backward(TensorList inputs, const std::optional<Tensor>& gradient, std::optional<bool> keep_graph, bool create_graph) const;
+
+  const Tensor& requires_grad_(bool _requires_grad=true) const {
+    TensorBase::requires_grad_(_requires_grad);
+    return *this;
+  }
+};
+
+namespace detail {
+// Helper creator for Tensor class which doesn't requires the users to pass
+// in an intrusive_ptr instead it just converts the argument passed to
+// requested intrusive_ptr type.
+template <typename T, typename... Args>
+Tensor make_tensor(Args&&... args) {
+  return Tensor(c10::make_intrusive<T>(std::forward<Args>(args)...));
+}
+
+} // namespace detail
+
+} // namespace at
+
+
+namespace at {
+${tensor_method_definitions}
+} // namespace at
+
+
+namespace c10 {
+template <>
+struct MaybeOwnedTraits<at::Tensor> {
+  using owned_type = at::Tensor;
+  using borrow_type = at::Tensor;
+
+  static borrow_type createBorrow(const owned_type& from) {
+    // NOTE: this can be implemented without the special
+    // unsafe_borrow_t Tensor constructor as
+    //
+    // return borrow_type(c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(from.unsafeGetTensorImpl()));
+    //
+    // but that hurts inlining due to the nullptr check in the
+    // Tensor(c10::intrusive_ptr<...>) constructor. We already know
+    // that from.impl_ isn't null because from is a valid Tensor, so
+    // we needn't do the check again. (using __builtin_assume can
+    // avoid this, but wouldn't be portable to MSVC.)
+    return borrow_type(borrow_type::unsafe_borrow_t{}, from);
+  }
+
+  static void assignBorrow(borrow_type& lhs, const borrow_type& rhs) {
+    lhs.unsafeReleaseTensorImpl();
+    // See above note: this can be implemented with public API
+    // similarly to createBorrow(), but that would hurt inlining.
+    lhs = borrow_type(borrow_type::unsafe_borrow_t{}, rhs);
+  }
+
+  static void destroyBorrow(borrow_type& toDestroy) {
+    toDestroy.unsafeReleaseTensorImpl(); // "leak" it, but it was already +0.
+  }
+
+  static const owned_type& referenceFromBorrow(const borrow_type& borrow) {
+    return borrow;
+  }
+
+  static const owned_type* pointerFromBorrow(const borrow_type& borrow) {
+    return &borrow;
+  }
+
+  static bool debugBorrowIsValid(const borrow_type& /*borrow*/) {
+    return true;
+  }
+};
+
+template <>
+struct ExclusivelyOwnedTraits<at::Tensor> {
+  using repr_type = at::Tensor;
+  using pointer_type = at::Tensor*;
+  using const_pointer_type = const at::Tensor*;
+
+  static repr_type nullRepr() {
+    return at::Tensor();
+  }
+
+  template <class... Args>
+  static repr_type createInPlace(Args&&... args) {
+    return at::Tensor(std::forward<Args>(args)...);
+  }
+
+  static repr_type moveToRepr(at::Tensor&& x) {
+    return std::move(x);
+  }
+
+  static void destroyOwned(at::Tensor& x) {
+    return ExclusivelyOwnedTraits<at::TensorBase>::destroyOwned(x);
+  }
+
+  static at::Tensor take(at::Tensor& x) {
+    return std::move(x);
+  }
+
+  static pointer_type getImpl(repr_type& x) {
+    return &x;
+  }
+
+  static const_pointer_type getImpl(const repr_type& x) {
+    return &x;
+  }
+};
+} // namespace c10
+
+namespace at {
+
+inline c10::MaybeOwned<Tensor> borrow_from_optional_tensor(
+    const std::optional<Tensor>& opt) {
+  return opt.has_value()
+    ? c10::MaybeOwned<Tensor>::borrowed(*opt)
+    : c10::MaybeOwned<Tensor>::owned(std::in_place);
+}
+
+inline c10::MaybeOwned<Tensor> Tensor::expect_contiguous(MemoryFormat memory_format) const & {
+  if (is_contiguous(memory_format)) {
+    return c10::MaybeOwned<Tensor>::borrowed(*this);
+  } else {
+    return c10::MaybeOwned<Tensor>::owned(__dispatch_contiguous(memory_format));
+  }
+}
+} // namespace at
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/TensorMethods.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/TensorMethods.cpp
@ -0,0 +1,61 @@
+#include <c10/core/Scalar.h>
+#include <ATen/core/TensorBody.h>
+
+#include <c10/util/string_view.h>
+
+namespace at {
+
+namespace {
+
+// Verifies the requested type is the same as the Tensor's type.
+void check_type(const TensorBase& tensor, ScalarType type, c10::string_view type_name) {
+  TORCH_CHECK(
+      tensor.scalar_type() == type
+      || (isQIntType(tensor.scalar_type())
+          && toUnderlying(tensor.scalar_type()) == type),
+      "expected scalar type ", type_name, " but found ", tensor.scalar_type());
+}
+
+} // namespace
+
+#define DEFINE_CAST(T, name)                                         \
+   template <>                                                       \
+   TORCH_API const T* TensorBase::const_data_ptr() const {           \
+     check_type(*this, ScalarType::name, #name);                     \
+     return this->unsafeGetTensorImpl()->data_ptr_impl<T>();         \
+   }                                                                 \
+                                                                     \
+   template <>                                                       \
+   TORCH_API const T* TensorBase::const_data_ptr<const T>() const {  \
+     check_type(*this, ScalarType::name, #name);                     \
+     return this->unsafeGetTensorImpl()->data_ptr_impl<std::remove_const_t<T>>(); \
+   }                                                                 \
+                                                                     \
+   template <>                                                       \
+   TORCH_API T* TensorBase::mutable_data_ptr() const {               \
+     check_type(*this, ScalarType::name, #name);                     \
+     return this->unsafeGetTensorImpl()->mutable_data_ptr_impl<T>(); \
+   }                                                                 \
+                                                                     \
+   template <>                                                       \
+   TORCH_API T* TensorBase::data_ptr() const {                       \
+     return mutable_data_ptr<T>();                                   \
+   }                                                                 \
+
+ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CAST)
+ AT_FORALL_QINT_TYPES(DEFINE_CAST)
+ DEFINE_CAST(uint16_t, UInt16)
+ DEFINE_CAST(uint32_t, UInt32)
+ DEFINE_CAST(uint64_t, UInt64)
+ #undef DEFINE_CAST
+
+ #define DEFINE_ITEM(T, name)      \
+   template <>                     \
+   TORCH_API T Tensor::item() const { \
+     return item().to##name();     \
+   }
+
+ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ITEM)
+ #undef DEFINE_ITEM
+
+ } //namespace at
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/UfuncCPU.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/UfuncCPU.cpp
@ -0,0 +1,19 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/DispatchStub.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+
+namespace at {
+
+// NB: this is explicitly copied here (via codegen) rather than
+// included via NativeFunctions.h to avoid recompiling this file when
+// NativeFunctions.h changes
+namespace meta {
+${meta_declaration}
+}
+
+namespace native {
+${native_declaration}
+${native_definitions}
+}} // namespace at::native
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/UfuncCPUKernel.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/UfuncCPUKernel.cpp
@ -0,0 +1,14 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/ufunc/${name}.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/native/cpu/Loops.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/Dispatch.h>
+#include <c10/core/Scalar.h>
+
+namespace at {
+namespace native {
+${native_definitions}
+}} // namespace at::native
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/UfuncCUDA.cu
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/UfuncCUDA.cu
@ -0,0 +1,21 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/ufunc/${name}.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/core/Scalar.h>
+${cuda_headers}
+
+namespace at {
+
+// NB: this is explicitly copied here (via codegen) rather than
+// included via NativeFunctions.h to avoid recompiling this file when
+// NativeFunctions.h changes
+namespace meta {
+${meta_declaration}
+}
+
+namespace native {
+${native_declaration}
+${native_definitions}
+}} // namespace at::native
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/UnboxingFunctions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/UnboxingFunctions.cpp
@ -0,0 +1,35 @@
+#include <ATen/UnboxingFunctions.h>
+#include <ATen/Functions.h>
+
+#include <ATen/Tensor.h>
+#include <ATen/core/functional.h>
+#include <ATen/core/interned_strings.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstring>
+#include <sstream>
+#include <stdexcept>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+namespace at {
+namespace unboxing {
+
+using ::c10::fmap;
+using ::c10::filter;
+using torch::jit::peek;
+using torch::jit::drop;
+using torch::jit::pack;
+using torch::jit::pop;
+
+// Generated function declaration
+${definitions}
+
+} // namespace unboxing
+} // namespace at
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/UnboxingFunctions.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/UnboxingFunctions.h
@ -0,0 +1,32 @@
+// ${generated_comment}
+
+// Generated by tools/jit/gen_unboxing.py. This file declares code generated boxed C++ functions for operators,
+// base off of native_functions.yaml (or similar yaml file with the same syntax). The definition of such a boxed
+// function will pop out IValues from the stack then convert them into the correct C++ types based on given schema. This
+// unboxing logic is an alternative to template-based metaprogramming unboxing.
+
+#pragma once
+
+#include <ATen/ATen.h>
+namespace at {
+namespace unboxing {
+namespace {
+
+template<typename T, size_t N>
+std::array<T, N> as_array(const c10::List<c10::IValue>& list) {
+    std::array<T, N> res;
+    AT_ASSERT(list.size() == N);
+    std::vector<T> vec;
+    for (c10::IValue elem : list) {
+        vec.push_back(elem.to<T>());
+    }
+    std::copy(vec.begin(), vec.end(), res.begin());
+    return res;
+}
+}  // namespace <anonymous>
+using Stack = std::vector<c10::IValue>;
+// Generated function declaration
+${declarations}
+
+} // namespace unboxing
+} // namespace at
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/aten_interned_strings.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/aten_interned_strings.h
@ -0,0 +1,22 @@
+#pragma once
+
+// ${generated_comment}
+
+#if defined(TORCH_ASSERT_NO_OPERATORS) || defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on native_functions.yaml,          \
+  meaning the file will need to be re-compiled every time an operator   \
+  is changed or added. Consider if including <ATen/core/symbol.h> for   \
+  the c10::Symbol class would be sufficient, or if your change would be \
+  better placed in another file.
+#endif
+
+// ATen symbols correspond exactly to operators defined in ATen. Every
+// symbol here corresponds exactly to an ATen operation defined in
+// native_functions.yaml; attributes are in one-to-one correspondence
+// with their ATen name.
+
+#define FORALL_ATEN_BASE_SYMBOLS(_) \
+${aten_symbols}
+
+#define FORALL_ATTR_BASE_SYMBOLS(_) \
+${attr_symbols}
--- a/rl/Lib/site-packages/torchgen/packaged/ATen/templates/enum_tag.h
+++ b/rl/Lib/site-packages/torchgen/packaged/ATen/templates/enum_tag.h
@ -0,0 +1,10 @@
+#pragma once
+
+// ${generated_comment}
+
+namespace at {
+    // Enum of valid tags obtained from the entries in tags.yaml
+    enum class Tag {
+        ${enum_of_valid_tags}
+    };
+}
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/BUILD.bazel
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/BUILD.bazel
@ -0,0 +1,4 @@
+load("//:tools/bazel.bzl", "rules")
+load(":build.bzl", "define_targets")
+
+define_targets(rules = rules)
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/README.md
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/README.md
@ -0,0 +1,3 @@
+If you add a file to this directory, you **MUST** update
+`torch/CMakeLists.txt` and add the file as a dependency to
+the `add_custom_command` call.
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/init.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/init.py
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/context.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/context.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_annotated_fn_args.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_annotated_fn_args.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_autograd.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_autograd.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_autograd_functions.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_autograd_functions.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_inplace_or_view_type.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_inplace_or_view_type.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_python_functions.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_python_functions.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_trace_type.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_trace_type.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_variable_factories.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_variable_factories.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_variable_type.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_variable_type.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_view_funcs.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/gen_view_funcs.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/load_derivatives.cpython-312.pyc
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/pycache/load_derivatives.cpython-312.pyc
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/build.bzl
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/build.bzl
@ -0,0 +1,14 @@
+def define_targets(rules):
+    rules.py_library(
+        name = "autograd",
+        srcs = rules.glob(["*.py"]),
+        data = rules.glob([
+            "*.yaml",
+            "templates/*",
+        ]),
+        visibility = ["//:__subpackages__"],
+        deps = [
+            rules.requirement("PyYAML"),
+            "//torchgen",
+        ],
+    )
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/context.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/context.py
@ -0,0 +1,31 @@
+import functools
+from typing import Callable
+
+from torchgen.api.autograd import NativeFunctionWithDifferentiabilityInfo as NFWDI
+from torchgen.context import native_function_manager
+from torchgen.utils import T
+
+
+# Like tools.api.context.with_native_function, but for
+# NativeFunctionWithDifferentiabilityInfo.
+def with_native_function_with_differentiability_info(
+    func: Callable[[NFWDI], T]
+) -> Callable[[NFWDI], T]:
+    @functools.wraps(func)
+    def wrapper(f: NFWDI) -> T:
+        with native_function_manager(f.func):
+            return func(f)
+
+    return wrapper
+
+
+# Like the above but with an additional dispatch key string argument
+def with_native_function_with_differentiability_info_and_key(
+    func: Callable[[NFWDI, str], T]
+) -> Callable[[NFWDI, str], T]:
+    @functools.wraps(func)
+    def wrapper(f: NFWDI, key: str) -> T:
+        with native_function_manager(f.func):
+            return func(f, key)
+
+    return wrapper
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/deprecated.yaml
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/deprecated.yaml
@ -0,0 +1,134 @@
+# Deprecated function signatures. These are exposed in Python, but not included
+# in the error message suggestions.
+
+- name: add(Tensor self, Scalar alpha, Tensor other) -> Tensor
+  aten: add(self, other, alpha)
+
+- name: add_(Tensor(a!) self, Scalar alpha, Tensor other) -> Tensor(a!)
+  aten: add_(self, other, alpha)
+
+- name: add(Tensor self, Scalar alpha, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  aten: add_out(out, self, other, alpha)
+
+- name: addbmm(Scalar beta, Tensor self, Scalar alpha, Tensor batch1, Tensor batch2) -> Tensor
+  aten: addbmm(self, batch1, batch2, beta, alpha)
+
+- name: addbmm_(Scalar beta, Tensor(a!) self, Scalar alpha, Tensor batch1, Tensor batch2) -> Tensor(a!)
+  aten: addbmm_(self, batch1, batch2, beta, alpha)
+
+- name: addbmm(Scalar beta, Tensor self, Scalar alpha, Tensor batch1, Tensor batch2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addbmm_out(out, self, batch1, batch2, beta, alpha)
+
+- name: addbmm(Scalar beta, Tensor self, Tensor batch1, Tensor batch2) -> Tensor
+  aten: addbmm(self, batch1, batch2, beta, 1)
+
+- name: addbmm_(Scalar beta, Tensor(a!) self, Tensor batch1, Tensor batch2) -> Tensor(a!)
+  aten: addbmm_(self, batch1, batch2, beta, 1)
+
+- name: addbmm(Scalar beta, Tensor self, Tensor batch1, Tensor batch2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addbmm_out(out, self, batch1, batch2, beta, 1)
+
+- name: addcdiv(Tensor self, Scalar value, Tensor tensor1, Tensor tensor2) -> Tensor
+  aten: addcdiv(self, tensor1, tensor2, value)
+
+- name: addcdiv_(Tensor(a!) self, Scalar value, Tensor tensor1, Tensor tensor2) -> Tensor(a!)
+  aten: addcdiv_(self, tensor1, tensor2, value)
+
+- name: addcdiv(Tensor self, Scalar value, Tensor tensor1, Tensor tensor2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addcdiv_out(out, self, tensor1, tensor2, value)
+
+- name: addcmul(Tensor self, Scalar value, Tensor tensor1, Tensor tensor2) -> Tensor
+  aten: addcmul(self, tensor1, tensor2, value)
+
+- name: addcmul_(Tensor(a!) self, Scalar value, Tensor tensor1, Tensor tensor2) -> Tensor(a!)
+  aten: addcmul_(self, tensor1, tensor2, value)
+
+- name: addcmul(Tensor self, Scalar value, Tensor tensor1, Tensor tensor2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addcmul_out(out, self, tensor1, tensor2, value)
+
+- name: addmm(Scalar beta, Tensor self, Scalar alpha, Tensor mat1, Tensor mat2) -> Tensor
+  aten: addmm(self, mat1, mat2, beta, alpha)
+
+- name: addmm_(Scalar beta, Tensor(a!) self, Scalar alpha, Tensor mat1, Tensor mat2) -> Tensor(a!)
+  aten: addmm_(self, mat1, mat2, beta, alpha)
+
+- name: addmm(Scalar beta, Tensor self, Scalar alpha, Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addmm_out(out, self, mat1, mat2, beta, alpha)
+
+- name: addmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2) -> Tensor
+  aten: addmm(self, mat1, mat2, beta, 1)
+
+- name: addmm_(Scalar beta, Tensor(a!) self, Tensor mat1, Tensor mat2) -> Tensor(a!)
+  aten: addmm_(self, mat1, mat2, beta, 1)
+
+- name: addmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addmm_out(out, self, mat1, mat2, beta, 1)
+
+- name: sspaddmm(Scalar beta, Tensor self, Scalar alpha, Tensor mat1, Tensor mat2) -> Tensor
+  aten: sspaddmm(self, mat1, mat2, beta, alpha)
+
+- name: sspaddmm(Scalar beta, Tensor self, Tensor mat1, Tensor mat2) -> Tensor
+  aten: sspaddmm(self, mat1, mat2, beta, 1)
+
+- name: addmv(Scalar beta, Tensor self, Scalar alpha, Tensor mat, Tensor vec) -> Tensor
+  aten: addmv(self, mat, vec, beta, alpha)
+
+- name: addmv_(Scalar beta, Tensor(a!) self, Scalar alpha, Tensor mat, Tensor vec) -> Tensor(a!)
+  aten: addmv_(self, mat, vec, beta, alpha)
+
+- name: addmv(Scalar beta, Tensor self, Scalar alpha, Tensor mat, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addmv_out(out, self, mat, vec, beta, alpha)
+
+- name: addmv(Scalar beta, Tensor self, Tensor mat, Tensor vec) -> Tensor
+  aten: addmv(self, mat, vec, beta, 1)
+
+- name: addmv_(Scalar beta, Tensor(a!) self, Tensor mat, Tensor vec) -> Tensor(a!)
+  aten: addmv_(self, mat, vec, beta, 1)
+
+- name: addmv(Scalar beta, Tensor self, Tensor mat, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addmv_out(out, self, mat, vec, beta, 1)
+
+- name: addr(Scalar beta, Tensor self, Scalar alpha, Tensor vec1, Tensor vec2) -> Tensor
+  aten: addr(self, vec1, vec2, beta, alpha)
+
+- name: addr_(Scalar beta, Tensor(a!) self, Scalar alpha, Tensor vec1, Tensor vec2) -> Tensor(a!)
+  aten: addr_(self, vec1, vec2, beta, alpha)
+
+- name: addr(Scalar beta, Tensor self, Scalar alpha, Tensor vec1, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addr_out(out, self, vec1, vec2, beta, alpha)
+
+- name: addr(Scalar beta, Tensor self, Tensor vec1, Tensor vec2) -> Tensor
+  aten: addr(self, vec1, vec2, beta, 1)
+
+- name: addr_(Scalar beta, Tensor(a!) self, Tensor vec1, Tensor vec2) -> Tensor(a!)
+  aten: addr_(self, vec1, vec2, beta, 1)
+
+- name: addr(Scalar beta, Tensor self, Tensor vec1, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: addr_out(out, self, vec1, vec2, beta, 1)
+
+- name: baddbmm(Scalar beta, Tensor self, Scalar alpha, Tensor batch1, Tensor batch2) -> Tensor
+  aten: baddbmm(self, batch1, batch2, beta, alpha)
+
+- name: baddbmm_(Scalar beta, Tensor(a!) self, Scalar alpha, Tensor batch1, Tensor batch2) -> Tensor(a!)
+  aten: baddbmm_(self, batch1, batch2, beta, alpha)
+
+- name: baddbmm(Scalar beta, Tensor self, Scalar alpha, Tensor batch1, Tensor batch2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: baddbmm_out(out, self, batch1, batch2, beta, alpha)
+
+- name: baddbmm(Scalar beta, Tensor self, Tensor batch1, Tensor batch2) -> Tensor
+  aten: baddbmm(self, batch1, batch2, beta, 1)
+
+- name: baddbmm_(Scalar beta, Tensor(a!) self, Tensor batch1, Tensor batch2) -> Tensor(a!)
+  aten: baddbmm_(self, batch1, batch2, beta, 1)
+
+- name: baddbmm(Scalar beta, Tensor self, Tensor batch1, Tensor batch2, *, Tensor(a!) out) -> Tensor(a!)
+  aten: baddbmm_out(out, self, batch1, batch2, beta, 1)
+
+- name: sub(Tensor self, Scalar alpha, Tensor other) -> Tensor
+  aten: sub(self, other, alpha)
+
+- name: sub_(Tensor(a!) self, Scalar alpha, Tensor other) -> Tensor(a!)
+  aten: sub_(self, other, alpha)
+
+- name: sub(Tensor self, Scalar alpha, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  aten: sub_out(out, self, other, alpha)
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/derivatives.yaml
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/derivatives.yaml
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_annotated_fn_args.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_annotated_fn_args.py
@ -0,0 +1,132 @@
+"""
+For procedural tests needed for __torch_function__, we use this function
+to export method names and signatures as needed by the tests in
+test/test_overrides.py.
+
+python -m tools.autograd.gen_annotated_fn_args \
+       aten/src/ATen/native/native_functions.yaml \
+       aten/src/ATen/native/tags.yaml \
+       $OUTPUT_DIR \
+       tools/autograd
+
+Where $OUTPUT_DIR is where you would like the files to be
+generated.  In the full build system, OUTPUT_DIR is
+torch/testing/_internal/generated
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import textwrap
+from collections import defaultdict
+from typing import Any, Sequence, TYPE_CHECKING
+
+import torchgen.api.python as python
+from torchgen.context import with_native_function
+from torchgen.gen import parse_native_yaml
+from torchgen.utils import FileManager
+
+from .gen_python_functions import (
+    is_py_fft_function,
+    is_py_linalg_function,
+    is_py_nn_function,
+    is_py_special_function,
+    is_py_torch_function,
+    is_py_variable_method,
+    should_generate_py_binding,
+)
+
+
+if TYPE_CHECKING:
+    from torchgen.model import Argument, BaseOperatorName, NativeFunction
+
+
+def gen_annotated(
+    native_yaml_path: str, tags_yaml_path: str, out: str, autograd_dir: str
+) -> None:
+    native_functions = parse_native_yaml(
+        native_yaml_path, tags_yaml_path
+    ).native_functions
+    mappings = (
+        (is_py_torch_function, "torch._C._VariableFunctions"),
+        (is_py_nn_function, "torch._C._nn"),
+        (is_py_linalg_function, "torch._C._linalg"),
+        (is_py_special_function, "torch._C._special"),
+        (is_py_fft_function, "torch._C._fft"),
+        (is_py_variable_method, "torch.Tensor"),
+    )
+    annotated_args: list[str] = []
+    for pred, namespace in mappings:
+        groups: dict[BaseOperatorName, list[NativeFunction]] = defaultdict(list)
+        for f in native_functions:
+            if not should_generate_py_binding(f) or not pred(f):
+                continue
+            groups[f.func.name.name].append(f)
+        for group in groups.values():
+            for f in group:
+                annotated_args.append(f"{namespace}.{gen_annotated_args(f)}")
+
+    template_path = os.path.join(autograd_dir, "templates")
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    fm.write_with_template(
+        "annotated_fn_args.py",
+        "annotated_fn_args.py.in",
+        lambda: {
+            "annotated_args": textwrap.indent("\n".join(annotated_args), "    "),
+        },
+    )
+
+
+@with_native_function
+def gen_annotated_args(f: NativeFunction) -> str:
+    def _get_kwargs_func_exclusion_list() -> list[str]:
+        # functions that currently don't work with kwargs in test_overrides.py
+        return [
+            "diagonal",
+            "round_",
+            "round",
+            "scatter_",
+        ]
+
+    def _add_out_arg(
+        out_args: list[dict[str, Any]], args: Sequence[Argument], *, is_kwarg_only: bool
+    ) -> None:
+        for arg in args:
+            if arg.default is not None:
+                continue
+            out_arg: dict[str, Any] = {}
+            out_arg["is_kwarg_only"] = str(is_kwarg_only)
+            out_arg["name"] = arg.name
+            out_arg["simple_type"] = python.argument_type_str(
+                arg.type, simple_type=True
+            )
+            size_t = python.argument_type_size(arg.type)
+            if size_t:
+                out_arg["size"] = size_t
+            out_args.append(out_arg)
+
+    out_args: list[dict[str, Any]] = []
+    _add_out_arg(out_args, f.func.arguments.flat_positional, is_kwarg_only=False)
+    if f"{f.func.name.name}" not in _get_kwargs_func_exclusion_list():
+        _add_out_arg(out_args, f.func.arguments.flat_kwarg_only, is_kwarg_only=True)
+
+    return f"{f.func.name.name}: {repr(out_args)},"
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate annotated_fn_args script")
+    parser.add_argument(
+        "native_functions", metavar="NATIVE", help="path to native_functions.yaml"
+    )
+    parser.add_argument("tags", metavar="TAGS", help="path to tags.yaml")
+    parser.add_argument("out", metavar="OUT", help="path to output directory")
+    parser.add_argument(
+        "autograd", metavar="AUTOGRAD", help="path to template directory"
+    )
+    args = parser.parse_args()
+    gen_annotated(args.native_functions, args.tags, args.out, args.autograd)
+
+
+if __name__ == "__main__":
+    main()
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_autograd.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_autograd.py
@ -0,0 +1,147 @@
+"""
+To run this file by hand from the root of the PyTorch
+repository, run:
+
+python -m tools.autograd.gen_autograd \
+       aten/src/ATen/native/native_functions.yaml \
+       aten/src/ATen/native/tags.yaml \
+       $OUTPUT_DIR \
+       tools/autograd
+
+Where $OUTPUT_DIR is where you would like the files to be
+generated.  In the full build system, OUTPUT_DIR is
+torch/csrc/autograd/generated/
+"""
+
+# gen_autograd.py generates C++ autograd functions and Python bindings.
+#
+# It delegates to the following scripts:
+#
+#  gen_autograd_functions.py: generates subclasses of torch::autograd::Node
+#  gen_variable_type.py: generates VariableType.h which contains all tensor methods
+#  gen_python_functions.py: generates Python bindings to THPVariable
+#
+
+from __future__ import annotations
+
+import argparse
+import os
+
+from torchgen.api import cpp
+from torchgen.api.autograd import (
+    match_differentiability_info,
+    NativeFunctionWithDifferentiabilityInfo,
+)
+from torchgen.gen import parse_native_yaml
+from torchgen.selective_build.selector import SelectiveBuilder
+
+from . import gen_python_functions
+from .gen_autograd_functions import (
+    gen_autograd_functions_lib,
+    gen_autograd_functions_python,
+)
+from .gen_inplace_or_view_type import gen_inplace_or_view_type
+from .gen_trace_type import gen_trace_type
+from .gen_variable_factories import gen_variable_factories
+from .gen_variable_type import gen_variable_type
+from .gen_view_funcs import gen_view_funcs
+from .load_derivatives import load_derivatives
+
+
+def gen_autograd(
+    native_functions_path: str,
+    tags_path: str,
+    out: str,
+    autograd_dir: str,
+    operator_selector: SelectiveBuilder,
+    disable_autograd: bool = False,
+) -> None:
+    # Parse and load derivatives.yaml
+    differentiability_infos, used_dispatch_keys = load_derivatives(
+        os.path.join(autograd_dir, "derivatives.yaml"), native_functions_path, tags_path
+    )
+
+    template_path = os.path.join(autograd_dir, "templates")
+
+    native_funcs = parse_native_yaml(native_functions_path, tags_path).native_functions
+    fns = sorted(
+        filter(
+            operator_selector.is_native_function_selected_for_training, native_funcs
+        ),
+        key=lambda f: cpp.name(f.func),
+    )
+    fns_with_diff_infos: list[
+        NativeFunctionWithDifferentiabilityInfo
+    ] = match_differentiability_info(fns, differentiability_infos)
+
+    # Generate VariableType.h/cpp
+    if not disable_autograd:
+        gen_variable_type(
+            out,
+            native_functions_path,
+            tags_path,
+            fns_with_diff_infos,
+            template_path,
+            used_dispatch_keys,
+        )
+
+        gen_inplace_or_view_type(
+            out, native_functions_path, tags_path, fns_with_diff_infos, template_path
+        )
+
+        # operator filter not applied as tracing sources are excluded in selective build
+        gen_trace_type(out, native_funcs, template_path)
+    # Generate Functions.h/cpp
+    gen_autograd_functions_lib(out, differentiability_infos, template_path)
+
+    # Generate variable_factories.h
+    gen_variable_factories(out, native_functions_path, tags_path, template_path)
+
+    # Generate ViewFuncs.h/cpp
+    gen_view_funcs(out, fns_with_diff_infos, template_path)
+
+
+def gen_autograd_python(
+    native_functions_path: str,
+    tags_path: str,
+    out: str,
+    autograd_dir: str,
+) -> None:
+    differentiability_infos, _ = load_derivatives(
+        os.path.join(autograd_dir, "derivatives.yaml"), native_functions_path, tags_path
+    )
+
+    template_path = os.path.join(autograd_dir, "templates")
+
+    # Generate Functions.h/cpp
+    gen_autograd_functions_python(out, differentiability_infos, template_path)
+
+    # Generate Python bindings
+    deprecated_path = os.path.join(autograd_dir, "deprecated.yaml")
+    gen_python_functions.gen(
+        out, native_functions_path, tags_path, deprecated_path, template_path
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate autograd C++ files script")
+    parser.add_argument(
+        "native_functions", metavar="NATIVE", help="path to native_functions.yaml"
+    )
+    parser.add_argument("tags", metavar="NATIVE", help="path to tags.yaml")
+    parser.add_argument("out", metavar="OUT", help="path to output directory")
+    parser.add_argument(
+        "autograd", metavar="AUTOGRAD", help="path to autograd directory"
+    )
+    args = parser.parse_args()
+    gen_autograd(
+        args.native_functions,
+        args.tags,
+        args.out,
+        args.autograd,
+        SelectiveBuilder.get_nop_selector(),
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_autograd_functions.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_autograd_functions.py
@ -0,0 +1,925 @@
+# Generates C++ autograd functions for the derivatives of ATen operations
+#
+# This writes two files:
+#  Functions.h/cpp: subclasses of autograd::Node
+#  python_functions.h/cpp: Python bindings for the above classes
+#
+
+from __future__ import annotations
+
+from typing import Sequence
+
+from torchgen.api.autograd import (
+    Derivative,
+    DifferentiabilityInfo,
+    SavedAttribute,
+    uses_retain_variables,
+    uses_single_grad,
+)
+from torchgen.api.types import (
+    ArrayRefCType,
+    BaseCppType,
+    BaseCType,
+    Binding,
+    boolT,
+    doubleT,
+    intArrayRefT,
+    iTensorListRefT,
+    ListCType,
+    longT,
+    MutRefCType,
+    OptionalCType,
+    optionalIntArrayRefT,
+    optionalSymIntArrayRefT,
+    scalarT,
+    stringT,
+    symIntArrayRefT,
+    SymIntT,
+    TENSOR_LIST_LIKE_CTYPES,
+    tensorListT,
+    tensorT,
+    VectorCType,
+)
+from torchgen.code_template import CodeTemplate
+from torchgen.model import Argument, FunctionSchema
+from torchgen.utils import FileManager
+
+from .gen_inplace_or_view_type import VIEW_FUNCTIONS
+
+
+FUNCTION_DECLARATION = CodeTemplate(
+    """\
+#ifdef _WIN32
+struct ${op} : public ${superclass} {
+  TORCH_API ${op}() = default;
+#else
+struct TORCH_API ${op} : public ${superclass} {
+#endif
+  using ${superclass}::${superclass};
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "${op}"; }
+  void release_variables() override {
+    ${thread_lock}
+    ${release_variables}
+  }
+  ${will_release_variables}
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  ${saved_variables}
+  ${saved_list_sizes}
+};
+"""
+)
+
+WILL_RELEASE_VARIABLES = CodeTemplate(
+    """\
+bool retain_variables = true;
+void will_release_variables() override {
+  retain_variables = false;
+}
+"""
+)
+
+FUNCTION_DEFINITION = CodeTemplate(
+    """\
+variable_list ${op}::apply(variable_list&& grads) {
+  ${thread_lock}
+  ${asserts}
+  IndexRangeGenerator gen;
+  ${compute_index_ranges}
+  variable_list grad_inputs(gen.size());
+  ${body}
+  return grad_inputs;
+}
+void ${op}::compiled_args(CompiledNodeArgs& args) {
+    ${compiled_args}
+}
+variable_list ${op}::apply_with_saved(const variable_list& grads, SwapSavedVariables& saved) {
+    ${apply_with_saved_before}
+    variable_list result = apply(variable_list(grads));
+    ${apply_with_saved_after}
+    return result;
+}
+"""
+)
+
+GRAD_INPUT_MASK = CodeTemplate(
+    """\
+  auto grad_input_mask = std::array<bool, ${n}>{
+    ${masks}
+  };\
+"""
+)
+
+DERIVATIVE_SINGLE = CodeTemplate(
+    """\
+if (task_should_compute_output({ ${name}_ix })) {
+  auto grad_result = ${derivative};
+  copy_range(grad_inputs, ${name}_ix, grad_result);
+}
+"""
+)
+
+# note(crcrpar): `self` argument and other optional positional argument
+# of foreach functions are basically a list of n `Tensor`s thus iterating over
+# `grads` in order to utilize and apply the existing derivative definitions
+# to each `Tensor`(s) of `self`, and the others.
+DERIVATIVE_SINGLE_FOREACH = CodeTemplate(
+    """\
+if (task_should_compute_output({ ${name}_ix })) {
+  std::vector<Tensor> grad_result;
+  grad_result.reserve(grads.size());
+  for (const auto & i : c10::irange(grads.size())) {
+    if (grads[i].defined()) {
+      grad_result.emplace_back(${derivative});
+    } else {
+      grad_result.emplace_back(Tensor());
+    }
+  }
+  copy_range(grad_inputs, ${name}_ix, grad_result);
+}
+"""
+)
+
+DERIVATIVE_MULTI_COPY_RANGE = CodeTemplate(
+    """\
+  if (task_should_compute_output({ ${name}_ix })) {
+    copy_range(grad_inputs, ${name}_ix, std::get<${i}>(grad_result));
+  }
+"""
+)
+
+DERIVATIVE_MULTI = CodeTemplate(
+    """\
+if (task_should_compute_output({ ${idx_ranges} })) {
+  ${grad_input_mask}
+  auto grad_result = ${derivative};
+  ${copy_ranges}
+}
+"""
+)
+
+# Generates python bindings
+#
+# This generates the definitions for:
+#   (1) The PyTypeObject for each backward grad_fn subclassing Node
+#   (2) The entry for PyTypeObject's tp_getset slot (an array of PyGetSetDef structs)
+#       We generate one PyGetSetDef struct for each of grad_fn's saved inputs and outputs
+#       Each PyGetSetDef has a function ptr to a getter, also defined here (3).
+#   (3) Getters for each of grad_fn's saved inputs and outputs.
+#
+PY_FUNCTION_DEFINITION = CodeTemplate(
+    """\
+static PyTypeObject ${op}Class;
+addClass<${op}>(module, ${op}Class, "${op}", ${op}_properties);
+"""
+)
+
+PY_FUNCTION_PROPS_AND_GETTERS = CodeTemplate(
+    """\
+${all_getter_definitions}
+
+static struct PyGetSetDef ${op}_properties[] = {
+  THP_FUNCTION_DEFAULT_PROPERTIES,
+  ${all_getsetdef_structs}
+  {nullptr} /* sentinel */
+};
+
+"""
+)
+
+PY_GETSETDEF_STRUCT = CodeTemplate(
+    """\
+{(char*)"_saved_${name}", (getter)THP${op}_${name}_getter, nullptr, nullptr, nullptr}"""
+)
+
+PY_RAW_GETSETDEF_STRUCT = CodeTemplate(
+    """\
+{(char*)"_raw_saved_${name}", (getter)THP${op}_${name}_raw_getter, nullptr, nullptr, nullptr}"""
+)
+
+# Getter templates
+GETTER_DEFINITION = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  auto prop = static_cast<${op}*>(self->cdata.get())->${name};
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_SAVEDVAR = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  const auto& prop = static_cast<${op}*>(self->cdata.get())->${name}_;
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_RAW_SAVEDVAR = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_raw_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  const auto& prop = static_cast<${op}*>(self->cdata.get())->${name}_;
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_VEC_SAVEDVAR = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  const auto *node = static_cast<${op}*>(self->cdata.get());
+  const auto& prop = node->${name}_;
+  if (node->${name}_released_) {
+    PyErr_SetString(PyExc_RuntimeError, ERR_BACKWARD_TWICE);
+    return nullptr;
+  }
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_RAW_VEC_SAVEDVAR = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_raw_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  const auto *node = static_cast<${op}*>(self->cdata.get());
+  const auto& prop = node->${name}_;
+  if (node->${name}_released_) {
+    PyErr_SetString(PyExc_RuntimeError, ERR_BACKWARD_TWICE);
+    return nullptr;
+  }
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_OPT = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  auto opt_prop = static_cast<${op}*>(self->cdata.get())->${name};
+  if (!opt_prop.has_value()) {
+    Py_RETURN_NONE;
+  }
+  auto prop = opt_prop.value();
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+GETTER_DEFINITION_OPT_ARRAYREF = CodeTemplate(
+    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  auto opt_prop = static_cast<${op}*>(self->cdata.get())->${name};
+  if (!opt_prop.list.has_value()) {
+    Py_RETURN_NONE;
+  }
+  auto prop = opt_prop.list.value();
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+"""
+)
+
+# Getter body
+GETTER_BODY_SAVEDVAR = """\
+return THPVariable_Wrap(prop.unpack(self->cdata));
+"""
+
+GETTER_BODY_RAW_SAVEDVAR = """\
+pybind11::object obj = pybind11::cast(prop, pybind11::return_value_policy::reference);
+return obj.release().ptr();
+"""
+
+GETTER_BODY_VEC_SAVEDVAR = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i: c10::irange(prop.size())) {
+  PyTuple_SetItem(tup, (Py_ssize_t) i, THPVariable_Wrap(prop[i].unpack(self->cdata)));
+}
+return tup;
+"""
+
+GETTER_BODY_RAW_VEC_SAVEDVAR = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i : c10::irange(prop.size())) {
+  pybind11::object obj = pybind11::cast(prop[i], pybind11::return_value_policy::reference);
+  PyTuple_SetItem(tup, (Py_ssize_t) i, obj.release().ptr());
+}
+return tup;
+"""
+
+GETTER_BODY_ARRAYREF_LONG = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i : c10::irange(prop.size())) {
+  PyTuple_SetItem(tup, (Py_ssize_t) i, PyLong_FromUnsignedLong((uint64_t) prop[i]));
+}
+return tup;
+"""
+
+GETTER_BODY_ARRAYREF_SYMINT = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i : c10::irange(prop.size())) {
+    auto si = prop[i];
+    if (auto m = si.maybe_as_int()) {
+      PyTuple_SetItem(tup, (Py_ssize_t) i, PyLong_FromUnsignedLong(*m));
+    } else {
+      auto py_symint = py::cast(si).release().ptr();
+      PyTuple_SetItem(tup, (Py_ssize_t) i, py_symint);
+    }
+}
+return tup;
+"""
+
+GETTER_BODY_ARRAYREF_DOUBLE = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i : c10::irange(prop.size())) {
+  PyTuple_SetItem(tup, (Py_ssize_t) i, PyFloat_FromDouble((double) prop[i]));
+}
+return tup;
+"""
+
+GETTER_BODY_INT64_T = """\
+return PyLong_FromUnsignedLong((int64_t) prop);
+"""
+
+GETTER_BODY_SYMINT = """\
+if (auto m = prop.maybe_as_int()) {
+  return PyLong_FromUnsignedLong(*m);
+} else {
+  return py::cast(prop).release().ptr();
+}
+"""
+
+GETTER_BODY_DOUBLE = """\
+return PyFloat_FromDouble((double) prop);
+"""
+
+GETTER_BODY_BOOL = """\
+if (prop) {
+  Py_RETURN_TRUE;
+} else {
+  Py_RETURN_FALSE;
+}
+"""
+
+GETTER_BODY_STRING = """\
+return PyUnicode_FromStringAndSize(prop.data(), prop.size());
+"""
+
+GETTER_BODY_SCALAR = """\
+if (prop.isComplex()) {
+  auto cprop = prop.to<c10::complex<double>>();
+  return PyComplex_FromDoubles(cprop.real(), cprop.imag());
+} else if (prop.isFloatingPoint()) {
+  return PyFloat_FromDouble(prop.to<double>());
+} else if (prop.isIntegral(/*includeBool=*/false)) {
+  return PyLong_FromLong(prop.to<int64_t>());
+} else if (prop.isBoolean()) {
+  if (prop.to<bool>()) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+} else {
+  PyErr_SetString(PyExc_RuntimeError, "Unknown scalar type");
+  return nullptr;
+}
+"""
+
+
+GETTER_BODY_VEC_SCALAR = """\
+PyObject* tup = PyTuple_New((Py_ssize_t) prop.size());
+for (auto i: c10::irange(prop.size())) {
+  if (prop[i].isComplex()) {
+    auto cprop = prop[i].to<c10::complex<double>>();
+    PyTuple_SetItem(tup, (Py_ssize_t) i, PyComplex_FromDoubles(cprop.real(), cprop.imag()));
+  } else if (prop[i].isFloatingPoint()) {
+    auto double_prop = prop[i].to<double>();
+    PyTuple_SetItem(tup, (Py_ssize_t) i, PyFloat_FromDouble(double_prop));
+  } else if (prop[i].isIntegral(/*includeBool=*/false)) {
+    auto long_prop = prop[i].to<int64_t>();
+    PyTuple_SetItem(tup, (Py_ssize_t) i, PyLong_FromLong(long_prop));
+  } else if (prop[i].isBoolean()) {
+    if (prop[i].to<bool>()) {
+      PyTuple_SetItem(tup, (Py_ssize_t) i, Py_True);
+    } else {
+      PyTuple_SetItem(tup, (Py_ssize_t) i, Py_False);
+    }
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, "Unknown scalar type");
+    return nullptr;
+  }
+}
+return tup;
+"""
+
+
+MISC_GETTER_DEFS = {
+    OptionalCType(BaseCType(longT)): (GETTER_DEFINITION_OPT, GETTER_BODY_INT64_T),
+    OptionalCType(BaseCType(SymIntT)): (GETTER_DEFINITION_OPT, GETTER_BODY_SYMINT),
+    BaseCType(doubleT): (GETTER_DEFINITION, GETTER_BODY_DOUBLE),
+    OptionalCType(BaseCType(doubleT)): (GETTER_DEFINITION_OPT, GETTER_BODY_DOUBLE),
+    BaseCType(boolT): (GETTER_DEFINITION, GETTER_BODY_BOOL),
+    BaseCType(scalarT): (GETTER_DEFINITION, GETTER_BODY_SCALAR),
+    OptionalCType(BaseCType(scalarT)): (GETTER_DEFINITION_OPT, GETTER_BODY_SCALAR),
+}
+
+# These functions have backwards which cannot be traced, and so must have
+# their backward functions traced opaquely.
+# VIEW_FUNCTIONS are not traceable because they use as_strided, which
+# has an untraceable backwards, see
+# https://github.com/pytorch/pytorch/issues/4250
+# TODO: This is probably not exhaustive, but it's a start
+UNTRACEABLE_FUNCTIONS = VIEW_FUNCTIONS
+
+
+def get_infos_with_derivatives_list(
+    differentiability_infos: dict[FunctionSchema, dict[str, DifferentiabilityInfo]]
+) -> list[DifferentiabilityInfo]:
+    diff_info_list = [
+        info
+        for diffinfo_dict in differentiability_infos.values()
+        for info in diffinfo_dict.values()
+    ]
+
+    return list(filter(lambda info: info.args_with_derivatives, diff_info_list))
+
+
+def gen_autograd_functions_lib(
+    out: str,
+    differentiability_infos: dict[FunctionSchema, dict[str, DifferentiabilityInfo]],
+    template_path: str,
+) -> None:
+    """Functions.h and Functions.cpp body
+
+    These contain the auto-generated subclasses of torch::autograd::Node
+    for each every differentiable torch function.
+    """
+
+    # get a 1D list of diffinfos, we do not need them to be per FunctionSchema/DispatchKey here
+    # infos with the diff dispatchkeys but the same name will still be in the same shard.
+    infos = get_infos_with_derivatives_list(differentiability_infos)
+    declarations = [process_function(f, FUNCTION_DECLARATION) for f in infos]
+    definitions = [process_function(f, FUNCTION_DEFINITION) for f in infos]
+
+    file_basename = "Functions"
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    for suffix in [".h", ".cpp"]:
+        fname = file_basename + suffix
+        fm.write_with_template(
+            fname,
+            fname,
+            lambda: {
+                "generated_comment": "@"
+                + f"generated from {fm.template_dir_for_comments()}/"
+                + fname,
+                "autograd_function_declarations": declarations,
+                "autograd_function_definitions": definitions,
+            },
+        )
+
+
+def gen_autograd_functions_python(
+    out: str,
+    differentiability_infos: dict[FunctionSchema, dict[str, DifferentiabilityInfo]],
+    template_path: str,
+) -> None:
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    num_shards = 5
+    fm.write(
+        "python_functions.h",
+        lambda: {
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/python_functions.h",
+            "shard_forward_declare": [
+                f"void initialize_autogenerated_functions_{i}(PyObject* module);"
+                for i in range(num_shards)
+            ],
+            "shard_call": [
+                f"initialize_autogenerated_functions_{i}(module);"
+                for i in range(num_shards)
+            ],
+        },
+    )
+
+    # get a 1D list of diffinfos, we do not need them to be per FunctionSchema/DispatchKey here
+    # infos with the diff dispatchkeys but the same name will still be in the same shard.
+    infos = get_infos_with_derivatives_list(differentiability_infos)
+    fm.write_sharded(
+        "python_functions.cpp",
+        infos,
+        key_fn=lambda info: info.name,
+        base_env={
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/python_functions.cpp",
+        },
+        env_callable=lambda info: {
+            "py_function_initializers": [
+                process_function(info, PY_FUNCTION_DEFINITION)
+            ],
+            "py_function_props_and_getters": [
+                process_function(info, PY_FUNCTION_PROPS_AND_GETTERS)
+            ],
+        },
+        num_shards=num_shards,
+        sharded_keys={"py_function_initializers", "py_function_props_and_getters"},
+    )
+
+
+def process_function(info: DifferentiabilityInfo, template: CodeTemplate) -> str:
+    saved_variables: list[str] = []
+    release_variables: list[str] = []
+    saved_list_sizes: list[str] = []
+    unpack: list[str] = []
+    asserts: list[str] = []
+    compute_index_ranges: list[str] = []
+    getter_definitions: list[str] = []
+    py_getsetdef_structs: list[str] = []
+    compiled_args: list[str] = []
+    apply_with_saved_before: list[str] = []
+    apply_with_saved_after: list[str] = []
+
+    for arg in info.args_with_derivatives:
+        if arg.type in TENSOR_LIST_LIKE_CTYPES:
+            size = f"{arg.name}_size_"
+            saved_list_sizes.append(f"size_t {arg.name}_size_;")
+        else:
+            size = "1"
+        compute_index_ranges.append(f"auto {arg.name}_ix = gen.range({size});")
+
+    def save_var(var: SavedAttribute, is_output: bool) -> None:
+        name = var.nctype.name
+        type = var.nctype.type
+        should_append_getsetdef = True
+        should_append_raw_getsetdef = False
+        visit_name = name
+        uses_cpp_saved_variable_cls = False
+
+        if (
+            type == BaseCType(tensorT)
+            or type == OptionalCType(BaseCType(tensorT))
+            or type == MutRefCType(OptionalCType(BaseCType(tensorT)))
+            or (type == BaseCType(scalarT) and is_output)
+        ):
+            uses_cpp_saved_variable_cls = True
+            saved_variables.append(f"SavedVariable {name}_;")
+            release_variables.append(f"{name}_.reset_data();")
+            ptr = "shared_from_this()" if is_output else ""
+            unpack.append(f"auto {name} = {name}_.unpack({ptr});")
+            getter_definitions.append(
+                GETTER_DEFINITION_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_SAVEDVAR
+                )
+            )
+            getter_definitions.append(
+                GETTER_DEFINITION_RAW_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_RAW_SAVEDVAR
+                )
+            )
+            should_append_raw_getsetdef = True
+            visit_name = f"{name}_"
+        elif (
+            type == BaseCType(tensorListT)
+            or type == BaseCType(iTensorListRefT)
+            or type == VectorCType(BaseCType(tensorT))
+        ):
+            # note(crcrpar): [nuanced return type of out-of-place foreach functions]
+            # When an out-of-place foreach function whose return signature is `Tensor[]`
+            # spells out its backward definitions in `derivatives.yaml`, and some of them depend on
+            # `result`, `result`'s type is interpreted and treated as `std::vector<Tensor>`.
+            # An out-of-place foreach whose backwards rely on their output doesn't suffer from this
+            # difference if the definitions are codegen'ed.
+            # This special case is needed for `_foreach_pow.List` and `_foreach_pow.ScalarAndTensor`
+            # as of https://github.com/pytorch/pytorch/pull/105504.
+            if type == VectorCType(BaseCType(tensorT)):
+                assert (
+                    info.func.func.name.name.base.startswith("_foreach") and is_output
+                )
+            uses_cpp_saved_variable_cls = True
+            saved_variables.append(f"std::vector<SavedVariable> {name}_;")
+            saved_variables.append(f"bool {name}_released_ = false;")
+            # Just clear() is sufficient, we don't need to loop and clear each variable.
+            # Because the SavedVariable owns a tensor and a grad_fn, removing the SavedVariable makes them go away as well.
+            release_variables.append(f"{name}_.clear();")
+            release_variables.append(f"{name}_released_ = true;")
+            ptr = "shared_from_this()" if is_output else "nullptr"
+            unpack.append(f"auto {name} = unpack_list({name}_, {ptr});")
+            asserts.append(f"TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);")
+            getter_definitions.append(
+                GETTER_DEFINITION_VEC_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_VEC_SAVEDVAR
+                )
+            )
+            getter_definitions.append(
+                GETTER_DEFINITION_RAW_VEC_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_RAW_VEC_SAVEDVAR
+                )
+            )
+            should_append_raw_getsetdef = True
+            visit_name = f"{name}_"
+        elif type == ListCType(OptionalCType(BaseCType(tensorT))):
+            uses_cpp_saved_variable_cls = True
+            saved_variables.append(f"std::vector<SavedVariable> {name}_;")
+            saved_variables.append(f"bool {name}_released_ = false;")
+            # Just clear() is sufficient, we don't need to loop and clear each variable.
+            # Because the SavedVariable owns a tensor and a grad_fn, removing the SavedVariable makes them go away as well.
+            release_variables.append(f"{name}_.clear();")
+            release_variables.append(f"{name}_released_ = true;")
+            unpack.append(f"auto {name} = unpack_opt_list({name}_);")
+            asserts.append(f"TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);")
+            getter_definitions.append(
+                GETTER_DEFINITION_VEC_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_VEC_SAVEDVAR
+                )
+            )
+            getter_definitions.append(
+                GETTER_DEFINITION_RAW_VEC_SAVEDVAR.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_RAW_VEC_SAVEDVAR
+                )
+            )
+            should_append_raw_getsetdef = True
+            visit_name = f"{name}_"
+        elif type == BaseCType(intArrayRefT):
+            saved_variables.append(f"std::vector<int64_t> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG
+                )
+            )
+        elif type == BaseCType(symIntArrayRefT):
+            saved_variables.append(f"std::vector<c10::SymInt> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_SYMINT
+                )
+            )
+        elif type == BaseCType(optionalIntArrayRefT):
+            saved_variables.append(f"c10::OptionalArray<int64_t> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG
+                )
+            )
+        elif type == BaseCType(optionalSymIntArrayRefT):
+            saved_variables.append(f"c10::OptionalArray<c10::SymInt> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_SYMINT
+                )
+            )
+        elif type == OptionalCType(BaseCType(intArrayRefT)):
+            saved_variables.append(f"c10::OptionalArray<int64_t> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_LONG
+                )
+            )
+        elif type == OptionalCType(BaseCType(symIntArrayRefT)):
+            saved_variables.append(f"c10::OptionalArray<c10::SymInt> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_SYMINT
+                )
+            )
+        elif type == OptionalCType(ArrayRefCType(BaseCType(doubleT))):
+            saved_variables.append(f"c10::OptionalArray<double> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT_ARRAYREF.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_ARRAYREF_DOUBLE
+                )
+            )
+        elif type == BaseCType(longT):
+            saved_variables.append(f"{type.cpp_type()} {name} = 0;")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_INT64_T
+                )
+            )
+        elif type == BaseCType(SymIntT):
+            saved_variables.append(f"c10::SymInt {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_SYMINT
+                )
+            )
+        elif type == BaseCType(stringT):
+            saved_variables.append(f"std::string {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_STRING
+                )
+            )
+        elif type == OptionalCType(BaseCType(stringT)):
+            saved_variables.append(f"std::optional<std::string> {name};")
+            getter_definitions.append(
+                GETTER_DEFINITION_OPT.substitute(
+                    op=info.op, name=name, body=GETTER_BODY_STRING
+                )
+            )
+        elif type == ArrayRefCType(
+            elem=BaseCType(type=BaseCppType(ns="at", name="Scalar"))
+        ):
+            saved_variables.append(f"std::vector<at::Scalar> {name};")
+            saved_variables.append(f"bool {name}_released_ = false;")
+            # Just clear() is sufficient, we don't need to loop and clear each variable.
+            # Because the SavedVariable owns a tensor and a grad_fn, removing the SavedVariable makes them go away as well.
+            release_variables.append(f"{name}.clear();")
+            # release_variables.append(f"{name}_released_ = true;")
+            # unpack.append(f"auto {name} = unpack_list({name}_);")
+            # asserts.append(f"TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);")
+            getter_definitions.append(
+                CodeTemplate(
+                    """\
+PyObject* THP${op}_${name}_getter(THPCppFunction *self, void *_unused) {
+  HANDLE_TH_ERRORS
+  const auto *node = static_cast<${op}*>(self->cdata.get());
+  const auto& prop = node->${name};
+  if (node->${name}_released_) {
+    PyErr_SetString(PyExc_RuntimeError, ERR_BACKWARD_TWICE);
+    return nullptr;
+  }
+  ${body}
+  END_HANDLE_TH_ERRORS
+}
+                            """
+                ).substitute(
+                    op=info.op,
+                    name=name,
+                    body=GETTER_BODY_VEC_SCALAR,
+                )
+            )
+        else:
+            # Check for indicators that you're putting a non-owning reference
+            # into the saved variable field.  If this is spuriously firing,
+            # edit this field.  Otherwise, you probably need to add a case
+            # above.
+            assert (
+                "ref" not in type.cpp_type().lower()
+                and "view" not in type.cpp_type().lower()
+                and "*" not in type.cpp_type()
+                and "&" not in type.cpp_type()
+            ), f"{type.cpp_type()} looks like it contains a non-owning reference"
+            saved_variables.append(f"{type.cpp_type()} {name};")
+
+            if type in MISC_GETTER_DEFS:
+                getter_def, body = MISC_GETTER_DEFS[type]
+                getter_definitions.append(
+                    getter_def.substitute(op=info.op, name=name, body=body)
+                )
+            else:
+                # Types we don't expose python bindings to yet:
+                #   TypeAndSize, at::ScalarType, TensorOptions, TensorGeometry,
+                #   std::vector<std::vector<int64_t>>, std::vector<at::ScalarType>
+                should_append_getsetdef = False
+
+        if should_append_getsetdef:
+            py_getsetdef_structs.append(
+                PY_GETSETDEF_STRUCT.substitute(op=info.op, name=name)
+            )
+        if should_append_raw_getsetdef:
+            py_getsetdef_structs.append(
+                PY_RAW_GETSETDEF_STRUCT.substitute(op=info.op, name=name)
+            )
+
+        if uses_cpp_saved_variable_cls:
+            compiled_args.append(
+                f"args.collect({visit_name}, {'true' if is_output else 'false'});"
+            )
+        else:
+            compiled_args.append(f"args.collect({visit_name});")
+        apply_with_saved_before.append(f"saved.before({visit_name});")
+        apply_with_saved_after.append(f"saved.after({visit_name});")
+
+    for var in sorted(info.all_saved_inputs, key=lambda sa: str(sa.nctype.name)):
+        save_var(var, is_output=False)
+    for var in sorted(info.all_saved_outputs, key=lambda sa: str(sa.nctype.name)):
+        save_var(var, is_output=True)
+
+    # lock the mutex when we release variables and in Node::apply to protect thread safety
+    # see Note [Thread Safety on Autograd Node]
+    if len(release_variables) > 0:
+        thread_lock = "std::lock_guard<std::mutex> lock(mutex_);"
+    else:
+        thread_lock = ""
+
+    if uses_retain_variables(info):
+        will_release_variables = WILL_RELEASE_VARIABLES.substitute()
+    else:
+        will_release_variables = ""
+
+    body: list[str] = []
+
+    if uses_single_grad(info):
+        body.append("const auto& grad = grads[0];")
+    else:
+        # Generate aliases for gradients named for returned values.
+        body.extend(
+            f"const auto& {name} = grads[{info.available_named_gradients.index(name)}];"
+            for name in sorted(info.used_named_gradients)
+        )
+
+    def emit_derivative(
+        derivative: Derivative,
+        args_with_derivatives: Sequence[Binding],
+    ) -> tuple[bool, str]:
+        formula = derivative.formula
+        var_names = derivative.var_names
+        if len(var_names) == 1:
+            checks_any_grad_defined = False
+            if "not_implemented" not in formula:
+                matching_args = [
+                    arg for arg in args_with_derivatives if arg.name == var_names[0]
+                ]
+                if len(matching_args) == 1:
+                    # We can add undefined grad support if the input variable is a Tensor
+                    arg = matching_args[0]
+                    if isinstance(arg.argument, Argument) and str(
+                        arg.argument.type
+                    ) in ("Tensor", "Tensor?"):
+                        formula = "any_grad_defined ? (" + formula + ") : Tensor()"
+                        checks_any_grad_defined = True
+            if info.name.startswith("_foreach_"):
+                derivative_template = DERIVATIVE_SINGLE_FOREACH
+            else:
+                derivative_template = DERIVATIVE_SINGLE
+            return (
+                checks_any_grad_defined,
+                derivative_template.substitute(name=var_names[0], derivative=formula),
+            )
+        else:
+            if "grad_input_mask" in formula:
+                masks = [
+                    f"task_should_compute_output({{ {n}_ix }})," for n in var_names
+                ]
+                grad_input_mask = GRAD_INPUT_MASK.substitute(
+                    masks=masks, n=len(var_names)
+                )
+            else:
+                grad_input_mask = ""
+            idx_ranges = ", ".join(f"{n}_ix" for n in var_names)
+            copy_ranges: list[str] = []
+            for i, n in enumerate(var_names):
+                copy_ranges.append(DERIVATIVE_MULTI_COPY_RANGE.substitute(name=n, i=i))
+            return False, DERIVATIVE_MULTI.substitute(
+                idx_ranges=idx_ranges,
+                copy_ranges=copy_ranges,
+                derivative=formula,
+                grad_input_mask=grad_input_mask,
+            )
+
+    body.extend(unpack)
+    need_any_grad_defined_var = False
+    for derivative in info.derivatives:
+        checks_any_grad_defined, derivative_text = emit_derivative(
+            derivative, info.args_with_derivatives
+        )
+        body.append(derivative_text)
+        need_any_grad_defined_var |= checks_any_grad_defined
+    # Since single-output derivative formulas need to check if grads are
+    # defined, only perform the check once, before all the formulas
+    if need_any_grad_defined_var:
+        body.insert(
+            -len(info.derivatives),
+            "bool any_grad_defined = any_variable_defined(grads);",
+        )
+
+    if info.name in UNTRACEABLE_FUNCTIONS:
+        superclass = "Node"
+    else:
+        superclass = "TraceableFunction"
+
+    all_getsetdef_structs = (
+        ",\n".join(py_getsetdef_structs) + "," if len(py_getsetdef_structs) != 0 else ""
+    )
+    all_getter_definitions = "\n".join(getter_definitions)
+
+    return template.substitute(
+        op=info.op,
+        compute_index_ranges=compute_index_ranges,
+        saved_variables=saved_variables,
+        release_variables=release_variables,
+        saved_list_sizes=saved_list_sizes,
+        asserts=asserts,
+        thread_lock=thread_lock,
+        will_release_variables=will_release_variables,
+        body=body,
+        superclass=superclass,
+        all_getter_definitions=all_getter_definitions,
+        all_getsetdef_structs=all_getsetdef_structs,
+        compiled_args=compiled_args,
+        apply_with_saved_before=apply_with_saved_before,
+        apply_with_saved_after=apply_with_saved_after,
+    )
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_inplace_or_view_type.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_inplace_or_view_type.py
@ -0,0 +1,675 @@
+# Generates ADInplaceOrViewType.h/cpp
+#
+# NOTE: If any changes are being made to the ADInplaceOrView codegen please also check
+# if updates are needed in torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+# The fallback is expected to mimick this codegen, so we should keep the two in sync.
+
+from __future__ import annotations
+
+from torchgen.api import cpp
+from torchgen.api.autograd import (
+    dispatch_strategy,
+    gen_differentiable_outputs,
+    NativeFunctionWithDifferentiabilityInfo,
+)
+from torchgen.api.types import (
+    BaseCType,
+    Binding,
+    boolT,
+    ConstRefCType,
+    CType,
+    DispatcherSignature,
+    intArrayRefT,
+    longT,
+    OptionalCType,
+    symIntArrayRefT,
+    SymIntT,
+    tensorT,
+)
+from torchgen.code_template import CodeTemplate
+from torchgen.context import with_native_function
+from torchgen.model import (
+    NativeFunction,
+    SchemaKind,
+    SelfArgument,
+    TensorOptionsArguments,
+    Type,
+)
+from torchgen.utils import FileManager
+
+from .context import with_native_function_with_differentiability_info
+from .gen_trace_type import (
+    get_return_value,
+    MANUAL_AUTOGRAD,
+    tie_return_values,
+    type_wrapper_name,
+)
+
+
+# See NOTE [ Autograd View Variables ] in variable.h for details.
+# If you update list VIEW_FUNCTIONS or RETURNS_VIEWS_OF_INPUT,
+# you **MUST** also update the public list of view ops accordingly in
+# docs/source/tensor_view.rst. Note not all ATen functions are exposed to public,
+# e.g alias & sparse_coo_tensor_with_dims_and_tensors.
+#
+# A map: function name => name of the argument that all outputs are view of
+
+VIEW_FUNCTIONS_WITH_METADATA_CHANGE = [
+    "view_as_complex",
+    "view_as_real",
+    "_conj",
+    "_neg_view",
+    "_nested_get_values",
+    "_nested_view_from_buffer",
+    "_nested_view_from_jagged",
+]
+
+VIEW_FUNCTIONS = {
+    "numpy_T": "self",
+    "alias": "self",
+    "as_strided": "self",
+    "diagonal": "self",
+    "expand": "self",
+    "permute": "self",
+    "select": "self",
+    "slice": "self",
+    "slice_inverse": "self",
+    "split": "self",
+    "split_with_sizes": "self",
+    "squeeze": "self",
+    "t": "self",
+    "transpose": "self",
+    "unfold": "self",
+    "unsqueeze": "self",
+    "flatten": "self",
+    "view": "self",
+    "unbind": "self",
+    "_indices": "self",
+    "_values": "self",
+    "indices": "self",
+    "values": "self",
+    "crow_indices": "self",
+    "col_indices": "self",
+    "ccol_indices": "self",
+    "row_indices": "self",
+    # sparse_coo ctor output should really be views of both indices and values,
+    # but we only supports making as view of a single variable, and indices is
+    # discrete anyways.
+    # FIXME: clone indices on construction.
+    "sparse_coo_tensor_with_dims_and_tensors": "values",
+    "_reshape_alias": "self",
+    "_test_autograd_multiple_dispatch_view": "self",
+}
+
+for key in VIEW_FUNCTIONS_WITH_METADATA_CHANGE:
+    VIEW_FUNCTIONS[key] = "self"
+
+# note: some VIEW_FUNCTIONS are just compositions of the view functions above
+# this list contains both the root view functions and any that are purely composed
+# of viewing functions, and is used by the JIT to determine when an operator
+# may return a view of its inputs; however they may sometimes return a copy.
+# (e.g. `contiguous`)
+RETURNS_VIEWS_OF_INPUT = set(VIEW_FUNCTIONS.keys()).union(
+    {
+        "chunk",
+        "detach",
+        "contiguous",
+        "reshape",
+        "reshape_as",
+        "expand_as",
+        "view_as",
+        "real",
+        "imag",
+        "narrow",
+        "movedim",
+        "tensor_split",
+        "swapdims",
+        "swapaxes",
+        "mT",
+        "mH",
+        "adjoint",
+        "matrix_H",
+    }
+)
+
+# These are the functions we consider views for the purposes of validating
+# StorageImpl and TensorImpl in gen_variable_type.
+# `_unsafe_view` is not included in VIEW_FUNCTIONS above because it is not a
+# view for the purposes of ADInplaceOrView kernel, we do not want to call as_view
+# See NOTE [Unsafe View] for more info.
+ALL_VIEW_FUNCTIONS = {
+    **VIEW_FUNCTIONS,
+    "_unsafe_view": "self",
+}
+
+ARRAYREF_TO_VEC = CodeTemplate(
+    """\
+auto ${vec} = ${arg}.vec();
+"""
+)
+
+OPTIONAL_TO_VAL = CodeTemplate(
+    """\
+auto ${val} = ${arg}.value_or(${default});
+"""
+)
+
+CALL_DISPATCH = CodeTemplate(
+    """\
+at::_ops::${unambiguous_name}::call(${unpacked_args})"""
+)
+
+REVERSE_VIEW_DISPATCH = CodeTemplate(
+    """\
+${reverse_name}(${unpacked_args})"""
+)
+
+MULTI_OUTPUT_VIEW_ITERATION = CodeTemplate(
+    """\
+for (auto ${view_idx} : c10::irange(${var}.size())) {
+  ${body}
+}
+"""
+)
+
+SETUP_REPLAY_VIEW_IF_NOT_SUPPORT_AS_STRIDED_OR_VIEW_WITH_METADATA_CHANGE = CodeTemplate(
+    """\
+std::unique_ptr<torch::autograd::ViewFunc> func(nullptr);
+std::function<at::Tensor(const at::Tensor&)> rev_func=nullptr;
+if (${is_view_with_metadata_change} ||
+    !self.unsafeGetTensorImpl()->support_as_strided() ||
+    self.unsafeGetTensorImpl()->is_python_dispatch() ||
+    c10::AutogradState::get_tls_state().get_view_replay_enabled()) {
+  ${replay_view_func}
+  ${reverse_replay_view_func}
+}
+"""
+)
+
+REPLAY_VIEW_FUNC = CodeTemplate(
+    """\
+func = std::make_unique<${view_func_name}>(${view_func_args});
+"""
+)
+
+REVERSE_REPLAY_VIEW_LAMBDA_FUNC = CodeTemplate(
+    """\
+rev_func = [=](const at::Tensor& ${input_view}) {
+  return ${reverse_replay_view_call};
+};
+"""
+)
+
+METHOD_DEFINITION = CodeTemplate(
+    """\
+${return_type} ${type_wrapper_name}(${formals}) {
+  ${type_definition_body}
+}
+"""
+)
+
+WRAPPER_REGISTRATION = CodeTemplate(
+    """\
+m.impl("${unqual_operator_name_with_overload}",
+       TORCH_FN(${class_type}::${type_wrapper_name})
+);
+"""
+)
+
+AUTOGRAD_NOT_IMPLEMENTED_REGISTRATION = CodeTemplate(
+    """\
+m.impl("${unqual_operator_name_with_overload}", torch::autograd::autogradNotImplementedFallback());
+"""
+)
+
+INPLACE_REDISPATCH = CodeTemplate(
+    """\
+{
+  at::AutoDispatchBelowADInplaceOrView guard;
+  at::_ops::${unambiguous_name}::redispatch(${unpacked_args});
+}
+"""
+)
+
+ASSIGN_RETURN_VALUE = CodeTemplate(
+    """\
+${return_values} = ${rhs_value};
+"""
+)
+
+VIEW_REDISPATCH = CodeTemplate(
+    """\
+${assign_return_values} ([&]() {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return at::_ops::${unambiguous_name}::redispatch(${unpacked_args});
+})();
+"""
+)
+
+TMP_VAR = "_tmp"
+
+
+# FIXME: Ideally these functions should be methods on Type class, but we have a
+#        comment in codegen/model.py there saying these concepts are not well defined.
+#        Thus we put a version that commonly used by autograd codegen here.
+def is_tensor_type(t: Type) -> bool:
+    # TODO: Should handle optional here?
+    return t.is_tensor_like() and t.is_list_like() is None
+
+
+def is_tensor_list_type(t: Type) -> bool:
+    # TODO: Should handle optional here?
+    return t.is_tensor_like() and t.is_list_like() is not None
+
+
+UNPACK_TENSOR = CodeTemplate(
+    """\
+auto${ref} ${arg_name}_ = unpack${suffix}(${arg_name}, "${arg_name}", ${arg_pos});"""
+)
+
+
+def unpacked_name(arg_name: str) -> str:
+    return arg_name + "_"
+
+
+# e.g. select.int -> select_copy_int_inverse()
+def inverse_view_name(f: NativeFunction) -> str:
+    copy_variant = f"{f.root_name}_copy"
+    overload = f"{f.func.name.overload_name}"
+    if overload != "":
+        overload = "_" + overload
+    return f"{copy_variant}{overload}_inverse"
+
+
+def extract_bindings(f: NativeFunction) -> list[Binding]:
+    return [
+        r
+        for a in f.func.schema_order_arguments()
+        for r in cpp.argument(
+            a,
+            method=False,
+            symint=True,
+            cpp_no_default_args=set(),
+            faithful=False,
+            has_tensor_options=False,
+        )
+    ]
+
+
+@with_native_function
+def unpack_args(f: NativeFunction) -> tuple[list[str], list[Binding]]:
+    body: list[str] = []
+    unpacked_bindings: list[Binding] = []
+
+    for i, binding in enumerate(extract_bindings(f)):
+        assert not isinstance(binding.argument, SelfArgument)
+        if isinstance(binding.argument, TensorOptionsArguments):
+            raise RuntimeError("VariableKernel shouldn't take TensorOptions")
+
+        is_nullable = binding.argument.type.is_nullable()
+        if not binding.argument.type.is_tensor_like() or is_nullable:
+            unpacked_bindings.append(binding)
+            continue
+
+        is_tensor_list = is_tensor_list_type(binding.argument.type)
+        ref = (not is_nullable) and not is_tensor_list
+        suffix = "_opt" if is_nullable and not is_tensor_list else ""
+        body.append(
+            UNPACK_TENSOR.substitute(
+                arg_name=binding.name,
+                arg_pos=i,
+                suffix=suffix,
+                ref="&" if ref else "",
+            )
+        )
+        unpacked_bindings.append(
+            Binding(
+                name=unpacked_name(binding.name),
+                nctype=binding.nctype,
+                argument=binding.argument,
+                default=binding.default,
+            )
+        )
+
+    return body, unpacked_bindings
+
+
+def get_base_name(f: NativeFunction) -> str:
+    return f.func.name.name.base  # TODO: should be str(f.func.name.name)?
+
+
+def get_view_info(f: NativeFunction) -> str | None:
+    base_name = get_base_name(f)
+    view_info = VIEW_FUNCTIONS.get(base_name, None)
+    if view_info is None and base_name in RETURNS_VIEWS_OF_INPUT:
+        view_info = "self"
+    return view_info
+
+
+def emit_view_func(
+    f: NativeFunction, bindings: list[Binding], view_idx: str | None = None
+) -> str:
+    """Generate an additional lambda function to recover views in backward when as_strided is not supported.
+    See Note [View + Inplace update for base tensor] and [View + Inplace update for view tensor] for more details.
+    """
+    # TODO: Clean this logic up if we get rid of reverse view funcs or reify them.
+    input_base = "input_base"
+    replay_view_func = ""
+    updated_args: list[str] = []
+    known_view_arg_simple_types: list[CType] = [
+        BaseCType(longT),
+        OptionalCType(BaseCType(longT)),
+        BaseCType(SymIntT),
+        OptionalCType(BaseCType(SymIntT)),
+        BaseCType(boolT),
+        BaseCType(intArrayRefT),
+        BaseCType(symIntArrayRefT),
+        ConstRefCType(BaseCType(tensorT)),
+        ConstRefCType(OptionalCType(BaseCType(tensorT))),
+    ]
+    for binding in bindings:
+        arg, arg_type = binding.name, binding.nctype.type
+        if arg == "self":
+            updated_args.append(input_base)
+            continue
+        if arg_type not in known_view_arg_simple_types:
+            known_types_str = ", ".join([str(t) for t in known_view_arg_simple_types])
+            raise TypeError(
+                f"You are adding an {arg_type} {arg} argument to op {cpp.name(f.func)} in addition to known types: "
+                f"{known_types_str}. Please update the list or materialize it so that it can be closed "
+                "over by value, also add a test in pytorch/xla/test/test_operations.py where this code "
+                "is exercised."
+            )
+        if arg_type == BaseCType(intArrayRefT) or arg_type == BaseCType(
+            symIntArrayRefT
+        ):
+            # It's not safe to close over IntArrayRef by value, since this is a
+            # reference type, so materialize a vector to close over by value
+            arg_vec = arg + "_vec"
+            replay_view_func += ARRAYREF_TO_VEC.substitute(arg=arg, vec=arg_vec)
+            updated_args.append(arg_vec)
+        elif arg_type == OptionalCType(BaseCType(longT)):
+            # Materialize int64_t? to int64_t
+            arg_value = arg + "_val"
+            replay_view_func += OPTIONAL_TO_VAL.substitute(
+                arg=arg, val=arg_value, default="0"
+            )
+            updated_args.append(arg_value)
+        elif arg_type == ConstRefCType(BaseCType(tensorT)) or arg_type == ConstRefCType(
+            OptionalCType(BaseCType(tensorT))
+        ):
+            # NB: Closing over a tensor. If a user modifies this tensor, this will be silently
+            # incorrect. The proper thing to do is to store the version counter and copy on write.
+            updated_args.append(arg)
+        else:
+            updated_args.append(arg)
+
+    from .gen_view_funcs import view_func_name
+
+    view_func_args = [b.name for b in bindings if b.name != "self"]
+    if view_idx is not None:
+        view_func_args.append(f"{view_idx}")
+    replay_view_func += REPLAY_VIEW_FUNC.substitute(
+        view_func_name=view_func_name(f, include_namespace=True),
+        view_func_args=view_func_args,
+    )
+
+    input_view = "input_view"
+    reverse_unpacked_args = [
+        "self",
+        f"{input_view}",
+        # inverse_return_mode=
+        "at::functionalization::InverseReturnMode::AlwaysView",
+        *(() if view_idx is None else (f"{view_idx}",)),
+        # skip input_base arg
+        *updated_args[1:],
+    ]
+
+    from torchgen.api.functionalization import reverse_name
+
+    reverse_replay_view_call = REVERSE_VIEW_DISPATCH.substitute(
+        reverse_name=reverse_name(f, include_namespace=True),
+        unpacked_args=reverse_unpacked_args,
+    )
+    reverse_replay_view_func = REVERSE_REPLAY_VIEW_LAMBDA_FUNC.substitute(
+        input_view=input_view, reverse_replay_view_call=reverse_replay_view_call
+    )
+
+    is_view_with_metadata_change = (
+        "true" if cpp.name(f.func) in VIEW_FUNCTIONS_WITH_METADATA_CHANGE else "false"
+    )
+
+    return SETUP_REPLAY_VIEW_IF_NOT_SUPPORT_AS_STRIDED_OR_VIEW_WITH_METADATA_CHANGE.substitute(
+        is_view_with_metadata_change=is_view_with_metadata_change,
+        replay_view_func=replay_view_func,
+        reverse_replay_view_func=reverse_replay_view_func,
+    )
+
+
+def emit_view_body(
+    fn: NativeFunctionWithDifferentiabilityInfo, var: str
+) -> tuple[str, str]:
+    # See NOTE [ Autograd View Variables ] in variable.h for details.
+    f = fn.func
+    base_name = get_base_name(f)
+    view_info = get_view_info(f)
+    call = ""
+    differentiable_outputs = gen_differentiable_outputs(fn)
+    differentiable_output_vars = {r.name for r in differentiable_outputs}
+    if not isinstance(view_info, str):
+        raise TypeError(
+            f"The view info should be a string for {base_name}, but it is: {view_info}"
+        )
+    if len(differentiable_output_vars) == 0:
+        # no output is differentiable (.indices() for SparseTensors for example)
+        rhs_value = (
+            f"as_view({view_info}, {var}, "
+            f"/* is_bw_differentiable */ false, /* is_fw_differentiable */ false)"
+        )
+    elif len(differentiable_output_vars) == 1:
+        # Single differentiable output (Tensor or Tensor[])
+        return_info = differentiable_outputs[0]
+        # We only support simple Tensor or a TensorList for functions that return views
+        if not is_tensor_type(return_info.type) and not is_tensor_list_type(
+            return_info.type
+        ):
+            raise RuntimeError(
+                f"{base_name} that return differentiable views can only return Tensor or Tensor[]"
+            )
+
+        # See Note [ View + Inplace detection]
+        def get_creation_meta_in_mode(original: str) -> str:
+            creation_meta_with_grad_mode = f"(at::GradMode::is_enabled() ? {original} : CreationMeta::NO_GRAD_MODE)"
+            return f"InferenceMode::is_enabled() ? CreationMeta::INFERENCE_MODE : {creation_meta_with_grad_mode}"
+
+        # Only allow rebasing of the history if we return a single Tensor
+        # If we are in a no grad block, raise a warning
+        # See NOTE [ View + Inplace detection ] for more details about this logic
+        if is_tensor_list_type(return_info.type):
+            creation_meta = get_creation_meta_in_mode("CreationMeta::MULTI_OUTPUT_NODE")
+            view_idx = "view_idx"
+            view_func = emit_view_func(
+                f, extract_bindings(f), view_idx=view_idx
+            ).strip()
+            as_view_call = (
+                f"as_view(/* base */ {view_info}, /* output */ {var}[{view_idx}], "
+                "/* is_bw_differentiable */ true, /* is_fw_differentiable */ true, "
+                "/* view_func */ std::move(func), /* rev_view_func */ rev_func, "
+                f"/* creation_meta */ {creation_meta});"
+            )
+            call += MULTI_OUTPUT_VIEW_ITERATION.substitute(
+                var=var, view_idx=view_idx, body=f"{view_func}\n{as_view_call}"
+            )
+            rhs_value = f"std::move({var})"
+        else:
+            call += emit_view_func(f, extract_bindings(f), view_idx=None)
+            creation_meta = get_creation_meta_in_mode("CreationMeta::DEFAULT")
+            rhs_value = (
+                f"as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, "
+                "/* is_fw_differentiable */ true, "
+                f"/* view_func */ std::move(func), /* rev_view_func */ rev_func, /* creation_meta */ {creation_meta})"
+            )
+    else:
+        # This could be supported but we don't need it at the moment, so keeping things simple.
+        raise RuntimeError(
+            "Function that return multiple differentiable output "
+            "when at least one of them is view is not supported."
+        )
+    return call, rhs_value
+
+
+def modifies_arguments(f: NativeFunction) -> bool:
+    return f.func.kind() in [SchemaKind.inplace, SchemaKind.out]
+
+
+@with_native_function_with_differentiability_info
+def emit_inplace_or_view_body(fn: NativeFunctionWithDifferentiabilityInfo) -> list[str]:
+    f = fn.func
+    inplace_view_body: list[str] = []
+
+    dispatcher_sig = DispatcherSignature.from_schema(f.func)
+    dispatcher_exprs = dispatcher_sig.exprs()
+
+    # code-generated ADInplaceOrView kernels plumb and recompute dispatch keys directly through the kernel for performance.
+    # See Note [Plumbing Keys Through The Dispatcher] for details.
+    dispatch_key_set = "ks & c10::after_ADInplaceOrView_keyset"
+    redispatch_args = ", ".join([dispatch_key_set] + [a.expr for a in dispatcher_exprs])
+
+    # Note that this calls the slow, dispatching variants of manual_cpp_binding ops.
+    # We could probably work harder to ensure that the fast variants are called instead, but the perf benefit would be minimal.
+    if modifies_arguments(f):  # inplace op
+        inplace_view_body.append(
+            INPLACE_REDISPATCH.substitute(
+                unambiguous_name=f.func.name.unambiguous_name(),
+                unpacked_args=redispatch_args,
+            )
+        )
+        for r in cpp.return_names(f):
+            inplace_view_body.append(f"increment_version({r});")
+    else:
+        assert get_view_info(f) is not None
+        inplace_view_body.append(
+            VIEW_REDISPATCH.substitute(
+                assign_return_values="auto " + TMP_VAR + " = ",
+                unambiguous_name=f.func.name.unambiguous_name(),
+                unpacked_args=redispatch_args,
+            )
+        )
+        call, rhs_value = emit_view_body(fn, TMP_VAR)
+        inplace_view_body.append(call)
+        assert rhs_value is not None
+        inplace_view_body.append(
+            ASSIGN_RETURN_VALUE.substitute(
+                return_values=tie_return_values(f), rhs_value=rhs_value
+            )
+        )
+    if f.func.returns:
+        inplace_view_body.append(f"return {get_return_value(f)};")
+    return inplace_view_body
+
+
+@with_native_function
+def gen_formals(f: NativeFunction) -> str:
+    return ", ".join(
+        # code-generated autograd kernels plumb and recompute dispatch keys directly through the kernel for performance.
+        # See Note [Plumbing Keys Through The Dispatcher] for details.
+        ["c10::DispatchKeySet ks"]
+        + [
+            f'{cpp.argument_type(a, binds="__placeholder__", symint=True).cpp_type()} {a.name}'
+            for a in f.func.schema_order_arguments()
+        ]
+    )
+
+
+@with_native_function_with_differentiability_info
+def inplace_or_view_method_definition(
+    fn: NativeFunctionWithDifferentiabilityInfo,
+) -> str | None:
+    f = fn.func
+    if get_view_info(f) is None and (
+        # For functions that modify their inputs but don't return them,
+        # we can't give them autograd support.
+        # See https://github.com/pytorch/pytorch/issues/53796
+        not modifies_arguments(f)
+        or len(f.func.returns) == 0
+    ):
+        return None
+    return METHOD_DEFINITION.substitute(
+        return_type=cpp.returns_type(f.func.returns, symint=True).cpp_type(),
+        type_wrapper_name=type_wrapper_name(f),
+        formals=gen_formals(f),
+        type_definition_body=emit_inplace_or_view_body(fn),
+    )
+
+
+@with_native_function_with_differentiability_info
+def inplace_or_view_method_registration(
+    fn: NativeFunctionWithDifferentiabilityInfo,
+) -> str | None:
+    f = fn.func
+    if get_view_info(f) is None and (
+        not modifies_arguments(f) or len(f.func.returns) == 0
+    ):
+        return None
+    return WRAPPER_REGISTRATION.substitute(
+        unqual_operator_name_with_overload=f.func.name,
+        type_wrapper_name=type_wrapper_name(f),
+        class_type="ADInplaceOrView",
+    )
+
+
+def use_derived(fn: NativeFunctionWithDifferentiabilityInfo) -> bool:
+    f = fn.func
+    name = cpp.name(f.func)
+    return name not in MANUAL_AUTOGRAD and dispatch_strategy(fn) == "use_derived"
+
+
+def gen_inplace_or_view_type_env(
+    fn: NativeFunctionWithDifferentiabilityInfo,
+) -> dict[str, list[str]]:
+    definition = inplace_or_view_method_definition(fn)
+    registration = inplace_or_view_method_registration(fn)
+
+    return {
+        "ops_headers": (
+            [f"#include <ATen/ops/{fn.func.root_name}_ops.h>"]
+            if definition is not None
+            else []
+        ),
+        "inplace_or_view_method_definitions": [definition]
+        if definition is not None
+        else [],
+        "inplace_or_view_wrapper_registrations": [registration]
+        if registration is not None
+        else [],
+    }
+
+
+def gen_inplace_or_view_type(
+    out: str,
+    native_yaml_path: str,
+    tags_yaml_path: str,
+    fns_with_infos: list[NativeFunctionWithDifferentiabilityInfo],
+    template_path: str,
+) -> None:
+    # NOTE: see Note [Sharded File] at the top of the VariableType.cpp
+    # template regarding sharding of the generated files.
+    num_shards = 2
+
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    fm.write_sharded(
+        "ADInplaceOrViewType.cpp",
+        [fn for fn in fns_with_infos if use_derived(fn)],
+        key_fn=lambda fn: fn.func.root_name,
+        base_env={
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/ADInplaceOrViewType.cpp",
+        },
+        env_callable=gen_inplace_or_view_type_env,
+        num_shards=2,
+        sharded_keys={
+            "ops_headers",
+            "inplace_or_view_method_definitions",
+            "inplace_or_view_wrapper_registrations",
+        },
+    )
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_python_functions.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_python_functions.py
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_trace_type.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_trace_type.py
@ -0,0 +1,536 @@
+from __future__ import annotations
+
+import itertools
+from typing import Sequence
+
+from torchgen.api import cpp
+from torchgen.api.types import DispatcherSignature
+from torchgen.code_template import CodeTemplate
+from torchgen.context import with_native_function
+from torchgen.model import Argument, NativeFunction, SchemaKind, TensorOptionsArguments
+from torchgen.utils import FileManager
+
+
+# Note [Manual Backend kernels]
+# For these ops, we want to manually register to dispatch key Backend and
+# skip codegen-ed registeration to all keys before Backend.
+# For codegen this means:
+#   - op set below must match ops with manual_kernel_registration=True in native_functions.yaml
+#     where we skip codegen backend kernels
+#   - all ops below are part of MANUAL_AUTOGRAD to skip codegen Autograd kernel registration
+#   - all ops below are part of MANUAL_TRACER to skip codegen Tracer kernel registration
+# Note: we still register to dispatch key Profiler for these ops, keeping it untouched for now.
+# You can find the manual registration in torch/csrc/autograd/VariableTypeManual.cpp
+MANUAL_BACKEND = {
+    "options",
+    "data",
+    "set_data",
+    "is_leaf",
+    "output_nr",
+    "_version",
+    "retain_grad",
+    "_backward",
+    "requires_grad_",
+}
+
+# For these ops we want to skip the codegen-ed registration to both Autograd and Tracer keys.
+# You can find the manual registration in torch/csrc/autograd/VariableTypeManual.cpp
+MANUAL_AUTOGRAD_AND_TRACER = {
+    "resize_",
+    "resize_as_",
+    "detach",
+    "detach_",
+    "copy_",
+    "_fw_primal",
+    "_make_dual",
+}
+
+# Currently MANUAL_AUTOGRAD and MANUAL_TRACER share the same set of ops:
+#   union(MANUAL_BACKEND, MANUAL_AUTOGRAD_AND_TRACER)
+# You can find the manual registration in torch/csrc/autograd/VariableTypeManual.cpp
+MANUAL_AUTOGRAD = MANUAL_TRACER = MANUAL_BACKEND | MANUAL_AUTOGRAD_AND_TRACER
+
+# These functions we don't want to record for tracing, because we always want
+# to trace their constituent parts.  This is a temporary hack in lieue
+# of proper scopes, where subsequent compilation passes can ask for the unfolding
+# on demand.  Only concrete ATen methods can be disabled this way; it will have
+# NO EFFECT otherwise.
+DONT_RECORD_TRACE = {
+    "convolution",
+    "conv1d",
+    "conv2d",
+    "conv3d",
+    "conv_transpose1d",
+    "conv_transpose2d",
+    "conv_transpose3d",
+    "lstm_cell",
+    "gru_cell",
+    "rnn_tanh_cell",
+    "rnn_relu_cell",
+    # FIXME: figure out a better way when we support sparse tensors in jit
+    "_coalesced",
+}
+
+
+def should_trace(f: NativeFunction) -> bool:
+    # Operations involving Storage or Type are not traceable at the moment
+    if any(
+        str(arg.type) in {"Storage", "Type", "ConstQuantizerPtr"}
+        for arg in f.func.schema_order_arguments()
+    ):
+        return False
+    # We can't trace functions which don't have any Tensor or TensorList returns
+    if not any(r.type.is_tensor_like() for r in f.func.returns):
+        return False
+    return f.func.name.name.base not in DONT_RECORD_TRACE
+
+
+SELECT = CodeTemplate(
+    """\
+
+if (${cond}) {
+  ${true}
+} else {
+  ${false}
+}
+"""
+)
+
+OP_NAME = CodeTemplate(
+    """\
+op_name = c10::Symbol::fromQualString("aten::${trace_name}");
+"""
+)
+
+# These functions have their names recorded under trace renamed,
+RENAME_TRACE = {
+    "zero": "zeros_like",  # replacing aten::zero_ with aten::zeros_like
+    "fill": "full_like",  # replacing aten::fill_ with aten::full_like
+}
+
+
+def format_trace_op_name(f: NativeFunction) -> str:
+    # TODO: byte-for-byte compatible with old codegen behavior - should clean up
+    if (
+        f.func.kind() in (SchemaKind.functional, SchemaKind.out)
+        or f.func.name.name.dunder_method
+    ):
+        # special case for *_out functions: the in-place and out-of-place ops
+        # are overloaded with the same name in the JIT
+        trace_name = str(f.func.name.name)
+        trace_name = RENAME_TRACE.get(trace_name, trace_name)
+        return OP_NAME.substitute(trace_name=trace_name)
+
+    # otherwise, this is an in-place op and we need to emit both in- and
+    # out-of-place versions
+    outplace_trace_name = f.func.name.name.base
+    inplace_trace_name = cpp.name(f.func)
+    outplace_trace_name = RENAME_TRACE.get(outplace_trace_name, outplace_trace_name)
+    inplace_trace_name = RENAME_TRACE.get(inplace_trace_name, inplace_trace_name)
+
+    return SELECT.substitute(
+        cond="tracer_state->force_outplace",
+        true=OP_NAME.substitute(trace_name=outplace_trace_name),
+        false=OP_NAME.substitute(trace_name=inplace_trace_name),
+    )
+
+
+ADD_TRACE_INPUT = CodeTemplate("""jit::tracer::addInputs(node, "${name}", ${input});""")
+
+
+def format_trace_inputs(f: NativeFunction) -> str:
+    def dispatch_trace_input(arg: Argument | TensorOptionsArguments) -> Sequence[str]:
+        if isinstance(arg, TensorOptionsArguments):
+            name = "options"
+            return [
+                ADD_TRACE_INPUT.substitute(
+                    name=name, input="c10::optTypeMetaToScalarType(options.dtype_opt())"
+                ),
+                ADD_TRACE_INPUT.substitute(name=name, input="options.layout()"),
+                ADD_TRACE_INPUT.substitute(name=name, input="options.device()"),
+                ADD_TRACE_INPUT.substitute(name=name, input="options.pinned_memory()"),
+            ]
+        else:
+            name = arg.name
+            if str(arg.type) == "Tensor?[]":
+                return [f'jit::tracer::addInputs(node, "{name}", {name});']
+            else:
+                return [ADD_TRACE_INPUT.substitute(name=name, input=name)]
+
+    args: list[Argument | TensorOptionsArguments] = list(
+        f.func.schema_order_arguments()
+    )
+
+    if f.func.is_out_fn():
+        # *_out functions take the result as a separate argument, but we don't want to
+        # trace that argument directly. Instead, we trace its TensorOptions.
+        # So first, we need to remove the out argument from the list of arguments to trace.
+        num_out_args = len(f.func.arguments.out)
+        args = args[:-num_out_args]
+
+    trace_inputs = itertools.chain.from_iterable(
+        dispatch_trace_input(arg) for arg in args
+    )
+
+    if f.func.is_out_fn():
+        # for *_out functions, handle the result argument differently for inplace/outplace.
+        # For inplace: just add the input to the end to confirm with the JIT schema
+        inplace = [
+            ADD_TRACE_INPUT.substitute(
+                name=f.func.arguments.out[i].name, input=f.func.arguments.out[i].name
+            )
+            for i in range(num_out_args)
+        ]
+
+        # for outplace: do nothing, except if the function is a factory.
+        # Factories are a bit special because their out-of-place overloads
+        # take an extra TensorOptions argument, which is missing in the _out function
+        has_tensor_return = any(r.type.is_tensor_like() for r in f.func.returns)
+        has_tensor_input_arg = any(
+            a.type.is_tensor_like() for a in f.func.arguments.flat_non_out
+        )
+        is_factory_method = f.category_override == "factory" or (
+            has_tensor_return and not has_tensor_input_arg
+        )
+
+        # HACK: preserve old codegen behavior - the old codegen set the `is_factory_method`
+        # flag for the whole family of ops with the same basename if any of them is a
+        # factory method. For most cases the whole family of ops are indeed all factory
+        # method - 'normal' is the only exception. So we handle it specially here to avoid
+        # cloning the old logic.
+        if f.func.name.name.base == "normal":
+            is_factory_method = True
+
+        if is_factory_method:
+            outplace = [
+                ADD_TRACE_INPUT.substitute(
+                    name="out",
+                    input="c10::optTypeMetaToScalarType(out.options().dtype_opt())",
+                ),
+                ADD_TRACE_INPUT.substitute(name="out", input="out.options().layout()"),
+                ADD_TRACE_INPUT.substitute(name="out", input="out.options().device()"),
+                ADD_TRACE_INPUT.substitute(
+                    name="out", input="out.options().pinned_memory()"
+                ),
+            ]
+        else:
+            outplace = []
+
+        trace_inputs = itertools.chain(
+            trace_inputs,
+            [
+                SELECT.substitute(
+                    cond="tracer_state->force_outplace",
+                    true="\n".join(outplace),
+                    false="\n".join(inplace),
+                )
+            ],
+        )
+
+    return "\n".join(trace_inputs)
+
+
+# `torch.jit.trace` have undocumented keyword argument `_force_outplace`,
+# which force jit to replace functions with outplace variants (for
+# example `aten::add_` becomes `aten::add`).
+#
+# This replacement implemented in-place with minimum modifications of
+# arguments stack (as it assumes that outplace call has the same arguments
+# as inplace version).
+#
+# However there are no such substitutions available for `aten::fill_`
+# and `aten::zero_` operators, as we never implemented `aten::fill`
+# and `aten::zero`. So jit tracing hack replacing `aten::zero_` with
+# `aten::zeros_like` and replacing `aten::fill_` with `aten::full_like`.
+#
+# But as they potentially can have different arguments, we also have
+# to hack into the stack and add missing ones.
+#
+# A possible alternative would be:
+#
+#  - Add `aten::fill` and `aten::zero`
+#
+#  - Or keep `aten::zeros_like` arguments aligned with `aten::zero_`
+# arguments (inside of the `native_functions.yaml`)
+RENAME_TRACE_ADD_ARGS = {
+    "fill": """\
+    jit::tracer::addInputs(node, "options", ::std::optional<ScalarType>());
+    jit::tracer::addInputs(node, "options", layout_or_default(::std::nullopt));
+    jit::tracer::addInputs(node, "options", device_or_default(::std::nullopt));
+    jit::tracer::addInputs(node, "options", pinned_memory_or_default(::std::nullopt));
+    ::std::optional<MemoryFormat> memory_format = c10::MemoryFormat::Preserve;
+    jit::tracer::addInputs(node, "memory_format", memory_format);
+""",
+    "zero": """\
+    jit::tracer::addInputs(node, "options", ::std::optional<ScalarType>());
+    jit::tracer::addInputs(node, "options", layout_or_default(::std::nullopt));
+    jit::tracer::addInputs(node, "options", device_or_default(::std::nullopt));
+    jit::tracer::addInputs(node, "options", pinned_memory_or_default(::std::nullopt));
+    ::std::optional<MemoryFormat> memory_format = c10::MemoryFormat::Preserve;
+    jit::tracer::addInputs(node, "memory_format", memory_format);
+""",
+}
+
+INPLACE_GUARD = CodeTemplate(
+    """\
+jit::tracer::ensureUniqueIfOutOfPlaced("${name}", ${mutable_input});
+"""
+)
+
+PRE_RECORD_TRACE = CodeTemplate(
+    """\
+torch::jit::Node* node = nullptr;
+std::shared_ptr<jit::tracer::TracingState> tracer_state;
+if (jit::tracer::isTracing()) {
+  tracer_state = jit::tracer::getTracingState();
+  at::Symbol op_name;
+  ${set_op_name}
+  node = tracer_state->createNode(op_name, /*num_outputs=*/0);
+  jit::tracer::recordSourceLocation(node);
+  ${add_trace_inputs}
+  tracer_state->insertNode(node);
+  ${inplace_guard}
+  jit::tracer::setTracingState(nullptr);
+}
+"""
+)
+
+
+def format_prerecord_trace(f: NativeFunction) -> str:
+    if not should_trace(f):
+        return ""
+
+    # TODO: clean up old codegen behavior
+    is_inplace = (
+        f.func.kind() in (SchemaKind.inplace, SchemaKind.out)
+        and not f.func.name.name.dunder_method
+    )
+    add_args = (
+        RENAME_TRACE_ADD_ARGS.get(f.func.name.name.base, "") if is_inplace else ""
+    )
+    additional_inputs = (
+        SELECT.substitute(
+            cond="tracer_state->force_outplace",
+            true=add_args,
+            false="",
+        )
+        if add_args
+        else ""
+    )
+
+    return PRE_RECORD_TRACE.substitute(
+        set_op_name=format_trace_op_name(f),
+        add_trace_inputs=format_trace_inputs(f) + additional_inputs,
+        inplace_guard=INPLACE_GUARD.substitute(
+            name=cpp.name(f.func),
+            mutable_input=f.func.arguments.out[0].name
+            if f.func.arguments.out
+            else "self",
+        )
+        if is_inplace
+        else "",
+    )
+
+
+POST_RECORD_TRACE = CodeTemplate(
+    """\
+if (tracer_state) {
+  jit::tracer::setTracingState(std::move(tracer_state));
+  ${add_trace_outputs}
+}
+"""
+)
+
+
+def format_postrecord_trace(f: NativeFunction) -> str:
+    if not should_trace(f):
+        return ""
+
+    # For outplacing ops, *_out overloads require special handling to move the
+    # output *argument* to a return value
+    if f.func.is_out_fn():
+        output_names_outplace = [arg.name for arg in f.func.arguments.out]
+        output_names_inplace = cpp.return_names(f)
+
+        # Code size optimization: the common case is that the return value is
+        # the same for both variants
+        if output_names_outplace == output_names_inplace:
+            outputs = [
+                f"jit::tracer::addOutput(node, {n});" for n in output_names_outplace
+            ]
+            return POST_RECORD_TRACE.substitute(add_trace_outputs=outputs)
+
+        selection = SELECT.substitute(
+            cond="force_outplace",
+            true="\n".join(
+                f"jit::tracer::addOutput(node, {n});" for n in output_names_outplace
+            ),
+            false="\n".join(
+                f"jit::tracer::addOutput(node, {n});" for n in output_names_inplace
+            ),
+        )
+        return POST_RECORD_TRACE.substitute(add_trace_outputs=selection)
+    else:
+        output_names = cpp.return_names(f)
+        outputs = [f"jit::tracer::addOutput(node, {n});" for n in output_names]
+        return POST_RECORD_TRACE.substitute(add_trace_outputs=outputs)
+
+
+def tie_return_values(f: NativeFunction) -> str:
+    if len(f.func.returns) == 1:
+        return f'auto {f.func.returns[0].name or "result"}'
+    names = cpp.return_names(f)
+    return f'auto [{", ".join(names)}]'
+
+
+def get_return_value(f: NativeFunction) -> str:
+    names = cpp.return_names(f)
+    if len(f.func.returns) == 1:
+        return names[0]
+    if f.func.kind() == SchemaKind.out:
+        return f'std::forward_as_tuple({", ".join(names)})'
+    else:
+        moved = ", ".join(f"std::move({name})" for name in names)
+        return f"std::make_tuple({moved})"
+
+
+TRACE_DISPATCH = CodeTemplate(
+    """\
+${assign_return_values}at::_ops::${unambiguous_name}::redispatch(${unpacked_args});"""
+)
+
+
+def emit_trace_body(f: NativeFunction) -> list[str]:
+    trace_body: list[str] = []
+
+    trace_body.append(format_prerecord_trace(f))
+
+    dispatcher_sig = DispatcherSignature.from_schema(f.func)
+    dispatcher_exprs = dispatcher_sig.exprs()
+
+    # code-generated tracing kernels plumb and recompute dispatch keys directly through the kernel for performance.
+    # See Note [Plumbing Keys Through The Dispatcher] for details.
+    dispatch_key_set = "ks & c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, c10::DispatchKey::Tracer)"
+    redispatch_args = ", ".join([dispatch_key_set] + [a.expr for a in dispatcher_exprs])
+
+    assign_return_values = (
+        f"{tie_return_values(f)} = "
+        if f.func.kind() in [SchemaKind.functional, SchemaKind.mutable]
+        and f.func.returns
+        else ""
+    )
+
+    # Note that this calls the slow, dispatching variants of manual_cpp_binding ops.
+    # We could probably work harder to ensure that the fast variants are
+    # called instead, but the perf benefit would be minimal.
+    trace_body.append(
+        TRACE_DISPATCH.substitute(
+            assign_return_values=assign_return_values,
+            unambiguous_name=f.func.name.unambiguous_name(),
+            unpacked_args=redispatch_args,
+        )
+    )
+
+    trace_body.append(format_postrecord_trace(f))
+    if f.func.returns:
+        trace_body.append(f"return {get_return_value(f)};")
+    return trace_body
+
+
+METHOD_DEFINITION = CodeTemplate(
+    """\
+${return_type} ${type_wrapper_name}(${formals}) {
+  ${type_definition_body}
+}
+"""
+)
+
+
+def type_wrapper_name(f: NativeFunction, key: str = "Default") -> str:
+    if f.func.name.overload_name:
+        name = f"{cpp.name(f.func)}_{f.func.name.overload_name}"
+    else:
+        name = cpp.name(f.func)
+
+    # The key argument is only used in gen_variable_type where we need fns per autograd dispatch key.
+    # In gen_trace_type and gen_inplace_view_type where only one fn per native_fn must be generated,
+    # the key argument should not be passed.
+    # We do not append key if it is Default so that generated functions from
+    # before per-dispatch-key derivatives were added retain the same names.
+    if key != "Default":
+        name = name + f"_{key}"
+    return name
+
+
+@with_native_function
+def method_definition(f: NativeFunction) -> str:
+    assert cpp.name(f.func) not in MANUAL_TRACER
+
+    formals = ", ".join(
+        # code-generated tracing kernels plumb and recompute dispatch keys directly through the kernel for performance.
+        # See Note [Plumbing Keys Through The Dispatcher] for details.
+        ["c10::DispatchKeySet ks"]
+        + [
+            f'{cpp.argument_type(a, binds="__placeholder__", symint=True).cpp_type()} {a.name}'
+            for a in f.func.schema_order_arguments()
+        ]
+    )
+
+    return METHOD_DEFINITION.substitute(
+        return_type=cpp.returns_type(f.func.returns, symint=True).cpp_type(),
+        type_wrapper_name=type_wrapper_name(f),
+        formals=formals,
+        type_definition_body=emit_trace_body(f),
+    )
+
+
+WRAPPER_REGISTRATION = CodeTemplate(
+    """\
+m.impl("${name}",
+       TORCH_FN(${class_type}::${type_wrapper_name})
+);
+"""
+)
+
+
+@with_native_function
+def method_registration(f: NativeFunction) -> str:
+    assert cpp.name(f.func) not in MANUAL_TRACER
+
+    return WRAPPER_REGISTRATION.substitute(
+        name=f.func.name,
+        type_wrapper_name=type_wrapper_name(f),
+        class_type="TraceType",
+    )
+
+
+def gen_trace_type_func(fn: NativeFunction) -> dict[str, list[str]]:
+    return {
+        "ops_headers": [f"#include <ATen/ops/{fn.root_name}_ops.h>"],
+        "trace_method_definitions": [method_definition(fn)],
+        "trace_wrapper_registrations": [method_registration(fn)],
+    }
+
+
+def gen_trace_type(
+    out: str, native_functions: list[NativeFunction], template_path: str
+) -> None:
+    # NOTE: see Note [Sharded File] at the top of the VariableType.cpp
+    # template regarding sharding of the generated files.
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    fm.write_sharded(
+        "TraceType.cpp",
+        [fn for fn in native_functions if cpp.name(fn.func) not in MANUAL_TRACER],
+        key_fn=lambda fn: fn.root_name,
+        base_env={
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/TraceType.cpp",
+        },
+        env_callable=gen_trace_type_func,
+        num_shards=5,
+        sharded_keys={
+            "ops_headers",
+            "trace_method_definitions",
+            "trace_wrapper_registrations",
+        },
+    )
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_variable_factories.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_variable_factories.py
@ -0,0 +1,116 @@
+# Generates C++ functions that wrap ATen tensor factory methods to turn them into Variables.
+#
+# This writes one file: variable_factories.h
+
+from __future__ import annotations
+
+import re
+
+import torchgen.api.python as python
+from torchgen.api import cpp
+from torchgen.api.types import CppSignatureGroup
+from torchgen.context import with_native_function
+from torchgen.gen import parse_native_yaml
+from torchgen.model import NativeFunction, TensorOptionsArguments, Variant
+from torchgen.utils import FileManager, mapMaybe
+
+
+OPTIONAL_TYPE_PATTERN = re.compile(r"std::optional<(.+)>")
+TYPE_PATTERN = re.compile(r"(?:const\s+)?([A-Z]\w+)")
+
+
+# Add 'at::' to types defined in ATen namespace, e.g. Tensor, TensorList, IntArrayRef and etc.
+# TODO: maybe update the cpp argument API to take optional namespace argument?
+def fully_qualified_type(argument_type: str) -> str:
+    def maybe_optional_type(type: str, is_opt: bool) -> str:
+        return f"std::optional<{type}>" if is_opt else type
+
+    opt_match = OPTIONAL_TYPE_PATTERN.match(argument_type)
+    is_opt = opt_match is not None
+    if opt_match:
+        argument_type = argument_type[opt_match.start(1) : opt_match.end(1)]
+    match = TYPE_PATTERN.match(argument_type)
+    if match is None:
+        return maybe_optional_type(argument_type, is_opt)
+    index = match.start(1)
+    qualified_type = f"{argument_type[:index]}at::{argument_type[index:]}"
+    return maybe_optional_type(qualified_type, is_opt)
+
+
+def gen_variable_factories(
+    out: str, native_yaml_path: str, tags_yaml_path: str, template_path: str
+) -> None:
+    native_functions = parse_native_yaml(
+        native_yaml_path, tags_yaml_path
+    ).native_functions
+    factory_functions = [fn for fn in native_functions if is_factory_function(fn)]
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    fm.write_with_template(
+        "variable_factories.h",
+        "variable_factories.h",
+        lambda: {
+            "generated_comment": "@"
+            + f"generated from {fm.template_dir_for_comments()}/variable_factories.h",
+            "ops_headers": [
+                f"#include <ATen/ops/{fn.root_name}.h>" for fn in factory_functions
+            ],
+            "function_definitions": list(mapMaybe(process_function, factory_functions)),
+        },
+    )
+
+
+@with_native_function
+def is_factory_function(f: NativeFunction) -> bool:
+    if Variant.function not in f.variants:
+        return False
+
+    name = cpp.name(f.func)
+    has_tensor_options = python.has_tensor_options(f)
+    return has_tensor_options or name.endswith("_like")
+
+
+@with_native_function
+def process_function(f: NativeFunction) -> str | None:
+    name = cpp.name(f.func)
+    has_tensor_options = python.has_tensor_options(f)
+    is_factory = has_tensor_options or name.endswith("_like")
+
+    if Variant.function not in f.variants or not is_factory:
+        return None
+
+    cpp_sigs = CppSignatureGroup.from_native_function(f, method=False)
+    sigs = [cpp_sigs.signature]
+    if cpp_sigs.symint_signature is not None:
+        sigs.append(cpp_sigs.symint_signature)
+    r = ""
+    for sig in sigs:
+        formals: list[str] = []
+        exprs: list[str] = []
+        requires_grad = "false"
+        for arg in sig.arguments():
+            qualified_type = fully_qualified_type(arg.type)
+            if arg.default:
+                formals.append(f"{qualified_type} {arg.name} = {arg.default}")
+            else:
+                formals.append(f"{qualified_type} {arg.name}")
+
+            if isinstance(arg.argument, TensorOptionsArguments):
+                # note: we remove the requires_grad setting from the TensorOptions because
+                # it is ignored anyways (and we actually have an assertion that it isn't set
+                # which would fail otherwise). We handle requires_grad explicitly here
+                # instead of passing it through to the kernel.
+                exprs.append(
+                    f"at::TensorOptions({arg.name}).requires_grad(::std::nullopt)"
+                )
+                # Manually set the requires_grad bit on the result tensor.
+                requires_grad = f"{arg.name}.requires_grad()"
+            else:
+                exprs.append(arg.name)
+
+        r += f"""\
+inline at::Tensor {sig.name()}({', '.join(formals)}) {{
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::{sig.name()}({', '.join(exprs)}), /*requires_grad=*/{requires_grad});
+}}
+"""
+    return r
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_variable_type.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_variable_type.py
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/gen_view_funcs.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/gen_view_funcs.py
@ -0,0 +1,340 @@
+# Generates ViewFuncs.h/cpp
+#
+# NOTE: If any changes are being made to the ViewFunc codegen please also check
+# if updates are needed in torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+# The fallback is expected to mimic this codegen, so we should keep the two in sync.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torchgen.api.dispatcher as dispatcher
+from torchgen.api.translate import translate
+from torchgen.api.types import (
+    BaseCType,
+    Binding,
+    NamedCType,
+    SymIntT,
+    tensorT,
+    VectorCType,
+)
+from torchgen.code_template import CodeTemplate
+from torchgen.model import Argument, NativeFunction, OptionalType
+from torchgen.utils import FileManager
+
+from .gen_inplace_or_view_type import (
+    CALL_DISPATCH,
+    extract_bindings,
+    get_view_info,
+    modifies_arguments,
+    use_derived,
+)
+
+
+if TYPE_CHECKING:
+    from torchgen.api.autograd import NativeFunctionWithDifferentiabilityInfo
+
+
+FUNCTION_DECLARATION = CodeTemplate(
+    """\
+#define ${uppercase_op}_AVAILABLE
+struct ${op} : public ${superclass} {
+  ${op}(${constructor_args}) ${initializer_list}
+  {};
+  virtual ~${op}() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ${state}
+};
+
+"""
+)
+
+FUNCTION_DEFINITION = CodeTemplate(
+    """\
+std::vector<c10::SymInt> ${op}::get_symints() const {
+  ${get_symints}
+}
+
+size_t ${op}::num_symints() const {
+  return static_cast<size_t>(${num_symints});
+}
+
+void ${op}::set_symints(std::vector<c10::SymInt> ${symints_vec}) {
+  TORCH_INTERNAL_ASSERT(${symints_vec}.size() == num_symints());
+  ${set_symints}
+}
+
+std::vector<at::Tensor> ${op}::get_tensors() const {
+  ${get_tensors}
+}
+
+size_t ${op}::num_tensors() const {
+  return static_cast<size_t>(${num_tensors});
+}
+
+void ${op}::set_tensors(std::vector<at::Tensor> ${tensors_vec}) {
+  TORCH_INTERNAL_ASSERT(${tensors_vec}.size() == num_tensors());
+  ${set_tensors}
+}
+
+at::Tensor ${op}::operator()(const at::Tensor& ${call_input_name}) const {
+  return ${op_call};
+}
+
+std::unique_ptr<ViewFunc> ${op}::clone_and_set(
+    std::optional<std::vector<c10::SymInt>> ${symints_vec},
+    std::optional<std::vector<at::Tensor>> ${tensors_vec}) const {
+  auto output = std::make_unique<${op}>(${clone_args});
+  if (${symints_vec}.has_value()) {
+    output->set_symints(std::move(*(${symints_vec})));
+  }
+  if (${tensors_vec}.has_value()) {
+    output->set_tensors(std::move(*(${tensors_vec})));
+  }
+  return output;
+}
+
+"""
+)
+
+
+# e.g. as_strided -> AsStridedViewFunc for camel case or
+# as_strided_view_func otherwise
+def view_func_name(
+    f: NativeFunction, include_namespace: bool = False, camel_case: bool = True
+) -> str:
+    name = f.func.name.unambiguous_name()
+    view_func_name = f"{name.replace('.', '_')}_view_func"
+    if camel_case:
+        is_private = view_func_name.startswith("_")
+        view_func_name = "".join(
+            [p.title() for p in view_func_name.replace(".", "_").split("_")]
+        )
+        if is_private:
+            # put the leading underscore back in
+            view_func_name = f"_{view_func_name}"
+    namespace = "torch::autograd::generated::" if include_namespace else ""
+    return f"{namespace}{view_func_name}"
+
+
+def is_symint_or_tensor(arg: Argument) -> bool:
+    return arg.type.is_tensor_like() or arg.type.is_symint_like()
+
+
+def remove_const_ref(binding: Binding) -> Binding:
+    return Binding(
+        name=binding.name,
+        nctype=binding.nctype.remove_const_ref(),
+        argument=binding.argument,
+        default=binding.default,
+    )
+
+
+def returns_multi_tensor(fn: NativeFunction) -> bool:
+    returns = fn.func.returns
+    assert len(returns) == 1
+    returns_list_like = returns[0].type.is_list_like() is not None
+    returns_tensor_like = returns[0].type.is_tensor_like()
+    return returns_list_like and returns_tensor_like
+
+
+# Generates strings with logic for getting / setting state of a particular type.
+#
+# Args:
+#   bindings (list): List of state bindings of interest (may be empty)
+#   state_vec_type (NamedCType): Type of vector to either return or copy from
+#
+# Returns:
+#   tuple: (list of getter logic strings, list of setter logic strings, string
+#     with num items expression)
+def generate_state_getter_setter(
+    bindings: list[Binding],
+    state_vec_type: NamedCType,
+) -> tuple[list[str], list[str], str]:
+    getter_logic = []
+    setter_logic = []
+
+    state_vec = state_vec_type.name
+    getter_logic.append(f"{state_vec_type.cpp_type()} {state_vec};")
+    if len(bindings) > 0:
+        setter_logic.append("auto i = 0;")
+
+    num_exprs = []
+    for i, b in enumerate(bindings):
+        assert isinstance(b.argument, Argument)
+        if b.argument.type.is_list_like():
+            # Handle list-likes.
+            num_expr = f"{b.name}.size()"
+            num_exprs.append(num_expr)
+            getter = f"{state_vec}.insert({state_vec}.end(), {b.name}.begin(), {b.name}.end());"
+            setter = f"std::copy({state_vec}.begin() + i, {state_vec}.begin() + i + {b.name}.size(), {b.name}.begin());"
+        elif isinstance(b.argument.type, OptionalType):
+            # Handle optionals.
+            num_expr = f"({b.name}.has_value() ? 1 : 0)"
+            num_exprs.append(num_expr)
+            conditional = f"if({b.name}.has_value())"
+            getter = (
+                f"{conditional} {state_vec}.insert({state_vec}.end(), *({b.name}));"
+            )
+            setter = f"{conditional} {b.name} = {state_vec}[i];"
+        else:
+            num_expr = "1"
+            num_exprs.append(num_expr)
+            getter = f"{state_vec}.push_back({b.name});"
+            setter = f"{b.name} = {state_vec}[i];"
+
+        getter_logic.append(getter)
+        setter_logic.append(setter)
+        if i < len(bindings) - 1:
+            setter_logic.append(f"i += {num_expr};")
+
+    # Reserve / assert based on the total number of items expression.
+    num_items = "0" if len(num_exprs) == 0 else " + ".join(num_exprs)
+    if len(bindings) > 0:
+        getter_logic.insert(1, f"{state_vec}.reserve({num_items});")
+
+    getter_logic.append(f"return {state_vec};")
+
+    return getter_logic, setter_logic, num_items
+
+
+def process_function(fn: NativeFunction, template: CodeTemplate) -> str:
+    bindings = extract_bindings(fn)
+    non_self_bindings = [b for b in bindings if b.name != "self"]
+
+    non_self_args = fn.func.arguments.flat_all[1:]
+    non_self_value_bindings = [
+        dispatcher.argument(a, remove_non_owning_ref_types=True) for a in non_self_args
+    ]
+
+    # Generate constructor / clone args for the generated struct.
+    constructor_args = [b.defn() for b in non_self_bindings]
+    clone_args = [b.name for b in non_self_bindings]
+
+    # Generate state variable declarations for the generated struct.
+    state_variables = [
+        f"{remove_const_ref(b).defn()};" for b in non_self_value_bindings
+    ]
+
+    # Generate initializer list expressions for the generated struct.
+    # allow_expensive_conversions=True because we need to store e.g. SymIntArrayRefs as
+    # vector<SymInt>s.
+    init_exprs = translate(
+        non_self_bindings, non_self_value_bindings, allow_expensive_conversions=True
+    )
+    initializers = []
+    for b, init_expr in zip(non_self_bindings, init_exprs):
+        name = b.nctype.name
+        assert isinstance(name, str)
+        initializers.append(f"{name}({init_expr.expr})")
+
+    # Generate call to underlying view op
+    call_input_name = "input_base"
+    op_call_args = [call_input_name, *(b.name for b in non_self_bindings)]
+    op_call = CALL_DISPATCH.substitute(
+        unambiguous_name=fn.func.name.unambiguous_name(),
+        unpacked_args=op_call_args,
+    )
+
+    # Multi-output views additionally require a view_idx for disambiguation.
+    if returns_multi_tensor(fn):
+        view_idx_name = "view_idx"
+        view_idx_typename = "int64_t"
+        view_idx_decl = f"{view_idx_typename} {view_idx_name}"
+        constructor_args.append(view_idx_decl)
+        clone_args.append(view_idx_name)
+        state_variables.append(f"{view_idx_decl};")
+        initializers.append(f"{view_idx_name}({view_idx_name})")
+        op_call += f"[{view_idx_name}]"
+
+    # Generate initializer list for the generated struct.
+    initializer_list = f": {', '.join(initializers)}" if len(initializers) > 0 else ""
+
+    # Generate getter / setter logic for any symints.
+    symint_bindings = [
+        b
+        for b in non_self_bindings
+        if isinstance(b.argument, Argument) and b.argument.type.is_symint_like()
+    ]
+    symints_vec_type = NamedCType("symints", VectorCType(BaseCType(SymIntT)))
+    get_symints, set_symints, num_symints = generate_state_getter_setter(
+        symint_bindings, symints_vec_type
+    )
+
+    # Generate getter / setter logic for any tensors.
+    tensor_bindings = [
+        b
+        for b in non_self_bindings
+        if isinstance(b.argument, Argument) and b.argument.type.is_tensor_like()
+    ]
+    tensors_vec_type = NamedCType("tensors", VectorCType(BaseCType(tensorT)))
+    get_tensors, set_tensors, num_tensors = generate_state_getter_setter(
+        tensor_bindings, tensors_vec_type
+    )
+
+    return template.substitute(
+        op=view_func_name(fn),
+        uppercase_op=view_func_name(fn, camel_case=False).upper(),
+        superclass="torch::autograd::ViewFunc",
+        initializer_list=initializer_list,
+        state=state_variables,
+        constructor_args=constructor_args,
+        clone_args=clone_args,
+        symints_vec=symints_vec_type.name,
+        get_symints=get_symints,
+        set_symints=set_symints,
+        num_symints=num_symints,
+        tensors_vec=tensors_vec_type.name,
+        get_tensors=get_tensors,
+        set_tensors=set_tensors,
+        num_tensors=num_tensors,
+        call_input_name=call_input_name,
+        op_call=op_call,
+    )
+
+
+def gen_view_funcs(
+    out: str,
+    fns_with_infos: list[NativeFunctionWithDifferentiabilityInfo],
+    template_path: str,
+) -> None:
+    # don't need the info parts, just the function
+    fns = [fn.func for fn in fns_with_infos if use_derived(fn)]
+    # only want out-of-place views
+    view_fns = [
+        fn for fn in fns if get_view_info(fn) is not None and not modifies_arguments(fn)
+    ]
+
+    declarations = [process_function(fn, FUNCTION_DECLARATION) for fn in view_fns]
+    definitions = [process_function(fn, FUNCTION_DEFINITION) for fn in view_fns]
+    ops_headers = [f"#include <ATen/ops/{fn.root_name}_ops.h>" for fn in view_fns]
+
+    file_basename = "ViewFuncs"
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    for suffix in [".h", ".cpp"]:
+        fname = file_basename + suffix
+        fm.write_with_template(
+            fname,
+            fname,
+            lambda: {
+                "generated_comment": "@"
+                + f"generated from {fm.template_dir_for_comments()}/"
+                + fname,
+                "view_func_declarations": declarations,
+                "view_func_definitions": definitions,
+                "ops_headers": ops_headers,
+            },
+        )
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/load_derivatives.py
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/load_derivatives.py
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/ADInplaceOrViewType.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/ADInplaceOrViewType.cpp
@ -0,0 +1,38 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include "torch/csrc/autograd/VariableTypeUtils.h"
+#include "torch/csrc/autograd/generated/ViewFuncs.h"
+
+#include <torch/library.h>
+#include <ATen/FunctionalInverses.h>
+#include <ATen/FunctionalTensorWrapper.h>
+
+// ${generated_comment}
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+$ops_headers
+#endif
+
+using namespace at;
+using torch::autograd::CreationMeta;
+using torch::autograd::as_view;
+using torch::autograd::increment_version;
+
+namespace torch {
+
+namespace ADInplaceOrView {
+
+namespace {
+${inplace_or_view_method_definitions}
+}  // namespace
+}  // namespace ADInplaceOrView
+
+namespace {
+
+TORCH_LIBRARY_IMPL(aten, ADInplaceOrView, m) {
+  ${inplace_or_view_wrapper_registrations};
+}
+
+}  // namespace
+} // namespace torch
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/Functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/Functions.cpp
@ -0,0 +1,20 @@
+#include "torch/csrc/autograd/FunctionsManual.h"
+#include "torch/csrc/dynamo/compiled_autograd.h"
+
+// ${generated_comment}
+
+// The manual function definitions that used to be here are now in torch/csrc/autograd/FunctionsManual.cpp
+// This speeds up re-compilation and allow to share these implementations so that they can be
+// used for forward mode AD formulas as well.
+
+using namespace torch::autograd::generated::details;
+using at::Tensor;
+using at::Scalar;
+using at::IntArrayRef;
+using at::TensorList;
+
+namespace torch::autograd::generated {
+
+${autograd_function_definitions}
+
+} // namespace torch::autograd::generated
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/Functions.h
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/Functions.h
@ -0,0 +1,51 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/ATen.h>
+#include <ATen/core/functional.h>
+#include <ATen/TensorGeometry.h>
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/autograd/saved_variable.h"
+#include <torch/csrc/Export.h>
+
+#include <c10/core/SymIntArrayRef.h>
+
+namespace torch { namespace autograd { namespace generated {
+
+using at::Scalar;
+using at::Tensor;
+using at::IntArrayRef;
+using at::ArrayRef;
+using at::Type;
+using at::TensorGeometry;
+using at::ScalarType;
+using std::optional;
+using c10::fmap;
+
+inline std::vector<Tensor> unpack_list(at::ArrayRef<SavedVariable> xs, std::shared_ptr<Node> saved_for = nullptr) {
+  // NB: we must explicitly do the conversion in the lambda, otherwise template
+  // deduction will give a Tensor of Variable which is not convertible
+  return fmap(xs, [&saved_for](const SavedVariable& x) {
+    // TODO(crcrpar): Use `std::move(saved_for)` to avoid incrementing refcount, which would need refactoring.
+    return static_cast<Tensor>(x.unpack(saved_for));
+  });
+}
+
+inline c10::List<std::optional<Tensor>> unpack_opt_list(at::ArrayRef<SavedVariable> xs, std::shared_ptr<Node> saved_for = nullptr) {
+  torch::List<std::optional<Tensor>> result;
+  result.reserve(xs.size());
+  for (const SavedVariable& v : xs) {
+    auto var = v.unpack(saved_for);
+    result.push_back(var.defined() ? std::optional<Tensor>(var) : ::std::nullopt);
+  }
+  return result;
+}
+
+using torch::autograd::TypeAndSize;
+
+${autograd_function_declarations}
+
+}}} // namespace torch::autograd::generated
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/TraceType.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/TraceType.cpp
@ -0,0 +1,40 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include "torch/csrc/jit/frontend/tracer.h"
+
+#include <torch/library.h>
+
+#include "torch/csrc/autograd/function.h"
+
+#include "ATen/quantized/Quantizer.h"
+
+// ${generated_comment}
+
+// See the `Tracer` section in `torch/csrc/jit/OVERVIEW.md`.
+// NOTE See [Sharded File] comment in VariableType
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+$ops_headers
+#endif
+
+using namespace at;
+
+namespace torch {
+
+namespace TraceType {
+
+namespace {
+${trace_method_definitions}
+}  // namespace
+}  // namespace TraceType
+
+namespace {
+
+TORCH_LIBRARY_IMPL(aten, Tracer, m) {
+  ${trace_wrapper_registrations};
+}
+
+}  // namespace
+
+} // namespace torch
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/VariableType.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/VariableType.cpp
@ -0,0 +1,65 @@
+#include "torch/csrc/autograd/VariableTypeUtils.h"
+#include "torch/csrc/autograd/generated/VariableType.h"
+#include "torch/csrc/autograd/FunctionsManual.h"
+
+#include <ATen/RedispatchFunctions.h>
+#include <c10/core/impl/TorchDispatchModeTLS.h>
+#include <ATen/core/TorchDispatchUtils.h>
+#include <torch/library.h>
+
+#include <ATen/SparseCsrTensorUtils.h>
+
+
+// ${generated_comment}
+
+// NOTE [Sharded File]: on this file's split-into-shards state
+//
+// Back in the good old days, VariableType.cpp was generated as one
+// file with every function in it, and everything was great and
+// simple.
+//
+// However, this file was also very large (over 36,000 lines), and
+// compiling it was very slow, and in fact was a significant
+// bottleneck for incremental rebuilds. To address this, we now
+// generate the file split across multiple shards, named
+// VariableType_0.cpp and so on, which can be compiled in parallel.
+//
+// For ease of inspection and debugging, so that it's not necessary to
+// go rooting around in multiple files, we also generate all the
+// functions together in VariableTypeEverything.cpp. This generated
+// file is only for convenience; it's not actually used in the
+// build. If the file you're looking at now is one of the shards, you
+// may want to switch over to the Everything variant to make you
+// grepping smoother.
+
+using namespace at;
+using namespace torch::autograd::generated;
+using namespace torch::autograd::generated::details;
+
+
+namespace torch::autograd {
+
+namespace VariableType {
+namespace{
+  C10_UNUSED void reset_grad_accumulator(Variable & self) {
+    AutogradMeta* meta = torch::autograd::impl::get_autograd_meta(self);
+    if (meta != nullptr) {
+      meta->grad_accumulator_.reset();
+    }
+  }
+}
+
+namespace {
+
+
+${type_derived_method_definitions}
+}
+}
+
+namespace {
+
+${wrapper_registrations}
+
+}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/VariableType.h
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/VariableType.h
@ -0,0 +1,59 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Context.h>
+
+#include <c10/util/intrusive_ptr.h>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
+
+#include <cstdint> // for size_t
+#include <functional> // for function
+#include <memory> // for unique_ptr
+#include <string>
+#include <vector>
+
+namespace at {
+  struct Quantizer;
+};
+
+namespace torch { namespace autograd {
+
+using Variable = at::Tensor;
+using at::Context;
+using at::Device;
+using at::Dimname;
+using at::DimnameList;
+using at::Generator;
+using at::IntArrayRef;
+using at::MemoryFormat;
+using at::QScheme;
+using at::Scalar;
+using at::ScalarType;
+using at::Storage;
+using at::Tensor;
+using at::TensorList;
+using at::TensorOptions;
+using at::Quantizer;
+// This is temporary typedef to enable Quantizer in aten native function API
+// we'll remove them when we are actually exposing Quantizer class
+// to frontend
+using ConstQuantizerPtr = const c10::intrusive_ptr<Quantizer>&;
+using std::optional;
+
+namespace VariableType {
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allCUDATypes();
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allXPUTypes();
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allCPUTypes();
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allPrivateUser1Types();
+
+  at::Tensor & unpack(Tensor & t, const char * name, int pos);
+  const at::Tensor & unpack(const Tensor & t, const char * name, int pos);
+  at::Tensor unpack_opt(const Tensor & t, const char * name, int pos);
+  std::vector<at::Tensor> unpack(const at::ITensorListRef& tl, const char *name, int pos);
+};
+
+}} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/ViewFuncs.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/ViewFuncs.cpp
@ -0,0 +1,14 @@
+#include <torch/csrc/autograd/generated/ViewFuncs.h>
+
+// ${generated_comment}
+
+using at::Tensor;
+using at::Scalar;
+using at::IntArrayRef;
+using at::TensorList;
+
+namespace torch::autograd::generated {
+
+${view_func_definitions}
+
+} // namespace torch::autograd::generated
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/ViewFuncs.h
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/ViewFuncs.h
@ -0,0 +1,28 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <torch/library.h>
+#include <torch/csrc/autograd/variable.h>
+#include <c10/core/SymIntArrayRef.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+$ops_headers
+#endif
+
+namespace torch::autograd::generated {
+
+using at::Scalar;
+using at::Tensor;
+using at::IntArrayRef;
+using at::ArrayRef;
+using at::Type;
+using at::ScalarType;
+using std::optional;
+using c10::fmap;
+
+${view_func_declarations}
+
+} // namespace torch::autograd::generated
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/annotated_fn_args.py.in
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/annotated_fn_args.py.in
@ -0,0 +1,11 @@
+"""
+This file is needed for generating procedural tests required for
+testing __torch_function__. See tests/test_overrides.py.
+"""
+
+# flake8: noqa
+import torch
+
+annotated_args = {
+${annotated_args}
+}
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_enum_tag.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_enum_tag.cpp
@ -0,0 +1,15 @@
+#include <torch/csrc/autograd/python_enum_tag.h>
+#include <torch/csrc/utils/pybind.h>
+#include <pybind11/pybind11.h>
+#include <ATen/core/enum_tag.h>
+
+namespace py = pybind11;
+namespace torch {
+    namespace autograd {
+    void initEnumTag(PyObject* module) {
+        auto m = py::handle(module).cast<py::module>();
+        py::enum_<at::Tag>(m, "Tag")
+        ${enum_of_valid_tags};
+        m.doc() = "An Enum that contains tags that can be assigned to an operator registered in C++.";
+    }
+}}
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_fft_functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_fft_functions.cpp
@ -0,0 +1,81 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_fft_functions.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/utils/out_types.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/device_lazy_init.h"
+
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Device;
+using at::Layout;
+using at::Scalar;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
+using at::IntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
+
+using torch::utils::check_out_type_matches;
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef fft_functions[] = {
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPFFTVariableFunctionsModule = NULL;
+
+void initFFTFunctions(PyObject* module) {
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._fft",
+     NULL,
+     -1,
+     fft_functions
+  };
+  PyObject* fft = PyModule_Create(&def);
+  THPFFTVariableFunctionsModule = fft;
+  if (!fft) {
+    throw python_error();
+  }
+  // steals a reference to fft
+  if (PyModule_AddObject(module, "_fft", fft) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_functions.cpp
@ -0,0 +1,37 @@
+#include <torch/csrc/autograd/generated/python_functions.h>
+
+// ${generated_comment}
+
+#include <Python.h>
+#include <ATen/ATen.h>
+
+#include <c10/core/SymNodeImpl.h>
+#include "torch/csrc/autograd/generated/Functions.h"
+#include "torch/csrc/autograd/python_cpp_function.h"
+#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/autograd/saved_variable.h>
+#include <torch/csrc/utils/pybind.h>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/utils/pybind.h>
+
+// NOTE: See [Sharded File] comment in VariableType
+
+namespace torch::autograd::generated {
+
+template<typename C>
+static void addClass(PyObject* module, PyTypeObject& type, const char* name,
+  PyGetSetDef* function_properties=NULL, PyMethodDef* function_methods=NULL)
+{
+  _initFunctionPyTypeObject(type, name, function_properties, function_methods);
+  Py_INCREF(&type);
+  PyModule_AddObject(module, name, (PyObject*)&type);
+  registerCppFunction(typeid(C), &type);
+}
+
+${py_function_props_and_getters}
+
+void initialize_autogenerated_functions${shard_id}(PyObject* module) {
+  ${py_function_initializers}
+}
+
+} // namespace torch::autograd::generated
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_functions.h
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_functions.h
@ -0,0 +1,17 @@
+#pragma once
+
+#include <Python.h>
+
+// ${generated_comment}
+
+// Python bindings for automatically generated autograd functions
+
+namespace torch { namespace autograd { namespace generated {
+
+${shard_forward_declare}
+
+inline void initialize_autogenerated_functions(PyObject* module) {
+  ${shard_call}
+}
+
+}}} // namespace torch::autograd::generated
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_linalg_functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_linalg_functions.cpp
@ -0,0 +1,68 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_linalg_functions.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Scalar;
+using at::ScalarType;
+using at::MemoryFormat;
+using at::Generator;
+using at::IntArrayRef;
+using at::TensorList;
+
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef linalg_functions[] = {
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPLinalgVariableFunctionsModule = NULL;
+
+void initLinalgFunctions(PyObject* module) {
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._linalg",
+     NULL,
+     -1,
+     linalg_functions
+  };
+  PyObject* linalg = PyModule_Create(&def);
+  THPLinalgVariableFunctionsModule = linalg;
+  if (!linalg) {
+    throw python_error();
+  }
+  // steals a reference to linalg
+  if (PyModule_AddObject(module, "_linalg", linalg) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_nested_functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_nested_functions.cpp
@ -0,0 +1,81 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_nested_functions.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/utils/out_types.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/device_lazy_init.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Device;
+using at::Layout;
+using at::Scalar;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
+using at::IntArrayRef;
+using at::OptionalIntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
+
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef nested_functions[] = {
+  {NULL, NULL, 0, NULL},
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPNestedVariableFunctionsModule = NULL;
+
+void initNestedFunctions(PyObject* module) {
+  nested_functions[0] = get_nested_functions_manual()[0];
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._nested",
+     NULL,
+     -1,
+     nested_functions
+  };
+  PyObject* nested = PyModule_Create(&def);
+  THPNestedVariableFunctionsModule = nested;
+  if (!nested) {
+    throw python_error();
+  }
+  // steals a reference to nested
+  if (PyModule_AddObject(module, "_nested", nested) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_nn_functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_nn_functions.cpp
@ -0,0 +1,113 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_nn_functions.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/tensor_memoryformats.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Scalar;
+using at::MemoryFormat;
+using at::Generator;
+using at::IntArrayRef;
+using at::ArrayRef;
+
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+static PyObject* THPNNVariableFunctionsModule = NULL;
+
+static PyObject * THPVariable__parse_to(PyObject* module, PyObject* args, PyObject* kwargs)
+{
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+    "to(Device device=None, ScalarType dtype=None, bool non_blocking=False, bool copy=False, *, MemoryFormat? memory_format=None)",
+    "to(ScalarType dtype, bool non_blocking=False, bool copy=False, *, MemoryFormat? memory_format=None)",
+    "to(Tensor tensor, bool non_blocking=False, bool copy=False, *, MemoryFormat? memory_format=None)",
+  });
+  ParsedArgs<5> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  if (r.has_torch_function()) {
+    return handle_torch_function(r, args, kwargs, THPNNVariableFunctionsModule, "torch.nn", "_parse_to");
+  }
+  auto parsed = parse_to_conversion(r, /*allow_copy*/ false); // we don't want copy for nn.Module.to
+  auto& device = std::get<0>(parsed);
+  auto& scalarType = std::get<1>(parsed);
+  auto non_blocking = std::get<2>(parsed);
+  auto opt_memory_format = std::get<4>(parsed);
+  auto tuple = THPObjectPtr{PyTuple_New(4)};
+  if (!tuple) throw python_error();
+  if (device) {
+    PyTuple_SET_ITEM(tuple.get(), 0, THPDevice_New(*device));
+  } else {
+    Py_INCREF(Py_None);
+    PyTuple_SET_ITEM(tuple.get(), 0, Py_None);
+  }
+  if (scalarType) {
+    PyTuple_SET_ITEM(tuple.get(), 1, Py_NewRef(torch::getTHPDtype(*scalarType)));
+  } else {
+    Py_INCREF(Py_None);
+    PyTuple_SET_ITEM(tuple.get(), 1, Py_None);
+  }
+  PyTuple_SET_ITEM(tuple.get(), 2, torch::autograd::utils::wrap(non_blocking));
+  if (opt_memory_format.has_value()) {
+    PyTuple_SET_ITEM(tuple.get(), 3, Py_NewRef(torch::utils::getTHPMemoryFormat(opt_memory_format.value())));
+  } else {
+    Py_INCREF(Py_None);
+    PyTuple_SET_ITEM(tuple.get(), 3, Py_None);
+  }
+  return tuple.release();
+  END_HANDLE_TH_ERRORS
+}
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef nn_functions[] = {
+  {"_parse_to", castPyCFunctionWithKeywords(THPVariable__parse_to),
+    METH_VARARGS | METH_KEYWORDS, nullptr},
+  ${py_method_defs}
+  {NULL}
+};
+
+void initNNFunctions(PyObject* module) {
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._nn",
+     NULL,
+     -1,
+     nn_functions
+  };
+  PyObject* nn = PyModule_Create(&def);
+  THPNNVariableFunctionsModule = nn;
+  if (!nn) {
+    throw python_error();
+  }
+  // steals a reference to nn
+  if (PyModule_AddObject(module, "_nn", nn) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_return_types.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_return_types.cpp
@ -0,0 +1,52 @@
+#include <Python.h>
+
+#include <vector>
+#include <map>
+#include <string>
+
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/Exceptions.h"
+
+namespace torch { namespace autograd { namespace generated {
+
+${py_return_types}
+
+}}}
+
+namespace torch::autograd {
+
+static void addReturnType(
+    PyObject* module,
+    const char* name,
+    PyTypeObject* type) {
+  // hold onto the TypeObject for the unlikely case of user
+  // deleting or overriding it.
+  Py_INCREF(type);
+  if (PyModule_AddObject(
+          module,
+          name,
+          (PyObject*)type) != 0) {
+    Py_DECREF(type);
+    throw python_error();
+  }
+}
+
+void initReturnTypes(PyObject* module) {
+  static struct PyModuleDef def = {
+      PyModuleDef_HEAD_INIT, "torch._C._return_types", nullptr, -1, {}};
+  PyObject* return_types_module = PyModule_Create(&def);
+  if (!return_types_module) {
+    throw python_error();
+  }
+
+  ${py_return_types_registrations}
+
+  // steals a reference to return_types on success
+  if (PyModule_AddObject(module, "_return_types", return_types_module) != 0) {
+    Py_DECREF(return_types_module);
+    throw python_error();
+  }
+}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_return_types.h
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_return_types.h
@ -0,0 +1,14 @@
+#pragma once
+
+namespace torch {
+namespace autograd {
+namespace generated {
+
+${py_return_types_declarations}
+
+}
+
+void initReturnTypes(PyObject* module);
+
+} // namespace autograd
+} // namespace torch
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_sparse_functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_sparse_functions.cpp
@ -0,0 +1,67 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_sparse_functions.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Scalar;
+using at::ScalarType;
+using at::MemoryFormat;
+using at::Generator;
+using at::IntArrayRef;
+using at::TensorList;
+
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef sparse_functions[] = {
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPSparseVariableFunctionsModule = NULL;
+
+void initSparseFunctions(PyObject* module) {
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._sparse",
+     NULL,
+     -1,
+     sparse_functions
+  };
+  PyObject* sparse = PyModule_Create(&def);
+  THPSparseVariableFunctionsModule = sparse;
+  if (!sparse) {
+    throw python_error();
+  }
+  // steals a reference to sparse
+  if (PyModule_AddObject(module, "_sparse", sparse) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_special_functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_special_functions.cpp
@ -0,0 +1,79 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_special_functions.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/utils/out_types.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/device_lazy_init.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Device;
+using at::Layout;
+using at::Scalar;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
+using at::IntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
+
+using torch::utils::check_out_type_matches;
+using namespace torch::autograd::utils;
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef special_functions[] = {
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPSpecialVariableFunctionsModule = NULL;
+
+void initSpecialFunctions(PyObject* module) {
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._special",
+     NULL,
+     -1,
+     special_functions
+  };
+  PyObject* special = PyModule_Create(&def);
+  THPSpecialVariableFunctionsModule = special;
+  if (!special) {
+    throw python_error();
+  }
+  // steals a reference to special
+  if (PyModule_AddObject(module, "_special", special) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_torch_functions.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_torch_functions.cpp
@ -0,0 +1,93 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+// Python bindings for torch.* functions implemented through ATen.
+//
+// The functions are bound as static methods on a class
+// torch._C._VariableFunctions which is also aliased as Variable._torch
+// and also copied into 'torch' module.
+
+#include <Python.h>
+
+// Undefine the copysign macro so that at::copysign works as intended with MSVC
+// https://github.com/python/cpython/blob/c60394c7fc9cc09b16e9675a3eeb5844b6d8523f/PC/pyconfig.h#L196
+#ifdef _MSC_VER
+#undef copysign
+#endif // _MSC_VER
+
+#include "torch/csrc/autograd/python_torch_functions.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/Dtype.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/utils/out_types.h"
+#include "torch/csrc/utils/pybind.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/tensor_layouts.h"
+#include "torch/csrc/utils/tensor_new.h"
+#include "torch/csrc/utils/tensor_numpy.h"
+#include "torch/csrc/jit/frontend/tracer.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/device_lazy_init.h"
+#include "torch/csrc/autograd/generated/python_return_types.h"
+
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+#include <functional>
+#include <initializer_list>
+#include <stdexcept>
+#include <utility>
+
+using at::Tensor;
+using at::Device;
+using at::Layout;
+using at::Scalar;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
+using at::IntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
+using at::ArrayRef;
+
+using torch::utils::check_out_type_matches;
+using namespace torch::autograd::utils;
+
+// NOTE: See [Sharded File] comment in VariableType
+
+namespace torch::autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef torch_functions_shard[] = {
+  ${py_method_defs}
+};
+
+void gatherTorchFunctions${shard_id}(std::vector<PyMethodDef> &torch_functions) {
+  constexpr size_t num_functions = sizeof(torch_functions_shard) / sizeof(torch_functions_shard[0]);
+  torch_functions.insert(
+    torch_functions.end(),
+    torch_functions_shard,
+    torch_functions_shard + num_functions);
+}
+
+// generated methods start here
+
+${py_methods}
+
+} // namespace torch::autograd
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_variable_methods.cpp
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/python_variable_methods.cpp
--- a/rl/Lib/site-packages/torchgen/packaged/autograd/templates/variable_factories.h
+++ b/rl/Lib/site-packages/torchgen/packaged/autograd/templates/variable_factories.h
@ -0,0 +1,135 @@
+#pragma once
+
+// ${generated_comment}
+
+#include <ATen/core/Tensor.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/grad_mode.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/core/MemoryFormat.h>
+#include <torch/csrc/api/include/torch/detail/TensorDataContainer.h>
+#include <torch/csrc/autograd/variable.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/from_blob.h>
+$ops_headers
+#endif
+
+#include <functional>
+#include <initializer_list>
+#include <utility>
+
+namespace torch {
+
+/// NOTE: Currently `torch::tensor(...)` doesn't support mixed data types
+/// (i.e. `torch::tensor({{bool, 2.0}})` doesn't work). We might be able to
+/// support it in the future by iterating over all sub-lists to find
+/// the largest data type that can represent all of the elements, or by using
+/// variadic templates.
+///
+/// NOTE: C++ `torch::tensor` with a floating-point type or an `at::ArrayRef` / `std::vector` /
+/// (nested) braced-init-list of floating-point types always produces a tensor of dtype
+/// `torch::get_default_dtype()`, matching Python `torch.tensor` behavior.
+///
+/// NOTE: C++ `torch::tensor` with an integer type or an `at::ArrayRef` / `std::vector` /
+/// (nested) braced-init-list of integer types always produces a tensor of dtype `at::kLong`
+/// (aka. int64_t), matching Python `torch.tensor` behavior.
+///
+/// NOTE: The following dtypes are not supported by `torch::tensor` currently:
+/// - `unsigned int`
+/// - `unsigned long int`
+/// - `unsigned long long int`
+/// - `long long int`
+inline at::Tensor tensor(detail::TensorDataContainer tensor_data_container, const at::TensorOptions& options = {}) {
+  return autograd::make_variable(
+    // note: we remove the requires_grad setting from the TensorOptions because
+    // it is ignored anyways (and we actually have an assertion that it isn't set
+    // which would fail otherwise). We handle requires_grad explicitly here
+    // instead of passing it through to the kernel.
+    tensor_data_container.convert_to_tensor(options.requires_grad(::std::nullopt)),
+    options.requires_grad());
+}
+
+/// A generic deleter function.
+using Deleter = std::function<void(void*)>;
+using at::MemoryFormat;
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor, `strides` the
+/// stride in each dimension. The `deleter` function (a
+/// `std::function<void(void*)>`) will be called on the `data` when the Tensor
+/// data would normally be deallocated. The `TensorOptions` specify additional
+/// configuration options for the returned tensor, such as what type to
+/// interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    at::IntArrayRef strides,
+    const Deleter& deleter,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, strides, deleter, options.requires_grad(::std::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor, `strides` the
+/// stride in each dimension. The `TensorOptions`
+/// specify additional configuration options for the returned tensor, such as
+/// what type to interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    at::IntArrayRef strides,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, strides, options.requires_grad(::std::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor. The `deleter`
+/// (a `std::function<void(void*)>`) function will be called on the `data` when
+/// the Tensor data would normally be deallocated. The `TensorOptions` specify
+/// additional configuration options for the returned tensor, such as what type
+/// to interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    const Deleter& deleter,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, deleter, options.requires_grad(::std::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor. The
+/// `TensorOptions` specify additional configuration options for the returned
+/// tensor, such as what type to interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, options.requires_grad(::std::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+${function_definitions}
+
+} // namespace torch