I am done

2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions
--- a/rl/Lib/site-packages/gymnasium/init.py
+++ b/rl/Lib/site-packages/gymnasium/init.py
@ -0,0 +1,77 @@
+"""Root `__init__` of the gymnasium module setting the `__all__` of gymnasium modules."""
+# isort: skip_file
+
+from gymnasium.core import (
+    Env,
+    Wrapper,
+    ObservationWrapper,
+    ActionWrapper,
+    RewardWrapper,
+)
+from gymnasium.spaces.space import Space
+from gymnasium.envs.registration import (
+    make,
+    spec,
+    register,
+    registry,
+    pprint_registry,
+    make_vec,
+    register_envs,
+)
+
+# necessary for `envs.__init__` which registers all gymnasium environments and loads plugins
+from gymnasium import envs
+from gymnasium import spaces, utils, vector, wrappers, error, logger
+from gymnasium import experimental
+
+
+__all__ = [
+    # core classes
+    "Env",
+    "Wrapper",
+    "ObservationWrapper",
+    "ActionWrapper",
+    "RewardWrapper",
+    "Space",
+    # registration
+    "make",
+    "make_vec",
+    "spec",
+    "register",
+    "registry",
+    "pprint_registry",
+    "register_envs",
+    # module folders
+    "envs",
+    "experimental",
+    "spaces",
+    "utils",
+    "vector",
+    "wrappers",
+    "error",
+    "logger",
+]
+__version__ = "0.29.1"
+
+
+# Initializing pygame initializes audio connections through SDL. SDL uses alsa by default on all Linux systems
+# SDL connecting to alsa frequently create these giant lists of warnings every time you import an environment using
+#   pygame
+# DSP is far more benign (and should probably be the default in SDL anyways)
+
+import os
+import sys
+
+if sys.platform.startswith("linux"):
+    os.environ["SDL_AUDIODRIVER"] = "dsp"
+
+os.environ["PYGAME_HIDE_SUPPORT_PROMPT"] = "hide"
+
+try:
+    from farama_notifications import notifications
+
+    if "gymnasium" in notifications and __version__ in notifications["gymnasium"]:
+        print(notifications["gymnasium"][__version__], file=sys.stderr)
+
+except Exception:  # nosec
+    pass
--- a/rl/Lib/site-packages/gymnasium/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/pycache/core.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/pycache/core.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/pycache/error.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/pycache/error.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/pycache/logger.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/pycache/logger.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/core.py
+++ b/rl/Lib/site-packages/gymnasium/core.py
@ -0,0 +1,602 @@
+"""Core API for Environment, Wrapper, ActionWrapper, RewardWrapper and ObservationWrapper."""
+from __future__ import annotations
+
+from copy import deepcopy
+from typing import TYPE_CHECKING, Any, Generic, SupportsFloat, TypeVar
+
+import numpy as np
+
+from gymnasium import logger, spaces
+from gymnasium.utils import RecordConstructorArgs, seeding
+
+
+if TYPE_CHECKING:
+    from gymnasium.envs.registration import EnvSpec, WrapperSpec
+
+
+__all__ = [
+    "Env",
+    "Wrapper",
+    "ObservationWrapper",
+    "RewardWrapper",
+    "ActionWrapper",
+    "ObsType",
+    "ActType",
+    "RenderFrame",
+    "WrapperObsType",
+    "WrapperActType",
+]
+
+ObsType = TypeVar("ObsType")
+ActType = TypeVar("ActType")
+RenderFrame = TypeVar("RenderFrame")
+
+
+class Env(Generic[ObsType, ActType]):
+    r"""The main Gymnasium class for implementing Reinforcement Learning Agents environments.
+
+    The class encapsulates an environment with arbitrary behind-the-scenes dynamics through the :meth:`step` and :meth:`reset` functions.
+    An environment can be partially or fully observed by single agents. For multi-agent environments, see PettingZoo.
+
+    The main API methods that users of this class need to know are:
+
+    - :meth:`step` - Updates an environment with actions returning the next agent observation, the reward for taking that actions,
+      if the environment has terminated or truncated due to the latest action and information from the environment about the step, i.e. metrics, debug info.
+    - :meth:`reset` - Resets the environment to an initial state, required before calling step.
+      Returns the first agent observation for an episode and information, i.e. metrics, debug info.
+    - :meth:`render` - Renders the environments to help visualise what the agent see, examples modes are "human", "rgb_array", "ansi" for text.
+    - :meth:`close` - Closes the environment, important when external software is used, i.e. pygame for rendering, databases
+
+    Environments have additional attributes for users to understand the implementation
+
+    - :attr:`action_space` - The Space object corresponding to valid actions, all valid actions should be contained within the space.
+    - :attr:`observation_space` - The Space object corresponding to valid observations, all valid observations should be contained within the space.
+    - :attr:`reward_range` - A tuple corresponding to the minimum and maximum possible rewards for an agent over an episode.
+      The default reward range is set to :math:`(-\infty,+\infty)`.
+    - :attr:`spec` - An environment spec that contains the information used to initialize the environment from :meth:`gymnasium.make`
+    - :attr:`metadata` - The metadata of the environment, i.e. render modes, render fps
+    - :attr:`np_random` - The random number generator for the environment. This is automatically assigned during
+      ``super().reset(seed=seed)`` and when assessing ``self.np_random``.
+
+    .. seealso:: For modifying or extending environments use the :py:class:`gymnasium.Wrapper` class
+
+    Note:
+        To get reproducible sampling of actions, a seed can be set with ``env.action_space.seed(123)``.
+    """
+
+    # Set this in SOME subclasses
+    metadata: dict[str, Any] = {"render_modes": []}
+    # define render_mode if your environment supports rendering
+    render_mode: str | None = None
+    reward_range = (-float("inf"), float("inf"))
+    spec: EnvSpec | None = None
+
+    # Set these in ALL subclasses
+    action_space: spaces.Space[ActType]
+    observation_space: spaces.Space[ObsType]
+
+    # Created
+    _np_random: np.random.Generator | None = None
+
+    def step(
+        self, action: ActType
+    ) -> tuple[ObsType, SupportsFloat, bool, bool, dict[str, Any]]:
+        """Run one timestep of the environment's dynamics using the agent actions.
+
+        When the end of an episode is reached (``terminated or truncated``), it is necessary to call :meth:`reset` to
+        reset this environment's state for the next episode.
+
+        .. versionchanged:: 0.26
+
+            The Step API was changed removing ``done`` in favor of ``terminated`` and ``truncated`` to make it clearer
+            to users when the environment had terminated or truncated which is critical for reinforcement learning
+            bootstrapping algorithms.
+
+        Args:
+            action (ActType): an action provided by the agent to update the environment state.
+
+        Returns:
+            observation (ObsType): An element of the environment's :attr:`observation_space` as the next observation due to the agent actions.
+                An example is a numpy array containing the positions and velocities of the pole in CartPole.
+            reward (SupportsFloat): The reward as a result of taking the action.
+            terminated (bool): Whether the agent reaches the terminal state (as defined under the MDP of the task)
+                which can be positive or negative. An example is reaching the goal state or moving into the lava from
+                the Sutton and Barton, Gridworld. If true, the user needs to call :meth:`reset`.
+            truncated (bool): Whether the truncation condition outside the scope of the MDP is satisfied.
+                Typically, this is a timelimit, but could also be used to indicate an agent physically going out of bounds.
+                Can be used to end the episode prematurely before a terminal state is reached.
+                If true, the user needs to call :meth:`reset`.
+            info (dict): Contains auxiliary diagnostic information (helpful for debugging, learning, and logging).
+                This might, for instance, contain: metrics that describe the agent's performance state, variables that are
+                hidden from observations, or individual reward terms that are combined to produce the total reward.
+                In OpenAI Gym <v26, it contains "TimeLimit.truncated" to distinguish truncation and termination,
+                however this is deprecated in favour of returning terminated and truncated variables.
+            done (bool): (Deprecated) A boolean value for if the episode has ended, in which case further :meth:`step` calls will
+                return undefined results. This was removed in OpenAI Gym v26 in favor of terminated and truncated attributes.
+                A done signal may be emitted for different reasons: Maybe the task underlying the environment was solved successfully,
+                a certain timelimit was exceeded, or the physics simulation has entered an invalid state.
+        """
+        raise NotImplementedError
+
+    def reset(
+        self,
+        *,
+        seed: int | None = None,
+        options: dict[str, Any] | None = None,
+    ) -> tuple[ObsType, dict[str, Any]]:  # type: ignore
+        """Resets the environment to an initial internal state, returning an initial observation and info.
+
+        This method generates a new starting state often with some randomness to ensure that the agent explores the
+        state space and learns a generalised policy about the environment. This randomness can be controlled
+        with the ``seed`` parameter otherwise if the environment already has a random number generator and
+        :meth:`reset` is called with ``seed=None``, the RNG is not reset.
+
+        Therefore, :meth:`reset` should (in the typical use case) be called with a seed right after initialization and then never again.
+
+        For Custom environments, the first line of :meth:`reset` should be ``super().reset(seed=seed)`` which implements
+        the seeding correctly.
+
+        .. versionchanged:: v0.25
+
+            The ``return_info`` parameter was removed and now info is expected to be returned.
+
+        Args:
+            seed (optional int): The seed that is used to initialize the environment's PRNG (`np_random`).
+                If the environment does not already have a PRNG and ``seed=None`` (the default option) is passed,
+                a seed will be chosen from some source of entropy (e.g. timestamp or /dev/urandom).
+                However, if the environment already has a PRNG and ``seed=None`` is passed, the PRNG will *not* be reset.
+                If you pass an integer, the PRNG will be reset even if it already exists.
+                Usually, you want to pass an integer *right after the environment has been initialized and then never again*.
+                Please refer to the minimal example above to see this paradigm in action.
+            options (optional dict): Additional information to specify how the environment is reset (optional,
+                depending on the specific environment)
+
+        Returns:
+            observation (ObsType): Observation of the initial state. This will be an element of :attr:`observation_space`
+                (typically a numpy array) and is analogous to the observation returned by :meth:`step`.
+            info (dictionary):  This dictionary contains auxiliary information complementing ``observation``. It should be analogous to
+                the ``info`` returned by :meth:`step`.
+        """
+        # Initialize the RNG if the seed is manually passed
+        if seed is not None:
+            self._np_random, seed = seeding.np_random(seed)
+
+    def render(self) -> RenderFrame | list[RenderFrame] | None:
+        """Compute the render frames as specified by :attr:`render_mode` during the initialization of the environment.
+
+        The environment's :attr:`metadata` render modes (`env.metadata["render_modes"]`) should contain the possible
+        ways to implement the render modes. In addition, list versions for most render modes is achieved through
+        `gymnasium.make` which automatically applies a wrapper to collect rendered frames.
+
+        Note:
+            As the :attr:`render_mode` is known during ``__init__``, the objects used to render the environment state
+            should be initialised in ``__init__``.
+
+        By convention, if the :attr:`render_mode` is:
+
+        - None (default): no render is computed.
+        - "human": The environment is continuously rendered in the current display or terminal, usually for human consumption.
+          This rendering should occur during :meth:`step` and :meth:`render` doesn't need to be called. Returns ``None``.
+        - "rgb_array": Return a single frame representing the current state of the environment.
+          A frame is a ``np.ndarray`` with shape ``(x, y, 3)`` representing RGB values for an x-by-y pixel image.
+        - "ansi": Return a strings (``str``) or ``StringIO.StringIO`` containing a terminal-style text representation
+          for each time step. The text can include newlines and ANSI escape sequences (e.g. for colors).
+        - "rgb_array_list" and "ansi_list": List based version of render modes are possible (except Human) through the
+          wrapper, :py:class:`gymnasium.wrappers.RenderCollection` that is automatically applied during ``gymnasium.make(..., render_mode="rgb_array_list")``.
+          The frames collected are popped after :meth:`render` is called or :meth:`reset`.
+
+        Note:
+            Make sure that your class's :attr:`metadata` ``"render_modes"`` key includes the list of supported modes.
+
+        .. versionchanged:: 0.25.0
+
+            The render function was changed to no longer accept parameters, rather these parameters should be specified
+            in the environment initialised, i.e., ``gymnasium.make("CartPole-v1", render_mode="human")``
+        """
+        raise NotImplementedError
+
+    def close(self):
+        """After the user has finished using the environment, close contains the code necessary to "clean up" the environment.
+
+        This is critical for closing rendering windows, database or HTTP connections.
+        Calling ``close`` on an already closed environment has no effect and won't raise an error.
+        """
+        pass
+
+    @property
+    def unwrapped(self) -> Env[ObsType, ActType]:
+        """Returns the base non-wrapped environment.
+
+        Returns:
+            Env: The base non-wrapped :class:`gymnasium.Env` instance
+        """
+        return self
+
+    @property
+    def np_random(self) -> np.random.Generator:
+        """Returns the environment's internal :attr:`_np_random` that if not set will initialise with a random seed.
+
+        Returns:
+            Instances of `np.random.Generator`
+        """
+        if self._np_random is None:
+            self._np_random, _ = seeding.np_random()
+        return self._np_random
+
+    @np_random.setter
+    def np_random(self, value: np.random.Generator):
+        self._np_random = value
+
+    def __str__(self):
+        """Returns a string of the environment with :attr:`spec` id's if :attr:`spec.
+
+        Returns:
+            A string identifying the environment
+        """
+        if self.spec is None:
+            return f"<{type(self).__name__} instance>"
+        else:
+            return f"<{type(self).__name__}<{self.spec.id}>>"
+
+    def __enter__(self):
+        """Support with-statement for the environment."""
+        return self
+
+    def __exit__(self, *args: Any):
+        """Support with-statement for the environment and closes the environment."""
+        self.close()
+        # propagate exception
+        return False
+
+    def get_wrapper_attr(self, name: str) -> Any:
+        """Gets the attribute `name` from the environment."""
+        return getattr(self, name)
+
+
+WrapperObsType = TypeVar("WrapperObsType")
+WrapperActType = TypeVar("WrapperActType")
+
+
+class Wrapper(
+    Env[WrapperObsType, WrapperActType],
+    Generic[WrapperObsType, WrapperActType, ObsType, ActType],
+):
+    """Wraps a :class:`gymnasium.Env` to allow a modular transformation of the :meth:`step` and :meth:`reset` methods.
+
+    This class is the base class of all wrappers to change the behavior of the underlying environment.
+    Wrappers that inherit from this class can modify the :attr:`action_space`, :attr:`observation_space`,
+    :attr:`reward_range` and :attr:`metadata` attributes, without changing the underlying environment's attributes.
+    Moreover, the behavior of the :meth:`step` and :meth:`reset` methods can be changed by these wrappers.
+
+    Some attributes (:attr:`spec`, :attr:`render_mode`, :attr:`np_random`) will point back to the wrapper's environment
+    (i.e. to the corresponding attributes of :attr:`env`).
+
+    Note:
+        If you inherit from :class:`Wrapper`, don't forget to call ``super().__init__(env)``
+    """
+
+    def __init__(self, env: Env[ObsType, ActType]):
+        """Wraps an environment to allow a modular transformation of the :meth:`step` and :meth:`reset` methods.
+
+        Args:
+            env: The environment to wrap
+        """
+        self.env = env
+
+        self._action_space: spaces.Space[WrapperActType] | None = None
+        self._observation_space: spaces.Space[WrapperObsType] | None = None
+        self._reward_range: tuple[SupportsFloat, SupportsFloat] | None = None
+        self._metadata: dict[str, Any] | None = None
+
+        self._cached_spec: EnvSpec | None = None
+
+    def __getattr__(self, name: str) -> Any:
+        """Returns an attribute with ``name``, unless ``name`` starts with an underscore.
+
+        Args:
+            name: The variable name
+
+        Returns:
+            The value of the variable in the wrapper stack
+
+        Warnings:
+            This feature is deprecated and removed in v1.0 and replaced with `env.get_attr(name})`
+        """
+        if name == "_np_random":
+            raise AttributeError(
+                "Can't access `_np_random` of a wrapper, use `self.unwrapped._np_random` or `self.np_random`."
+            )
+        elif name.startswith("_"):
+            raise AttributeError(f"accessing private attribute '{name}' is prohibited")
+        logger.warn(
+            f"env.{name} to get variables from other wrappers is deprecated and will be removed in v1.0, "
+            f"to get this variable you can do `env.unwrapped.{name}` for environment variables or `env.get_wrapper_attr('{name}')` that will search the reminding wrappers."
+        )
+        return getattr(self.env, name)
+
+    def get_wrapper_attr(self, name: str) -> Any:
+        """Gets an attribute from the wrapper and lower environments if `name` doesn't exist in this object.
+
+        Args:
+            name: The variable name to get
+
+        Returns:
+            The variable with name in wrapper or lower environments
+        """
+        if name in self.__dir__():  # todo change in v1.0.0 to `hasattr`
+            return getattr(self, name)
+        else:
+            try:
+                return self.env.get_wrapper_attr(name)
+            except AttributeError as e:
+                raise AttributeError(
+                    f"wrapper {self.class_name()} has no attribute {name!r}"
+                ) from e
+
+    @property
+    def spec(self) -> EnvSpec | None:
+        """Returns the :attr:`Env` :attr:`spec` attribute with the `WrapperSpec` if the wrapper inherits from `EzPickle`."""
+        if self._cached_spec is not None:
+            return self._cached_spec
+
+        env_spec = self.env.spec
+        if env_spec is not None:
+            # See if the wrapper inherits from `RecordConstructorArgs` then add the kwargs otherwise use `None` for the wrapper kwargs. This will raise an error in `make`
+            if isinstance(self, RecordConstructorArgs):
+                kwargs = getattr(self, "_saved_kwargs")
+                if "env" in kwargs:
+                    kwargs = deepcopy(kwargs)
+                    kwargs.pop("env")
+            else:
+                kwargs = None
+
+            from gymnasium.envs.registration import WrapperSpec
+
+            wrapper_spec = WrapperSpec(
+                name=self.class_name(),
+                entry_point=f"{self.__module__}:{type(self).__name__}",
+                kwargs=kwargs,
+            )
+
+            # to avoid reference issues we deepcopy the prior environments spec and add the new information
+            env_spec = deepcopy(env_spec)
+            env_spec.additional_wrappers += (wrapper_spec,)
+
+        self._cached_spec = env_spec
+        return env_spec
+
+    @classmethod
+    def wrapper_spec(cls, **kwargs: Any) -> WrapperSpec:
+        """Generates a `WrapperSpec` for the wrappers."""
+        from gymnasium.envs.registration import WrapperSpec
+
+        return WrapperSpec(
+            name=cls.class_name(),
+            entry_point=f"{cls.__module__}:{cls.__name__}",
+            kwargs=kwargs,
+        )
+
+    @classmethod
+    def class_name(cls) -> str:
+        """Returns the class name of the wrapper."""
+        return cls.__name__
+
+    @property
+    def action_space(
+        self,
+    ) -> spaces.Space[ActType] | spaces.Space[WrapperActType]:
+        """Return the :attr:`Env` :attr:`action_space` unless overwritten then the wrapper :attr:`action_space` is used."""
+        if self._action_space is None:
+            return self.env.action_space
+        return self._action_space
+
+    @action_space.setter
+    def action_space(self, space: spaces.Space[WrapperActType]):
+        self._action_space = space
+
+    @property
+    def observation_space(
+        self,
+    ) -> spaces.Space[ObsType] | spaces.Space[WrapperObsType]:
+        """Return the :attr:`Env` :attr:`observation_space` unless overwritten then the wrapper :attr:`observation_space` is used."""
+        if self._observation_space is None:
+            return self.env.observation_space
+        return self._observation_space
+
+    @observation_space.setter
+    def observation_space(self, space: spaces.Space[WrapperObsType]):
+        self._observation_space = space
+
+    @property
+    def reward_range(self) -> tuple[SupportsFloat, SupportsFloat]:
+        """Return the :attr:`Env` :attr:`reward_range` unless overwritten then the wrapper :attr:`reward_range` is used."""
+        if self._reward_range is None:
+            return self.env.reward_range
+        logger.warn("The `reward_range` is deprecated and will be removed in v1.0")
+        return self._reward_range
+
+    @reward_range.setter
+    def reward_range(self, value: tuple[SupportsFloat, SupportsFloat]):
+        self._reward_range = value
+
+    @property
+    def metadata(self) -> dict[str, Any]:
+        """Returns the :attr:`Env` :attr:`metadata`."""
+        if self._metadata is None:
+            return self.env.metadata
+        return self._metadata
+
+    @metadata.setter
+    def metadata(self, value: dict[str, Any]):
+        self._metadata = value
+
+    @property
+    def render_mode(self) -> str | None:
+        """Returns the :attr:`Env` :attr:`render_mode`."""
+        return self.env.render_mode
+
+    @property
+    def np_random(self) -> np.random.Generator:
+        """Returns the :attr:`Env` :attr:`np_random` attribute."""
+        return self.env.np_random
+
+    @np_random.setter
+    def np_random(self, value: np.random.Generator):
+        self.env.np_random = value
+
+    @property
+    def _np_random(self):
+        """This code will never be run due to __getattr__ being called prior this.
+
+        It seems that @property overwrites the variable (`_np_random`) meaning that __getattr__ gets called with the missing variable.
+        """
+        raise AttributeError(
+            "Can't access `_np_random` of a wrapper, use `.unwrapped._np_random` or `.np_random`."
+        )
+
+    def step(
+        self, action: WrapperActType
+    ) -> tuple[WrapperObsType, SupportsFloat, bool, bool, dict[str, Any]]:
+        """Uses the :meth:`step` of the :attr:`env` that can be overwritten to change the returned data."""
+        return self.env.step(action)
+
+    def reset(
+        self, *, seed: int | None = None, options: dict[str, Any] | None = None
+    ) -> tuple[WrapperObsType, dict[str, Any]]:
+        """Uses the :meth:`reset` of the :attr:`env` that can be overwritten to change the returned data."""
+        return self.env.reset(seed=seed, options=options)
+
+    def render(self) -> RenderFrame | list[RenderFrame] | None:
+        """Uses the :meth:`render` of the :attr:`env` that can be overwritten to change the returned data."""
+        return self.env.render()
+
+    def close(self):
+        """Closes the wrapper and :attr:`env`."""
+        return self.env.close()
+
+    def __str__(self):
+        """Returns the wrapper name and the :attr:`env` representation string."""
+        return f"<{type(self).__name__}{self.env}>"
+
+    def __repr__(self):
+        """Returns the string representation of the wrapper."""
+        return str(self)
+
+    @property
+    def unwrapped(self) -> Env[ObsType, ActType]:
+        """Returns the base environment of the wrapper.
+
+        This will be the bare :class:`gymnasium.Env` environment, underneath all layers of wrappers.
+        """
+        return self.env.unwrapped
+
+
+class ObservationWrapper(Wrapper[WrapperObsType, ActType, ObsType, ActType]):
+    """Superclass of wrappers that can modify observations using :meth:`observation` for :meth:`reset` and :meth:`step`.
+
+    If you would like to apply a function to only the observation before
+    passing it to the learning code, you can simply inherit from :class:`ObservationWrapper` and overwrite the method
+    :meth:`observation` to implement that transformation. The transformation defined in that method must be
+    reflected by the :attr:`env` observation space. Otherwise, you need to specify the new observation space of the
+    wrapper by setting :attr:`self.observation_space` in the :meth:`__init__` method of your wrapper.
+
+    Among others, Gymnasium provides the observation wrapper :class:`TimeAwareObservation`, which adds information about the
+    index of the timestep to the observation.
+    """
+
+    def __init__(self, env: Env[ObsType, ActType]):
+        """Constructor for the observation wrapper."""
+        Wrapper.__init__(self, env)
+
+    def reset(
+        self, *, seed: int | None = None, options: dict[str, Any] | None = None
+    ) -> tuple[WrapperObsType, dict[str, Any]]:
+        """Modifies the :attr:`env` after calling :meth:`reset`, returning a modified observation using :meth:`self.observation`."""
+        obs, info = self.env.reset(seed=seed, options=options)
+        return self.observation(obs), info
+
+    def step(
+        self, action: ActType
+    ) -> tuple[WrapperObsType, SupportsFloat, bool, bool, dict[str, Any]]:
+        """Modifies the :attr:`env` after calling :meth:`step` using :meth:`self.observation` on the returned observations."""
+        observation, reward, terminated, truncated, info = self.env.step(action)
+        return self.observation(observation), reward, terminated, truncated, info
+
+    def observation(self, observation: ObsType) -> WrapperObsType:
+        """Returns a modified observation.
+
+        Args:
+            observation: The :attr:`env` observation
+
+        Returns:
+            The modified observation
+        """
+        raise NotImplementedError
+
+
+class RewardWrapper(Wrapper[ObsType, ActType, ObsType, ActType]):
+    """Superclass of wrappers that can modify the returning reward from a step.
+
+    If you would like to apply a function to the reward that is returned by the base environment before
+    passing it to learning code, you can simply inherit from :class:`RewardWrapper` and overwrite the method
+    :meth:`reward` to implement that transformation.
+    This transformation might change the :attr:`reward_range`; to specify the :attr:`reward_range` of your wrapper,
+    you can simply define :attr:`self.reward_range` in :meth:`__init__`.
+    """
+
+    def __init__(self, env: Env[ObsType, ActType]):
+        """Constructor for the Reward wrapper."""
+        Wrapper.__init__(self, env)
+
+    def step(
+        self, action: ActType
+    ) -> tuple[ObsType, SupportsFloat, bool, bool, dict[str, Any]]:
+        """Modifies the :attr:`env` :meth:`step` reward using :meth:`self.reward`."""
+        observation, reward, terminated, truncated, info = self.env.step(action)
+        return observation, self.reward(reward), terminated, truncated, info
+
+    def reward(self, reward: SupportsFloat) -> SupportsFloat:
+        """Returns a modified environment ``reward``.
+
+        Args:
+            reward: The :attr:`env` :meth:`step` reward
+
+        Returns:
+            The modified `reward`
+        """
+        raise NotImplementedError
+
+
+class ActionWrapper(Wrapper[ObsType, WrapperActType, ObsType, ActType]):
+    """Superclass of wrappers that can modify the action before :meth:`env.step`.
+
+    If you would like to apply a function to the action before passing it to the base environment,
+    you can simply inherit from :class:`ActionWrapper` and overwrite the method :meth:`action` to implement
+    that transformation. The transformation defined in that method must take values in the base environment’s
+    action space. However, its domain might differ from the original action space.
+    In that case, you need to specify the new action space of the wrapper by setting :attr:`self.action_space` in
+    the :meth:`__init__` method of your wrapper.
+
+    Among others, Gymnasium provides the action wrappers :class:`ClipAction` and :class:`RescaleAction` for clipping and rescaling actions.
+    """
+
+    def __init__(self, env: Env[ObsType, ActType]):
+        """Constructor for the action wrapper."""
+        Wrapper.__init__(self, env)
+
+    def step(
+        self, action: WrapperActType
+    ) -> tuple[ObsType, SupportsFloat, bool, bool, dict[str, Any]]:
+        """Runs the :attr:`env` :meth:`env.step` using the modified ``action`` from :meth:`self.action`."""
+        return self.env.step(self.action(action))
+
+    def action(self, action: WrapperActType) -> ActType:
+        """Returns a modified action before :meth:`env.step` is called.
+
+        Args:
+            action: The original :meth:`step` actions
+
+        Returns:
+            The modified actions
+        """
+        raise NotImplementedError
--- a/rl/Lib/site-packages/gymnasium/envs/init.py
+++ b/rl/Lib/site-packages/gymnasium/envs/init.py
@ -0,0 +1,387 @@
+"""Registers the internal gym envs then loads the env plugins for module using the entry point."""
+from typing import Any
+
+from gymnasium.envs.registration import (
+    load_plugin_envs,
+    make,
+    pprint_registry,
+    register,
+    registry,
+    spec,
+)
+
+
+# Classic
+# ----------------------------------------
+
+register(
+    id="CartPole-v0",
+    entry_point="gymnasium.envs.classic_control.cartpole:CartPoleEnv",
+    vector_entry_point="gymnasium.envs.classic_control.cartpole:CartPoleVectorEnv",
+    max_episode_steps=200,
+    reward_threshold=195.0,
+)
+
+register(
+    id="CartPole-v1",
+    entry_point="gymnasium.envs.classic_control.cartpole:CartPoleEnv",
+    vector_entry_point="gymnasium.envs.classic_control.cartpole:CartPoleVectorEnv",
+    max_episode_steps=500,
+    reward_threshold=475.0,
+)
+
+register(
+    id="MountainCar-v0",
+    entry_point="gymnasium.envs.classic_control.mountain_car:MountainCarEnv",
+    max_episode_steps=200,
+    reward_threshold=-110.0,
+)
+
+register(
+    id="MountainCarContinuous-v0",
+    entry_point="gymnasium.envs.classic_control.continuous_mountain_car:Continuous_MountainCarEnv",
+    max_episode_steps=999,
+    reward_threshold=90.0,
+)
+
+register(
+    id="Pendulum-v1",
+    entry_point="gymnasium.envs.classic_control.pendulum:PendulumEnv",
+    max_episode_steps=200,
+)
+
+register(
+    id="Acrobot-v1",
+    entry_point="gymnasium.envs.classic_control.acrobot:AcrobotEnv",
+    reward_threshold=-100.0,
+    max_episode_steps=500,
+)
+
+
+# Phys2d (jax classic control)
+# ----------------------------------------
+
+register(
+    id="phys2d/CartPole-v0",
+    entry_point="gymnasium.envs.phys2d.cartpole:CartPoleJaxEnv",
+    vector_entry_point="gymnasium.envs.phys2d.cartpole:CartPoleJaxVectorEnv",
+    max_episode_steps=200,
+    reward_threshold=195.0,
+)
+
+register(
+    id="phys2d/CartPole-v1",
+    entry_point="gymnasium.envs.phys2d.cartpole:CartPoleJaxEnv",
+    vector_entry_point="gymnasium.envs.phys2d.cartpole:CartPoleJaxVectorEnv",
+    max_episode_steps=500,
+    reward_threshold=475.0,
+)
+
+register(
+    id="phys2d/Pendulum-v0",
+    entry_point="gymnasium.envs.phys2d.pendulum:PendulumJaxEnv",
+    vector_entry_point="gymnasium.envs.phys2d.pendulum:PendulumJaxVectorEnv",
+    max_episode_steps=200,
+)
+
+# Box2d
+# ----------------------------------------
+
+register(
+    id="LunarLander-v2",
+    entry_point="gymnasium.envs.box2d.lunar_lander:LunarLander",
+    max_episode_steps=1000,
+    reward_threshold=200,
+)
+
+register(
+    id="LunarLanderContinuous-v2",
+    entry_point="gymnasium.envs.box2d.lunar_lander:LunarLander",
+    kwargs={"continuous": True},
+    max_episode_steps=1000,
+    reward_threshold=200,
+)
+
+register(
+    id="BipedalWalker-v3",
+    entry_point="gymnasium.envs.box2d.bipedal_walker:BipedalWalker",
+    max_episode_steps=1600,
+    reward_threshold=300,
+)
+
+register(
+    id="BipedalWalkerHardcore-v3",
+    entry_point="gymnasium.envs.box2d.bipedal_walker:BipedalWalker",
+    kwargs={"hardcore": True},
+    max_episode_steps=2000,
+    reward_threshold=300,
+)
+
+register(
+    id="CarRacing-v2",
+    entry_point="gymnasium.envs.box2d.car_racing:CarRacing",
+    max_episode_steps=1000,
+    reward_threshold=900,
+)
+
+# Toy Text
+# ----------------------------------------
+
+register(
+    id="Blackjack-v1",
+    entry_point="gymnasium.envs.toy_text.blackjack:BlackjackEnv",
+    kwargs={"sab": True, "natural": False},
+)
+
+register(
+    id="FrozenLake-v1",
+    entry_point="gymnasium.envs.toy_text.frozen_lake:FrozenLakeEnv",
+    kwargs={"map_name": "4x4"},
+    max_episode_steps=100,
+    reward_threshold=0.70,  # optimum = 0.74
+)
+
+register(
+    id="FrozenLake8x8-v1",
+    entry_point="gymnasium.envs.toy_text.frozen_lake:FrozenLakeEnv",
+    kwargs={"map_name": "8x8"},
+    max_episode_steps=200,
+    reward_threshold=0.85,  # optimum = 0.91
+)
+
+register(
+    id="CliffWalking-v0",
+    entry_point="gymnasium.envs.toy_text.cliffwalking:CliffWalkingEnv",
+)
+
+register(
+    id="Taxi-v3",
+    entry_point="gymnasium.envs.toy_text.taxi:TaxiEnv",
+    reward_threshold=8,  # optimum = 8.46
+    max_episode_steps=200,
+)
+
+
+# Tabular
+# ----------------------------------------
+
+register(
+    id="tabular/Blackjack-v0",
+    entry_point="gymnasium.envs.tabular.blackjack:BlackJackJaxEnv",
+    kwargs={"sutton_and_barto": True, "natural": False},
+)
+
+register(
+    id="tabular/CliffWalking-v0",
+    entry_point="gymnasium.envs.tabular.cliffwalking:CliffWalkingJaxEnv",
+)
+
+
+# Mujoco
+# ----------------------------------------
+
+# 2D
+
+register(
+    id="Reacher-v2",
+    entry_point="gymnasium.envs.mujoco:ReacherEnv",
+    max_episode_steps=50,
+    reward_threshold=-3.75,
+)
+
+register(
+    id="Reacher-v4",
+    entry_point="gymnasium.envs.mujoco.reacher_v4:ReacherEnv",
+    max_episode_steps=50,
+    reward_threshold=-3.75,
+)
+
+register(
+    id="Pusher-v2",
+    entry_point="gymnasium.envs.mujoco:PusherEnv",
+    max_episode_steps=100,
+    reward_threshold=0.0,
+)
+
+register(
+    id="Pusher-v4",
+    entry_point="gymnasium.envs.mujoco.pusher_v4:PusherEnv",
+    max_episode_steps=100,
+    reward_threshold=0.0,
+)
+
+register(
+    id="InvertedPendulum-v2",
+    entry_point="gymnasium.envs.mujoco:InvertedPendulumEnv",
+    max_episode_steps=1000,
+    reward_threshold=950.0,
+)
+
+register(
+    id="InvertedPendulum-v4",
+    entry_point="gymnasium.envs.mujoco.inverted_pendulum_v4:InvertedPendulumEnv",
+    max_episode_steps=1000,
+    reward_threshold=950.0,
+)
+
+register(
+    id="InvertedDoublePendulum-v2",
+    entry_point="gymnasium.envs.mujoco:InvertedDoublePendulumEnv",
+    max_episode_steps=1000,
+    reward_threshold=9100.0,
+)
+
+register(
+    id="InvertedDoublePendulum-v4",
+    entry_point="gymnasium.envs.mujoco.inverted_double_pendulum_v4:InvertedDoublePendulumEnv",
+    max_episode_steps=1000,
+    reward_threshold=9100.0,
+)
+
+register(
+    id="HalfCheetah-v2",
+    entry_point="gymnasium.envs.mujoco:HalfCheetahEnv",
+    max_episode_steps=1000,
+    reward_threshold=4800.0,
+)
+
+register(
+    id="HalfCheetah-v3",
+    entry_point="gymnasium.envs.mujoco.half_cheetah_v3:HalfCheetahEnv",
+    max_episode_steps=1000,
+    reward_threshold=4800.0,
+)
+
+register(
+    id="HalfCheetah-v4",
+    entry_point="gymnasium.envs.mujoco.half_cheetah_v4:HalfCheetahEnv",
+    max_episode_steps=1000,
+    reward_threshold=4800.0,
+)
+
+register(
+    id="Hopper-v2",
+    entry_point="gymnasium.envs.mujoco:HopperEnv",
+    max_episode_steps=1000,
+    reward_threshold=3800.0,
+)
+
+register(
+    id="Hopper-v3",
+    entry_point="gymnasium.envs.mujoco.hopper_v3:HopperEnv",
+    max_episode_steps=1000,
+    reward_threshold=3800.0,
+)
+
+register(
+    id="Hopper-v4",
+    entry_point="gymnasium.envs.mujoco.hopper_v4:HopperEnv",
+    max_episode_steps=1000,
+    reward_threshold=3800.0,
+)
+
+register(
+    id="Swimmer-v2",
+    entry_point="gymnasium.envs.mujoco:SwimmerEnv",
+    max_episode_steps=1000,
+    reward_threshold=360.0,
+)
+
+register(
+    id="Swimmer-v3",
+    entry_point="gymnasium.envs.mujoco.swimmer_v3:SwimmerEnv",
+    max_episode_steps=1000,
+    reward_threshold=360.0,
+)
+
+register(
+    id="Swimmer-v4",
+    entry_point="gymnasium.envs.mujoco.swimmer_v4:SwimmerEnv",
+    max_episode_steps=1000,
+    reward_threshold=360.0,
+)
+
+register(
+    id="Walker2d-v2",
+    max_episode_steps=1000,
+    entry_point="gymnasium.envs.mujoco:Walker2dEnv",
+)
+
+register(
+    id="Walker2d-v3",
+    max_episode_steps=1000,
+    entry_point="gymnasium.envs.mujoco.walker2d_v3:Walker2dEnv",
+)
+
+register(
+    id="Walker2d-v4",
+    max_episode_steps=1000,
+    entry_point="gymnasium.envs.mujoco.walker2d_v4:Walker2dEnv",
+)
+
+register(
+    id="Ant-v2",
+    entry_point="gymnasium.envs.mujoco:AntEnv",
+    max_episode_steps=1000,
+    reward_threshold=6000.0,
+)
+
+register(
+    id="Ant-v3",
+    entry_point="gymnasium.envs.mujoco.ant_v3:AntEnv",
+    max_episode_steps=1000,
+    reward_threshold=6000.0,
+)
+
+register(
+    id="Ant-v4",
+    entry_point="gymnasium.envs.mujoco.ant_v4:AntEnv",
+    max_episode_steps=1000,
+    reward_threshold=6000.0,
+)
+
+register(
+    id="Humanoid-v2",
+    entry_point="gymnasium.envs.mujoco:HumanoidEnv",
+    max_episode_steps=1000,
+)
+
+register(
+    id="Humanoid-v3",
+    entry_point="gymnasium.envs.mujoco.humanoid_v3:HumanoidEnv",
+    max_episode_steps=1000,
+)
+
+register(
+    id="Humanoid-v4",
+    entry_point="gymnasium.envs.mujoco.humanoid_v4:HumanoidEnv",
+    max_episode_steps=1000,
+)
+
+register(
+    id="HumanoidStandup-v2",
+    entry_point="gymnasium.envs.mujoco:HumanoidStandupEnv",
+    max_episode_steps=1000,
+)
+
+register(
+    id="HumanoidStandup-v4",
+    entry_point="gymnasium.envs.mujoco.humanoidstandup_v4:HumanoidStandupEnv",
+    max_episode_steps=1000,
+)
+
+
+# --- For shimmy compatibility
+def _raise_shimmy_error(*args: Any, **kwargs: Any):
+    raise ImportError(
+        "To use the gym compatibility environments, run `pip install shimmy[gym-v21]` or `pip install shimmy[gym-v26]`"
+    )
+
+
+# When installed, shimmy will re-register these environments with the correct entry_point
+register(id="GymV21Environment-v0", entry_point=_raise_shimmy_error)
+register(id="GymV26Environment-v0", entry_point=_raise_shimmy_error)
+
+
+# Hook to load plugins from entry points
+load_plugin_envs()
--- a/rl/Lib/site-packages/gymnasium/envs/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/pycache/registration.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/pycache/registration.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/box2d/init.py
+++ b/rl/Lib/site-packages/gymnasium/envs/box2d/init.py
@ -0,0 +1,3 @@
+from gymnasium.envs.box2d.bipedal_walker import BipedalWalker, BipedalWalkerHardcore
+from gymnasium.envs.box2d.car_racing import CarRacing
+from gymnasium.envs.box2d.lunar_lander import LunarLander, LunarLanderContinuous
--- a/rl/Lib/site-packages/gymnasium/envs/box2d/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/box2d/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/box2d/pycache/bipedal_walker.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/box2d/pycache/bipedal_walker.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/box2d/pycache/car_dynamics.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/box2d/pycache/car_dynamics.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/box2d/pycache/car_racing.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/box2d/pycache/car_racing.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/box2d/pycache/lunar_lander.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/box2d/pycache/lunar_lander.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/box2d/bipedal_walker.py
+++ b/rl/Lib/site-packages/gymnasium/envs/box2d/bipedal_walker.py
@ -0,0 +1,861 @@
+__credits__ = ["Andrea PIERRÉ"]
+
+import math
+from typing import TYPE_CHECKING, List, Optional
+
+import numpy as np
+
+import gymnasium as gym
+from gymnasium import error, spaces
+from gymnasium.error import DependencyNotInstalled
+from gymnasium.utils import EzPickle
+
+
+try:
+    import Box2D
+    from Box2D.b2 import (
+        circleShape,
+        contactListener,
+        edgeShape,
+        fixtureDef,
+        polygonShape,
+        revoluteJointDef,
+    )
+except ImportError as e:
+    raise DependencyNotInstalled(
+        "Box2D is not installed, run `pip install gymnasium[box2d]`"
+    ) from e
+
+
+if TYPE_CHECKING:
+    import pygame
+
+FPS = 50
+SCALE = 30.0  # affects how fast-paced the game is, forces should be adjusted as well
+
+MOTORS_TORQUE = 80
+SPEED_HIP = 4
+SPEED_KNEE = 6
+LIDAR_RANGE = 160 / SCALE
+
+INITIAL_RANDOM = 5
+
+HULL_POLY = [(-30, +9), (+6, +9), (+34, +1), (+34, -8), (-30, -8)]
+LEG_DOWN = -8 / SCALE
+LEG_W, LEG_H = 8 / SCALE, 34 / SCALE
+
+VIEWPORT_W = 600
+VIEWPORT_H = 400
+
+TERRAIN_STEP = 14 / SCALE
+TERRAIN_LENGTH = 200  # in steps
+TERRAIN_HEIGHT = VIEWPORT_H / SCALE / 4
+TERRAIN_GRASS = 10  # low long are grass spots, in steps
+TERRAIN_STARTPAD = 20  # in steps
+FRICTION = 2.5
+
+HULL_FD = fixtureDef(
+    shape=polygonShape(vertices=[(x / SCALE, y / SCALE) for x, y in HULL_POLY]),
+    density=5.0,
+    friction=0.1,
+    categoryBits=0x0020,
+    maskBits=0x001,  # collide only with ground
+    restitution=0.0,
+)  # 0.99 bouncy
+
+LEG_FD = fixtureDef(
+    shape=polygonShape(box=(LEG_W / 2, LEG_H / 2)),
+    density=1.0,
+    restitution=0.0,
+    categoryBits=0x0020,
+    maskBits=0x001,
+)
+
+LOWER_FD = fixtureDef(
+    shape=polygonShape(box=(0.8 * LEG_W / 2, LEG_H / 2)),
+    density=1.0,
+    restitution=0.0,
+    categoryBits=0x0020,
+    maskBits=0x001,
+)
+
+
+class ContactDetector(contactListener):
+    def __init__(self, env):
+        contactListener.__init__(self)
+        self.env = env
+
+    def BeginContact(self, contact):
+        if (
+            self.env.hull == contact.fixtureA.body
+            or self.env.hull == contact.fixtureB.body
+        ):
+            self.env.game_over = True
+        for leg in [self.env.legs[1], self.env.legs[3]]:
+            if leg in [contact.fixtureA.body, contact.fixtureB.body]:
+                leg.ground_contact = True
+
+    def EndContact(self, contact):
+        for leg in [self.env.legs[1], self.env.legs[3]]:
+            if leg in [contact.fixtureA.body, contact.fixtureB.body]:
+                leg.ground_contact = False
+
+
+class BipedalWalker(gym.Env, EzPickle):
+    """
+    ## Description
+    This is a simple 4-joint walker robot environment.
+    There are two versions:
+    - Normal, with slightly uneven terrain.
+    - Hardcore, with ladders, stumps, pitfalls.
+
+    To solve the normal version, you need to get 300 points in 1600 time steps.
+    To solve the hardcore version, you need 300 points in 2000 time steps.
+
+    A heuristic is provided for testing. It's also useful to get demonstrations
+    to learn from. To run the heuristic:
+    ```
+    python gymnasium/envs/box2d/bipedal_walker.py
+    ```
+
+    ## Action Space
+    Actions are motor speed values in the [-1, 1] range for each of the
+    4 joints at both hips and knees.
+
+    ## Observation Space
+    State consists of hull angle speed, angular velocity, horizontal speed,
+    vertical speed, position of joints and joints angular speed, legs contact
+    with ground, and 10 lidar rangefinder measurements. There are no coordinates
+    in the state vector.
+
+    ## Rewards
+    Reward is given for moving forward, totaling 300+ points up to the far end.
+    If the robot falls, it gets -100. Applying motor torque costs a small
+    amount of points. A more optimal agent will get a better score.
+
+    ## Starting State
+    The walker starts standing at the left end of the terrain with the hull
+    horizontal, and both legs in the same position with a slight knee angle.
+
+    ## Episode Termination
+    The episode will terminate if the hull gets in contact with the ground or
+    if the walker exceeds the right end of the terrain length.
+
+    ## Arguments
+    To use to the _hardcore_ environment, you need to specify the
+    `hardcore=True` argument like below:
+    ```python
+    import gymnasium as gym
+    env = gym.make("BipedalWalker-v3", hardcore=True)
+    ```
+
+    ## Version History
+    - v3: Returns the closest lidar trace instead of furthest;
+        faster video recording
+    - v2: Count energy spent
+    - v1: Legs now report contact with ground; motors have higher torque and
+        speed; ground has higher friction; lidar rendered less nervously.
+    - v0: Initial version
+
+
+    <!-- ## References -->
+
+    ## Credits
+    Created by Oleg Klimov
+
+    """
+
+    metadata = {
+        "render_modes": ["human", "rgb_array"],
+        "render_fps": FPS,
+    }
+
+    def __init__(self, render_mode: Optional[str] = None, hardcore: bool = False):
+        EzPickle.__init__(self, render_mode, hardcore)
+        self.isopen = True
+
+        self.world = Box2D.b2World()
+        self.terrain: List[Box2D.b2Body] = []
+        self.hull: Optional[Box2D.b2Body] = None
+
+        self.prev_shaping = None
+
+        self.hardcore = hardcore
+
+        self.fd_polygon = fixtureDef(
+            shape=polygonShape(vertices=[(0, 0), (1, 0), (1, -1), (0, -1)]),
+            friction=FRICTION,
+        )
+
+        self.fd_edge = fixtureDef(
+            shape=edgeShape(vertices=[(0, 0), (1, 1)]),
+            friction=FRICTION,
+            categoryBits=0x0001,
+        )
+
+        # we use 5.0 to represent the joints moving at maximum
+        # 5 x the rated speed due to impulses from ground contact etc.
+        low = np.array(
+            [
+                -math.pi,
+                -5.0,
+                -5.0,
+                -5.0,
+                -math.pi,
+                -5.0,
+                -math.pi,
+                -5.0,
+                -0.0,
+                -math.pi,
+                -5.0,
+                -math.pi,
+                -5.0,
+                -0.0,
+            ]
+            + [-1.0] * 10
+        ).astype(np.float32)
+        high = np.array(
+            [
+                math.pi,
+                5.0,
+                5.0,
+                5.0,
+                math.pi,
+                5.0,
+                math.pi,
+                5.0,
+                5.0,
+                math.pi,
+                5.0,
+                math.pi,
+                5.0,
+                5.0,
+            ]
+            + [1.0] * 10
+        ).astype(np.float32)
+        self.action_space = spaces.Box(
+            np.array([-1, -1, -1, -1]).astype(np.float32),
+            np.array([1, 1, 1, 1]).astype(np.float32),
+        )
+        self.observation_space = spaces.Box(low, high)
+
+        # state = [
+        #     self.hull.angle,  # Normal angles up to 0.5 here, but sure more is possible.
+        #     2.0 * self.hull.angularVelocity / FPS,
+        #     0.3 * vel.x * (VIEWPORT_W / SCALE) / FPS,  # Normalized to get -1..1 range
+        #     0.3 * vel.y * (VIEWPORT_H / SCALE) / FPS,
+        #     self.joints[
+        #         0
+        #     ].angle,  # This will give 1.1 on high up, but it's still OK (and there should be spikes on hiting the ground, that's normal too)
+        #     self.joints[0].speed / SPEED_HIP,
+        #     self.joints[1].angle + 1.0,
+        #     self.joints[1].speed / SPEED_KNEE,
+        #     1.0 if self.legs[1].ground_contact else 0.0,
+        #     self.joints[2].angle,
+        #     self.joints[2].speed / SPEED_HIP,
+        #     self.joints[3].angle + 1.0,
+        #     self.joints[3].speed / SPEED_KNEE,
+        #     1.0 if self.legs[3].ground_contact else 0.0,
+        # ]
+        # state += [l.fraction for l in self.lidar]
+
+        self.render_mode = render_mode
+        self.screen: Optional[pygame.Surface] = None
+        self.clock = None
+
+    def _destroy(self):
+        if not self.terrain:
+            return
+        self.world.contactListener = None
+        for t in self.terrain:
+            self.world.DestroyBody(t)
+        self.terrain = []
+        self.world.DestroyBody(self.hull)
+        self.hull = None
+        for leg in self.legs:
+            self.world.DestroyBody(leg)
+        self.legs = []
+        self.joints = []
+
+    def _generate_terrain(self, hardcore):
+        GRASS, STUMP, STAIRS, PIT, _STATES_ = range(5)
+        state = GRASS
+        velocity = 0.0
+        y = TERRAIN_HEIGHT
+        counter = TERRAIN_STARTPAD
+        oneshot = False
+        self.terrain = []
+        self.terrain_x = []
+        self.terrain_y = []
+
+        stair_steps, stair_width, stair_height = 0, 0, 0
+        original_y = 0
+        for i in range(TERRAIN_LENGTH):
+            x = i * TERRAIN_STEP
+            self.terrain_x.append(x)
+
+            if state == GRASS and not oneshot:
+                velocity = 0.8 * velocity + 0.01 * np.sign(TERRAIN_HEIGHT - y)
+                if i > TERRAIN_STARTPAD:
+                    velocity += self.np_random.uniform(-1, 1) / SCALE  # 1
+                y += velocity
+
+            elif state == PIT and oneshot:
+                counter = self.np_random.integers(3, 5)
+                poly = [
+                    (x, y),
+                    (x + TERRAIN_STEP, y),
+                    (x + TERRAIN_STEP, y - 4 * TERRAIN_STEP),
+                    (x, y - 4 * TERRAIN_STEP),
+                ]
+                self.fd_polygon.shape.vertices = poly
+                t = self.world.CreateStaticBody(fixtures=self.fd_polygon)
+                t.color1, t.color2 = (255, 255, 255), (153, 153, 153)
+                self.terrain.append(t)
+
+                self.fd_polygon.shape.vertices = [
+                    (p[0] + TERRAIN_STEP * counter, p[1]) for p in poly
+                ]
+                t = self.world.CreateStaticBody(fixtures=self.fd_polygon)
+                t.color1, t.color2 = (255, 255, 255), (153, 153, 153)
+                self.terrain.append(t)
+                counter += 2
+                original_y = y
+
+            elif state == PIT and not oneshot:
+                y = original_y
+                if counter > 1:
+                    y -= 4 * TERRAIN_STEP
+
+            elif state == STUMP and oneshot:
+                counter = self.np_random.integers(1, 3)
+                poly = [
+                    (x, y),
+                    (x + counter * TERRAIN_STEP, y),
+                    (x + counter * TERRAIN_STEP, y + counter * TERRAIN_STEP),
+                    (x, y + counter * TERRAIN_STEP),
+                ]
+                self.fd_polygon.shape.vertices = poly
+                t = self.world.CreateStaticBody(fixtures=self.fd_polygon)
+                t.color1, t.color2 = (255, 255, 255), (153, 153, 153)
+                self.terrain.append(t)
+
+            elif state == STAIRS and oneshot:
+                stair_height = +1 if self.np_random.random() > 0.5 else -1
+                stair_width = self.np_random.integers(4, 5)
+                stair_steps = self.np_random.integers(3, 5)
+                original_y = y
+                for s in range(stair_steps):
+                    poly = [
+                        (
+                            x + (s * stair_width) * TERRAIN_STEP,
+                            y + (s * stair_height) * TERRAIN_STEP,
+                        ),
+                        (
+                            x + ((1 + s) * stair_width) * TERRAIN_STEP,
+                            y + (s * stair_height) * TERRAIN_STEP,
+                        ),
+                        (
+                            x + ((1 + s) * stair_width) * TERRAIN_STEP,
+                            y + (-1 + s * stair_height) * TERRAIN_STEP,
+                        ),
+                        (
+                            x + (s * stair_width) * TERRAIN_STEP,
+                            y + (-1 + s * stair_height) * TERRAIN_STEP,
+                        ),
+                    ]
+                    self.fd_polygon.shape.vertices = poly
+                    t = self.world.CreateStaticBody(fixtures=self.fd_polygon)
+                    t.color1, t.color2 = (255, 255, 255), (153, 153, 153)
+                    self.terrain.append(t)
+                counter = stair_steps * stair_width
+
+            elif state == STAIRS and not oneshot:
+                s = stair_steps * stair_width - counter - stair_height
+                n = s / stair_width
+                y = original_y + (n * stair_height) * TERRAIN_STEP
+
+            oneshot = False
+            self.terrain_y.append(y)
+            counter -= 1
+            if counter == 0:
+                counter = self.np_random.integers(TERRAIN_GRASS / 2, TERRAIN_GRASS)
+                if state == GRASS and hardcore:
+                    state = self.np_random.integers(1, _STATES_)
+                    oneshot = True
+                else:
+                    state = GRASS
+                    oneshot = True
+
+        self.terrain_poly = []
+        for i in range(TERRAIN_LENGTH - 1):
+            poly = [
+                (self.terrain_x[i], self.terrain_y[i]),
+                (self.terrain_x[i + 1], self.terrain_y[i + 1]),
+            ]
+            self.fd_edge.shape.vertices = poly
+            t = self.world.CreateStaticBody(fixtures=self.fd_edge)
+            color = (76, 255 if i % 2 == 0 else 204, 76)
+            t.color1 = color
+            t.color2 = color
+            self.terrain.append(t)
+            color = (102, 153, 76)
+            poly += [(poly[1][0], 0), (poly[0][0], 0)]
+            self.terrain_poly.append((poly, color))
+        self.terrain.reverse()
+
+    def _generate_clouds(self):
+        # Sorry for the clouds, couldn't resist
+        self.cloud_poly = []
+        for i in range(TERRAIN_LENGTH // 20):
+            x = self.np_random.uniform(0, TERRAIN_LENGTH) * TERRAIN_STEP
+            y = VIEWPORT_H / SCALE * 3 / 4
+            poly = [
+                (
+                    x
+                    + 15 * TERRAIN_STEP * math.sin(3.14 * 2 * a / 5)
+                    + self.np_random.uniform(0, 5 * TERRAIN_STEP),
+                    y
+                    + 5 * TERRAIN_STEP * math.cos(3.14 * 2 * a / 5)
+                    + self.np_random.uniform(0, 5 * TERRAIN_STEP),
+                )
+                for a in range(5)
+            ]
+            x1 = min(p[0] for p in poly)
+            x2 = max(p[0] for p in poly)
+            self.cloud_poly.append((poly, x1, x2))
+
+    def reset(
+        self,
+        *,
+        seed: Optional[int] = None,
+        options: Optional[dict] = None,
+    ):
+        super().reset(seed=seed)
+        self._destroy()
+        self.world.contactListener_bug_workaround = ContactDetector(self)
+        self.world.contactListener = self.world.contactListener_bug_workaround
+        self.game_over = False
+        self.prev_shaping = None
+        self.scroll = 0.0
+        self.lidar_render = 0
+
+        self._generate_terrain(self.hardcore)
+        self._generate_clouds()
+
+        init_x = TERRAIN_STEP * TERRAIN_STARTPAD / 2
+        init_y = TERRAIN_HEIGHT + 2 * LEG_H
+        self.hull = self.world.CreateDynamicBody(
+            position=(init_x, init_y), fixtures=HULL_FD
+        )
+        self.hull.color1 = (127, 51, 229)
+        self.hull.color2 = (76, 76, 127)
+        self.hull.ApplyForceToCenter(
+            (self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM), 0), True
+        )
+
+        self.legs: List[Box2D.b2Body] = []
+        self.joints: List[Box2D.b2RevoluteJoint] = []
+        for i in [-1, +1]:
+            leg = self.world.CreateDynamicBody(
+                position=(init_x, init_y - LEG_H / 2 - LEG_DOWN),
+                angle=(i * 0.05),
+                fixtures=LEG_FD,
+            )
+            leg.color1 = (153 - i * 25, 76 - i * 25, 127 - i * 25)
+            leg.color2 = (102 - i * 25, 51 - i * 25, 76 - i * 25)
+            rjd = revoluteJointDef(
+                bodyA=self.hull,
+                bodyB=leg,
+                localAnchorA=(0, LEG_DOWN),
+                localAnchorB=(0, LEG_H / 2),
+                enableMotor=True,
+                enableLimit=True,
+                maxMotorTorque=MOTORS_TORQUE,
+                motorSpeed=i,
+                lowerAngle=-0.8,
+                upperAngle=1.1,
+            )
+            self.legs.append(leg)
+            self.joints.append(self.world.CreateJoint(rjd))
+
+            lower = self.world.CreateDynamicBody(
+                position=(init_x, init_y - LEG_H * 3 / 2 - LEG_DOWN),
+                angle=(i * 0.05),
+                fixtures=LOWER_FD,
+            )
+            lower.color1 = (153 - i * 25, 76 - i * 25, 127 - i * 25)
+            lower.color2 = (102 - i * 25, 51 - i * 25, 76 - i * 25)
+            rjd = revoluteJointDef(
+                bodyA=leg,
+                bodyB=lower,
+                localAnchorA=(0, -LEG_H / 2),
+                localAnchorB=(0, LEG_H / 2),
+                enableMotor=True,
+                enableLimit=True,
+                maxMotorTorque=MOTORS_TORQUE,
+                motorSpeed=1,
+                lowerAngle=-1.6,
+                upperAngle=-0.1,
+            )
+            lower.ground_contact = False
+            self.legs.append(lower)
+            self.joints.append(self.world.CreateJoint(rjd))
+
+        self.drawlist = self.terrain + self.legs + [self.hull]
+
+        class LidarCallback(Box2D.b2.rayCastCallback):
+            def ReportFixture(self, fixture, point, normal, fraction):
+                if (fixture.filterData.categoryBits & 1) == 0:
+                    return -1
+                self.p2 = point
+                self.fraction = fraction
+                return fraction
+
+        self.lidar = [LidarCallback() for _ in range(10)]
+        if self.render_mode == "human":
+            self.render()
+        return self.step(np.array([0, 0, 0, 0]))[0], {}
+
+    def step(self, action: np.ndarray):
+        assert self.hull is not None
+
+        # self.hull.ApplyForceToCenter((0, 20), True) -- Uncomment this to receive a bit of stability help
+        control_speed = False  # Should be easier as well
+        if control_speed:
+            self.joints[0].motorSpeed = float(SPEED_HIP * np.clip(action[0], -1, 1))
+            self.joints[1].motorSpeed = float(SPEED_KNEE * np.clip(action[1], -1, 1))
+            self.joints[2].motorSpeed = float(SPEED_HIP * np.clip(action[2], -1, 1))
+            self.joints[3].motorSpeed = float(SPEED_KNEE * np.clip(action[3], -1, 1))
+        else:
+            self.joints[0].motorSpeed = float(SPEED_HIP * np.sign(action[0]))
+            self.joints[0].maxMotorTorque = float(
+                MOTORS_TORQUE * np.clip(np.abs(action[0]), 0, 1)
+            )
+            self.joints[1].motorSpeed = float(SPEED_KNEE * np.sign(action[1]))
+            self.joints[1].maxMotorTorque = float(
+                MOTORS_TORQUE * np.clip(np.abs(action[1]), 0, 1)
+            )
+            self.joints[2].motorSpeed = float(SPEED_HIP * np.sign(action[2]))
+            self.joints[2].maxMotorTorque = float(
+                MOTORS_TORQUE * np.clip(np.abs(action[2]), 0, 1)
+            )
+            self.joints[3].motorSpeed = float(SPEED_KNEE * np.sign(action[3]))
+            self.joints[3].maxMotorTorque = float(
+                MOTORS_TORQUE * np.clip(np.abs(action[3]), 0, 1)
+            )
+
+        self.world.Step(1.0 / FPS, 6 * 30, 2 * 30)
+
+        pos = self.hull.position
+        vel = self.hull.linearVelocity
+
+        for i in range(10):
+            self.lidar[i].fraction = 1.0
+            self.lidar[i].p1 = pos
+            self.lidar[i].p2 = (
+                pos[0] + math.sin(1.5 * i / 10.0) * LIDAR_RANGE,
+                pos[1] - math.cos(1.5 * i / 10.0) * LIDAR_RANGE,
+            )
+            self.world.RayCast(self.lidar[i], self.lidar[i].p1, self.lidar[i].p2)
+
+        state = [
+            self.hull.angle,  # Normal angles up to 0.5 here, but sure more is possible.
+            2.0 * self.hull.angularVelocity / FPS,
+            0.3 * vel.x * (VIEWPORT_W / SCALE) / FPS,  # Normalized to get -1..1 range
+            0.3 * vel.y * (VIEWPORT_H / SCALE) / FPS,
+            self.joints[0].angle,
+            # This will give 1.1 on high up, but it's still OK (and there should be spikes on hiting the ground, that's normal too)
+            self.joints[0].speed / SPEED_HIP,
+            self.joints[1].angle + 1.0,
+            self.joints[1].speed / SPEED_KNEE,
+            1.0 if self.legs[1].ground_contact else 0.0,
+            self.joints[2].angle,
+            self.joints[2].speed / SPEED_HIP,
+            self.joints[3].angle + 1.0,
+            self.joints[3].speed / SPEED_KNEE,
+            1.0 if self.legs[3].ground_contact else 0.0,
+        ]
+        state += [l.fraction for l in self.lidar]
+        assert len(state) == 24
+
+        self.scroll = pos.x - VIEWPORT_W / SCALE / 5
+
+        shaping = (
+            130 * pos[0] / SCALE
+        )  # moving forward is a way to receive reward (normalized to get 300 on completion)
+        shaping -= 5.0 * abs(
+            state[0]
+        )  # keep head straight, other than that and falling, any behavior is unpunished
+
+        reward = 0
+        if self.prev_shaping is not None:
+            reward = shaping - self.prev_shaping
+        self.prev_shaping = shaping
+
+        for a in action:
+            reward -= 0.00035 * MOTORS_TORQUE * np.clip(np.abs(a), 0, 1)
+            # normalized to about -50.0 using heuristic, more optimal agent should spend less
+
+        terminated = False
+        if self.game_over or pos[0] < 0:
+            reward = -100
+            terminated = True
+        if pos[0] > (TERRAIN_LENGTH - TERRAIN_GRASS) * TERRAIN_STEP:
+            terminated = True
+
+        if self.render_mode == "human":
+            self.render()
+        return np.array(state, dtype=np.float32), reward, terminated, False, {}
+
+    def render(self):
+        if self.render_mode is None:
+            assert self.spec is not None
+            gym.logger.warn(
+                "You are calling render method without specifying any render mode. "
+                "You can specify the render_mode at initialization, "
+                f'e.g. gym.make("{self.spec.id}", render_mode="rgb_array")'
+            )
+            return
+
+        try:
+            import pygame
+            from pygame import gfxdraw
+        except ImportError as e:
+            raise DependencyNotInstalled(
+                "pygame is not installed, run `pip install gymnasium[box2d]`"
+            ) from e
+
+        if self.screen is None and self.render_mode == "human":
+            pygame.init()
+            pygame.display.init()
+            self.screen = pygame.display.set_mode((VIEWPORT_W, VIEWPORT_H))
+        if self.clock is None:
+            self.clock = pygame.time.Clock()
+
+        self.surf = pygame.Surface(
+            (VIEWPORT_W + max(0.0, self.scroll) * SCALE, VIEWPORT_H)
+        )
+
+        pygame.transform.scale(self.surf, (SCALE, SCALE))
+
+        pygame.draw.polygon(
+            self.surf,
+            color=(215, 215, 255),
+            points=[
+                (self.scroll * SCALE, 0),
+                (self.scroll * SCALE + VIEWPORT_W, 0),
+                (self.scroll * SCALE + VIEWPORT_W, VIEWPORT_H),
+                (self.scroll * SCALE, VIEWPORT_H),
+            ],
+        )
+
+        for poly, x1, x2 in self.cloud_poly:
+            if x2 < self.scroll / 2:
+                continue
+            if x1 > self.scroll / 2 + VIEWPORT_W / SCALE:
+                continue
+            pygame.draw.polygon(
+                self.surf,
+                color=(255, 255, 255),
+                points=[
+                    (p[0] * SCALE + self.scroll * SCALE / 2, p[1] * SCALE) for p in poly
+                ],
+            )
+            gfxdraw.aapolygon(
+                self.surf,
+                [(p[0] * SCALE + self.scroll * SCALE / 2, p[1] * SCALE) for p in poly],
+                (255, 255, 255),
+            )
+        for poly, color in self.terrain_poly:
+            if poly[1][0] < self.scroll:
+                continue
+            if poly[0][0] > self.scroll + VIEWPORT_W / SCALE:
+                continue
+            scaled_poly = []
+            for coord in poly:
+                scaled_poly.append([coord[0] * SCALE, coord[1] * SCALE])
+            pygame.draw.polygon(self.surf, color=color, points=scaled_poly)
+            gfxdraw.aapolygon(self.surf, scaled_poly, color)
+
+        self.lidar_render = (self.lidar_render + 1) % 100
+        i = self.lidar_render
+        if i < 2 * len(self.lidar):
+            single_lidar = (
+                self.lidar[i]
+                if i < len(self.lidar)
+                else self.lidar[len(self.lidar) - i - 1]
+            )
+            if hasattr(single_lidar, "p1") and hasattr(single_lidar, "p2"):
+                pygame.draw.line(
+                    self.surf,
+                    color=(255, 0, 0),
+                    start_pos=(single_lidar.p1[0] * SCALE, single_lidar.p1[1] * SCALE),
+                    end_pos=(single_lidar.p2[0] * SCALE, single_lidar.p2[1] * SCALE),
+                    width=1,
+                )
+
+        for obj in self.drawlist:
+            for f in obj.fixtures:
+                trans = f.body.transform
+                if type(f.shape) is circleShape:
+                    pygame.draw.circle(
+                        self.surf,
+                        color=obj.color1,
+                        center=trans * f.shape.pos * SCALE,
+                        radius=f.shape.radius * SCALE,
+                    )
+                    pygame.draw.circle(
+                        self.surf,
+                        color=obj.color2,
+                        center=trans * f.shape.pos * SCALE,
+                        radius=f.shape.radius * SCALE,
+                    )
+                else:
+                    path = [trans * v * SCALE for v in f.shape.vertices]
+                    if len(path) > 2:
+                        pygame.draw.polygon(self.surf, color=obj.color1, points=path)
+                        gfxdraw.aapolygon(self.surf, path, obj.color1)
+                        path.append(path[0])
+                        pygame.draw.polygon(
+                            self.surf, color=obj.color2, points=path, width=1
+                        )
+                        gfxdraw.aapolygon(self.surf, path, obj.color2)
+                    else:
+                        pygame.draw.aaline(
+                            self.surf,
+                            start_pos=path[0],
+                            end_pos=path[1],
+                            color=obj.color1,
+                        )
+
+        flagy1 = TERRAIN_HEIGHT * SCALE
+        flagy2 = flagy1 + 50
+        x = TERRAIN_STEP * 3 * SCALE
+        pygame.draw.aaline(
+            self.surf, color=(0, 0, 0), start_pos=(x, flagy1), end_pos=(x, flagy2)
+        )
+        f = [
+            (x, flagy2),
+            (x, flagy2 - 10),
+            (x + 25, flagy2 - 5),
+        ]
+        pygame.draw.polygon(self.surf, color=(230, 51, 0), points=f)
+        pygame.draw.lines(
+            self.surf, color=(0, 0, 0), points=f + [f[0]], width=1, closed=False
+        )
+
+        self.surf = pygame.transform.flip(self.surf, False, True)
+
+        if self.render_mode == "human":
+            assert self.screen is not None
+            self.screen.blit(self.surf, (-self.scroll * SCALE, 0))
+            pygame.event.pump()
+            self.clock.tick(self.metadata["render_fps"])
+            pygame.display.flip()
+        elif self.render_mode == "rgb_array":
+            return np.transpose(
+                np.array(pygame.surfarray.pixels3d(self.surf)), axes=(1, 0, 2)
+            )[:, -VIEWPORT_W:]
+
+    def close(self):
+        if self.screen is not None:
+            import pygame
+
+            pygame.display.quit()
+            pygame.quit()
+            self.isopen = False
+
+
+class BipedalWalkerHardcore:
+    def __init__(self):
+        raise error.Error(
+            "Error initializing BipedalWalkerHardcore Environment.\n"
+            "Currently, we do not support initializing this mode of environment by calling the class directly.\n"
+            "To use this environment, instead create it by specifying the hardcore keyword in gym.make, i.e.\n"
+            'gym.make("BipedalWalker-v3", hardcore=True)'
+        )
+
+
+if __name__ == "__main__":
+    # Heurisic: suboptimal, have no notion of balance.
+    env = BipedalWalker()
+    env.reset()
+    steps = 0
+    total_reward = 0
+    a = np.array([0.0, 0.0, 0.0, 0.0])
+    STAY_ON_ONE_LEG, PUT_OTHER_DOWN, PUSH_OFF = 1, 2, 3
+    SPEED = 0.29  # Will fall forward on higher speed
+    state = STAY_ON_ONE_LEG
+    moving_leg = 0
+    supporting_leg = 1 - moving_leg
+    SUPPORT_KNEE_ANGLE = +0.1
+    supporting_knee_angle = SUPPORT_KNEE_ANGLE
+    while True:
+        s, r, terminated, truncated, info = env.step(a)
+        total_reward += r
+        if steps % 20 == 0 or terminated or truncated:
+            print("\naction " + str([f"{x:+0.2f}" for x in a]))
+            print(f"step {steps} total_reward {total_reward:+0.2f}")
+            print("hull " + str([f"{x:+0.2f}" for x in s[0:4]]))
+            print("leg0 " + str([f"{x:+0.2f}" for x in s[4:9]]))
+            print("leg1 " + str([f"{x:+0.2f}" for x in s[9:14]]))
+        steps += 1
+
+        contact0 = s[8]
+        contact1 = s[13]
+        moving_s_base = 4 + 5 * moving_leg
+        supporting_s_base = 4 + 5 * supporting_leg
+
+        hip_targ = [None, None]  # -0.8 .. +1.1
+        knee_targ = [None, None]  # -0.6 .. +0.9
+        hip_todo = [0.0, 0.0]
+        knee_todo = [0.0, 0.0]
+
+        if state == STAY_ON_ONE_LEG:
+            hip_targ[moving_leg] = 1.1
+            knee_targ[moving_leg] = -0.6
+            supporting_knee_angle += 0.03
+            if s[2] > SPEED:
+                supporting_knee_angle += 0.03
+            supporting_knee_angle = min(supporting_knee_angle, SUPPORT_KNEE_ANGLE)
+            knee_targ[supporting_leg] = supporting_knee_angle
+            if s[supporting_s_base + 0] < 0.10:  # supporting leg is behind
+                state = PUT_OTHER_DOWN
+        if state == PUT_OTHER_DOWN:
+            hip_targ[moving_leg] = +0.1
+            knee_targ[moving_leg] = SUPPORT_KNEE_ANGLE
+            knee_targ[supporting_leg] = supporting_knee_angle
+            if s[moving_s_base + 4]:
+                state = PUSH_OFF
+                supporting_knee_angle = min(s[moving_s_base + 2], SUPPORT_KNEE_ANGLE)
+        if state == PUSH_OFF:
+            knee_targ[moving_leg] = supporting_knee_angle
+            knee_targ[supporting_leg] = +1.0
+            if s[supporting_s_base + 2] > 0.88 or s[2] > 1.2 * SPEED:
+                state = STAY_ON_ONE_LEG
+                moving_leg = 1 - moving_leg
+                supporting_leg = 1 - moving_leg
+
+        if hip_targ[0]:
+            hip_todo[0] = 0.9 * (hip_targ[0] - s[4]) - 0.25 * s[5]
+        if hip_targ[1]:
+            hip_todo[1] = 0.9 * (hip_targ[1] - s[9]) - 0.25 * s[10]
+        if knee_targ[0]:
+            knee_todo[0] = 4.0 * (knee_targ[0] - s[6]) - 0.25 * s[7]
+        if knee_targ[1]:
+            knee_todo[1] = 4.0 * (knee_targ[1] - s[11]) - 0.25 * s[12]
+
+        hip_todo[0] -= 0.9 * (0 - s[0]) - 1.5 * s[1]  # PID to keep head strait
+        hip_todo[1] -= 0.9 * (0 - s[0]) - 1.5 * s[1]
+        knee_todo[0] -= 15.0 * s[3]  # vertical speed, to damp oscillations
+        knee_todo[1] -= 15.0 * s[3]
+
+        a[0] = hip_todo[0]
+        a[1] = knee_todo[0]
+        a[2] = hip_todo[1]
+        a[3] = knee_todo[1]
+        a = np.clip(0.5 * a, -1.0, 1.0)
+
+        if terminated or truncated:
+            break
--- a/rl/Lib/site-packages/gymnasium/envs/box2d/car_dynamics.py
+++ b/rl/Lib/site-packages/gymnasium/envs/box2d/car_dynamics.py
@ -0,0 +1,356 @@
+"""
+Top-down car dynamics simulation.
+
+Some ideas are taken from this great tutorial http://www.iforce2d.net/b2dtut/top-down-car by Chris Campbell.
+This simulation is a bit more detailed, with wheels rotation.
+
+Created by Oleg Klimov
+"""
+
+import math
+
+import Box2D
+import numpy as np
+
+from gymnasium.error import DependencyNotInstalled
+
+
+try:
+    from Box2D.b2 import fixtureDef, polygonShape, revoluteJointDef
+except ImportError as e:
+    raise DependencyNotInstalled(
+        "Box2D is not installed, run `pip install gymnasium[box2d]`"
+    ) from e
+
+
+SIZE = 0.02
+ENGINE_POWER = 100000000 * SIZE * SIZE
+WHEEL_MOMENT_OF_INERTIA = 4000 * SIZE * SIZE
+FRICTION_LIMIT = (
+    1000000 * SIZE * SIZE
+)  # friction ~= mass ~= size^2 (calculated implicitly using density)
+WHEEL_R = 27
+WHEEL_W = 14
+WHEELPOS = [(-55, +80), (+55, +80), (-55, -82), (+55, -82)]
+HULL_POLY1 = [(-60, +130), (+60, +130), (+60, +110), (-60, +110)]
+HULL_POLY2 = [(-15, +120), (+15, +120), (+20, +20), (-20, 20)]
+HULL_POLY3 = [
+    (+25, +20),
+    (+50, -10),
+    (+50, -40),
+    (+20, -90),
+    (-20, -90),
+    (-50, -40),
+    (-50, -10),
+    (-25, +20),
+]
+HULL_POLY4 = [(-50, -120), (+50, -120), (+50, -90), (-50, -90)]
+WHEEL_COLOR = (0, 0, 0)
+WHEEL_WHITE = (77, 77, 77)
+MUD_COLOR = (102, 102, 0)
+
+
+class Car:
+    def __init__(self, world, init_angle, init_x, init_y):
+        self.world: Box2D.b2World = world
+        self.hull: Box2D.b2Body = self.world.CreateDynamicBody(
+            position=(init_x, init_y),
+            angle=init_angle,
+            fixtures=[
+                fixtureDef(
+                    shape=polygonShape(
+                        vertices=[(x * SIZE, y * SIZE) for x, y in HULL_POLY1]
+                    ),
+                    density=1.0,
+                ),
+                fixtureDef(
+                    shape=polygonShape(
+                        vertices=[(x * SIZE, y * SIZE) for x, y in HULL_POLY2]
+                    ),
+                    density=1.0,
+                ),
+                fixtureDef(
+                    shape=polygonShape(
+                        vertices=[(x * SIZE, y * SIZE) for x, y in HULL_POLY3]
+                    ),
+                    density=1.0,
+                ),
+                fixtureDef(
+                    shape=polygonShape(
+                        vertices=[(x * SIZE, y * SIZE) for x, y in HULL_POLY4]
+                    ),
+                    density=1.0,
+                ),
+            ],
+        )
+        self.hull.color = (0.8, 0.0, 0.0)
+        self.wheels = []
+        self.fuel_spent = 0.0
+        WHEEL_POLY = [
+            (-WHEEL_W, +WHEEL_R),
+            (+WHEEL_W, +WHEEL_R),
+            (+WHEEL_W, -WHEEL_R),
+            (-WHEEL_W, -WHEEL_R),
+        ]
+        for wx, wy in WHEELPOS:
+            front_k = 1.0 if wy > 0 else 1.0
+            w = self.world.CreateDynamicBody(
+                position=(init_x + wx * SIZE, init_y + wy * SIZE),
+                angle=init_angle,
+                fixtures=fixtureDef(
+                    shape=polygonShape(
+                        vertices=[
+                            (x * front_k * SIZE, y * front_k * SIZE)
+                            for x, y in WHEEL_POLY
+                        ]
+                    ),
+                    density=0.1,
+                    categoryBits=0x0020,
+                    maskBits=0x001,
+                    restitution=0.0,
+                ),
+            )
+            w.wheel_rad = front_k * WHEEL_R * SIZE
+            w.color = WHEEL_COLOR
+            w.gas = 0.0
+            w.brake = 0.0
+            w.steer = 0.0
+            w.phase = 0.0  # wheel angle
+            w.omega = 0.0  # angular velocity
+            w.skid_start = None
+            w.skid_particle = None
+            rjd = revoluteJointDef(
+                bodyA=self.hull,
+                bodyB=w,
+                localAnchorA=(wx * SIZE, wy * SIZE),
+                localAnchorB=(0, 0),
+                enableMotor=True,
+                enableLimit=True,
+                maxMotorTorque=180 * 900 * SIZE * SIZE,
+                motorSpeed=0,
+                lowerAngle=-0.4,
+                upperAngle=+0.4,
+            )
+            w.joint = self.world.CreateJoint(rjd)
+            w.tiles = set()
+            w.userData = w
+            self.wheels.append(w)
+        self.drawlist = self.wheels + [self.hull]
+        self.particles = []
+
+    def gas(self, gas):
+        """control: rear wheel drive
+
+        Args:
+            gas (float): How much gas gets applied. Gets clipped between 0 and 1.
+        """
+        gas = np.clip(gas, 0, 1)
+        for w in self.wheels[2:4]:
+            diff = gas - w.gas
+            if diff > 0.1:
+                diff = 0.1  # gradually increase, but stop immediately
+            w.gas += diff
+
+    def brake(self, b):
+        """control: brake
+
+        Args:
+            b (0..1): Degree to which the brakes are applied. More than 0.9 blocks the wheels to zero rotation
+        """
+        for w in self.wheels:
+            w.brake = b
+
+    def steer(self, s):
+        """control: steer
+
+        Args:
+            s (-1..1): target position, it takes time to rotate steering wheel from side-to-side
+        """
+        self.wheels[0].steer = s
+        self.wheels[1].steer = s
+
+    def step(self, dt):
+        for w in self.wheels:
+            # Steer each wheel
+            dir = np.sign(w.steer - w.joint.angle)
+            val = abs(w.steer - w.joint.angle)
+            w.joint.motorSpeed = dir * min(50.0 * val, 3.0)
+
+            # Position => friction_limit
+            grass = True
+            friction_limit = FRICTION_LIMIT * 0.6  # Grass friction if no tile
+            for tile in w.tiles:
+                friction_limit = max(
+                    friction_limit, FRICTION_LIMIT * tile.road_friction
+                )
+                grass = False
+
+            # Force
+            forw = w.GetWorldVector((0, 1))
+            side = w.GetWorldVector((1, 0))
+            v = w.linearVelocity
+            vf = forw[0] * v[0] + forw[1] * v[1]  # forward speed
+            vs = side[0] * v[0] + side[1] * v[1]  # side speed
+
+            # WHEEL_MOMENT_OF_INERTIA*np.square(w.omega)/2 = E -- energy
+            # WHEEL_MOMENT_OF_INERTIA*w.omega * domega/dt = dE/dt = W -- power
+            # domega = dt*W/WHEEL_MOMENT_OF_INERTIA/w.omega
+
+            # add small coef not to divide by zero
+            w.omega += (
+                dt
+                * ENGINE_POWER
+                * w.gas
+                / WHEEL_MOMENT_OF_INERTIA
+                / (abs(w.omega) + 5.0)
+            )
+            self.fuel_spent += dt * ENGINE_POWER * w.gas
+
+            if w.brake >= 0.9:
+                w.omega = 0
+            elif w.brake > 0:
+                BRAKE_FORCE = 15  # radians per second
+                dir = -np.sign(w.omega)
+                val = BRAKE_FORCE * w.brake
+                if abs(val) > abs(w.omega):
+                    val = abs(w.omega)  # low speed => same as = 0
+                w.omega += dir * val
+            w.phase += w.omega * dt
+
+            vr = w.omega * w.wheel_rad  # rotating wheel speed
+            f_force = -vf + vr  # force direction is direction of speed difference
+            p_force = -vs
+
+            # Physically correct is to always apply friction_limit until speed is equal.
+            # But dt is finite, that will lead to oscillations if difference is already near zero.
+
+            # Random coefficient to cut oscillations in few steps (have no effect on friction_limit)
+            f_force *= 205000 * SIZE * SIZE
+            p_force *= 205000 * SIZE * SIZE
+            force = np.sqrt(np.square(f_force) + np.square(p_force))
+
+            # Skid trace
+            if abs(force) > 2.0 * friction_limit:
+                if (
+                    w.skid_particle
+                    and w.skid_particle.grass == grass
+                    and len(w.skid_particle.poly) < 30
+                ):
+                    w.skid_particle.poly.append((w.position[0], w.position[1]))
+                elif w.skid_start is None:
+                    w.skid_start = w.position
+                else:
+                    w.skid_particle = self._create_particle(
+                        w.skid_start, w.position, grass
+                    )
+                    w.skid_start = None
+            else:
+                w.skid_start = None
+                w.skid_particle = None
+
+            if abs(force) > friction_limit:
+                f_force /= force
+                p_force /= force
+                force = friction_limit  # Correct physics here
+                f_force *= force
+                p_force *= force
+
+            w.omega -= dt * f_force * w.wheel_rad / WHEEL_MOMENT_OF_INERTIA
+
+            w.ApplyForceToCenter(
+                (
+                    p_force * side[0] + f_force * forw[0],
+                    p_force * side[1] + f_force * forw[1],
+                ),
+                True,
+            )
+
+    def draw(self, surface, zoom, translation, angle, draw_particles=True):
+        import pygame.draw
+
+        if draw_particles:
+            for p in self.particles:
+                poly = [pygame.math.Vector2(c).rotate_rad(angle) for c in p.poly]
+                poly = [
+                    (
+                        coords[0] * zoom + translation[0],
+                        coords[1] * zoom + translation[1],
+                    )
+                    for coords in poly
+                ]
+                pygame.draw.lines(
+                    surface, color=p.color, points=poly, width=2, closed=False
+                )
+
+        for obj in self.drawlist:
+            for f in obj.fixtures:
+                trans = f.body.transform
+                path = [trans * v for v in f.shape.vertices]
+                path = [(coords[0], coords[1]) for coords in path]
+                path = [pygame.math.Vector2(c).rotate_rad(angle) for c in path]
+                path = [
+                    (
+                        coords[0] * zoom + translation[0],
+                        coords[1] * zoom + translation[1],
+                    )
+                    for coords in path
+                ]
+                color = [int(c * 255) for c in obj.color]
+
+                pygame.draw.polygon(surface, color=color, points=path)
+
+                if "phase" not in obj.__dict__:
+                    continue
+                a1 = obj.phase
+                a2 = obj.phase + 1.2  # radians
+                s1 = math.sin(a1)
+                s2 = math.sin(a2)
+                c1 = math.cos(a1)
+                c2 = math.cos(a2)
+                if s1 > 0 and s2 > 0:
+                    continue
+                if s1 > 0:
+                    c1 = np.sign(c1)
+                if s2 > 0:
+                    c2 = np.sign(c2)
+                white_poly = [
+                    (-WHEEL_W * SIZE, +WHEEL_R * c1 * SIZE),
+                    (+WHEEL_W * SIZE, +WHEEL_R * c1 * SIZE),
+                    (+WHEEL_W * SIZE, +WHEEL_R * c2 * SIZE),
+                    (-WHEEL_W * SIZE, +WHEEL_R * c2 * SIZE),
+                ]
+                white_poly = [trans * v for v in white_poly]
+
+                white_poly = [(coords[0], coords[1]) for coords in white_poly]
+                white_poly = [
+                    pygame.math.Vector2(c).rotate_rad(angle) for c in white_poly
+                ]
+                white_poly = [
+                    (
+                        coords[0] * zoom + translation[0],
+                        coords[1] * zoom + translation[1],
+                    )
+                    for coords in white_poly
+                ]
+                pygame.draw.polygon(surface, color=WHEEL_WHITE, points=white_poly)
+
+    def _create_particle(self, point1, point2, grass):
+        class Particle:
+            pass
+
+        p = Particle()
+        p.color = WHEEL_COLOR if not grass else MUD_COLOR
+        p.ttl = 1
+        p.poly = [(point1[0], point1[1]), (point2[0], point2[1])]
+        p.grass = grass
+        self.particles.append(p)
+        while len(self.particles) > 30:
+            self.particles.pop(0)
+        return p
+
+    def destroy(self):
+        self.world.DestroyBody(self.hull)
+        self.hull = None
+        for w in self.wheels:
+            self.world.DestroyBody(w)
+        self.wheels = []
--- a/rl/Lib/site-packages/gymnasium/envs/box2d/car_racing.py
+++ b/rl/Lib/site-packages/gymnasium/envs/box2d/car_racing.py
@ -0,0 +1,841 @@
+__credits__ = ["Andrea PIERRÉ"]
+
+import math
+from typing import Optional, Union
+
+import numpy as np
+
+import gymnasium as gym
+from gymnasium import spaces
+from gymnasium.envs.box2d.car_dynamics import Car
+from gymnasium.error import DependencyNotInstalled, InvalidAction
+from gymnasium.utils import EzPickle
+
+
+try:
+    import Box2D
+    from Box2D.b2 import contactListener, fixtureDef, polygonShape
+except ImportError as e:
+    raise DependencyNotInstalled(
+        "Box2D is not installed, run `pip install gymnasium[box2d]`"
+    ) from e
+
+try:
+    # As pygame is necessary for using the environment (reset and step) even without a render mode
+    #   therefore, pygame is a necessary import for the environment.
+    import pygame
+    from pygame import gfxdraw
+except ImportError as e:
+    raise DependencyNotInstalled(
+        "pygame is not installed, run `pip install gymnasium[box2d]`"
+    ) from e
+
+
+STATE_W = 96  # less than Atari 160x192
+STATE_H = 96
+VIDEO_W = 600
+VIDEO_H = 400
+WINDOW_W = 1000
+WINDOW_H = 800
+
+SCALE = 6.0  # Track scale
+TRACK_RAD = 900 / SCALE  # Track is heavily morphed circle with this radius
+PLAYFIELD = 2000 / SCALE  # Game over boundary
+FPS = 50  # Frames per second
+ZOOM = 2.7  # Camera zoom
+ZOOM_FOLLOW = True  # Set to False for fixed view (don't use zoom)
+
+
+TRACK_DETAIL_STEP = 21 / SCALE
+TRACK_TURN_RATE = 0.31
+TRACK_WIDTH = 40 / SCALE
+BORDER = 8 / SCALE
+BORDER_MIN_COUNT = 4
+GRASS_DIM = PLAYFIELD / 20.0
+MAX_SHAPE_DIM = (
+    max(GRASS_DIM, TRACK_WIDTH, TRACK_DETAIL_STEP) * math.sqrt(2) * ZOOM * SCALE
+)
+
+
+class FrictionDetector(contactListener):
+    def __init__(self, env, lap_complete_percent):
+        contactListener.__init__(self)
+        self.env = env
+        self.lap_complete_percent = lap_complete_percent
+
+    def BeginContact(self, contact):
+        self._contact(contact, True)
+
+    def EndContact(self, contact):
+        self._contact(contact, False)
+
+    def _contact(self, contact, begin):
+        tile = None
+        obj = None
+        u1 = contact.fixtureA.body.userData
+        u2 = contact.fixtureB.body.userData
+        if u1 and "road_friction" in u1.__dict__:
+            tile = u1
+            obj = u2
+        if u2 and "road_friction" in u2.__dict__:
+            tile = u2
+            obj = u1
+        if not tile:
+            return
+
+        # inherit tile color from env
+        tile.color[:] = self.env.road_color
+        if not obj or "tiles" not in obj.__dict__:
+            return
+        if begin:
+            obj.tiles.add(tile)
+            if not tile.road_visited:
+                tile.road_visited = True
+                self.env.reward += 1000.0 / len(self.env.track)
+                self.env.tile_visited_count += 1
+
+                # Lap is considered completed if enough % of the track was covered
+                if (
+                    tile.idx == 0
+                    and self.env.tile_visited_count / len(self.env.track)
+                    > self.lap_complete_percent
+                ):
+                    self.env.new_lap = True
+        else:
+            obj.tiles.remove(tile)
+
+
+class CarRacing(gym.Env, EzPickle):
+    """
+    ## Description
+    The easiest control task to learn from pixels - a top-down
+    racing environment. The generated track is random every episode.
+
+    Some indicators are shown at the bottom of the window along with the
+    state RGB buffer. From left to right: true speed, four ABS sensors,
+    steering wheel position, and gyroscope.
+    To play yourself (it's rather fast for humans), type:
+    ```
+    python gymnasium/envs/box2d/car_racing.py
+    ```
+    Remember: it's a powerful rear-wheel drive car - don't press the accelerator
+    and turn at the same time.
+
+    ## Action Space
+    If continuous there are 3 actions :
+    - 0: steering, -1 is full left, +1 is full right
+    - 1: gas
+    - 2: breaking
+
+    If discrete there are 5 actions:
+    - 0: do nothing
+    - 1: steer left
+    - 2: steer right
+    - 3: gas
+    - 4: brake
+
+    ## Observation Space
+
+    A top-down 96x96 RGB image of the car and race track.
+
+    ## Rewards
+    The reward is -0.1 every frame and +1000/N for every track tile visited,
+    where N is the total number of tiles visited in the track. For example,
+    if you have finished in 732 frames, your reward is
+    1000 - 0.1*732 = 926.8 points.
+
+    ## Starting State
+    The car starts at rest in the center of the road.
+
+    ## Episode Termination
+    The episode finishes when all the tiles are visited. The car can also go
+    outside the playfield - that is, far off the track, in which case it will
+    receive -100 reward and die.
+
+    ## Arguments
+    `lap_complete_percent` dictates the percentage of tiles that must be visited by
+    the agent before a lap is considered complete.
+
+    Passing `domain_randomize=True` enables the domain randomized variant of the environment.
+    In this scenario, the background and track colours are different on every reset.
+
+    Passing `continuous=False` converts the environment to use discrete action space.
+    The discrete action space has 5 actions: [do nothing, left, right, gas, brake].
+
+    ## Reset Arguments
+    Passing the option `options["randomize"] = True` will change the current colour of the environment on demand.
+    Correspondingly, passing the option `options["randomize"] = False` will not change the current colour of the environment.
+    `domain_randomize` must be `True` on init for this argument to work.
+    Example usage:
+    ```python
+    import gymnasium as gym
+    env = gym.make("CarRacing-v1", domain_randomize=True)
+
+    # normal reset, this changes the colour scheme by default
+    env.reset()
+
+    # reset with colour scheme change
+    env.reset(options={"randomize": True})
+
+    # reset with no colour scheme change
+    env.reset(options={"randomize": False})
+    ```
+
+    ## Version History
+    - v1: Change track completion logic and add domain randomization (0.24.0)
+    - v0: Original version
+
+    ## References
+    - Chris Campbell (2014), http://www.iforce2d.net/b2dtut/top-down-car.
+
+    ## Credits
+    Created by Oleg Klimov
+    """
+
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "state_pixels",
+        ],
+        "render_fps": FPS,
+    }
+
+    def __init__(
+        self,
+        render_mode: Optional[str] = None,
+        verbose: bool = False,
+        lap_complete_percent: float = 0.95,
+        domain_randomize: bool = False,
+        continuous: bool = True,
+    ):
+        EzPickle.__init__(
+            self,
+            render_mode,
+            verbose,
+            lap_complete_percent,
+            domain_randomize,
+            continuous,
+        )
+        self.continuous = continuous
+        self.domain_randomize = domain_randomize
+        self.lap_complete_percent = lap_complete_percent
+        self._init_colors()
+
+        self.contactListener_keepref = FrictionDetector(self, self.lap_complete_percent)
+        self.world = Box2D.b2World((0, 0), contactListener=self.contactListener_keepref)
+        self.screen: Optional[pygame.Surface] = None
+        self.surf = None
+        self.clock = None
+        self.isopen = True
+        self.invisible_state_window = None
+        self.invisible_video_window = None
+        self.road = None
+        self.car: Optional[Car] = None
+        self.reward = 0.0
+        self.prev_reward = 0.0
+        self.verbose = verbose
+        self.new_lap = False
+        self.fd_tile = fixtureDef(
+            shape=polygonShape(vertices=[(0, 0), (1, 0), (1, -1), (0, -1)])
+        )
+
+        # This will throw a warning in tests/envs/test_envs in utils/env_checker.py as the space is not symmetric
+        #   or normalised however this is not possible here so ignore
+        if self.continuous:
+            self.action_space = spaces.Box(
+                np.array([-1, 0, 0]).astype(np.float32),
+                np.array([+1, +1, +1]).astype(np.float32),
+            )  # steer, gas, brake
+        else:
+            self.action_space = spaces.Discrete(5)
+            # do nothing, left, right, gas, brake
+
+        self.observation_space = spaces.Box(
+            low=0, high=255, shape=(STATE_H, STATE_W, 3), dtype=np.uint8
+        )
+
+        self.render_mode = render_mode
+
+    def _destroy(self):
+        if not self.road:
+            return
+        for t in self.road:
+            self.world.DestroyBody(t)
+        self.road = []
+        assert self.car is not None
+        self.car.destroy()
+
+    def _init_colors(self):
+        if self.domain_randomize:
+            # domain randomize the bg and grass colour
+            self.road_color = self.np_random.uniform(0, 210, size=3)
+
+            self.bg_color = self.np_random.uniform(0, 210, size=3)
+
+            self.grass_color = np.copy(self.bg_color)
+            idx = self.np_random.integers(3)
+            self.grass_color[idx] += 20
+        else:
+            # default colours
+            self.road_color = np.array([102, 102, 102])
+            self.bg_color = np.array([102, 204, 102])
+            self.grass_color = np.array([102, 230, 102])
+
+    def _reinit_colors(self, randomize):
+        assert (
+            self.domain_randomize
+        ), "domain_randomize must be True to use this function."
+
+        if randomize:
+            # domain randomize the bg and grass colour
+            self.road_color = self.np_random.uniform(0, 210, size=3)
+
+            self.bg_color = self.np_random.uniform(0, 210, size=3)
+
+            self.grass_color = np.copy(self.bg_color)
+            idx = self.np_random.integers(3)
+            self.grass_color[idx] += 20
+
+    def _create_track(self):
+        CHECKPOINTS = 12
+
+        # Create checkpoints
+        checkpoints = []
+        for c in range(CHECKPOINTS):
+            noise = self.np_random.uniform(0, 2 * math.pi * 1 / CHECKPOINTS)
+            alpha = 2 * math.pi * c / CHECKPOINTS + noise
+            rad = self.np_random.uniform(TRACK_RAD / 3, TRACK_RAD)
+
+            if c == 0:
+                alpha = 0
+                rad = 1.5 * TRACK_RAD
+            if c == CHECKPOINTS - 1:
+                alpha = 2 * math.pi * c / CHECKPOINTS
+                self.start_alpha = 2 * math.pi * (-0.5) / CHECKPOINTS
+                rad = 1.5 * TRACK_RAD
+
+            checkpoints.append((alpha, rad * math.cos(alpha), rad * math.sin(alpha)))
+        self.road = []
+
+        # Go from one checkpoint to another to create track
+        x, y, beta = 1.5 * TRACK_RAD, 0, 0
+        dest_i = 0
+        laps = 0
+        track = []
+        no_freeze = 2500
+        visited_other_side = False
+        while True:
+            alpha = math.atan2(y, x)
+            if visited_other_side and alpha > 0:
+                laps += 1
+                visited_other_side = False
+            if alpha < 0:
+                visited_other_side = True
+                alpha += 2 * math.pi
+
+            while True:  # Find destination from checkpoints
+                failed = True
+
+                while True:
+                    dest_alpha, dest_x, dest_y = checkpoints[dest_i % len(checkpoints)]
+                    if alpha <= dest_alpha:
+                        failed = False
+                        break
+                    dest_i += 1
+                    if dest_i % len(checkpoints) == 0:
+                        break
+
+                if not failed:
+                    break
+
+                alpha -= 2 * math.pi
+                continue
+
+            r1x = math.cos(beta)
+            r1y = math.sin(beta)
+            p1x = -r1y
+            p1y = r1x
+            dest_dx = dest_x - x  # vector towards destination
+            dest_dy = dest_y - y
+            # destination vector projected on rad:
+            proj = r1x * dest_dx + r1y * dest_dy
+            while beta - alpha > 1.5 * math.pi:
+                beta -= 2 * math.pi
+            while beta - alpha < -1.5 * math.pi:
+                beta += 2 * math.pi
+            prev_beta = beta
+            proj *= SCALE
+            if proj > 0.3:
+                beta -= min(TRACK_TURN_RATE, abs(0.001 * proj))
+            if proj < -0.3:
+                beta += min(TRACK_TURN_RATE, abs(0.001 * proj))
+            x += p1x * TRACK_DETAIL_STEP
+            y += p1y * TRACK_DETAIL_STEP
+            track.append((alpha, prev_beta * 0.5 + beta * 0.5, x, y))
+            if laps > 4:
+                break
+            no_freeze -= 1
+            if no_freeze == 0:
+                break
+
+        # Find closed loop range i1..i2, first loop should be ignored, second is OK
+        i1, i2 = -1, -1
+        i = len(track)
+        while True:
+            i -= 1
+            if i == 0:
+                return False  # Failed
+            pass_through_start = (
+                track[i][0] > self.start_alpha and track[i - 1][0] <= self.start_alpha
+            )
+            if pass_through_start and i2 == -1:
+                i2 = i
+            elif pass_through_start and i1 == -1:
+                i1 = i
+                break
+        if self.verbose:
+            print("Track generation: %i..%i -> %i-tiles track" % (i1, i2, i2 - i1))
+        assert i1 != -1
+        assert i2 != -1
+
+        track = track[i1 : i2 - 1]
+
+        first_beta = track[0][1]
+        first_perp_x = math.cos(first_beta)
+        first_perp_y = math.sin(first_beta)
+        # Length of perpendicular jump to put together head and tail
+        well_glued_together = np.sqrt(
+            np.square(first_perp_x * (track[0][2] - track[-1][2]))
+            + np.square(first_perp_y * (track[0][3] - track[-1][3]))
+        )
+        if well_glued_together > TRACK_DETAIL_STEP:
+            return False
+
+        # Red-white border on hard turns
+        border = [False] * len(track)
+        for i in range(len(track)):
+            good = True
+            oneside = 0
+            for neg in range(BORDER_MIN_COUNT):
+                beta1 = track[i - neg - 0][1]
+                beta2 = track[i - neg - 1][1]
+                good &= abs(beta1 - beta2) > TRACK_TURN_RATE * 0.2
+                oneside += np.sign(beta1 - beta2)
+            good &= abs(oneside) == BORDER_MIN_COUNT
+            border[i] = good
+        for i in range(len(track)):
+            for neg in range(BORDER_MIN_COUNT):
+                border[i - neg] |= border[i]
+
+        # Create tiles
+        for i in range(len(track)):
+            alpha1, beta1, x1, y1 = track[i]
+            alpha2, beta2, x2, y2 = track[i - 1]
+            road1_l = (
+                x1 - TRACK_WIDTH * math.cos(beta1),
+                y1 - TRACK_WIDTH * math.sin(beta1),
+            )
+            road1_r = (
+                x1 + TRACK_WIDTH * math.cos(beta1),
+                y1 + TRACK_WIDTH * math.sin(beta1),
+            )
+            road2_l = (
+                x2 - TRACK_WIDTH * math.cos(beta2),
+                y2 - TRACK_WIDTH * math.sin(beta2),
+            )
+            road2_r = (
+                x2 + TRACK_WIDTH * math.cos(beta2),
+                y2 + TRACK_WIDTH * math.sin(beta2),
+            )
+            vertices = [road1_l, road1_r, road2_r, road2_l]
+            self.fd_tile.shape.vertices = vertices
+            t = self.world.CreateStaticBody(fixtures=self.fd_tile)
+            t.userData = t
+            c = 0.01 * (i % 3) * 255
+            t.color = self.road_color + c
+            t.road_visited = False
+            t.road_friction = 1.0
+            t.idx = i
+            t.fixtures[0].sensor = True
+            self.road_poly.append(([road1_l, road1_r, road2_r, road2_l], t.color))
+            self.road.append(t)
+            if border[i]:
+                side = np.sign(beta2 - beta1)
+                b1_l = (
+                    x1 + side * TRACK_WIDTH * math.cos(beta1),
+                    y1 + side * TRACK_WIDTH * math.sin(beta1),
+                )
+                b1_r = (
+                    x1 + side * (TRACK_WIDTH + BORDER) * math.cos(beta1),
+                    y1 + side * (TRACK_WIDTH + BORDER) * math.sin(beta1),
+                )
+                b2_l = (
+                    x2 + side * TRACK_WIDTH * math.cos(beta2),
+                    y2 + side * TRACK_WIDTH * math.sin(beta2),
+                )
+                b2_r = (
+                    x2 + side * (TRACK_WIDTH + BORDER) * math.cos(beta2),
+                    y2 + side * (TRACK_WIDTH + BORDER) * math.sin(beta2),
+                )
+                self.road_poly.append(
+                    (
+                        [b1_l, b1_r, b2_r, b2_l],
+                        (255, 255, 255) if i % 2 == 0 else (255, 0, 0),
+                    )
+                )
+        self.track = track
+        return True
+
+    def reset(
+        self,
+        *,
+        seed: Optional[int] = None,
+        options: Optional[dict] = None,
+    ):
+        super().reset(seed=seed)
+        self._destroy()
+        self.world.contactListener_bug_workaround = FrictionDetector(
+            self, self.lap_complete_percent
+        )
+        self.world.contactListener = self.world.contactListener_bug_workaround
+        self.reward = 0.0
+        self.prev_reward = 0.0
+        self.tile_visited_count = 0
+        self.t = 0.0
+        self.new_lap = False
+        self.road_poly = []
+
+        if self.domain_randomize:
+            randomize = True
+            if isinstance(options, dict):
+                if "randomize" in options:
+                    randomize = options["randomize"]
+
+            self._reinit_colors(randomize)
+
+        while True:
+            success = self._create_track()
+            if success:
+                break
+            if self.verbose:
+                print(
+                    "retry to generate track (normal if there are not many"
+                    "instances of this message)"
+                )
+        self.car = Car(self.world, *self.track[0][1:4])
+
+        if self.render_mode == "human":
+            self.render()
+        return self.step(None)[0], {}
+
+    def step(self, action: Union[np.ndarray, int]):
+        assert self.car is not None
+        if action is not None:
+            if self.continuous:
+                self.car.steer(-action[0])
+                self.car.gas(action[1])
+                self.car.brake(action[2])
+            else:
+                if not self.action_space.contains(action):
+                    raise InvalidAction(
+                        f"you passed the invalid action `{action}`. "
+                        f"The supported action_space is `{self.action_space}`"
+                    )
+                self.car.steer(-0.6 * (action == 1) + 0.6 * (action == 2))
+                self.car.gas(0.2 * (action == 3))
+                self.car.brake(0.8 * (action == 4))
+
+        self.car.step(1.0 / FPS)
+        self.world.Step(1.0 / FPS, 6 * 30, 2 * 30)
+        self.t += 1.0 / FPS
+
+        self.state = self._render("state_pixels")
+
+        step_reward = 0
+        terminated = False
+        truncated = False
+        if action is not None:  # First step without action, called from reset()
+            self.reward -= 0.1
+            # We actually don't want to count fuel spent, we want car to be faster.
+            # self.reward -=  10 * self.car.fuel_spent / ENGINE_POWER
+            self.car.fuel_spent = 0.0
+            step_reward = self.reward - self.prev_reward
+            self.prev_reward = self.reward
+            if self.tile_visited_count == len(self.track) or self.new_lap:
+                # Truncation due to finishing lap
+                # This should not be treated as a failure
+                # but like a timeout
+                truncated = True
+            x, y = self.car.hull.position
+            if abs(x) > PLAYFIELD or abs(y) > PLAYFIELD:
+                terminated = True
+                step_reward = -100
+
+        if self.render_mode == "human":
+            self.render()
+        return self.state, step_reward, terminated, truncated, {}
+
+    def render(self):
+        if self.render_mode is None:
+            assert self.spec is not None
+            gym.logger.warn(
+                "You are calling render method without specifying any render mode. "
+                "You can specify the render_mode at initialization, "
+                f'e.g. gym.make("{self.spec.id}", render_mode="rgb_array")'
+            )
+            return
+        else:
+            return self._render(self.render_mode)
+
+    def _render(self, mode: str):
+        assert mode in self.metadata["render_modes"]
+
+        pygame.font.init()
+        if self.screen is None and mode == "human":
+            pygame.init()
+            pygame.display.init()
+            self.screen = pygame.display.set_mode((WINDOW_W, WINDOW_H))
+        if self.clock is None:
+            self.clock = pygame.time.Clock()
+
+        if "t" not in self.__dict__:
+            return  # reset() not called yet
+
+        self.surf = pygame.Surface((WINDOW_W, WINDOW_H))
+
+        assert self.car is not None
+        # computing transformations
+        angle = -self.car.hull.angle
+        # Animating first second zoom.
+        zoom = 0.1 * SCALE * max(1 - self.t, 0) + ZOOM * SCALE * min(self.t, 1)
+        scroll_x = -(self.car.hull.position[0]) * zoom
+        scroll_y = -(self.car.hull.position[1]) * zoom
+        trans = pygame.math.Vector2((scroll_x, scroll_y)).rotate_rad(angle)
+        trans = (WINDOW_W / 2 + trans[0], WINDOW_H / 4 + trans[1])
+
+        self._render_road(zoom, trans, angle)
+        self.car.draw(
+            self.surf,
+            zoom,
+            trans,
+            angle,
+            mode not in ["state_pixels_list", "state_pixels"],
+        )
+
+        self.surf = pygame.transform.flip(self.surf, False, True)
+
+        # showing stats
+        self._render_indicators(WINDOW_W, WINDOW_H)
+
+        font = pygame.font.Font(pygame.font.get_default_font(), 42)
+        text = font.render("%04i" % self.reward, True, (255, 255, 255), (0, 0, 0))
+        text_rect = text.get_rect()
+        text_rect.center = (60, WINDOW_H - WINDOW_H * 2.5 / 40.0)
+        self.surf.blit(text, text_rect)
+
+        if mode == "human":
+            pygame.event.pump()
+            self.clock.tick(self.metadata["render_fps"])
+            assert self.screen is not None
+            self.screen.fill(0)
+            self.screen.blit(self.surf, (0, 0))
+            pygame.display.flip()
+        elif mode == "rgb_array":
+            return self._create_image_array(self.surf, (VIDEO_W, VIDEO_H))
+        elif mode == "state_pixels":
+            return self._create_image_array(self.surf, (STATE_W, STATE_H))
+        else:
+            return self.isopen
+
+    def _render_road(self, zoom, translation, angle):
+        bounds = PLAYFIELD
+        field = [
+            (bounds, bounds),
+            (bounds, -bounds),
+            (-bounds, -bounds),
+            (-bounds, bounds),
+        ]
+
+        # draw background
+        self._draw_colored_polygon(
+            self.surf, field, self.bg_color, zoom, translation, angle, clip=False
+        )
+
+        # draw grass patches
+        grass = []
+        for x in range(-20, 20, 2):
+            for y in range(-20, 20, 2):
+                grass.append(
+                    [
+                        (GRASS_DIM * x + GRASS_DIM, GRASS_DIM * y + 0),
+                        (GRASS_DIM * x + 0, GRASS_DIM * y + 0),
+                        (GRASS_DIM * x + 0, GRASS_DIM * y + GRASS_DIM),
+                        (GRASS_DIM * x + GRASS_DIM, GRASS_DIM * y + GRASS_DIM),
+                    ]
+                )
+        for poly in grass:
+            self._draw_colored_polygon(
+                self.surf, poly, self.grass_color, zoom, translation, angle
+            )
+
+        # draw road
+        for poly, color in self.road_poly:
+            # converting to pixel coordinates
+            poly = [(p[0], p[1]) for p in poly]
+            color = [int(c) for c in color]
+            self._draw_colored_polygon(self.surf, poly, color, zoom, translation, angle)
+
+    def _render_indicators(self, W, H):
+        s = W / 40.0
+        h = H / 40.0
+        color = (0, 0, 0)
+        polygon = [(W, H), (W, H - 5 * h), (0, H - 5 * h), (0, H)]
+        pygame.draw.polygon(self.surf, color=color, points=polygon)
+
+        def vertical_ind(place, val):
+            return [
+                (place * s, H - (h + h * val)),
+                ((place + 1) * s, H - (h + h * val)),
+                ((place + 1) * s, H - h),
+                ((place + 0) * s, H - h),
+            ]
+
+        def horiz_ind(place, val):
+            return [
+                ((place + 0) * s, H - 4 * h),
+                ((place + val) * s, H - 4 * h),
+                ((place + val) * s, H - 2 * h),
+                ((place + 0) * s, H - 2 * h),
+            ]
+
+        assert self.car is not None
+        true_speed = np.sqrt(
+            np.square(self.car.hull.linearVelocity[0])
+            + np.square(self.car.hull.linearVelocity[1])
+        )
+
+        # simple wrapper to render if the indicator value is above a threshold
+        def render_if_min(value, points, color):
+            if abs(value) > 1e-4:
+                pygame.draw.polygon(self.surf, points=points, color=color)
+
+        render_if_min(true_speed, vertical_ind(5, 0.02 * true_speed), (255, 255, 255))
+        # ABS sensors
+        render_if_min(
+            self.car.wheels[0].omega,
+            vertical_ind(7, 0.01 * self.car.wheels[0].omega),
+            (0, 0, 255),
+        )
+        render_if_min(
+            self.car.wheels[1].omega,
+            vertical_ind(8, 0.01 * self.car.wheels[1].omega),
+            (0, 0, 255),
+        )
+        render_if_min(
+            self.car.wheels[2].omega,
+            vertical_ind(9, 0.01 * self.car.wheels[2].omega),
+            (51, 0, 255),
+        )
+        render_if_min(
+            self.car.wheels[3].omega,
+            vertical_ind(10, 0.01 * self.car.wheels[3].omega),
+            (51, 0, 255),
+        )
+
+        render_if_min(
+            self.car.wheels[0].joint.angle,
+            horiz_ind(20, -10.0 * self.car.wheels[0].joint.angle),
+            (0, 255, 0),
+        )
+        render_if_min(
+            self.car.hull.angularVelocity,
+            horiz_ind(30, -0.8 * self.car.hull.angularVelocity),
+            (255, 0, 0),
+        )
+
+    def _draw_colored_polygon(
+        self, surface, poly, color, zoom, translation, angle, clip=True
+    ):
+        poly = [pygame.math.Vector2(c).rotate_rad(angle) for c in poly]
+        poly = [
+            (c[0] * zoom + translation[0], c[1] * zoom + translation[1]) for c in poly
+        ]
+        # This checks if the polygon is out of bounds of the screen, and we skip drawing if so.
+        # Instead of calculating exactly if the polygon and screen overlap,
+        # we simply check if the polygon is in a larger bounding box whose dimension
+        # is greater than the screen by MAX_SHAPE_DIM, which is the maximum
+        # diagonal length of an environment object
+        if not clip or any(
+            (-MAX_SHAPE_DIM <= coord[0] <= WINDOW_W + MAX_SHAPE_DIM)
+            and (-MAX_SHAPE_DIM <= coord[1] <= WINDOW_H + MAX_SHAPE_DIM)
+            for coord in poly
+        ):
+            gfxdraw.aapolygon(self.surf, poly, color)
+            gfxdraw.filled_polygon(self.surf, poly, color)
+
+    def _create_image_array(self, screen, size):
+        scaled_screen = pygame.transform.smoothscale(screen, size)
+        return np.transpose(
+            np.array(pygame.surfarray.pixels3d(scaled_screen)), axes=(1, 0, 2)
+        )
+
+    def close(self):
+        if self.screen is not None:
+            pygame.display.quit()
+            self.isopen = False
+            pygame.quit()
+
+
+if __name__ == "__main__":
+    a = np.array([0.0, 0.0, 0.0])
+
+    def register_input():
+        global quit, restart
+        for event in pygame.event.get():
+            if event.type == pygame.KEYDOWN:
+                if event.key == pygame.K_LEFT:
+                    a[0] = -1.0
+                if event.key == pygame.K_RIGHT:
+                    a[0] = +1.0
+                if event.key == pygame.K_UP:
+                    a[1] = +1.0
+                if event.key == pygame.K_DOWN:
+                    a[2] = +0.8  # set 1.0 for wheels to block to zero rotation
+                if event.key == pygame.K_RETURN:
+                    restart = True
+                if event.key == pygame.K_ESCAPE:
+                    quit = True
+
+            if event.type == pygame.KEYUP:
+                if event.key == pygame.K_LEFT:
+                    a[0] = 0
+                if event.key == pygame.K_RIGHT:
+                    a[0] = 0
+                if event.key == pygame.K_UP:
+                    a[1] = 0
+                if event.key == pygame.K_DOWN:
+                    a[2] = 0
+
+            if event.type == pygame.QUIT:
+                quit = True
+
+    env = CarRacing(render_mode="human")
+
+    quit = False
+    while not quit:
+        env.reset()
+        total_reward = 0.0
+        steps = 0
+        restart = False
+        while True:
+            register_input()
+            s, r, terminated, truncated, info = env.step(a)
+            total_reward += r
+            if steps % 200 == 0 or terminated or truncated:
+                print("\naction " + str([f"{x:+0.2f}" for x in a]))
+                print(f"step {steps} total_reward {total_reward:+0.2f}")
+            steps += 1
+            if terminated or truncated or restart or quit:
+                break
+    env.close()
--- a/rl/Lib/site-packages/gymnasium/envs/box2d/lunar_lander.py
+++ b/rl/Lib/site-packages/gymnasium/envs/box2d/lunar_lander.py
@ -0,0 +1,894 @@
+__credits__ = ["Andrea PIERRÉ"]
+
+import math
+import warnings
+from typing import TYPE_CHECKING, Optional
+
+import numpy as np
+
+import gymnasium as gym
+from gymnasium import error, spaces
+from gymnasium.error import DependencyNotInstalled
+from gymnasium.utils import EzPickle, colorize
+from gymnasium.utils.step_api_compatibility import step_api_compatibility
+
+
+try:
+    import Box2D
+    from Box2D.b2 import (
+        circleShape,
+        contactListener,
+        edgeShape,
+        fixtureDef,
+        polygonShape,
+        revoluteJointDef,
+    )
+except ImportError as e:
+    raise DependencyNotInstalled(
+        "Box2D is not installed, run `pip install gymnasium[box2d]`"
+    ) from e
+
+
+if TYPE_CHECKING:
+    import pygame
+
+
+FPS = 50
+SCALE = 30.0  # affects how fast-paced the game is, forces should be adjusted as well
+
+MAIN_ENGINE_POWER = 13.0
+SIDE_ENGINE_POWER = 0.6
+
+INITIAL_RANDOM = 1000.0  # Set 1500 to make game harder
+
+LANDER_POLY = [(-14, +17), (-17, 0), (-17, -10), (+17, -10), (+17, 0), (+14, +17)]
+LEG_AWAY = 20
+LEG_DOWN = 18
+LEG_W, LEG_H = 2, 8
+LEG_SPRING_TORQUE = 40
+
+SIDE_ENGINE_HEIGHT = 14
+SIDE_ENGINE_AWAY = 12
+MAIN_ENGINE_Y_LOCATION = (
+    4  # The Y location of the main engine on the body of the Lander.
+)
+
+VIEWPORT_W = 600
+VIEWPORT_H = 400
+
+
+class ContactDetector(contactListener):
+    def __init__(self, env):
+        contactListener.__init__(self)
+        self.env = env
+
+    def BeginContact(self, contact):
+        if (
+            self.env.lander == contact.fixtureA.body
+            or self.env.lander == contact.fixtureB.body
+        ):
+            self.env.game_over = True
+        for i in range(2):
+            if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]:
+                self.env.legs[i].ground_contact = True
+
+    def EndContact(self, contact):
+        for i in range(2):
+            if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]:
+                self.env.legs[i].ground_contact = False
+
+
+class LunarLander(gym.Env, EzPickle):
+    """
+    ## Description
+    This environment is a classic rocket trajectory optimization problem.
+    According to Pontryagin's maximum principle, it is optimal to fire the
+    engine at full throttle or turn it off. This is the reason why this
+    environment has discrete actions: engine on or off.
+
+    There are two environment versions: discrete or continuous.
+    The landing pad is always at coordinates (0,0). The coordinates are the
+    first two numbers in the state vector.
+    Landing outside of the landing pad is possible. Fuel is infinite, so an agent
+    can learn to fly and then land on its first attempt.
+
+    To see a heuristic landing, run:
+    ```
+    python gymnasium/envs/box2d/lunar_lander.py
+    ```
+    <!-- To play yourself, run: -->
+    <!-- python examples/agents/keyboard_agent.py LunarLander-v2 -->
+
+    ## Action Space
+    There are four discrete actions available:
+    - 0: do nothing
+    - 1: fire left orientation engine
+    - 2: fire main engine
+    - 3: fire right orientation engine
+
+    ## Observation Space
+    The state is an 8-dimensional vector: the coordinates of the lander in `x` & `y`, its linear
+    velocities in `x` & `y`, its angle, its angular velocity, and two booleans
+    that represent whether each leg is in contact with the ground or not.
+
+    ## Rewards
+    After every step a reward is granted. The total reward of an episode is the
+    sum of the rewards for all the steps within that episode.
+
+    For each step, the reward:
+    - is increased/decreased the closer/further the lander is to the landing pad.
+    - is increased/decreased the slower/faster the lander is moving.
+    - is decreased the more the lander is tilted (angle not horizontal).
+    - is increased by 10 points for each leg that is in contact with the ground.
+    - is decreased by 0.03 points each frame a side engine is firing.
+    - is decreased by 0.3 points each frame the main engine is firing.
+
+    The episode receive an additional reward of -100 or +100 points for crashing or landing safely respectively.
+
+    An episode is considered a solution if it scores at least 200 points.
+
+    ## Starting State
+    The lander starts at the top center of the viewport with a random initial
+    force applied to its center of mass.
+
+    ## Episode Termination
+    The episode finishes if:
+    1) the lander crashes (the lander body gets in contact with the moon);
+    2) the lander gets outside of the viewport (`x` coordinate is greater than 1);
+    3) the lander is not awake. From the [Box2D docs](https://box2d.org/documentation/md__d_1__git_hub_box2d_docs_dynamics.html#autotoc_md61),
+        a body which is not awake is a body which doesn't move and doesn't
+        collide with any other body:
+    > When Box2D determines that a body (or group of bodies) has come to rest,
+    > the body enters a sleep state which has very little CPU overhead. If a
+    > body is awake and collides with a sleeping body, then the sleeping body
+    > wakes up. Bodies will also wake up if a joint or contact attached to
+    > them is destroyed.
+
+    ## Arguments
+    To use to the _continuous_ environment, you need to specify the
+    `continuous=True` argument like below:
+    ```python
+    import gymnasium as gym
+    env = gym.make(
+        "LunarLander-v2",
+        continuous: bool = False,
+        gravity: float = -10.0,
+        enable_wind: bool = False,
+        wind_power: float = 15.0,
+        turbulence_power: float = 1.5,
+    )
+    ```
+    If `continuous=True` is passed, continuous actions (corresponding to the throttle of the engines) will be used and the
+    action space will be `Box(-1, +1, (2,), dtype=np.float32)`.
+    The first coordinate of an action determines the throttle of the main engine, while the second
+    coordinate specifies the throttle of the lateral boosters.
+    Given an action `np.array([main, lateral])`, the main engine will be turned off completely if
+    `main < 0` and the throttle scales affinely from 50% to 100% for `0 <= main <= 1` (in particular, the
+    main engine doesn't work  with less than 50% power).
+    Similarly, if `-0.5 < lateral < 0.5`, the lateral boosters will not fire at all. If `lateral < -0.5`, the left
+    booster will fire, and if `lateral > 0.5`, the right booster will fire. Again, the throttle scales affinely
+    from 50% to 100% between -1 and -0.5 (and 0.5 and 1, respectively).
+
+    `gravity` dictates the gravitational constant, this is bounded to be within 0 and -12.
+
+    If `enable_wind=True` is passed, there will be wind effects applied to the lander.
+    The wind is generated using the function `tanh(sin(2 k (t+C)) + sin(pi k (t+C)))`.
+    `k` is set to 0.01.
+    `C` is sampled randomly between -9999 and 9999.
+
+    `wind_power` dictates the maximum magnitude of linear wind applied to the craft. The recommended value for `wind_power` is between 0.0 and 20.0.
+    `turbulence_power` dictates the maximum magnitude of rotational wind applied to the craft. The recommended value for `turbulence_power` is between 0.0 and 2.0.
+
+    ## Version History
+    - v2: Count energy spent and in v0.24, added turbulence with wind power and turbulence_power parameters
+    - v1: Legs contact with ground added in state vector; contact with ground
+        give +10 reward points, and -10 if then lose contact; reward
+        renormalized to 200; harder initial random push.
+    - v0: Initial version
+
+
+    ## Notes
+
+    There are several unexpected bugs with the implementation of the environment.
+
+    1. The position of the side thursters on the body of the lander changes, depending on the orientation of the lander.
+    This in turn results in an orientation depentant torque being applied to the lander.
+
+    2. The units of the state are not consistent. I.e.
+    * The angular velocity is in units of 0.4 radians per second. In order to convert to radians per second, the value needs to be multiplied by a factor of 2.5.
+
+    For the default values of VIEWPORT_W, VIEWPORT_H, SCALE, and FPS, the scale factors equal:
+    'x': 10
+    'y': 6.666
+    'vx': 5
+    'vy': 7.5
+    'angle': 1
+    'angular velocity': 2.5
+
+    After the correction has been made, the units of the state are as follows:
+    'x': (units)
+    'y': (units)
+    'vx': (units/second)
+    'vy': (units/second)
+    'angle': (radians)
+    'angular velocity': (radians/second)
+
+
+    <!-- ## References -->
+
+    ## Credits
+    Created by Oleg Klimov
+    """
+
+    metadata = {
+        "render_modes": ["human", "rgb_array"],
+        "render_fps": FPS,
+    }
+
+    def __init__(
+        self,
+        render_mode: Optional[str] = None,
+        continuous: bool = False,
+        gravity: float = -10.0,
+        enable_wind: bool = False,
+        wind_power: float = 15.0,
+        turbulence_power: float = 1.5,
+    ):
+        EzPickle.__init__(
+            self,
+            render_mode,
+            continuous,
+            gravity,
+            enable_wind,
+            wind_power,
+            turbulence_power,
+        )
+
+        assert (
+            -12.0 < gravity and gravity < 0.0
+        ), f"gravity (current value: {gravity}) must be between -12 and 0"
+        self.gravity = gravity
+
+        if 0.0 > wind_power or wind_power > 20.0:
+            warnings.warn(
+                colorize(
+                    f"WARN: wind_power value is recommended to be between 0.0 and 20.0, (current value: {wind_power})",
+                    "yellow",
+                ),
+            )
+        self.wind_power = wind_power
+
+        if 0.0 > turbulence_power or turbulence_power > 2.0:
+            warnings.warn(
+                colorize(
+                    f"WARN: turbulence_power value is recommended to be between 0.0 and 2.0, (current value: {turbulence_power})",
+                    "yellow",
+                ),
+            )
+        self.turbulence_power = turbulence_power
+
+        self.enable_wind = enable_wind
+        self.wind_idx = np.random.randint(-9999, 9999)
+        self.torque_idx = np.random.randint(-9999, 9999)
+
+        self.screen: pygame.Surface = None
+        self.clock = None
+        self.isopen = True
+        self.world = Box2D.b2World(gravity=(0, gravity))
+        self.moon = None
+        self.lander: Optional[Box2D.b2Body] = None
+        self.particles = []
+
+        self.prev_reward = None
+
+        self.continuous = continuous
+
+        low = np.array(
+            [
+                # these are bounds for position
+                # realistically the environment should have ended
+                # long before we reach more than 50% outside
+                -1.5,
+                -1.5,
+                # velocity bounds is 5x rated speed
+                -5.0,
+                -5.0,
+                -math.pi,
+                -5.0,
+                -0.0,
+                -0.0,
+            ]
+        ).astype(np.float32)
+        high = np.array(
+            [
+                # these are bounds for position
+                # realistically the environment should have ended
+                # long before we reach more than 50% outside
+                1.5,
+                1.5,
+                # velocity bounds is 5x rated speed
+                5.0,
+                5.0,
+                math.pi,
+                5.0,
+                1.0,
+                1.0,
+            ]
+        ).astype(np.float32)
+
+        # useful range is -1 .. +1, but spikes can be higher
+        self.observation_space = spaces.Box(low, high)
+
+        if self.continuous:
+            # Action is two floats [main engine, left-right engines].
+            # Main engine: -1..0 off, 0..+1 throttle from 50% to 100% power. Engine can't work with less than 50% power.
+            # Left-right:  -1.0..-0.5 fire left engine, +0.5..+1.0 fire right engine, -0.5..0.5 off
+            self.action_space = spaces.Box(-1, +1, (2,), dtype=np.float32)
+        else:
+            # Nop, fire left engine, main engine, right engine
+            self.action_space = spaces.Discrete(4)
+
+        self.render_mode = render_mode
+
+    def _destroy(self):
+        if not self.moon:
+            return
+        self.world.contactListener = None
+        self._clean_particles(True)
+        self.world.DestroyBody(self.moon)
+        self.moon = None
+        self.world.DestroyBody(self.lander)
+        self.lander = None
+        self.world.DestroyBody(self.legs[0])
+        self.world.DestroyBody(self.legs[1])
+
+    def reset(
+        self,
+        *,
+        seed: Optional[int] = None,
+        options: Optional[dict] = None,
+    ):
+        super().reset(seed=seed)
+        self._destroy()
+        self.world.contactListener_keepref = ContactDetector(self)
+        self.world.contactListener = self.world.contactListener_keepref
+        self.game_over = False
+        self.prev_shaping = None
+
+        W = VIEWPORT_W / SCALE
+        H = VIEWPORT_H / SCALE
+
+        # Create Terrain
+        CHUNKS = 11
+        height = self.np_random.uniform(0, H / 2, size=(CHUNKS + 1,))
+        chunk_x = [W / (CHUNKS - 1) * i for i in range(CHUNKS)]
+        self.helipad_x1 = chunk_x[CHUNKS // 2 - 1]
+        self.helipad_x2 = chunk_x[CHUNKS // 2 + 1]
+        self.helipad_y = H / 4
+        height[CHUNKS // 2 - 2] = self.helipad_y
+        height[CHUNKS // 2 - 1] = self.helipad_y
+        height[CHUNKS // 2 + 0] = self.helipad_y
+        height[CHUNKS // 2 + 1] = self.helipad_y
+        height[CHUNKS // 2 + 2] = self.helipad_y
+        smooth_y = [
+            0.33 * (height[i - 1] + height[i + 0] + height[i + 1])
+            for i in range(CHUNKS)
+        ]
+
+        self.moon = self.world.CreateStaticBody(
+            shapes=edgeShape(vertices=[(0, 0), (W, 0)])
+        )
+        self.sky_polys = []
+        for i in range(CHUNKS - 1):
+            p1 = (chunk_x[i], smooth_y[i])
+            p2 = (chunk_x[i + 1], smooth_y[i + 1])
+            self.moon.CreateEdgeFixture(vertices=[p1, p2], density=0, friction=0.1)
+            self.sky_polys.append([p1, p2, (p2[0], H), (p1[0], H)])
+
+        self.moon.color1 = (0.0, 0.0, 0.0)
+        self.moon.color2 = (0.0, 0.0, 0.0)
+
+        # Create Lander body
+        initial_y = VIEWPORT_H / SCALE
+        initial_x = VIEWPORT_W / SCALE / 2
+        self.lander: Box2D.b2Body = self.world.CreateDynamicBody(
+            position=(initial_x, initial_y),
+            angle=0.0,
+            fixtures=fixtureDef(
+                shape=polygonShape(
+                    vertices=[(x / SCALE, y / SCALE) for x, y in LANDER_POLY]
+                ),
+                density=5.0,
+                friction=0.1,
+                categoryBits=0x0010,
+                maskBits=0x001,  # collide only with ground
+                restitution=0.0,
+            ),  # 0.99 bouncy
+        )
+        self.lander.color1 = (128, 102, 230)
+        self.lander.color2 = (77, 77, 128)
+
+        # Apply the initial random impulse to the lander
+        self.lander.ApplyForceToCenter(
+            (
+                self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM),
+                self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM),
+            ),
+            True,
+        )
+
+        # Create Lander Legs
+        self.legs = []
+        for i in [-1, +1]:
+            leg = self.world.CreateDynamicBody(
+                position=(initial_x - i * LEG_AWAY / SCALE, initial_y),
+                angle=(i * 0.05),
+                fixtures=fixtureDef(
+                    shape=polygonShape(box=(LEG_W / SCALE, LEG_H / SCALE)),
+                    density=1.0,
+                    restitution=0.0,
+                    categoryBits=0x0020,
+                    maskBits=0x001,
+                ),
+            )
+            leg.ground_contact = False
+            leg.color1 = (128, 102, 230)
+            leg.color2 = (77, 77, 128)
+            rjd = revoluteJointDef(
+                bodyA=self.lander,
+                bodyB=leg,
+                localAnchorA=(0, 0),
+                localAnchorB=(i * LEG_AWAY / SCALE, LEG_DOWN / SCALE),
+                enableMotor=True,
+                enableLimit=True,
+                maxMotorTorque=LEG_SPRING_TORQUE,
+                motorSpeed=+0.3 * i,  # low enough not to jump back into the sky
+            )
+            if i == -1:
+                rjd.lowerAngle = (
+                    +0.9 - 0.5
+                )  # The most esoteric numbers here, angled legs have freedom to travel within
+                rjd.upperAngle = +0.9
+            else:
+                rjd.lowerAngle = -0.9
+                rjd.upperAngle = -0.9 + 0.5
+            leg.joint = self.world.CreateJoint(rjd)
+            self.legs.append(leg)
+
+        self.drawlist = [self.lander] + self.legs
+
+        if self.render_mode == "human":
+            self.render()
+        return self.step(np.array([0, 0]) if self.continuous else 0)[0], {}
+
+    def _create_particle(self, mass, x, y, ttl):
+        p = self.world.CreateDynamicBody(
+            position=(x, y),
+            angle=0.0,
+            fixtures=fixtureDef(
+                shape=circleShape(radius=2 / SCALE, pos=(0, 0)),
+                density=mass,
+                friction=0.1,
+                categoryBits=0x0100,
+                maskBits=0x001,  # collide only with ground
+                restitution=0.3,
+            ),
+        )
+        p.ttl = ttl
+        self.particles.append(p)
+        self._clean_particles(False)
+        return p
+
+    def _clean_particles(self, all):
+        while self.particles and (all or self.particles[0].ttl < 0):
+            self.world.DestroyBody(self.particles.pop(0))
+
+    def step(self, action):
+        assert self.lander is not None
+
+        # Update wind and apply to the lander
+        assert self.lander is not None, "You forgot to call reset()"
+        if self.enable_wind and not (
+            self.legs[0].ground_contact or self.legs[1].ground_contact
+        ):
+            # the function used for wind is tanh(sin(2 k x) + sin(pi k x)),
+            # which is proven to never be periodic, k = 0.01
+            wind_mag = (
+                math.tanh(
+                    math.sin(0.02 * self.wind_idx)
+                    + (math.sin(math.pi * 0.01 * self.wind_idx))
+                )
+                * self.wind_power
+            )
+            self.wind_idx += 1
+            self.lander.ApplyForceToCenter(
+                (wind_mag, 0.0),
+                True,
+            )
+
+            # the function used for torque is tanh(sin(2 k x) + sin(pi k x)),
+            # which is proven to never be periodic, k = 0.01
+            torque_mag = math.tanh(
+                math.sin(0.02 * self.torque_idx)
+                + (math.sin(math.pi * 0.01 * self.torque_idx))
+            ) * (self.turbulence_power)
+            self.torque_idx += 1
+            self.lander.ApplyTorque(
+                (torque_mag),
+                True,
+            )
+
+        if self.continuous:
+            action = np.clip(action, -1, +1).astype(np.float32)
+        else:
+            assert self.action_space.contains(
+                action
+            ), f"{action!r} ({type(action)}) invalid "
+
+        # Apply Engine Impulses
+
+        # Tip is a the (X and Y) components of the rotation of the lander.
+        tip = (math.sin(self.lander.angle), math.cos(self.lander.angle))
+
+        # Side is the (-Y and X) components of the rotation of the lander.
+        side = (-tip[1], tip[0])
+
+        # Generate two random numbers between -1/SCALE and 1/SCALE.
+        dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)]
+
+        m_power = 0.0
+        if (self.continuous and action[0] > 0.0) or (
+            not self.continuous and action == 2
+        ):
+            # Main engine
+            if self.continuous:
+                m_power = (np.clip(action[0], 0.0, 1.0) + 1.0) * 0.5  # 0.5..1.0
+                assert m_power >= 0.5 and m_power <= 1.0
+            else:
+                m_power = 1.0
+
+            # 4 is move a bit downwards, +-2 for randomness
+            # The components of the impulse to be applied by the main engine.
+            ox = (
+                tip[0] * (MAIN_ENGINE_Y_LOCATION / SCALE + 2 * dispersion[0])
+                + side[0] * dispersion[1]
+            )
+            oy = (
+                -tip[1] * (MAIN_ENGINE_Y_LOCATION / SCALE + 2 * dispersion[0])
+                - side[1] * dispersion[1]
+            )
+
+            impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy)
+            if self.render_mode is not None:
+                # particles are just a decoration, with no impact on the physics, so don't add them when not rendering
+                p = self._create_particle(
+                    3.5,  # 3.5 is here to make particle speed adequate
+                    impulse_pos[0],
+                    impulse_pos[1],
+                    m_power,
+                )
+                p.ApplyLinearImpulse(
+                    (
+                        ox * MAIN_ENGINE_POWER * m_power,
+                        oy * MAIN_ENGINE_POWER * m_power,
+                    ),
+                    impulse_pos,
+                    True,
+                )
+            self.lander.ApplyLinearImpulse(
+                (-ox * MAIN_ENGINE_POWER * m_power, -oy * MAIN_ENGINE_POWER * m_power),
+                impulse_pos,
+                True,
+            )
+
+        s_power = 0.0
+        if (self.continuous and np.abs(action[1]) > 0.5) or (
+            not self.continuous and action in [1, 3]
+        ):
+            # Orientation/Side engines
+            if self.continuous:
+                direction = np.sign(action[1])
+                s_power = np.clip(np.abs(action[1]), 0.5, 1.0)
+                assert s_power >= 0.5 and s_power <= 1.0
+            else:
+                # action = 1 is left, action = 3 is right
+                direction = action - 2
+                s_power = 1.0
+
+            # The components of the impulse to be applied by the side engines.
+            ox = tip[0] * dispersion[0] + side[0] * (
+                3 * dispersion[1] + direction * SIDE_ENGINE_AWAY / SCALE
+            )
+            oy = -tip[1] * dispersion[0] - side[1] * (
+                3 * dispersion[1] + direction * SIDE_ENGINE_AWAY / SCALE
+            )
+
+            # The constant 17 is a constant, that is presumably meant to be SIDE_ENGINE_HEIGHT.
+            # However, SIDE_ENGINE_HEIGHT is defined as 14
+            # This casuses the position of the thurst on the body of the lander to change, depending on the orientation of the lander.
+            # This in turn results in an orientation depentant torque being applied to the lander.
+            impulse_pos = (
+                self.lander.position[0] + ox - tip[0] * 17 / SCALE,
+                self.lander.position[1] + oy + tip[1] * SIDE_ENGINE_HEIGHT / SCALE,
+            )
+            if self.render_mode is not None:
+                # particles are just a decoration, with no impact on the physics, so don't add them when not rendering
+                p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power)
+                p.ApplyLinearImpulse(
+                    (
+                        ox * SIDE_ENGINE_POWER * s_power,
+                        oy * SIDE_ENGINE_POWER * s_power,
+                    ),
+                    impulse_pos,
+                    True,
+                )
+            self.lander.ApplyLinearImpulse(
+                (-ox * SIDE_ENGINE_POWER * s_power, -oy * SIDE_ENGINE_POWER * s_power),
+                impulse_pos,
+                True,
+            )
+
+        self.world.Step(1.0 / FPS, 6 * 30, 2 * 30)
+
+        pos = self.lander.position
+        vel = self.lander.linearVelocity
+
+        state = [
+            (pos.x - VIEWPORT_W / SCALE / 2) / (VIEWPORT_W / SCALE / 2),
+            (pos.y - (self.helipad_y + LEG_DOWN / SCALE)) / (VIEWPORT_H / SCALE / 2),
+            vel.x * (VIEWPORT_W / SCALE / 2) / FPS,
+            vel.y * (VIEWPORT_H / SCALE / 2) / FPS,
+            self.lander.angle,
+            20.0 * self.lander.angularVelocity / FPS,
+            1.0 if self.legs[0].ground_contact else 0.0,
+            1.0 if self.legs[1].ground_contact else 0.0,
+        ]
+        assert len(state) == 8
+
+        reward = 0
+        shaping = (
+            -100 * np.sqrt(state[0] * state[0] + state[1] * state[1])
+            - 100 * np.sqrt(state[2] * state[2] + state[3] * state[3])
+            - 100 * abs(state[4])
+            + 10 * state[6]
+            + 10 * state[7]
+        )  # And ten points for legs contact, the idea is if you
+        # lose contact again after landing, you get negative reward
+        if self.prev_shaping is not None:
+            reward = shaping - self.prev_shaping
+        self.prev_shaping = shaping
+
+        reward -= (
+            m_power * 0.30
+        )  # less fuel spent is better, about -30 for heuristic landing
+        reward -= s_power * 0.03
+
+        terminated = False
+        if self.game_over or abs(state[0]) >= 1.0:
+            terminated = True
+            reward = -100
+        if not self.lander.awake:
+            terminated = True
+            reward = +100
+
+        if self.render_mode == "human":
+            self.render()
+        return np.array(state, dtype=np.float32), reward, terminated, False, {}
+
+    def render(self):
+        if self.render_mode is None:
+            assert self.spec is not None
+            gym.logger.warn(
+                "You are calling render method without specifying any render mode. "
+                "You can specify the render_mode at initialization, "
+                f'e.g. gym.make("{self.spec.id}", render_mode="rgb_array")'
+            )
+            return
+
+        try:
+            import pygame
+            from pygame import gfxdraw
+        except ImportError as e:
+            raise DependencyNotInstalled(
+                "pygame is not installed, run `pip install gymnasium[box2d]`"
+            ) from e
+
+        if self.screen is None and self.render_mode == "human":
+            pygame.init()
+            pygame.display.init()
+            self.screen = pygame.display.set_mode((VIEWPORT_W, VIEWPORT_H))
+        if self.clock is None:
+            self.clock = pygame.time.Clock()
+
+        self.surf = pygame.Surface((VIEWPORT_W, VIEWPORT_H))
+
+        pygame.transform.scale(self.surf, (SCALE, SCALE))
+        pygame.draw.rect(self.surf, (255, 255, 255), self.surf.get_rect())
+
+        for obj in self.particles:
+            obj.ttl -= 0.15
+            obj.color1 = (
+                int(max(0.2, 0.15 + obj.ttl) * 255),
+                int(max(0.2, 0.5 * obj.ttl) * 255),
+                int(max(0.2, 0.5 * obj.ttl) * 255),
+            )
+            obj.color2 = (
+                int(max(0.2, 0.15 + obj.ttl) * 255),
+                int(max(0.2, 0.5 * obj.ttl) * 255),
+                int(max(0.2, 0.5 * obj.ttl) * 255),
+            )
+
+        self._clean_particles(False)
+
+        for p in self.sky_polys:
+            scaled_poly = []
+            for coord in p:
+                scaled_poly.append((coord[0] * SCALE, coord[1] * SCALE))
+            pygame.draw.polygon(self.surf, (0, 0, 0), scaled_poly)
+            gfxdraw.aapolygon(self.surf, scaled_poly, (0, 0, 0))
+
+        for obj in self.particles + self.drawlist:
+            for f in obj.fixtures:
+                trans = f.body.transform
+                if type(f.shape) is circleShape:
+                    pygame.draw.circle(
+                        self.surf,
+                        color=obj.color1,
+                        center=trans * f.shape.pos * SCALE,
+                        radius=f.shape.radius * SCALE,
+                    )
+                    pygame.draw.circle(
+                        self.surf,
+                        color=obj.color2,
+                        center=trans * f.shape.pos * SCALE,
+                        radius=f.shape.radius * SCALE,
+                    )
+
+                else:
+                    path = [trans * v * SCALE for v in f.shape.vertices]
+                    pygame.draw.polygon(self.surf, color=obj.color1, points=path)
+                    gfxdraw.aapolygon(self.surf, path, obj.color1)
+                    pygame.draw.aalines(
+                        self.surf, color=obj.color2, points=path, closed=True
+                    )
+
+                for x in [self.helipad_x1, self.helipad_x2]:
+                    x = x * SCALE
+                    flagy1 = self.helipad_y * SCALE
+                    flagy2 = flagy1 + 50
+                    pygame.draw.line(
+                        self.surf,
+                        color=(255, 255, 255),
+                        start_pos=(x, flagy1),
+                        end_pos=(x, flagy2),
+                        width=1,
+                    )
+                    pygame.draw.polygon(
+                        self.surf,
+                        color=(204, 204, 0),
+                        points=[
+                            (x, flagy2),
+                            (x, flagy2 - 10),
+                            (x + 25, flagy2 - 5),
+                        ],
+                    )
+                    gfxdraw.aapolygon(
+                        self.surf,
+                        [(x, flagy2), (x, flagy2 - 10), (x + 25, flagy2 - 5)],
+                        (204, 204, 0),
+                    )
+
+        self.surf = pygame.transform.flip(self.surf, False, True)
+
+        if self.render_mode == "human":
+            assert self.screen is not None
+            self.screen.blit(self.surf, (0, 0))
+            pygame.event.pump()
+            self.clock.tick(self.metadata["render_fps"])
+            pygame.display.flip()
+        elif self.render_mode == "rgb_array":
+            return np.transpose(
+                np.array(pygame.surfarray.pixels3d(self.surf)), axes=(1, 0, 2)
+            )
+
+    def close(self):
+        if self.screen is not None:
+            import pygame
+
+            pygame.display.quit()
+            pygame.quit()
+            self.isopen = False
+
+
+def heuristic(env, s):
+    """
+    The heuristic for
+    1. Testing
+    2. Demonstration rollout.
+
+    Args:
+        env: The environment
+        s (list): The state. Attributes:
+            s[0] is the horizontal coordinate
+            s[1] is the vertical coordinate
+            s[2] is the horizontal speed
+            s[3] is the vertical speed
+            s[4] is the angle
+            s[5] is the angular speed
+            s[6] 1 if first leg has contact, else 0
+            s[7] 1 if second leg has contact, else 0
+
+    Returns:
+         a: The heuristic to be fed into the step function defined above to determine the next step and reward.
+    """
+
+    angle_targ = s[0] * 0.5 + s[2] * 1.0  # angle should point towards center
+    if angle_targ > 0.4:
+        angle_targ = 0.4  # more than 0.4 radians (22 degrees) is bad
+    if angle_targ < -0.4:
+        angle_targ = -0.4
+    hover_targ = 0.55 * np.abs(
+        s[0]
+    )  # target y should be proportional to horizontal offset
+
+    angle_todo = (angle_targ - s[4]) * 0.5 - (s[5]) * 1.0
+    hover_todo = (hover_targ - s[1]) * 0.5 - (s[3]) * 0.5
+
+    if s[6] or s[7]:  # legs have contact
+        angle_todo = 0
+        hover_todo = (
+            -(s[3]) * 0.5
+        )  # override to reduce fall speed, that's all we need after contact
+
+    if env.continuous:
+        a = np.array([hover_todo * 20 - 1, -angle_todo * 20])
+        a = np.clip(a, -1, +1)
+    else:
+        a = 0
+        if hover_todo > np.abs(angle_todo) and hover_todo > 0.05:
+            a = 2
+        elif angle_todo < -0.05:
+            a = 3
+        elif angle_todo > +0.05:
+            a = 1
+    return a
+
+
+def demo_heuristic_lander(env, seed=None, render=False):
+    total_reward = 0
+    steps = 0
+    s, info = env.reset(seed=seed)
+    while True:
+        a = heuristic(env, s)
+        s, r, terminated, truncated, info = step_api_compatibility(env.step(a), True)
+        total_reward += r
+
+        if render:
+            still_open = env.render()
+            if still_open is False:
+                break
+
+        if steps % 20 == 0 or terminated or truncated:
+            print("observations:", " ".join([f"{x:+0.2f}" for x in s]))
+            print(f"step {steps} total_reward {total_reward:+0.2f}")
+        steps += 1
+        if terminated or truncated:
+            break
+    if render:
+        env.close()
+    return total_reward
+
+
+class LunarLanderContinuous:
+    def __init__(self):
+        raise error.Error(
+            "Error initializing LunarLanderContinuous Environment.\n"
+            "Currently, we do not support initializing this mode of environment by calling the class directly.\n"
+            "To use this environment, instead create it by specifying the continuous keyword in gym.make, i.e.\n"
+            'gym.make("LunarLander-v2", continuous=True)'
+        )
+
+
+if __name__ == "__main__":
+    env = gym.make("LunarLander-v2", render_mode="rgb_array")
+    demo_heuristic_lander(env, render=True)
--- a/rl/Lib/site-packages/gymnasium/envs/classic_control/init.py
+++ b/rl/Lib/site-packages/gymnasium/envs/classic_control/init.py
@ -0,0 +1,7 @@
+from gymnasium.envs.classic_control.acrobot import AcrobotEnv
+from gymnasium.envs.classic_control.cartpole import CartPoleEnv
+from gymnasium.envs.classic_control.continuous_mountain_car import (
+    Continuous_MountainCarEnv,
+)
+from gymnasium.envs.classic_control.mountain_car import MountainCarEnv
+from gymnasium.envs.classic_control.pendulum import PendulumEnv
--- a/rl/Lib/site-packages/gymnasium/envs/classic_control/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/classic_control/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/classic_control/pycache/acrobot.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/classic_control/pycache/acrobot.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/classic_control/pycache/cartpole.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/classic_control/pycache/cartpole.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/classic_control/pycache/continuous_mountain_car.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/classic_control/pycache/continuous_mountain_car.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/classic_control/pycache/mountain_car.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/classic_control/pycache/mountain_car.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/classic_control/pycache/pendulum.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/classic_control/pycache/pendulum.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/classic_control/pycache/utils.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/classic_control/pycache/utils.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/classic_control/acrobot.py
+++ b/rl/Lib/site-packages/gymnasium/envs/classic_control/acrobot.py
@ -0,0 +1,470 @@
+"""classic Acrobot task"""
+from typing import Optional
+
+import numpy as np
+from numpy import cos, pi, sin
+
+import gymnasium as gym
+from gymnasium import Env, spaces
+from gymnasium.envs.classic_control import utils
+from gymnasium.error import DependencyNotInstalled
+
+
+__copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy"
+__credits__ = [
+    "Alborz Geramifard",
+    "Robert H. Klein",
+    "Christoph Dann",
+    "William Dabney",
+    "Jonathan P. How",
+]
+__license__ = "BSD 3-Clause"
+__author__ = "Christoph Dann <cdann@cdann.de>"
+
+# SOURCE:
+# https://github.com/rlpy/rlpy/blob/master/rlpy/Domains/Acrobot.py
+
+
+class AcrobotEnv(Env):
+    """
+    ## Description
+
+    The Acrobot environment is based on Sutton's work in
+    ["Generalization in Reinforcement Learning: Successful Examples Using Sparse Coarse Coding"](https://papers.nips.cc/paper/1995/hash/8f1d43620bc6bb580df6e80b0dc05c48-Abstract.html)
+    and [Sutton and Barto's book](http://www.incompleteideas.net/book/the-book-2nd.html).
+    The system consists of two links connected linearly to form a chain, with one end of
+    the chain fixed. The joint between the two links is actuated. The goal is to apply
+    torques on the actuated joint to swing the free end of the linear chain above a
+    given height while starting from the initial state of hanging downwards.
+
+    As seen in the **Gif**: two blue links connected by two green joints. The joint in
+    between the two links is actuated. The goal is to swing the free end of the outer-link
+    to reach the target height (black horizontal line above system) by applying torque on
+    the actuator.
+
+    ## Action Space
+
+    The action is discrete, deterministic, and represents the torque applied on the actuated
+    joint between the two links.
+
+    | Num | Action                                | Unit         |
+    |-----|---------------------------------------|--------------|
+    | 0   | apply -1 torque to the actuated joint | torque (N m) |
+    | 1   | apply 0 torque to the actuated joint  | torque (N m) |
+    | 2   | apply 1 torque to the actuated joint  | torque (N m) |
+
+    ## Observation Space
+
+    The observation is a `ndarray` with shape `(6,)` that provides information about the
+    two rotational joint angles as well as their angular velocities:
+
+    | Num | Observation                  | Min                 | Max               |
+    |-----|------------------------------|---------------------|-------------------|
+    | 0   | Cosine of `theta1`           | -1                  | 1                 |
+    | 1   | Sine of `theta1`             | -1                  | 1                 |
+    | 2   | Cosine of `theta2`           | -1                  | 1                 |
+    | 3   | Sine of `theta2`             | -1                  | 1                 |
+    | 4   | Angular velocity of `theta1` | ~ -12.567 (-4 * pi) | ~ 12.567 (4 * pi) |
+    | 5   | Angular velocity of `theta2` | ~ -28.274 (-9 * pi) | ~ 28.274 (9 * pi) |
+
+    where
+    - `theta1` is the angle of the first joint, where an angle of 0 indicates the first link is pointing directly
+    downwards.
+    - `theta2` is ***relative to the angle of the first link.***
+        An angle of 0 corresponds to having the same angle between the two links.
+
+    The angular velocities of `theta1` and `theta2` are bounded at ±4π, and ±9π rad/s respectively.
+    A state of `[1, 0, 1, 0, ..., ...]` indicates that both links are pointing downwards.
+
+    ## Rewards
+
+    The goal is to have the free end reach a designated target height in as few steps as possible,
+    and as such all steps that do not reach the goal incur a reward of -1.
+    Achieving the target height results in termination with a reward of 0. The reward threshold is -100.
+
+    ## Starting State
+
+    Each parameter in the underlying state (`theta1`, `theta2`, and the two angular velocities) is initialized
+    uniformly between -0.1 and 0.1. This means both links are pointing downwards with some initial stochasticity.
+
+    ## Episode End
+
+    The episode ends if one of the following occurs:
+    1. Termination: The free end reaches the target height, which is constructed as:
+    `-cos(theta1) - cos(theta2 + theta1) > 1.0`
+    2. Truncation: Episode length is greater than 500 (200 for v0)
+
+    ## Arguments
+
+    No additional arguments are currently supported during construction.
+
+    ```python
+    import gymnasium as gym
+    env = gym.make('Acrobot-v1')
+    ```
+
+    On reset, the `options` parameter allows the user to change the bounds used to determine
+    the new random state.
+
+    By default, the dynamics of the acrobot follow those described in Sutton and Barto's book
+    [Reinforcement Learning: An Introduction](http://incompleteideas.net/book/11/node4.html).
+    However, a `book_or_nips` parameter can be modified to change the pendulum dynamics to those described
+    in the original [NeurIPS paper](https://papers.nips.cc/paper/1995/hash/8f1d43620bc6bb580df6e80b0dc05c48-Abstract.html).
+
+    ```python
+    # To change the dynamics as described above
+    env.unwrapped.book_or_nips = 'nips'
+    ```
+
+    See the following note for details:
+
+    > The dynamics equations were missing some terms in the NIPS paper which
+            are present in the book. R. Sutton confirmed in personal correspondence
+            that the experimental results shown in the paper and the book were
+            generated with the equations shown in the book.
+            However, there is the option to run the domain with the paper equations
+            by setting `book_or_nips = 'nips'`
+
+
+    ## Version History
+
+    - v1: Maximum number of steps increased from 200 to 500. The observation space for v0 provided direct readings of
+    `theta1` and `theta2` in radians, having a range of `[-pi, pi]`. The v1 observation space as described here provides the
+    sine and cosine of each angle instead.
+    - v0: Initial versions release (1.0.0) (removed from gymnasium for v1)
+
+    ## References
+    - Sutton, R. S. (1996). Generalization in Reinforcement Learning: Successful Examples Using Sparse Coarse Coding.
+        In D. Touretzky, M. C. Mozer, & M. Hasselmo (Eds.), Advances in Neural Information Processing Systems (Vol. 8).
+        MIT Press. https://proceedings.neurips.cc/paper/1995/file/8f1d43620bc6bb580df6e80b0dc05c48-Paper.pdf
+    - Sutton, R. S., Barto, A. G. (2018 ). Reinforcement Learning: An Introduction. The MIT Press.
+    """
+
+    metadata = {
+        "render_modes": ["human", "rgb_array"],
+        "render_fps": 15,
+    }
+
+    dt = 0.2
+
+    LINK_LENGTH_1 = 1.0  # [m]
+    LINK_LENGTH_2 = 1.0  # [m]
+    LINK_MASS_1 = 1.0  #: [kg] mass of link 1
+    LINK_MASS_2 = 1.0  #: [kg] mass of link 2
+    LINK_COM_POS_1 = 0.5  #: [m] position of the center of mass of link 1
+    LINK_COM_POS_2 = 0.5  #: [m] position of the center of mass of link 2
+    LINK_MOI = 1.0  #: moments of inertia for both links
+
+    MAX_VEL_1 = 4 * pi
+    MAX_VEL_2 = 9 * pi
+
+    AVAIL_TORQUE = [-1.0, 0.0, +1]
+
+    torque_noise_max = 0.0
+
+    SCREEN_DIM = 500
+
+    #: use dynamics equations from the nips paper or the book
+    book_or_nips = "book"
+    action_arrow = None
+    domain_fig = None
+    actions_num = 3
+
+    def __init__(self, render_mode: Optional[str] = None):
+        self.render_mode = render_mode
+        self.screen = None
+        self.clock = None
+        self.isopen = True
+        high = np.array(
+            [1.0, 1.0, 1.0, 1.0, self.MAX_VEL_1, self.MAX_VEL_2], dtype=np.float32
+        )
+        low = -high
+        self.observation_space = spaces.Box(low=low, high=high, dtype=np.float32)
+        self.action_space = spaces.Discrete(3)
+        self.state = None
+
+    def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
+        super().reset(seed=seed)
+        # Note that if you use custom reset bounds, it may lead to out-of-bound
+        # state/observations.
+        low, high = utils.maybe_parse_reset_bounds(
+            options, -0.1, 0.1  # default low
+        )  # default high
+        self.state = self.np_random.uniform(low=low, high=high, size=(4,)).astype(
+            np.float32
+        )
+
+        if self.render_mode == "human":
+            self.render()
+        return self._get_ob(), {}
+
+    def step(self, a):
+        s = self.state
+        assert s is not None, "Call reset before using AcrobotEnv object."
+        torque = self.AVAIL_TORQUE[a]
+
+        # Add noise to the force action
+        if self.torque_noise_max > 0:
+            torque += self.np_random.uniform(
+                -self.torque_noise_max, self.torque_noise_max
+            )
+
+        # Now, augment the state with our force action so it can be passed to
+        # _dsdt
+        s_augmented = np.append(s, torque)
+
+        ns = rk4(self._dsdt, s_augmented, [0, self.dt])
+
+        ns[0] = wrap(ns[0], -pi, pi)
+        ns[1] = wrap(ns[1], -pi, pi)
+        ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1)
+        ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2)
+        self.state = ns
+        terminated = self._terminal()
+        reward = -1.0 if not terminated else 0.0
+
+        if self.render_mode == "human":
+            self.render()
+        return (self._get_ob(), reward, terminated, False, {})
+
+    def _get_ob(self):
+        s = self.state
+        assert s is not None, "Call reset before using AcrobotEnv object."
+        return np.array(
+            [cos(s[0]), sin(s[0]), cos(s[1]), sin(s[1]), s[2], s[3]], dtype=np.float32
+        )
+
+    def _terminal(self):
+        s = self.state
+        assert s is not None, "Call reset before using AcrobotEnv object."
+        return bool(-cos(s[0]) - cos(s[1] + s[0]) > 1.0)
+
+    def _dsdt(self, s_augmented):
+        m1 = self.LINK_MASS_1
+        m2 = self.LINK_MASS_2
+        l1 = self.LINK_LENGTH_1
+        lc1 = self.LINK_COM_POS_1
+        lc2 = self.LINK_COM_POS_2
+        I1 = self.LINK_MOI
+        I2 = self.LINK_MOI
+        g = 9.8
+        a = s_augmented[-1]
+        s = s_augmented[:-1]
+        theta1 = s[0]
+        theta2 = s[1]
+        dtheta1 = s[2]
+        dtheta2 = s[3]
+        d1 = (
+            m1 * lc1**2
+            + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2))
+            + I1
+            + I2
+        )
+        d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2
+        phi2 = m2 * lc2 * g * cos(theta1 + theta2 - pi / 2.0)
+        phi1 = (
+            -m2 * l1 * lc2 * dtheta2**2 * sin(theta2)
+            - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2)
+            + (m1 * lc1 + m2 * l1) * g * cos(theta1 - pi / 2)
+            + phi2
+        )
+        if self.book_or_nips == "nips":
+            # the following line is consistent with the description in the
+            # paper
+            ddtheta2 = (a + d2 / d1 * phi1 - phi2) / (m2 * lc2**2 + I2 - d2**2 / d1)
+        else:
+            # the following line is consistent with the java implementation and the
+            # book
+            ddtheta2 = (
+                a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1**2 * sin(theta2) - phi2
+            ) / (m2 * lc2**2 + I2 - d2**2 / d1)
+        ddtheta1 = -(d2 * ddtheta2 + phi1) / d1
+        return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0
+
+    def render(self):
+        if self.render_mode is None:
+            assert self.spec is not None
+            gym.logger.warn(
+                "You are calling render method without specifying any render mode. "
+                "You can specify the render_mode at initialization, "
+                f'e.g. gym.make("{self.spec.id}", render_mode="rgb_array")'
+            )
+            return
+
+        try:
+            import pygame
+            from pygame import gfxdraw
+        except ImportError as e:
+            raise DependencyNotInstalled(
+                "pygame is not installed, run `pip install gymnasium[classic-control]`"
+            ) from e
+
+        if self.screen is None:
+            pygame.init()
+            if self.render_mode == "human":
+                pygame.display.init()
+                self.screen = pygame.display.set_mode(
+                    (self.SCREEN_DIM, self.SCREEN_DIM)
+                )
+            else:  # mode in "rgb_array"
+                self.screen = pygame.Surface((self.SCREEN_DIM, self.SCREEN_DIM))
+        if self.clock is None:
+            self.clock = pygame.time.Clock()
+
+        surf = pygame.Surface((self.SCREEN_DIM, self.SCREEN_DIM))
+        surf.fill((255, 255, 255))
+        s = self.state
+
+        bound = self.LINK_LENGTH_1 + self.LINK_LENGTH_2 + 0.2  # 2.2 for default
+        scale = self.SCREEN_DIM / (bound * 2)
+        offset = self.SCREEN_DIM / 2
+
+        if s is None:
+            return None
+
+        p1 = [
+            -self.LINK_LENGTH_1 * cos(s[0]) * scale,
+            self.LINK_LENGTH_1 * sin(s[0]) * scale,
+        ]
+
+        p2 = [
+            p1[0] - self.LINK_LENGTH_2 * cos(s[0] + s[1]) * scale,
+            p1[1] + self.LINK_LENGTH_2 * sin(s[0] + s[1]) * scale,
+        ]
+
+        xys = np.array([[0, 0], p1, p2])[:, ::-1]
+        thetas = [s[0] - pi / 2, s[0] + s[1] - pi / 2]
+        link_lengths = [self.LINK_LENGTH_1 * scale, self.LINK_LENGTH_2 * scale]
+
+        pygame.draw.line(
+            surf,
+            start_pos=(-2.2 * scale + offset, 1 * scale + offset),
+            end_pos=(2.2 * scale + offset, 1 * scale + offset),
+            color=(0, 0, 0),
+        )
+
+        for (x, y), th, llen in zip(xys, thetas, link_lengths):
+            x = x + offset
+            y = y + offset
+            l, r, t, b = 0, llen, 0.1 * scale, -0.1 * scale
+            coords = [(l, b), (l, t), (r, t), (r, b)]
+            transformed_coords = []
+            for coord in coords:
+                coord = pygame.math.Vector2(coord).rotate_rad(th)
+                coord = (coord[0] + x, coord[1] + y)
+                transformed_coords.append(coord)
+            gfxdraw.aapolygon(surf, transformed_coords, (0, 204, 204))
+            gfxdraw.filled_polygon(surf, transformed_coords, (0, 204, 204))
+
+            gfxdraw.aacircle(surf, int(x), int(y), int(0.1 * scale), (204, 204, 0))
+            gfxdraw.filled_circle(surf, int(x), int(y), int(0.1 * scale), (204, 204, 0))
+
+        surf = pygame.transform.flip(surf, False, True)
+        self.screen.blit(surf, (0, 0))
+
+        if self.render_mode == "human":
+            pygame.event.pump()
+            self.clock.tick(self.metadata["render_fps"])
+            pygame.display.flip()
+
+        elif self.render_mode == "rgb_array":
+            return np.transpose(
+                np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2)
+            )
+
+    def close(self):
+        if self.screen is not None:
+            import pygame
+
+            pygame.display.quit()
+            pygame.quit()
+            self.isopen = False
+
+
+def wrap(x, m, M):
+    """Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which
+    truncates, ``wrap()`` wraps x around the coordinate system defined by m,M.\n
+    For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
+
+    Args:
+        x: a scalar
+        m: minimum possible value in range
+        M: maximum possible value in range
+
+    Returns:
+        x: a scalar, wrapped
+    """
+    diff = M - m
+    while x > M:
+        x = x - diff
+    while x < m:
+        x = x + diff
+    return x
+
+
+def bound(x, m, M=None):
+    """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
+    have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
+
+    Args:
+        x: scalar
+        m: The lower bound
+        M: The upper bound
+
+    Returns:
+        x: scalar, bound between min (m) and Max (M)
+    """
+    if M is None:
+        M = m[1]
+        m = m[0]
+    # bound x between min (m) and Max (M)
+    return min(max(x, m), M)
+
+
+def rk4(derivs, y0, t):
+    """
+    Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta.
+
+    Example for 2D system:
+
+        >>> def derivs(x):
+        ...     d1 =  x[0] + 2*x[1]
+        ...     d2 =  -3*x[0] + 4*x[1]
+        ...     return d1, d2
+
+        >>> dt = 0.0005
+        >>> t = np.arange(0.0, 2.0, dt)
+        >>> y0 = (1,2)
+        >>> yout = rk4(derivs, y0, t)
+
+    Args:
+        derivs: the derivative of the system and has the signature ``dy = derivs(yi)``
+        y0: initial state vector
+        t: sample times
+
+    Returns:
+        yout: Runge-Kutta approximation of the ODE
+    """
+
+    try:
+        Ny = len(y0)
+    except TypeError:
+        yout = np.zeros((len(t),), np.float_)
+    else:
+        yout = np.zeros((len(t), Ny), np.float_)
+
+    yout[0] = y0
+
+    for i in np.arange(len(t) - 1):
+        this = t[i]
+        dt = t[i + 1] - this
+        dt2 = dt / 2.0
+        y0 = yout[i]
+
+        k1 = np.asarray(derivs(y0))
+        k2 = np.asarray(derivs(y0 + dt2 * k1))
+        k3 = np.asarray(derivs(y0 + dt2 * k2))
+        k4 = np.asarray(derivs(y0 + dt * k3))
+        yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4)
+    # We only care about the final timestep and we cleave off action value which will be zero
+    return yout[-1][:4]
--- a/rl/Lib/site-packages/gymnasium/envs/classic_control/assets/clockwise.png
+++ b/rl/Lib/site-packages/gymnasium/envs/classic_control/assets/clockwise.png
--- a/rl/Lib/site-packages/gymnasium/envs/classic_control/assets/clockwise.png.import
+++ b/rl/Lib/site-packages/gymnasium/envs/classic_control/assets/clockwise.png.import
@ -0,0 +1,34 @@
+[remap]
+
+importer="texture"
+type="CompressedTexture2D"
+uid="uid://dk5m5liwecsk"
+path="res://.godot/imported/clockwise.png-0b6c4e15d302e93ea5c480ebf7734edb.ctex"
+metadata={
+"vram_texture": false
+}
+
+[deps]
+
+source_file="res://rl/Lib/site-packages/gymnasium/envs/classic_control/assets/clockwise.png"
+dest_files=["res://.godot/imported/clockwise.png-0b6c4e15d302e93ea5c480ebf7734edb.ctex"]
+
+[params]
+
+compress/mode=0
+compress/high_quality=false
+compress/lossy_quality=0.7
+compress/hdr_compression=1
+compress/normal_map=0
+compress/channel_pack=0
+mipmaps/generate=false
+mipmaps/limit=-1
+roughness/mode=0
+roughness/src_normal=""
+process/fix_alpha_border=true
+process/premult_alpha=false
+process/normal_map_invert_y=false
+process/hdr_as_srgb=false
+process/hdr_clamp_exposure=false
+process/size_limit=0
+detect_3d/compress_to=1
--- a/rl/Lib/site-packages/gymnasium/envs/classic_control/cartpole.py
+++ b/rl/Lib/site-packages/gymnasium/envs/classic_control/cartpole.py
@ -0,0 +1,577 @@
+"""
+Classic cart-pole system implemented by Rich Sutton et al.
+Copied from http://incompleteideas.net/sutton/book/code/pole.c
+permalink: https://perma.cc/C9ZM-652R
+"""
+import math
+from typing import Optional, Tuple, Union
+
+import numpy as np
+
+import gymnasium as gym
+from gymnasium import logger, spaces
+from gymnasium.envs.classic_control import utils
+from gymnasium.error import DependencyNotInstalled
+from gymnasium.experimental.vector import VectorEnv
+from gymnasium.vector.utils import batch_space
+
+
+class CartPoleEnv(gym.Env[np.ndarray, Union[int, np.ndarray]]):
+    """
+    ## Description
+
+    This environment corresponds to the version of the cart-pole problem described by Barto, Sutton, and Anderson in
+    ["Neuronlike Adaptive Elements That Can Solve Difficult Learning Control Problem"](https://ieeexplore.ieee.org/document/6313077).
+    A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track.
+    The pendulum is placed upright on the cart and the goal is to balance the pole by applying forces
+     in the left and right direction on the cart.
+
+    ## Action Space
+
+    The action is a `ndarray` with shape `(1,)` which can take values `{0, 1}` indicating the direction
+     of the fixed force the cart is pushed with.
+
+    - 0: Push cart to the left
+    - 1: Push cart to the right
+
+    **Note**: The velocity that is reduced or increased by the applied force is not fixed and it depends on the angle
+     the pole is pointing. The center of gravity of the pole varies the amount of energy needed to move the cart underneath it
+
+    ## Observation Space
+
+    The observation is a `ndarray` with shape `(4,)` with the values corresponding to the following positions and velocities:
+
+    | Num | Observation           | Min                 | Max               |
+    |-----|-----------------------|---------------------|-------------------|
+    | 0   | Cart Position         | -4.8                | 4.8               |
+    | 1   | Cart Velocity         | -Inf                | Inf               |
+    | 2   | Pole Angle            | ~ -0.418 rad (-24°) | ~ 0.418 rad (24°) |
+    | 3   | Pole Angular Velocity | -Inf                | Inf               |
+
+    **Note:** While the ranges above denote the possible values for observation space of each element,
+        it is not reflective of the allowed values of the state space in an unterminated episode. Particularly:
+    -  The cart x-position (index 0) can be take values between `(-4.8, 4.8)`, but the episode terminates
+       if the cart leaves the `(-2.4, 2.4)` range.
+    -  The pole angle can be observed between  `(-.418, .418)` radians (or **±24°**), but the episode terminates
+       if the pole angle is not in the range `(-.2095, .2095)` (or **±12°**)
+
+    ## Rewards
+
+    Since the goal is to keep the pole upright for as long as possible, a reward of `+1` for every step taken,
+    including the termination step, is allotted. The threshold for rewards is 500 for v1 and 200 for v0.
+
+    ## Starting State
+
+    All observations are assigned a uniformly random value in `(-0.05, 0.05)`
+
+    ## Episode End
+
+    The episode ends if any one of the following occurs:
+
+    1. Termination: Pole Angle is greater than ±12°
+    2. Termination: Cart Position is greater than ±2.4 (center of the cart reaches the edge of the display)
+    3. Truncation: Episode length is greater than 500 (200 for v0)
+
+    ## Arguments
+
+    ```python
+    import gymnasium as gym
+    gym.make('CartPole-v1')
+    ```
+
+    On reset, the `options` parameter allows the user to change the bounds used to determine
+    the new random state.
+    """
+
+    metadata = {
+        "render_modes": ["human", "rgb_array"],
+        "render_fps": 50,
+    }
+
+    def __init__(self, render_mode: Optional[str] = None):
+        self.gravity = 9.8
+        self.masscart = 1.0
+        self.masspole = 0.1
+        self.total_mass = self.masspole + self.masscart
+        self.length = 0.5  # actually half the pole's length
+        self.polemass_length = self.masspole * self.length
+        self.force_mag = 10.0
+        self.tau = 0.02  # seconds between state updates
+        self.kinematics_integrator = "euler"
+
+        # Angle at which to fail the episode
+        self.theta_threshold_radians = 12 * 2 * math.pi / 360
+        self.x_threshold = 2.4
+
+        # Angle limit set to 2 * theta_threshold_radians so failing observation
+        # is still within bounds.
+        high = np.array(
+            [
+                self.x_threshold * 2,
+                np.finfo(np.float32).max,
+                self.theta_threshold_radians * 2,
+                np.finfo(np.float32).max,
+            ],
+            dtype=np.float32,
+        )
+
+        self.action_space = spaces.Discrete(2)
+        self.observation_space = spaces.Box(-high, high, dtype=np.float32)
+
+        self.render_mode = render_mode
+
+        self.screen_width = 600
+        self.screen_height = 400
+        self.screen = None
+        self.clock = None
+        self.isopen = True
+        self.state = None
+
+        self.steps_beyond_terminated = None
+
+    def step(self, action):
+        assert self.action_space.contains(
+            action
+        ), f"{action!r} ({type(action)}) invalid"
+        assert self.state is not None, "Call reset before using step method."
+        x, x_dot, theta, theta_dot = self.state
+        force = self.force_mag if action == 1 else -self.force_mag
+        costheta = math.cos(theta)
+        sintheta = math.sin(theta)
+
+        # For the interested reader:
+        # https://coneural.org/florian/papers/05_cart_pole.pdf
+        temp = (
+            force + self.polemass_length * theta_dot**2 * sintheta
+        ) / self.total_mass
+        thetaacc = (self.gravity * sintheta - costheta * temp) / (
+            self.length * (4.0 / 3.0 - self.masspole * costheta**2 / self.total_mass)
+        )
+        xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
+
+        if self.kinematics_integrator == "euler":
+            x = x + self.tau * x_dot
+            x_dot = x_dot + self.tau * xacc
+            theta = theta + self.tau * theta_dot
+            theta_dot = theta_dot + self.tau * thetaacc
+        else:  # semi-implicit euler
+            x_dot = x_dot + self.tau * xacc
+            x = x + self.tau * x_dot
+            theta_dot = theta_dot + self.tau * thetaacc
+            theta = theta + self.tau * theta_dot
+
+        self.state = (x, x_dot, theta, theta_dot)
+
+        terminated = bool(
+            x < -self.x_threshold
+            or x > self.x_threshold
+            or theta < -self.theta_threshold_radians
+            or theta > self.theta_threshold_radians
+        )
+
+        if not terminated:
+            reward = 1.0
+        elif self.steps_beyond_terminated is None:
+            # Pole just fell!
+            self.steps_beyond_terminated = 0
+            reward = 1.0
+        else:
+            if self.steps_beyond_terminated == 0:
+                logger.warn(
+                    "You are calling 'step()' even though this "
+                    "environment has already returned terminated = True. You "
+                    "should always call 'reset()' once you receive 'terminated = "
+                    "True' -- any further steps are undefined behavior."
+                )
+            self.steps_beyond_terminated += 1
+            reward = 0.0
+
+        if self.render_mode == "human":
+            self.render()
+        return np.array(self.state, dtype=np.float32), reward, terminated, False, {}
+
+    def reset(
+        self,
+        *,
+        seed: Optional[int] = None,
+        options: Optional[dict] = None,
+    ):
+        super().reset(seed=seed)
+        # Note that if you use custom reset bounds, it may lead to out-of-bound
+        # state/observations.
+        low, high = utils.maybe_parse_reset_bounds(
+            options, -0.05, 0.05  # default low
+        )  # default high
+        self.state = self.np_random.uniform(low=low, high=high, size=(4,))
+        self.steps_beyond_terminated = None
+
+        if self.render_mode == "human":
+            self.render()
+        return np.array(self.state, dtype=np.float32), {}
+
+    def render(self):
+        if self.render_mode is None:
+            assert self.spec is not None
+            gym.logger.warn(
+                "You are calling render method without specifying any render mode. "
+                "You can specify the render_mode at initialization, "
+                f'e.g. gym.make("{self.spec.id}", render_mode="rgb_array")'
+            )
+            return
+
+        try:
+            import pygame
+            from pygame import gfxdraw
+        except ImportError as e:
+            raise DependencyNotInstalled(
+                "pygame is not installed, run `pip install gymnasium[classic-control]`"
+            ) from e
+
+        if self.screen is None:
+            pygame.init()
+            if self.render_mode == "human":
+                pygame.display.init()
+                self.screen = pygame.display.set_mode(
+                    (self.screen_width, self.screen_height)
+                )
+            else:  # mode == "rgb_array"
+                self.screen = pygame.Surface((self.screen_width, self.screen_height))
+        if self.clock is None:
+            self.clock = pygame.time.Clock()
+
+        world_width = self.x_threshold * 2
+        scale = self.screen_width / world_width
+        polewidth = 10.0
+        polelen = scale * (2 * self.length)
+        cartwidth = 50.0
+        cartheight = 30.0
+
+        if self.state is None:
+            return None
+
+        x = self.state
+
+        self.surf = pygame.Surface((self.screen_width, self.screen_height))
+        self.surf.fill((255, 255, 255))
+
+        l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2
+        axleoffset = cartheight / 4.0
+        cartx = x[0] * scale + self.screen_width / 2.0  # MIDDLE OF CART
+        carty = 100  # TOP OF CART
+        cart_coords = [(l, b), (l, t), (r, t), (r, b)]
+        cart_coords = [(c[0] + cartx, c[1] + carty) for c in cart_coords]
+        gfxdraw.aapolygon(self.surf, cart_coords, (0, 0, 0))
+        gfxdraw.filled_polygon(self.surf, cart_coords, (0, 0, 0))
+
+        l, r, t, b = (
+            -polewidth / 2,
+            polewidth / 2,
+            polelen - polewidth / 2,
+            -polewidth / 2,
+        )
+
+        pole_coords = []
+        for coord in [(l, b), (l, t), (r, t), (r, b)]:
+            coord = pygame.math.Vector2(coord).rotate_rad(-x[2])
+            coord = (coord[0] + cartx, coord[1] + carty + axleoffset)
+            pole_coords.append(coord)
+        gfxdraw.aapolygon(self.surf, pole_coords, (202, 152, 101))
+        gfxdraw.filled_polygon(self.surf, pole_coords, (202, 152, 101))
+
+        gfxdraw.aacircle(
+            self.surf,
+            int(cartx),
+            int(carty + axleoffset),
+            int(polewidth / 2),
+            (129, 132, 203),
+        )
+        gfxdraw.filled_circle(
+            self.surf,
+            int(cartx),
+            int(carty + axleoffset),
+            int(polewidth / 2),
+            (129, 132, 203),
+        )
+
+        gfxdraw.hline(self.surf, 0, self.screen_width, carty, (0, 0, 0))
+
+        self.surf = pygame.transform.flip(self.surf, False, True)
+        self.screen.blit(self.surf, (0, 0))
+        if self.render_mode == "human":
+            pygame.event.pump()
+            self.clock.tick(self.metadata["render_fps"])
+            pygame.display.flip()
+
+        elif self.render_mode == "rgb_array":
+            return np.transpose(
+                np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2)
+            )
+
+    def close(self):
+        if self.screen is not None:
+            import pygame
+
+            pygame.display.quit()
+            pygame.quit()
+            self.isopen = False
+
+
+class CartPoleVectorEnv(VectorEnv):
+    metadata = {
+        "render_modes": ["human", "rgb_array"],
+        "render_fps": 50,
+    }
+
+    def __init__(
+        self,
+        num_envs: int = 2,
+        max_episode_steps: int = 500,
+        render_mode: Optional[str] = None,
+    ):
+        super().__init__()
+        self.num_envs = num_envs
+        self.gravity = 9.8
+        self.masscart = 1.0
+        self.masspole = 0.1
+        self.total_mass = self.masspole + self.masscart
+        self.length = 0.5  # actually half the pole's length
+        self.polemass_length = self.masspole * self.length
+        self.force_mag = 10.0
+        self.tau = 0.02  # seconds between state updates
+        self.kinematics_integrator = "euler"
+        self.max_episode_steps = max_episode_steps
+
+        self.steps = np.zeros(num_envs, dtype=np.int32)
+
+        # Angle at which to fail the episode
+        self.theta_threshold_radians = 12 * 2 * math.pi / 360
+        self.x_threshold = 2.4
+
+        # Angle limit set to 2 * theta_threshold_radians so failing observation
+        # is still within bounds.
+        high = np.array(
+            [
+                self.x_threshold * 2,
+                np.finfo(np.float32).max,
+                self.theta_threshold_radians * 2,
+                np.finfo(np.float32).max,
+            ],
+            dtype=np.float32,
+        )
+
+        self.low = -0.05
+        self.high = 0.05
+
+        self.single_action_space = spaces.Discrete(2)
+        self.action_space = batch_space(self.single_action_space, num_envs)
+        self.single_observation_space = spaces.Box(-high, high, dtype=np.float32)
+        self.observation_space = batch_space(self.single_observation_space, num_envs)
+
+        self.render_mode = render_mode
+
+        self.screen_width = 600
+        self.screen_height = 400
+        self.screens = None
+        self.clocks = None
+        self.isopen = True
+        self.state = None
+
+        self.steps_beyond_terminated = None
+
+    def step(
+        self, action: np.ndarray
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, dict]:
+        assert self.action_space.contains(
+            action
+        ), f"{action!r} ({type(action)}) invalid"
+        assert self.state is not None, "Call reset before using step method."
+
+        x, x_dot, theta, theta_dot = self.state
+        force = np.sign(action - 0.5) * self.force_mag
+        costheta = np.cos(theta)
+        sintheta = np.sin(theta)
+
+        # For the interested reader:
+        # https://coneural.org/florian/papers/05_cart_pole.pdf
+        temp = (
+            force + self.polemass_length * theta_dot**2 * sintheta
+        ) / self.total_mass
+        thetaacc = (self.gravity * sintheta - costheta * temp) / (
+            self.length * (4.0 / 3.0 - self.masspole * costheta**2 / self.total_mass)
+        )
+        xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
+
+        if self.kinematics_integrator == "euler":
+            x = x + self.tau * x_dot
+            x_dot = x_dot + self.tau * xacc
+            theta = theta + self.tau * theta_dot
+            theta_dot = theta_dot + self.tau * thetaacc
+        else:  # semi-implicit euler
+            x_dot = x_dot + self.tau * xacc
+            x = x + self.tau * x_dot
+            theta_dot = theta_dot + self.tau * thetaacc
+            theta = theta + self.tau * theta_dot
+
+        self.state = np.stack((x, x_dot, theta, theta_dot))
+
+        terminated: np.ndarray = (
+            (x < -self.x_threshold)
+            | (x > self.x_threshold)
+            | (theta < -self.theta_threshold_radians)
+            | (theta > self.theta_threshold_radians)
+        )
+
+        self.steps += 1
+
+        truncated = self.steps >= self.max_episode_steps
+
+        done = terminated | truncated
+
+        if any(done):
+            # This code was generated by copilot, need to check if it works
+            self.state[:, done] = self.np_random.uniform(
+                low=self.low, high=self.high, size=(4, done.sum())
+            ).astype(np.float32)
+            self.steps[done] = 0
+
+        reward = np.ones_like(terminated, dtype=np.float32)
+
+        if self.render_mode == "human":
+            self.render()
+
+        return self.state.T, reward, terminated, truncated, {}
+
+    def reset(
+        self,
+        *,
+        seed: Optional[int] = None,
+        options: Optional[dict] = None,
+    ):
+        super().reset(seed=seed)
+        # Note that if you use custom reset bounds, it may lead to out-of-bound
+        # state/observations.
+        self.low, self.high = utils.maybe_parse_reset_bounds(
+            options, -0.05, 0.05  # default low
+        )  # default high
+        self.state = self.np_random.uniform(
+            low=self.low, high=self.high, size=(4, self.num_envs)
+        ).astype(np.float32)
+        self.steps_beyond_terminated = None
+
+        if self.render_mode == "human":
+            self.render()
+        return self.state.T, {}
+
+    def render(self):
+        if self.render_mode is None:
+            gym.logger.warn(
+                "You are calling render method without specifying any render mode. "
+                "You can specify the render_mode at initialization, "
+                f'e.g. gym("{self.spec.id}", render_mode="rgb_array")'
+            )
+            return
+
+        try:
+            import pygame
+            from pygame import gfxdraw
+        except ImportError:
+            raise DependencyNotInstalled(
+                "pygame is not installed, run `pip install gymnasium[classic_control]`"
+            )
+
+        if self.screens is None:
+            pygame.init()
+            if self.render_mode == "human":
+                pygame.display.init()
+                self.screens = [
+                    pygame.display.set_mode((self.screen_width, self.screen_height))
+                    for _ in range(self.num_envs)
+                ]
+            else:  # mode == "rgb_array"
+                self.screens = [
+                    pygame.Surface((self.screen_width, self.screen_height))
+                    for _ in range(self.num_envs)
+                ]
+        if self.clocks is None:
+            self.clock = [pygame.time.Clock() for _ in range(self.num_envs)]
+
+        world_width = self.x_threshold * 2
+        scale = self.screen_width / world_width
+        polewidth = 10.0
+        polelen = scale * (2 * self.length)
+        cartwidth = 50.0
+        cartheight = 30.0
+
+        if self.state is None:
+            return None
+
+        for state, screen, clock in zip(self.state, self.screens, self.clocks):
+            x = self.state.T
+
+            self.surf = pygame.Surface((self.screen_width, self.screen_height))
+            self.surf.fill((255, 255, 255))
+
+            l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2
+            axleoffset = cartheight / 4.0
+            cartx = x[0] * scale + self.screen_width / 2.0  # MIDDLE OF CART
+            carty = 100  # TOP OF CART
+            cart_coords = [(l, b), (l, t), (r, t), (r, b)]
+            cart_coords = [(c[0] + cartx, c[1] + carty) for c in cart_coords]
+            gfxdraw.aapolygon(self.surf, cart_coords, (0, 0, 0))
+            gfxdraw.filled_polygon(self.surf, cart_coords, (0, 0, 0))
+
+            l, r, t, b = (
+                -polewidth / 2,
+                polewidth / 2,
+                polelen - polewidth / 2,
+                -polewidth / 2,
+            )
+
+            pole_coords = []
+            for coord in [(l, b), (l, t), (r, t), (r, b)]:
+                coord = pygame.math.Vector2(coord).rotate_rad(-x[2])
+                coord = (coord[0] + cartx, coord[1] + carty + axleoffset)
+                pole_coords.append(coord)
+            gfxdraw.aapolygon(self.surf, pole_coords, (202, 152, 101))
+            gfxdraw.filled_polygon(self.surf, pole_coords, (202, 152, 101))
+
+            gfxdraw.aacircle(
+                self.surf,
+                int(cartx),
+                int(carty + axleoffset),
+                int(polewidth / 2),
+                (129, 132, 203),
+            )
+            gfxdraw.filled_circle(
+                self.surf,
+                int(cartx),
+                int(carty + axleoffset),
+                int(polewidth / 2),
+                (129, 132, 203),
+            )
+
+            gfxdraw.hline(self.surf, 0, self.screen_width, carty, (0, 0, 0))
+
+            self.surf = pygame.transform.flip(self.surf, False, True)
+            screen.blit(self.surf, (0, 0))
+
+        if self.render_mode == "human":
+            pygame.event.pump()
+            [clock.tick(self.metadata["render_fps"]) for clock in self.clocks]
+            pygame.display.flip()
+
+        elif self.render_mode == "rgb_array":
+            return [
+                np.transpose(
+                    np.array(pygame.surfarray.pixels3d(screen)), axes=(1, 0, 2)
+                )
+                for screen in self.screens
+            ]
+
+    def close(self):
+        if self.screens is not None:
+            import pygame
+
+            pygame.display.quit()
+            pygame.quit()
+            self.isopen = False
--- a/rl/Lib/site-packages/gymnasium/envs/classic_control/continuous_mountain_car.py
+++ b/rl/Lib/site-packages/gymnasium/envs/classic_control/continuous_mountain_car.py
@ -0,0 +1,304 @@
+"""
+@author: Olivier Sigaud
+
+A merge between two sources:
+
+* Adaptation of the MountainCar Environment from the "FAReinforcement" library
+of Jose Antonio Martin H. (version 1.0), adapted by  'Tom Schaul, tom@idsia.ch'
+and then modified by Arnaud de Broissia
+
+* the gymnasium MountainCar environment
+itself from
+http://incompleteideas.net/sutton/MountainCar/MountainCar1.cp
+permalink: https://perma.cc/6Z2N-PFWC
+"""
+
+import math
+from typing import Optional
+
+import numpy as np
+
+import gymnasium as gym
+from gymnasium import spaces
+from gymnasium.envs.classic_control import utils
+from gymnasium.error import DependencyNotInstalled
+
+
+class Continuous_MountainCarEnv(gym.Env):
+    """
+    ## Description
+
+    The Mountain Car MDP is a deterministic MDP that consists of a car placed stochastically
+    at the bottom of a sinusoidal valley, with the only possible actions being the accelerations
+    that can be applied to the car in either direction. The goal of the MDP is to strategically
+    accelerate the car to reach the goal state on top of the right hill. There are two versions
+    of the mountain car domain in gymnasium: one with discrete actions and one with continuous.
+    This version is the one with continuous actions.
+
+    This MDP first appeared in [Andrew Moore's PhD Thesis (1990)](https://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-209.pdf)
+
+    ```
+    @TECHREPORT{Moore90efficientmemory-based,
+        author = {Andrew William Moore},
+        title = {Efficient Memory-based Learning for Robot Control},
+        institution = {University of Cambridge},
+        year = {1990}
+    }
+    ```
+
+    ## Observation Space
+
+    The observation is a `ndarray` with shape `(2,)` where the elements correspond to the following:
+
+    | Num | Observation                          | Min  | Max | Unit         |
+    |-----|--------------------------------------|------|-----|--------------|
+    | 0   | position of the car along the x-axis | -Inf | Inf | position (m) |
+    | 1   | velocity of the car                  | -Inf | Inf | position (m) |
+
+    ## Action Space
+
+    The action is a `ndarray` with shape `(1,)`, representing the directional force applied on the car.
+    The action is clipped in the range `[-1,1]` and multiplied by a power of 0.0015.
+
+    ## Transition Dynamics:
+
+    Given an action, the mountain car follows the following transition dynamics:
+
+    *velocity<sub>t+1</sub> = velocity<sub>t+1</sub> + force * self.power - 0.0025 * cos(3 * position<sub>t</sub>)*
+
+    *position<sub>t+1</sub> = position<sub>t</sub> + velocity<sub>t+1</sub>*
+
+    where force is the action clipped to the range `[-1,1]` and power is a constant 0.0015.
+    The collisions at either end are inelastic with the velocity set to 0 upon collision with the wall.
+    The position is clipped to the range [-1.2, 0.6] and velocity is clipped to the range [-0.07, 0.07].
+
+    ## Reward
+
+    A negative reward of *-0.1 * action<sup>2</sup>* is received at each timestep to penalise for
+    taking actions of large magnitude. If the mountain car reaches the goal then a positive reward of +100
+    is added to the negative reward for that timestep.
+
+    ## Starting State
+
+    The position of the car is assigned a uniform random value in `[-0.6 , -0.4]`.
+    The starting velocity of the car is always assigned to 0.
+
+    ## Episode End
+
+    The episode ends if either of the following happens:
+    1. Termination: The position of the car is greater than or equal to 0.45 (the goal position on top of the right hill)
+    2. Truncation: The length of the episode is 999.
+
+    ## Arguments
+
+    ```python
+    import gymnasium as gym
+    gym.make('MountainCarContinuous-v0')
+    ```
+
+    On reset, the `options` parameter allows the user to change the bounds used to determine
+    the new random state.
+
+    ## Version History
+
+    * v0: Initial versions release (1.0.0)
+    """
+
+    metadata = {
+        "render_modes": ["human", "rgb_array"],
+        "render_fps": 30,
+    }
+
+    def __init__(self, render_mode: Optional[str] = None, goal_velocity=0):
+        self.min_action = -1.0
+        self.max_action = 1.0
+        self.min_position = -1.2
+        self.max_position = 0.6
+        self.max_speed = 0.07
+        self.goal_position = (
+            0.45  # was 0.5 in gymnasium, 0.45 in Arnaud de Broissia's version
+        )
+        self.goal_velocity = goal_velocity
+        self.power = 0.0015
+
+        self.low_state = np.array(
+            [self.min_position, -self.max_speed], dtype=np.float32
+        )
+        self.high_state = np.array(
+            [self.max_position, self.max_speed], dtype=np.float32
+        )
+
+        self.render_mode = render_mode
+
+        self.screen_width = 600
+        self.screen_height = 400
+        self.screen = None
+        self.clock = None
+        self.isopen = True
+
+        self.action_space = spaces.Box(
+            low=self.min_action, high=self.max_action, shape=(1,), dtype=np.float32
+        )
+        self.observation_space = spaces.Box(
+            low=self.low_state, high=self.high_state, dtype=np.float32
+        )
+
+    def step(self, action: np.ndarray):
+        position = self.state[0]
+        velocity = self.state[1]
+        force = min(max(action[0], self.min_action), self.max_action)
+
+        velocity += force * self.power - 0.0025 * math.cos(3 * position)
+        if velocity > self.max_speed:
+            velocity = self.max_speed
+        if velocity < -self.max_speed:
+            velocity = -self.max_speed
+        position += velocity
+        if position > self.max_position:
+            position = self.max_position
+        if position < self.min_position:
+            position = self.min_position
+        if position == self.min_position and velocity < 0:
+            velocity = 0
+
+        # Convert a possible numpy bool to a Python bool.
+        terminated = bool(
+            position >= self.goal_position and velocity >= self.goal_velocity
+        )
+
+        reward = 0
+        if terminated:
+            reward = 100.0
+        reward -= math.pow(action[0], 2) * 0.1
+
+        self.state = np.array([position, velocity], dtype=np.float32)
+
+        if self.render_mode == "human":
+            self.render()
+        return self.state, reward, terminated, False, {}
+
+    def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
+        super().reset(seed=seed)
+        # Note that if you use custom reset bounds, it may lead to out-of-bound
+        # state/observations.
+        low, high = utils.maybe_parse_reset_bounds(options, -0.6, -0.4)
+        self.state = np.array([self.np_random.uniform(low=low, high=high), 0])
+
+        if self.render_mode == "human":
+            self.render()
+        return np.array(self.state, dtype=np.float32), {}
+
+    def _height(self, xs):
+        return np.sin(3 * xs) * 0.45 + 0.55
+
+    def render(self):
+        if self.render_mode is None:
+            assert self.spec is not None
+            gym.logger.warn(
+                "You are calling render method without specifying any render mode. "
+                "You can specify the render_mode at initialization, "
+                f'e.g. gym.make("{self.spec.id}", render_mode="rgb_array")'
+            )
+            return
+
+        try:
+            import pygame
+            from pygame import gfxdraw
+        except ImportError as e:
+            raise DependencyNotInstalled(
+                "pygame is not installed, run `pip install gymnasium[classic-control]`"
+            ) from e
+
+        if self.screen is None:
+            pygame.init()
+            if self.render_mode == "human":
+                pygame.display.init()
+                self.screen = pygame.display.set_mode(
+                    (self.screen_width, self.screen_height)
+                )
+            else:  # mode == "rgb_array":
+                self.screen = pygame.Surface((self.screen_width, self.screen_height))
+        if self.clock is None:
+            self.clock = pygame.time.Clock()
+
+        world_width = self.max_position - self.min_position
+        scale = self.screen_width / world_width
+        carwidth = 40
+        carheight = 20
+
+        self.surf = pygame.Surface((self.screen_width, self.screen_height))
+        self.surf.fill((255, 255, 255))
+
+        pos = self.state[0]
+
+        xs = np.linspace(self.min_position, self.max_position, 100)
+        ys = self._height(xs)
+        xys = list(zip((xs - self.min_position) * scale, ys * scale))
+
+        pygame.draw.aalines(self.surf, points=xys, closed=False, color=(0, 0, 0))
+
+        clearance = 10
+
+        l, r, t, b = -carwidth / 2, carwidth / 2, carheight, 0
+        coords = []
+        for c in [(l, b), (l, t), (r, t), (r, b)]:
+            c = pygame.math.Vector2(c).rotate_rad(math.cos(3 * pos))
+            coords.append(
+                (
+                    c[0] + (pos - self.min_position) * scale,
+                    c[1] + clearance + self._height(pos) * scale,
+                )
+            )
+
+        gfxdraw.aapolygon(self.surf, coords, (0, 0, 0))
+        gfxdraw.filled_polygon(self.surf, coords, (0, 0, 0))
+
+        for c in [(carwidth / 4, 0), (-carwidth / 4, 0)]:
+            c = pygame.math.Vector2(c).rotate_rad(math.cos(3 * pos))
+            wheel = (
+                int(c[0] + (pos - self.min_position) * scale),
+                int(c[1] + clearance + self._height(pos) * scale),
+            )
+
+            gfxdraw.aacircle(
+                self.surf, wheel[0], wheel[1], int(carheight / 2.5), (128, 128, 128)
+            )
+            gfxdraw.filled_circle(
+                self.surf, wheel[0], wheel[1], int(carheight / 2.5), (128, 128, 128)
+            )
+
+        flagx = int((self.goal_position - self.min_position) * scale)
+        flagy1 = int(self._height(self.goal_position) * scale)
+        flagy2 = flagy1 + 50
+        gfxdraw.vline(self.surf, flagx, flagy1, flagy2, (0, 0, 0))
+
+        gfxdraw.aapolygon(
+            self.surf,
+            [(flagx, flagy2), (flagx, flagy2 - 10), (flagx + 25, flagy2 - 5)],
+            (204, 204, 0),
+        )
+        gfxdraw.filled_polygon(
+            self.surf,
+            [(flagx, flagy2), (flagx, flagy2 - 10), (flagx + 25, flagy2 - 5)],
+            (204, 204, 0),
+        )
+
+        self.surf = pygame.transform.flip(self.surf, False, True)
+        self.screen.blit(self.surf, (0, 0))
+        if self.render_mode == "human":
+            pygame.event.pump()
+            self.clock.tick(self.metadata["render_fps"])
+            pygame.display.flip()
+
+        elif self.render_mode == "rgb_array":
+            return np.transpose(
+                np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2)
+            )
+
+    def close(self):
+        if self.screen is not None:
+            import pygame
+
+            pygame.display.quit()
+            pygame.quit()
+            self.isopen = False
--- a/rl/Lib/site-packages/gymnasium/envs/classic_control/mountain_car.py
+++ b/rl/Lib/site-packages/gymnasium/envs/classic_control/mountain_car.py
@ -0,0 +1,284 @@
+"""
+http://incompleteideas.net/MountainCar/MountainCar1.cp
+permalink: https://perma.cc/6Z2N-PFWC
+"""
+import math
+from typing import Optional
+
+import numpy as np
+
+import gymnasium as gym
+from gymnasium import spaces
+from gymnasium.envs.classic_control import utils
+from gymnasium.error import DependencyNotInstalled
+
+
+class MountainCarEnv(gym.Env):
+    """
+    ## Description
+
+    The Mountain Car MDP is a deterministic MDP that consists of a car placed stochastically
+    at the bottom of a sinusoidal valley, with the only possible actions being the accelerations
+    that can be applied to the car in either direction. The goal of the MDP is to strategically
+    accelerate the car to reach the goal state on top of the right hill. There are two versions
+    of the mountain car domain in gymnasium: one with discrete actions and one with continuous.
+    This version is the one with discrete actions.
+
+    This MDP first appeared in [Andrew Moore's PhD Thesis (1990)](https://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-209.pdf)
+
+    ```
+    @TECHREPORT{Moore90efficientmemory-based,
+        author = {Andrew William Moore},
+        title = {Efficient Memory-based Learning for Robot Control},
+        institution = {University of Cambridge},
+        year = {1990}
+    }
+    ```
+
+    ## Observation Space
+
+    The observation is a `ndarray` with shape `(2,)` where the elements correspond to the following:
+
+    | Num | Observation                          | Min   | Max  | Unit         |
+    |-----|--------------------------------------|-------|------|--------------|
+    | 0   | position of the car along the x-axis | -1.2  | 0.6  | position (m) |
+    | 1   | velocity of the car                  | -0.07 | 0.07 | velocity (v) |
+
+    ## Action Space
+
+    There are 3 discrete deterministic actions:
+
+    - 0: Accelerate to the left
+    - 1: Don't accelerate
+    - 2: Accelerate to the right
+
+    ## Transition Dynamics:
+
+    Given an action, the mountain car follows the following transition dynamics:
+
+    *velocity<sub>t+1</sub> = velocity<sub>t</sub> + (action - 1) * force - cos(3 * position<sub>t</sub>) * gravity*
+
+    *position<sub>t+1</sub> = position<sub>t</sub> + velocity<sub>t+1</sub>*
+
+    where force = 0.001 and gravity = 0.0025. The collisions at either end are inelastic with the velocity set to 0
+    upon collision with the wall. The position is clipped to the range `[-1.2, 0.6]` and
+    velocity is clipped to the range `[-0.07, 0.07]`.
+
+    ## Reward:
+
+    The goal is to reach the flag placed on top of the right hill as quickly as possible, as such the agent is
+    penalised with a reward of -1 for each timestep.
+
+    ## Starting State
+
+    The position of the car is assigned a uniform random value in *[-0.6 , -0.4]*.
+    The starting velocity of the car is always assigned to 0.
+
+    ## Episode End
+
+    The episode ends if either of the following happens:
+    1. Termination: The position of the car is greater than or equal to 0.5 (the goal position on top of the right hill)
+    2. Truncation: The length of the episode is 200.
+
+
+    ## Arguments
+
+    ```python
+    import gymnasium as gym
+    gym.make('MountainCar-v0')
+    ```
+
+    On reset, the `options` parameter allows the user to change the bounds used to determine
+    the new random state.
+
+    ## Version History
+
+    * v0: Initial versions release (1.0.0)
+    """
+
+    metadata = {
+        "render_modes": ["human", "rgb_array"],
+        "render_fps": 30,
+    }
+
+    def __init__(self, render_mode: Optional[str] = None, goal_velocity=0):
+        self.min_position = -1.2
+        self.max_position = 0.6
+        self.max_speed = 0.07
+        self.goal_position = 0.5
+        self.goal_velocity = goal_velocity
+
+        self.force = 0.001
+        self.gravity = 0.0025
+
+        self.low = np.array([self.min_position, -self.max_speed], dtype=np.float32)
+        self.high = np.array([self.max_position, self.max_speed], dtype=np.float32)
+
+        self.render_mode = render_mode
+
+        self.screen_width = 600
+        self.screen_height = 400
+        self.screen = None
+        self.clock = None
+        self.isopen = True
+
+        self.action_space = spaces.Discrete(3)
+        self.observation_space = spaces.Box(self.low, self.high, dtype=np.float32)
+
+    def step(self, action: int):
+        assert self.action_space.contains(
+            action
+        ), f"{action!r} ({type(action)}) invalid"
+
+        position, velocity = self.state
+        velocity += (action - 1) * self.force + math.cos(3 * position) * (-self.gravity)
+        velocity = np.clip(velocity, -self.max_speed, self.max_speed)
+        position += velocity
+        position = np.clip(position, self.min_position, self.max_position)
+        if position == self.min_position and velocity < 0:
+            velocity = 0
+
+        terminated = bool(
+            position >= self.goal_position and velocity >= self.goal_velocity
+        )
+        reward = -1.0
+
+        self.state = (position, velocity)
+        if self.render_mode == "human":
+            self.render()
+        return np.array(self.state, dtype=np.float32), reward, terminated, False, {}
+
+    def reset(
+        self,
+        *,
+        seed: Optional[int] = None,
+        options: Optional[dict] = None,
+    ):
+        super().reset(seed=seed)
+        # Note that if you use custom reset bounds, it may lead to out-of-bound
+        # state/observations.
+        low, high = utils.maybe_parse_reset_bounds(options, -0.6, -0.4)
+        self.state = np.array([self.np_random.uniform(low=low, high=high), 0])
+
+        if self.render_mode == "human":
+            self.render()
+        return np.array(self.state, dtype=np.float32), {}
+
+    def _height(self, xs):
+        return np.sin(3 * xs) * 0.45 + 0.55
+
+    def render(self):
+        if self.render_mode is None:
+            assert self.spec is not None
+            gym.logger.warn(
+                "You are calling render method without specifying any render mode. "
+                "You can specify the render_mode at initialization, "
+                f'e.g. gym.make("{self.spec.id}", render_mode="rgb_array")'
+            )
+            return
+
+        try:
+            import pygame
+            from pygame import gfxdraw
+        except ImportError as e:
+            raise DependencyNotInstalled(
+                "pygame is not installed, run `pip install gymnasium[classic-control]`"
+            ) from e
+
+        if self.screen is None:
+            pygame.init()
+            if self.render_mode == "human":
+                pygame.display.init()
+                self.screen = pygame.display.set_mode(
+                    (self.screen_width, self.screen_height)
+                )
+            else:  # mode in "rgb_array"
+                self.screen = pygame.Surface((self.screen_width, self.screen_height))
+        if self.clock is None:
+            self.clock = pygame.time.Clock()
+
+        world_width = self.max_position - self.min_position
+        scale = self.screen_width / world_width
+        carwidth = 40
+        carheight = 20
+
+        self.surf = pygame.Surface((self.screen_width, self.screen_height))
+        self.surf.fill((255, 255, 255))
+
+        pos = self.state[0]
+
+        xs = np.linspace(self.min_position, self.max_position, 100)
+        ys = self._height(xs)
+        xys = list(zip((xs - self.min_position) * scale, ys * scale))
+
+        pygame.draw.aalines(self.surf, points=xys, closed=False, color=(0, 0, 0))
+
+        clearance = 10
+
+        l, r, t, b = -carwidth / 2, carwidth / 2, carheight, 0
+        coords = []
+        for c in [(l, b), (l, t), (r, t), (r, b)]:
+            c = pygame.math.Vector2(c).rotate_rad(math.cos(3 * pos))
+            coords.append(
+                (
+                    c[0] + (pos - self.min_position) * scale,
+                    c[1] + clearance + self._height(pos) * scale,
+                )
+            )
+
+        gfxdraw.aapolygon(self.surf, coords, (0, 0, 0))
+        gfxdraw.filled_polygon(self.surf, coords, (0, 0, 0))
+
+        for c in [(carwidth / 4, 0), (-carwidth / 4, 0)]:
+            c = pygame.math.Vector2(c).rotate_rad(math.cos(3 * pos))
+            wheel = (
+                int(c[0] + (pos - self.min_position) * scale),
+                int(c[1] + clearance + self._height(pos) * scale),
+            )
+
+            gfxdraw.aacircle(
+                self.surf, wheel[0], wheel[1], int(carheight / 2.5), (128, 128, 128)
+            )
+            gfxdraw.filled_circle(
+                self.surf, wheel[0], wheel[1], int(carheight / 2.5), (128, 128, 128)
+            )
+
+        flagx = int((self.goal_position - self.min_position) * scale)
+        flagy1 = int(self._height(self.goal_position) * scale)
+        flagy2 = flagy1 + 50
+        gfxdraw.vline(self.surf, flagx, flagy1, flagy2, (0, 0, 0))
+
+        gfxdraw.aapolygon(
+            self.surf,
+            [(flagx, flagy2), (flagx, flagy2 - 10), (flagx + 25, flagy2 - 5)],
+            (204, 204, 0),
+        )
+        gfxdraw.filled_polygon(
+            self.surf,
+            [(flagx, flagy2), (flagx, flagy2 - 10), (flagx + 25, flagy2 - 5)],
+            (204, 204, 0),
+        )
+
+        self.surf = pygame.transform.flip(self.surf, False, True)
+        self.screen.blit(self.surf, (0, 0))
+        if self.render_mode == "human":
+            pygame.event.pump()
+            self.clock.tick(self.metadata["render_fps"])
+            pygame.display.flip()
+
+        elif self.render_mode == "rgb_array":
+            return np.transpose(
+                np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2)
+            )
+
+    def get_keys_to_action(self):
+        # Control with left and right arrow keys.
+        return {(): 1, (276,): 0, (275,): 2, (275, 276): 1}
+
+    def close(self):
+        if self.screen is not None:
+            import pygame
+
+            pygame.display.quit()
+            pygame.quit()
+            self.isopen = False
--- a/rl/Lib/site-packages/gymnasium/envs/classic_control/pendulum.py
+++ b/rl/Lib/site-packages/gymnasium/envs/classic_control/pendulum.py
@ -0,0 +1,277 @@
+__credits__ = ["Carlos Luis"]
+
+from os import path
+from typing import Optional
+
+import numpy as np
+
+import gymnasium as gym
+from gymnasium import spaces
+from gymnasium.envs.classic_control import utils
+from gymnasium.error import DependencyNotInstalled
+
+
+DEFAULT_X = np.pi
+DEFAULT_Y = 1.0
+
+
+class PendulumEnv(gym.Env):
+    """
+    ## Description
+
+    The inverted pendulum swingup problem is based on the classic problem in control theory.
+    The system consists of a pendulum attached at one end to a fixed point, and the other end being free.
+    The pendulum starts in a random position and the goal is to apply torque on the free end to swing it
+    into an upright position, with its center of gravity right above the fixed point.
+
+    The diagram below specifies the coordinate system used for the implementation of the pendulum's
+    dynamic equations.
+
+    ![Pendulum Coordinate System](/_static/diagrams/pendulum.png)
+
+    -  `x-y`: cartesian coordinates of the pendulum's end in meters.
+    - `theta` : angle in radians.
+    - `tau`: torque in `N m`. Defined as positive _counter-clockwise_.
+
+    ## Action Space
+
+    The action is a `ndarray` with shape `(1,)` representing the torque applied to free end of the pendulum.
+
+    | Num | Action | Min  | Max |
+    |-----|--------|------|-----|
+    | 0   | Torque | -2.0 | 2.0 |
+
+
+    ## Observation Space
+
+    The observation is a `ndarray` with shape `(3,)` representing the x-y coordinates of the pendulum's free
+    end and its angular velocity.
+
+    | Num | Observation      | Min  | Max |
+    |-----|------------------|------|-----|
+    | 0   | x = cos(theta)   | -1.0 | 1.0 |
+    | 1   | y = sin(theta)   | -1.0 | 1.0 |
+    | 2   | Angular Velocity | -8.0 | 8.0 |
+
+    ## Rewards
+
+    The reward function is defined as:
+
+    *r = -(theta<sup>2</sup> + 0.1 * theta_dt<sup>2</sup> + 0.001 * torque<sup>2</sup>)*
+
+    where `theta` is the pendulum's angle normalized between *[-pi, pi]* (with 0 being in the upright position).
+    Based on the above equation, the minimum reward that can be obtained is
+    *-(pi<sup>2</sup> + 0.1 * 8<sup>2</sup> + 0.001 * 2<sup>2</sup>) = -16.2736044*,
+    while the maximum reward is zero (pendulum is upright with zero velocity and no torque applied).
+
+    ## Starting State
+
+    The starting state is a random angle in *[-pi, pi]* and a random angular velocity in *[-1,1]*.
+
+    ## Episode Truncation
+
+    The episode truncates at 200 time steps.
+
+    ## Arguments
+
+    - `g`: acceleration of gravity measured in *(m s<sup>-2</sup>)* used to calculate the pendulum dynamics.
+      The default value is g = 10.0 .
+
+    ```python
+    import gymnasium as gym
+    gym.make('Pendulum-v1', g=9.81)
+    ```
+
+    On reset, the `options` parameter allows the user to change the bounds used to determine
+    the new random state.
+
+    ## Version History
+
+    * v1: Simplify the math equations, no difference in behavior.
+    * v0: Initial versions release (1.0.0)
+
+    """
+
+    metadata = {
+        "render_modes": ["human", "rgb_array"],
+        "render_fps": 30,
+    }
+
+    def __init__(self, render_mode: Optional[str] = None, g=10.0):
+        self.max_speed = 8
+        self.max_torque = 2.0
+        self.dt = 0.05
+        self.g = g
+        self.m = 1.0
+        self.l = 1.0
+
+        self.render_mode = render_mode
+
+        self.screen_dim = 500
+        self.screen = None
+        self.clock = None
+        self.isopen = True
+
+        high = np.array([1.0, 1.0, self.max_speed], dtype=np.float32)
+        # This will throw a warning in tests/envs/test_envs in utils/env_checker.py as the space is not symmetric
+        #   or normalised as max_torque == 2 by default. Ignoring the issue here as the default settings are too old
+        #   to update to follow the gymnasium api
+        self.action_space = spaces.Box(
+            low=-self.max_torque, high=self.max_torque, shape=(1,), dtype=np.float32
+        )
+        self.observation_space = spaces.Box(low=-high, high=high, dtype=np.float32)
+
+    def step(self, u):
+        th, thdot = self.state  # th := theta
+
+        g = self.g
+        m = self.m
+        l = self.l
+        dt = self.dt
+
+        u = np.clip(u, -self.max_torque, self.max_torque)[0]
+        self.last_u = u  # for rendering
+        costs = angle_normalize(th) ** 2 + 0.1 * thdot**2 + 0.001 * (u**2)
+
+        newthdot = thdot + (3 * g / (2 * l) * np.sin(th) + 3.0 / (m * l**2) * u) * dt
+        newthdot = np.clip(newthdot, -self.max_speed, self.max_speed)
+        newth = th + newthdot * dt
+
+        self.state = np.array([newth, newthdot])
+
+        if self.render_mode == "human":
+            self.render()
+        return self._get_obs(), -costs, False, False, {}
+
+    def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
+        super().reset(seed=seed)
+        if options is None:
+            high = np.array([DEFAULT_X, DEFAULT_Y])
+        else:
+            # Note that if you use custom reset bounds, it may lead to out-of-bound
+            # state/observations.
+            x = options.get("x_init") if "x_init" in options else DEFAULT_X
+            y = options.get("y_init") if "y_init" in options else DEFAULT_Y
+            x = utils.verify_number_and_cast(x)
+            y = utils.verify_number_and_cast(y)
+            high = np.array([x, y])
+        low = -high  # We enforce symmetric limits.
+        self.state = self.np_random.uniform(low=low, high=high)
+        self.last_u = None
+
+        if self.render_mode == "human":
+            self.render()
+        return self._get_obs(), {}
+
+    def _get_obs(self):
+        theta, thetadot = self.state
+        return np.array([np.cos(theta), np.sin(theta), thetadot], dtype=np.float32)
+
+    def render(self):
+        if self.render_mode is None:
+            assert self.spec is not None
+            gym.logger.warn(
+                "You are calling render method without specifying any render mode. "
+                "You can specify the render_mode at initialization, "
+                f'e.g. gym.make("{self.spec.id}", render_mode="rgb_array")'
+            )
+            return
+
+        try:
+            import pygame
+            from pygame import gfxdraw
+        except ImportError as e:
+            raise DependencyNotInstalled(
+                "pygame is not installed, run `pip install gymnasium[classic-control]`"
+            ) from e
+
+        if self.screen is None:
+            pygame.init()
+            if self.render_mode == "human":
+                pygame.display.init()
+                self.screen = pygame.display.set_mode(
+                    (self.screen_dim, self.screen_dim)
+                )
+            else:  # mode in "rgb_array"
+                self.screen = pygame.Surface((self.screen_dim, self.screen_dim))
+        if self.clock is None:
+            self.clock = pygame.time.Clock()
+
+        self.surf = pygame.Surface((self.screen_dim, self.screen_dim))
+        self.surf.fill((255, 255, 255))
+
+        bound = 2.2
+        scale = self.screen_dim / (bound * 2)
+        offset = self.screen_dim // 2
+
+        rod_length = 1 * scale
+        rod_width = 0.2 * scale
+        l, r, t, b = 0, rod_length, rod_width / 2, -rod_width / 2
+        coords = [(l, b), (l, t), (r, t), (r, b)]
+        transformed_coords = []
+        for c in coords:
+            c = pygame.math.Vector2(c).rotate_rad(self.state[0] + np.pi / 2)
+            c = (c[0] + offset, c[1] + offset)
+            transformed_coords.append(c)
+        gfxdraw.aapolygon(self.surf, transformed_coords, (204, 77, 77))
+        gfxdraw.filled_polygon(self.surf, transformed_coords, (204, 77, 77))
+
+        gfxdraw.aacircle(self.surf, offset, offset, int(rod_width / 2), (204, 77, 77))
+        gfxdraw.filled_circle(
+            self.surf, offset, offset, int(rod_width / 2), (204, 77, 77)
+        )
+
+        rod_end = (rod_length, 0)
+        rod_end = pygame.math.Vector2(rod_end).rotate_rad(self.state[0] + np.pi / 2)
+        rod_end = (int(rod_end[0] + offset), int(rod_end[1] + offset))
+        gfxdraw.aacircle(
+            self.surf, rod_end[0], rod_end[1], int(rod_width / 2), (204, 77, 77)
+        )
+        gfxdraw.filled_circle(
+            self.surf, rod_end[0], rod_end[1], int(rod_width / 2), (204, 77, 77)
+        )
+
+        fname = path.join(path.dirname(__file__), "assets/clockwise.png")
+        img = pygame.image.load(fname)
+        if self.last_u is not None:
+            scale_img = pygame.transform.smoothscale(
+                img,
+                (scale * np.abs(self.last_u) / 2, scale * np.abs(self.last_u) / 2),
+            )
+            is_flip = bool(self.last_u > 0)
+            scale_img = pygame.transform.flip(scale_img, is_flip, True)
+            self.surf.blit(
+                scale_img,
+                (
+                    offset - scale_img.get_rect().centerx,
+                    offset - scale_img.get_rect().centery,
+                ),
+            )
+
+        # drawing axle
+        gfxdraw.aacircle(self.surf, offset, offset, int(0.05 * scale), (0, 0, 0))
+        gfxdraw.filled_circle(self.surf, offset, offset, int(0.05 * scale), (0, 0, 0))
+
+        self.surf = pygame.transform.flip(self.surf, False, True)
+        self.screen.blit(self.surf, (0, 0))
+        if self.render_mode == "human":
+            pygame.event.pump()
+            self.clock.tick(self.metadata["render_fps"])
+            pygame.display.flip()
+
+        else:  # mode == "rgb_array":
+            return np.transpose(
+                np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2)
+            )
+
+    def close(self):
+        if self.screen is not None:
+            import pygame
+
+            pygame.display.quit()
+            pygame.quit()
+            self.isopen = False
+
+
+def angle_normalize(x):
+    return ((x + np.pi) % (2 * np.pi)) - np.pi
--- a/rl/Lib/site-packages/gymnasium/envs/classic_control/utils.py
+++ b/rl/Lib/site-packages/gymnasium/envs/classic_control/utils.py
@ -0,0 +1,46 @@
+"""
+Utility functions used for classic control environments.
+"""
+
+from typing import Optional, SupportsFloat, Tuple
+
+
+def verify_number_and_cast(x: SupportsFloat) -> float:
+    """Verify parameter is a single number and cast to a float."""
+    try:
+        x = float(x)
+    except (ValueError, TypeError) as e:
+        raise ValueError(f"An option ({x}) could not be converted to a float.") from e
+    return x
+
+
+def maybe_parse_reset_bounds(
+    options: Optional[dict], default_low: float, default_high: float
+) -> Tuple[float, float]:
+    """
+    This function can be called during a reset() to customize the sampling
+    ranges for setting the initial state distributions.
+
+    Args:
+      options: Options passed in to reset().
+      default_low: Default lower limit to use, if none specified in options.
+      default_high: Default upper limit to use, if none specified in options.
+
+    Returns:
+      Tuple of the lower and upper limits.
+    """
+    if options is None:
+        return default_low, default_high
+
+    low = options.get("low") if "low" in options else default_low
+    high = options.get("high") if "high" in options else default_high
+
+    # We expect only numerical inputs.
+    low = verify_number_and_cast(low)
+    high = verify_number_and_cast(high)
+    if low > high:
+        raise ValueError(
+            f"Lower bound ({low}) must be lower than higher bound ({high})."
+        )
+
+    return low, high
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/init.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/init.py
@ -0,0 +1,15 @@
+from gymnasium.envs.mujoco.mujoco_env import MujocoEnv, MuJocoPyEnv  # isort:skip
+
+# ^^^^^ so that user gets the correct error
+# message if mujoco is not installed correctly
+from gymnasium.envs.mujoco.ant import AntEnv
+from gymnasium.envs.mujoco.half_cheetah import HalfCheetahEnv
+from gymnasium.envs.mujoco.hopper import HopperEnv
+from gymnasium.envs.mujoco.humanoid import HumanoidEnv
+from gymnasium.envs.mujoco.humanoidstandup import HumanoidStandupEnv
+from gymnasium.envs.mujoco.inverted_double_pendulum import InvertedDoublePendulumEnv
+from gymnasium.envs.mujoco.inverted_pendulum import InvertedPendulumEnv
+from gymnasium.envs.mujoco.pusher import PusherEnv
+from gymnasium.envs.mujoco.reacher import ReacherEnv
+from gymnasium.envs.mujoco.swimmer import SwimmerEnv
+from gymnasium.envs.mujoco.walker2d import Walker2dEnv
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/ant.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/ant.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/ant_v3.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/ant_v3.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/ant_v4.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/ant_v4.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/half_cheetah.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/half_cheetah.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/half_cheetah_v3.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/half_cheetah_v3.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/half_cheetah_v4.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/half_cheetah_v4.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/hopper.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/hopper.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/hopper_v3.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/hopper_v3.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/hopper_v4.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/hopper_v4.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/humanoid.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/humanoid.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/humanoid_v3.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/humanoid_v3.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/humanoid_v4.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/humanoid_v4.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/humanoidstandup.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/humanoidstandup.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/humanoidstandup_v4.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/humanoidstandup_v4.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/inverted_double_pendulum.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/inverted_double_pendulum.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/inverted_double_pendulum_v4.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/inverted_double_pendulum_v4.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/inverted_pendulum.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/inverted_pendulum.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/inverted_pendulum_v4.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/inverted_pendulum_v4.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/mujoco_env.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/mujoco_env.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/mujoco_rendering.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/mujoco_rendering.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/pusher.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/pusher.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/pusher_v4.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/pusher_v4.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/reacher.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/reacher.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/reacher_v4.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/reacher_v4.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/swimmer.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/swimmer.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/swimmer_v3.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/swimmer_v3.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/swimmer_v4.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/swimmer_v4.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/walker2d.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/walker2d.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/walker2d_v3.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/walker2d_v3.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/walker2d_v4.cpython-312.pyc
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pycache/walker2d_v4.cpython-312.pyc
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/ant.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/ant.py
@ -0,0 +1,80 @@
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MuJocoPyEnv
+from gymnasium.spaces import Box
+
+
+class AntEnv(MuJocoPyEnv, utils.EzPickle):
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 20,
+    }
+
+    def __init__(self, **kwargs):
+        observation_space = Box(
+            low=-np.inf, high=np.inf, shape=(111,), dtype=np.float64
+        )
+        MuJocoPyEnv.__init__(
+            self, "ant.xml", 5, observation_space=observation_space, **kwargs
+        )
+        utils.EzPickle.__init__(self, **kwargs)
+
+    def step(self, a):
+        xposbefore = self.get_body_com("torso")[0]
+        self.do_simulation(a, self.frame_skip)
+        xposafter = self.get_body_com("torso")[0]
+
+        forward_reward = (xposafter - xposbefore) / self.dt
+        ctrl_cost = 0.5 * np.square(a).sum()
+        contact_cost = (
+            0.5 * 1e-3 * np.sum(np.square(np.clip(self.sim.data.cfrc_ext, -1, 1)))
+        )
+        survive_reward = 1.0
+        reward = forward_reward - ctrl_cost - contact_cost + survive_reward
+        state = self.state_vector()
+        not_terminated = (
+            np.isfinite(state).all() and state[2] >= 0.2 and state[2] <= 1.0
+        )
+        terminated = not not_terminated
+        ob = self._get_obs()
+
+        if self.render_mode == "human":
+            self.render()
+        return (
+            ob,
+            reward,
+            terminated,
+            False,
+            dict(
+                reward_forward=forward_reward,
+                reward_ctrl=-ctrl_cost,
+                reward_contact=-contact_cost,
+                reward_survive=survive_reward,
+            ),
+        )
+
+    def _get_obs(self):
+        return np.concatenate(
+            [
+                self.sim.data.qpos.flat[2:],
+                self.sim.data.qvel.flat,
+                np.clip(self.sim.data.cfrc_ext, -1, 1).flat,
+            ]
+        )
+
+    def reset_model(self):
+        qpos = self.init_qpos + self.np_random.uniform(
+            size=self.model.nq, low=-0.1, high=0.1
+        )
+        qvel = self.init_qvel + self.np_random.standard_normal(self.model.nv) * 0.1
+        self.set_state(qpos, qvel)
+        return self._get_obs()
+
+    def viewer_setup(self):
+        assert self.viewer is not None
+        self.viewer.cam.distance = self.model.stat.extent * 0.5
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/ant_v3.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/ant_v3.py
@ -0,0 +1,187 @@
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MuJocoPyEnv
+from gymnasium.spaces import Box
+
+
+DEFAULT_CAMERA_CONFIG = {
+    "distance": 4.0,
+}
+
+
+class AntEnv(MuJocoPyEnv, utils.EzPickle):
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 20,
+    }
+
+    def __init__(
+        self,
+        xml_file="ant.xml",
+        ctrl_cost_weight=0.5,
+        contact_cost_weight=5e-4,
+        healthy_reward=1.0,
+        terminate_when_unhealthy=True,
+        healthy_z_range=(0.2, 1.0),
+        contact_force_range=(-1.0, 1.0),
+        reset_noise_scale=0.1,
+        exclude_current_positions_from_observation=True,
+        **kwargs,
+    ):
+        utils.EzPickle.__init__(
+            self,
+            xml_file,
+            ctrl_cost_weight,
+            contact_cost_weight,
+            healthy_reward,
+            terminate_when_unhealthy,
+            healthy_z_range,
+            contact_force_range,
+            reset_noise_scale,
+            exclude_current_positions_from_observation,
+            **kwargs,
+        )
+
+        self._ctrl_cost_weight = ctrl_cost_weight
+        self._contact_cost_weight = contact_cost_weight
+
+        self._healthy_reward = healthy_reward
+        self._terminate_when_unhealthy = terminate_when_unhealthy
+        self._healthy_z_range = healthy_z_range
+
+        self._contact_force_range = contact_force_range
+
+        self._reset_noise_scale = reset_noise_scale
+
+        self._exclude_current_positions_from_observation = (
+            exclude_current_positions_from_observation
+        )
+
+        if exclude_current_positions_from_observation:
+            observation_space = Box(
+                low=-np.inf, high=np.inf, shape=(111,), dtype=np.float64
+            )
+        else:
+            observation_space = Box(
+                low=-np.inf, high=np.inf, shape=(113,), dtype=np.float64
+            )
+
+        MuJocoPyEnv.__init__(
+            self, xml_file, 5, observation_space=observation_space, **kwargs
+        )
+
+    @property
+    def healthy_reward(self):
+        return (
+            float(self.is_healthy or self._terminate_when_unhealthy)
+            * self._healthy_reward
+        )
+
+    def control_cost(self, action):
+        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
+        return control_cost
+
+    @property
+    def contact_forces(self):
+        raw_contact_forces = self.sim.data.cfrc_ext
+        min_value, max_value = self._contact_force_range
+        contact_forces = np.clip(raw_contact_forces, min_value, max_value)
+        return contact_forces
+
+    @property
+    def contact_cost(self):
+        contact_cost = self._contact_cost_weight * np.sum(
+            np.square(self.contact_forces)
+        )
+        return contact_cost
+
+    @property
+    def is_healthy(self):
+        state = self.state_vector()
+        min_z, max_z = self._healthy_z_range
+        is_healthy = np.isfinite(state).all() and min_z <= state[2] <= max_z
+        return is_healthy
+
+    @property
+    def terminated(self):
+        terminated = not self.is_healthy if self._terminate_when_unhealthy else False
+        return terminated
+
+    def step(self, action):
+        xy_position_before = self.get_body_com("torso")[:2].copy()
+        self.do_simulation(action, self.frame_skip)
+        xy_position_after = self.get_body_com("torso")[:2].copy()
+
+        xy_velocity = (xy_position_after - xy_position_before) / self.dt
+        x_velocity, y_velocity = xy_velocity
+
+        ctrl_cost = self.control_cost(action)
+        contact_cost = self.contact_cost
+
+        forward_reward = x_velocity
+        healthy_reward = self.healthy_reward
+
+        rewards = forward_reward + healthy_reward
+        costs = ctrl_cost + contact_cost
+
+        reward = rewards - costs
+        terminated = self.terminated
+        observation = self._get_obs()
+        info = {
+            "reward_forward": forward_reward,
+            "reward_ctrl": -ctrl_cost,
+            "reward_contact": -contact_cost,
+            "reward_survive": healthy_reward,
+            "x_position": xy_position_after[0],
+            "y_position": xy_position_after[1],
+            "distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
+            "x_velocity": x_velocity,
+            "y_velocity": y_velocity,
+            "forward_reward": forward_reward,
+        }
+
+        if self.render_mode == "human":
+            self.render()
+        return observation, reward, terminated, False, info
+
+    def _get_obs(self):
+        position = self.sim.data.qpos.flat.copy()
+        velocity = self.sim.data.qvel.flat.copy()
+        contact_force = self.contact_forces.flat.copy()
+
+        if self._exclude_current_positions_from_observation:
+            position = position[2:]
+
+        observations = np.concatenate((position, velocity, contact_force))
+
+        return observations
+
+    def reset_model(self):
+        noise_low = -self._reset_noise_scale
+        noise_high = self._reset_noise_scale
+
+        qpos = self.init_qpos + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nq
+        )
+        qvel = (
+            self.init_qvel
+            + self._reset_noise_scale * self.np_random.standard_normal(self.model.nv)
+        )
+        self.set_state(qpos, qvel)
+
+        observation = self._get_obs()
+
+        return observation
+
+    def viewer_setup(self):
+        assert self.viewer is not None
+        for key, value in DEFAULT_CAMERA_CONFIG.items():
+            if isinstance(value, np.ndarray):
+                getattr(self.viewer.cam, key)[:] = value
+            else:
+                setattr(self.viewer.cam, key, value)
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/ant_v4.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/ant_v4.py
@ -0,0 +1,379 @@
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MujocoEnv
+from gymnasium.spaces import Box
+
+
+DEFAULT_CAMERA_CONFIG = {
+    "distance": 4.0,
+}
+
+
+class AntEnv(MujocoEnv, utils.EzPickle):
+    """
+    ## Description
+
+    This environment is based on the environment introduced by Schulman,
+    Moritz, Levine, Jordan and Abbeel in ["High-Dimensional Continuous Control
+    Using Generalized Advantage Estimation"](https://arxiv.org/abs/1506.02438).
+    The ant is a 3D robot consisting of one torso (free rotational body) with
+    four legs attached to it with each leg having two body parts. The goal is to
+    coordinate the four legs to move in the forward (right) direction by applying
+    torques on the eight hinges connecting the two body parts of each leg and the torso
+    (nine body parts and eight hinges).
+
+    ## Action Space
+    The action space is a `Box(-1, 1, (8,), float32)`. An action represents the torques applied at the hinge joints.
+
+    | Num | Action                                                            | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
+    | --- | ----------------------------------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ |
+    | 0   | Torque applied on the rotor between the torso and back right hip  | -1          | 1           | hip_4 (right_back_leg)           | hinge | torque (N m) |
+    | 1   | Torque applied on the rotor between the back right two links      | -1          | 1           | angle_4 (right_back_leg)         | hinge | torque (N m) |
+    | 2   | Torque applied on the rotor between the torso and front left hip  | -1          | 1           | hip_1 (front_left_leg)           | hinge | torque (N m) |
+    | 3   | Torque applied on the rotor between the front left two links      | -1          | 1           | angle_1 (front_left_leg)         | hinge | torque (N m) |
+    | 4   | Torque applied on the rotor between the torso and front right hip | -1          | 1           | hip_2 (front_right_leg)          | hinge | torque (N m) |
+    | 5   | Torque applied on the rotor between the front right two links     | -1          | 1           | angle_2 (front_right_leg)        | hinge | torque (N m) |
+    | 6   | Torque applied on the rotor between the torso and back left hip   | -1          | 1           | hip_3 (back_leg)                 | hinge | torque (N m) |
+    | 7   | Torque applied on the rotor between the back left two links       | -1          | 1           | angle_3 (back_leg)               | hinge | torque (N m) |
+
+    ## Observation Space
+    Observations consist of positional values of different body parts of the ant,
+    followed by the velocities of those individual parts (their derivatives) with all
+    the positions ordered before all the velocities.
+
+    By default, observations do not include the x- and y-coordinates of the ant's torso. These may
+    be included by passing `exclude_current_positions_from_observation=False` during construction.
+    In that case, the observation space will be a `Box(-Inf, Inf, (29,), float64)` where the first two observations
+    represent the x- and y- coordinates of the ant's torso.
+    Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates
+    of the torso will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively.
+
+    However, by default, observation Space is a `Box(-Inf, Inf, (27,), float64)` where the elements correspond to the following:
+
+    | Num | Observation                                                  | Min    | Max    | Name (in corresponding XML file)       | Joint | Unit                     |
+    |-----|--------------------------------------------------------------|--------|--------|----------------------------------------|-------|--------------------------|
+    | 0   | z-coordinate of the torso (centre)                           | -Inf   | Inf    | torso                                  | free  | position (m)             |
+    | 1   | x-orientation of the torso (centre)                          | -Inf   | Inf    | torso                                  | free  | angle (rad)              |
+    | 2   | y-orientation of the torso (centre)                          | -Inf   | Inf    | torso                                  | free  | angle (rad)              |
+    | 3   | z-orientation of the torso (centre)                          | -Inf   | Inf    | torso                                  | free  | angle (rad)              |
+    | 4   | w-orientation of the torso (centre)                          | -Inf   | Inf    | torso                                  | free  | angle (rad)              |
+    | 5   | angle between torso and first link on front left             | -Inf   | Inf    | hip_1 (front_left_leg)                 | hinge | angle (rad)              |
+    | 6   | angle between the two links on the front left                | -Inf   | Inf    | ankle_1 (front_left_leg)               | hinge | angle (rad)              |
+    | 7   | angle between torso and first link on front right            | -Inf   | Inf    | hip_2 (front_right_leg)                | hinge | angle (rad)              |
+    | 8   | angle between the two links on the front right               | -Inf   | Inf    | ankle_2 (front_right_leg)              | hinge | angle (rad)              |
+    | 9   | angle between torso and first link on back left              | -Inf   | Inf    | hip_3 (back_leg)                       | hinge | angle (rad)              |
+    | 10  | angle between the two links on the back left                 | -Inf   | Inf    | ankle_3 (back_leg)                     | hinge | angle (rad)              |
+    | 11  | angle between torso and first link on back right             | -Inf   | Inf    | hip_4 (right_back_leg)                 | hinge | angle (rad)              |
+    | 12  | angle between the two links on the back right                | -Inf   | Inf    | ankle_4 (right_back_leg)               | hinge | angle (rad)              |
+    | 13  | x-coordinate velocity of the torso                           | -Inf   | Inf    | torso                                  | free  | velocity (m/s)           |
+    | 14  | y-coordinate velocity of the torso                           | -Inf   | Inf    | torso                                  | free  | velocity (m/s)           |
+    | 15  | z-coordinate velocity of the torso                           | -Inf   | Inf    | torso                                  | free  | velocity (m/s)           |
+    | 16  | x-coordinate angular velocity of the torso                   | -Inf   | Inf    | torso                                  | free  | angular velocity (rad/s) |
+    | 17  | y-coordinate angular velocity of the torso                   | -Inf   | Inf    | torso                                  | free  | angular velocity (rad/s) |
+    | 18  | z-coordinate angular velocity of the torso                   | -Inf   | Inf    | torso                                  | free  | angular velocity (rad/s) |
+    | 19  | angular velocity of angle between torso and front left link  | -Inf   | Inf    | hip_1 (front_left_leg)                 | hinge | angle (rad)              |
+    | 20  | angular velocity of the angle between front left links       | -Inf   | Inf    | ankle_1 (front_left_leg)               | hinge | angle (rad)              |
+    | 21  | angular velocity of angle between torso and front right link | -Inf   | Inf    | hip_2 (front_right_leg)                | hinge | angle (rad)              |
+    | 22  | angular velocity of the angle between front right links      | -Inf   | Inf    | ankle_2 (front_right_leg)              | hinge | angle (rad)              |
+    | 23  | angular velocity of angle between torso and back left link   | -Inf   | Inf    | hip_3 (back_leg)                       | hinge | angle (rad)              |
+    | 24  | angular velocity of the angle between back left links        | -Inf   | Inf    | ankle_3 (back_leg)                     | hinge | angle (rad)              |
+    | 25  | angular velocity of angle between torso and back right link  | -Inf   | Inf    | hip_4 (right_back_leg)                 | hinge | angle (rad)              |
+    | 26  | angular velocity of the angle between back right links       | -Inf   | Inf    | ankle_4 (right_back_leg)               | hinge | angle (rad)              |
+    | excluded | x-coordinate of the torso (centre)                      | -Inf   | Inf    | torso                                  | free  | position (m)             |
+    | excluded | y-coordinate of the torso (centre)                      | -Inf   | Inf    | torso                                  | free  | position (m)             |
+
+
+    If version < `v4` or `use_contact_forces` is `True` then the observation space is extended by 14*6 = 84 elements, which are contact forces
+    (external forces - force x, y, z and torque x, y, z) applied to the
+    center of mass of each of the body parts. The 14 body parts are:
+
+    | id (for `v2`, `v3`, `v4)` | body parts |
+    |  ---  |  ------------  |
+    | 0  | worldbody (note: forces are always full of zeros) |
+    | 1  | torso |
+    | 2  | front_left_leg |
+    | 3  | aux_1 (front left leg) |
+    | 4  | ankle_1 (front left leg) |
+    | 5  | front_right_leg |
+    | 6  | aux_2 (front right leg) |
+    | 7  | ankle_2 (front right leg) |
+    | 8  | back_leg (back left leg) |
+    | 9  | aux_3 (back left leg) |
+    | 10 | ankle_3 (back left leg) |
+    | 11 | right_back_leg |
+    | 12 | aux_4 (back right leg) |
+    | 13 | ankle_4 (back right leg) |
+
+
+    The (x,y,z) coordinates are translational DOFs while the orientations are rotational
+    DOFs expressed as quaternions. One can read more about free joints on the [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
+
+
+    **Note:** Ant-v4 environment no longer has the following contact forces issue.
+    If using previous Humanoid versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results
+    in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
+    when using the Ant environment if you would like to report results with contact forces (if
+    contact forces are not used in your experiments, you can use version > 2.0).
+
+    ## Rewards
+    The reward consists of three parts:
+    - *healthy_reward*: Every timestep that the ant is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`
+    - *forward_reward*: A reward of moving forward which is measured as
+    *(x-coordinate before action - x-coordinate after action)/dt*. *dt* is the time
+    between actions and is dependent on the `frame_skip` parameter (default is 5),
+    where the frametime is 0.01 - making the default *dt = 5 * 0.01 = 0.05*.
+    This reward would be positive if the ant moves forward (in positive x direction).
+    - *ctrl_cost*: A negative reward for penalising the ant if it takes actions
+    that are too large. It is measured as *`ctrl_cost_weight` * sum(action<sup>2</sup>)*
+    where *`ctr_cost_weight`* is a parameter set for the control and has a default value of 0.5.
+    - *contact_cost*: A negative reward for penalising the ant if the external contact
+    force is too large. It is calculated *`contact_cost_weight` * sum(clip(external contact
+    force to `contact_force_range`)<sup>2</sup>)*.
+
+    The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost*.
+
+    But if `use_contact_forces=True` or version < `v4`
+    The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost*.
+
+    In either case `info` will also contain the individual reward terms.
+
+    ## Starting State
+    All observations start in state
+    (0.0, 0.0,  0.75, 1.0, 0.0  ... 0.0) with a uniform noise in the range
+    of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional values and standard normal noise
+    with mean 0 and standard deviation `reset_noise_scale` added to the velocity values for
+    stochasticity. Note that the initial z coordinate is intentionally selected
+    to be slightly high, thereby indicating a standing up ant. The initial orientation
+    is designed to make it face forward as well.
+
+    ## Episode End
+    The ant is said to be unhealthy if any of the following happens:
+
+    1. Any of the state space values is no longer finite
+    2. The z-coordinate of the torso is **not** in the closed interval given by `healthy_z_range` (defaults to [0.2, 1.0])
+
+    If `terminate_when_unhealthy=True` is passed during construction (which is the default),
+    the episode ends when any of the following happens:
+
+    1. Truncation: The episode duration reaches a 1000 timesteps
+    2. Termination: The ant is unhealthy
+
+    If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded.
+
+    ## Arguments
+
+    No additional arguments are currently supported in v2 and lower.
+
+    ```python
+    import gymnasium as gym
+    env = gym.make('Ant-v2')
+    ```
+
+    v3 and v4 take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc.
+
+    ```python
+    import gymnasium as gym
+    env = gym.make('Ant-v4', ctrl_cost_weight=0.1, ...)
+    ```
+
+    | Parameter               | Type       | Default      |Description                    |
+    |-------------------------|------------|--------------|-------------------------------|
+    | `xml_file`              | **str**    | `"ant.xml"`  | Path to a MuJoCo model |
+    | `ctrl_cost_weight`      | **float**  | `0.5`        | Weight for *ctrl_cost* term (see section on reward) |
+    | `use_contact_forces`    | **bool**  | `False`      | If true, it extends the observation space by adding contact forces (see `Observation Space` section) and includes contact_cost to the reward function (see `Rewards` section) |
+    | `contact_cost_weight`   | **float**  | `5e-4`       | Weight for *contact_cost* term (see section on reward) |
+    | `healthy_reward`        | **float**  | `1`          | Constant reward given if the ant is "healthy" after timestep |
+    | `terminate_when_unhealthy` | **bool**| `True`       | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range` |
+    | `healthy_z_range`       | **tuple**  | `(0.2, 1)`   | The ant is considered healthy if the z-coordinate of the torso is in this range |
+    | `contact_force_range`   | **tuple**  | `(-1, 1)`    | Contact forces are clipped to this range in the computation of *contact_cost* |
+    | `reset_noise_scale`     | **float**  | `0.1`        | Scale of random perturbations of initial position and velocity (see section on Starting State) |
+    | `exclude_current_positions_from_observation`| **bool** | `True`| Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
+
+    ## Version History
+    * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3, also removed contact forces from the default observation space (new variable `use_contact_forces=True` can restore them)
+    * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
+    * v2: All continuous control environments now use mujoco-py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
+    * v0: Initial versions release (1.0.0)
+    """
+
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 20,
+    }
+
+    def __init__(
+        self,
+        xml_file="ant.xml",
+        ctrl_cost_weight=0.5,
+        use_contact_forces=False,
+        contact_cost_weight=5e-4,
+        healthy_reward=1.0,
+        terminate_when_unhealthy=True,
+        healthy_z_range=(0.2, 1.0),
+        contact_force_range=(-1.0, 1.0),
+        reset_noise_scale=0.1,
+        exclude_current_positions_from_observation=True,
+        **kwargs,
+    ):
+        utils.EzPickle.__init__(
+            self,
+            xml_file,
+            ctrl_cost_weight,
+            use_contact_forces,
+            contact_cost_weight,
+            healthy_reward,
+            terminate_when_unhealthy,
+            healthy_z_range,
+            contact_force_range,
+            reset_noise_scale,
+            exclude_current_positions_from_observation,
+            **kwargs,
+        )
+
+        self._ctrl_cost_weight = ctrl_cost_weight
+        self._contact_cost_weight = contact_cost_weight
+
+        self._healthy_reward = healthy_reward
+        self._terminate_when_unhealthy = terminate_when_unhealthy
+        self._healthy_z_range = healthy_z_range
+
+        self._contact_force_range = contact_force_range
+
+        self._reset_noise_scale = reset_noise_scale
+
+        self._use_contact_forces = use_contact_forces
+
+        self._exclude_current_positions_from_observation = (
+            exclude_current_positions_from_observation
+        )
+
+        obs_shape = 27
+        if not exclude_current_positions_from_observation:
+            obs_shape += 2
+        if use_contact_forces:
+            obs_shape += 84
+
+        observation_space = Box(
+            low=-np.inf, high=np.inf, shape=(obs_shape,), dtype=np.float64
+        )
+
+        MujocoEnv.__init__(
+            self,
+            xml_file,
+            5,
+            observation_space=observation_space,
+            default_camera_config=DEFAULT_CAMERA_CONFIG,
+            **kwargs,
+        )
+
+    @property
+    def healthy_reward(self):
+        return (
+            float(self.is_healthy or self._terminate_when_unhealthy)
+            * self._healthy_reward
+        )
+
+    def control_cost(self, action):
+        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
+        return control_cost
+
+    @property
+    def contact_forces(self):
+        raw_contact_forces = self.data.cfrc_ext
+        min_value, max_value = self._contact_force_range
+        contact_forces = np.clip(raw_contact_forces, min_value, max_value)
+        return contact_forces
+
+    @property
+    def contact_cost(self):
+        contact_cost = self._contact_cost_weight * np.sum(
+            np.square(self.contact_forces)
+        )
+        return contact_cost
+
+    @property
+    def is_healthy(self):
+        state = self.state_vector()
+        min_z, max_z = self._healthy_z_range
+        is_healthy = np.isfinite(state).all() and min_z <= state[2] <= max_z
+        return is_healthy
+
+    @property
+    def terminated(self):
+        terminated = not self.is_healthy if self._terminate_when_unhealthy else False
+        return terminated
+
+    def step(self, action):
+        xy_position_before = self.get_body_com("torso")[:2].copy()
+        self.do_simulation(action, self.frame_skip)
+        xy_position_after = self.get_body_com("torso")[:2].copy()
+
+        xy_velocity = (xy_position_after - xy_position_before) / self.dt
+        x_velocity, y_velocity = xy_velocity
+
+        forward_reward = x_velocity
+        healthy_reward = self.healthy_reward
+
+        rewards = forward_reward + healthy_reward
+
+        costs = ctrl_cost = self.control_cost(action)
+
+        terminated = self.terminated
+        observation = self._get_obs()
+        info = {
+            "reward_forward": forward_reward,
+            "reward_ctrl": -ctrl_cost,
+            "reward_survive": healthy_reward,
+            "x_position": xy_position_after[0],
+            "y_position": xy_position_after[1],
+            "distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
+            "x_velocity": x_velocity,
+            "y_velocity": y_velocity,
+            "forward_reward": forward_reward,
+        }
+        if self._use_contact_forces:
+            contact_cost = self.contact_cost
+            costs += contact_cost
+            info["reward_ctrl"] = -contact_cost
+
+        reward = rewards - costs
+
+        if self.render_mode == "human":
+            self.render()
+        return observation, reward, terminated, False, info
+
+    def _get_obs(self):
+        position = self.data.qpos.flat.copy()
+        velocity = self.data.qvel.flat.copy()
+
+        if self._exclude_current_positions_from_observation:
+            position = position[2:]
+
+        if self._use_contact_forces:
+            contact_force = self.contact_forces.flat.copy()
+            return np.concatenate((position, velocity, contact_force))
+        else:
+            return np.concatenate((position, velocity))
+
+    def reset_model(self):
+        noise_low = -self._reset_noise_scale
+        noise_high = self._reset_noise_scale
+
+        qpos = self.init_qpos + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nq
+        )
+        qvel = (
+            self.init_qvel
+            + self._reset_noise_scale * self.np_random.standard_normal(self.model.nv)
+        )
+        self.set_state(qpos, qvel)
+
+        observation = self._get_obs()
+
+        return observation
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/ant.xml
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/ant.xml
@ -0,0 +1,81 @@
+<mujoco model="ant">
+  <compiler angle="degree" coordinate="local" inertiafromgeom="true"/>
+  <option integrator="RK4" timestep="0.01"/>
+  <custom>
+    <numeric data="0.0 0.0 0.55 1.0 0.0 0.0 0.0 0.0 1.0 0.0 -1.0 0.0 -1.0 0.0 1.0" name="init_qpos"/>
+  </custom>
+  <default>
+    <joint armature="1" damping="1" limited="true"/>
+    <geom conaffinity="0" condim="3" density="5.0" friction="1 0.5 0.5" margin="0.01" rgba="0.8 0.6 0.4 1"/>
+  </default>
+  <asset>
+    <texture builtin="gradient" height="100" rgb1="1 1 1" rgb2="0 0 0" type="skybox" width="100"/>
+    <texture builtin="flat" height="1278" mark="cross" markrgb="1 1 1" name="texgeom" random="0.01" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" type="cube" width="127"/>
+    <texture builtin="checker" height="100" name="texplane" rgb1="0 0 0" rgb2="0.8 0.8 0.8" type="2d" width="100"/>
+    <material name="MatPlane" reflectance="0.5" shininess="1" specular="1" texrepeat="60 60" texture="texplane"/>
+    <material name="geom" texture="texgeom" texuniform="true"/>
+  </asset>
+  <worldbody>
+    <light cutoff="100" diffuse="1 1 1" dir="-0 0 -1.3" directional="true" exponent="1" pos="0 0 1.3" specular=".1 .1 .1"/>
+    <geom conaffinity="1" condim="3" material="MatPlane" name="floor" pos="0 0 0" rgba="0.8 0.9 0.8 1" size="40 40 40" type="plane"/>
+    <body name="torso" pos="0 0 0.75">
+      <camera name="track" mode="trackcom" pos="0 -3 0.3" xyaxes="1 0 0 0 0 1"/>
+      <geom name="torso_geom" pos="0 0 0" size="0.25" type="sphere"/>
+      <joint armature="0" damping="0" limited="false" margin="0.01" name="root" pos="0 0 0" type="free"/>
+      <body name="front_left_leg" pos="0 0 0">
+        <geom fromto="0.0 0.0 0.0 0.2 0.2 0.0" name="aux_1_geom" size="0.08" type="capsule"/>
+        <body name="aux_1" pos="0.2 0.2 0">
+          <joint axis="0 0 1" name="hip_1" pos="0.0 0.0 0.0" range="-30 30" type="hinge"/>
+          <geom fromto="0.0 0.0 0.0 0.2 0.2 0.0" name="left_leg_geom" size="0.08" type="capsule"/>
+          <body pos="0.2 0.2 0">
+            <joint axis="-1 1 0" name="ankle_1" pos="0.0 0.0 0.0" range="30 70" type="hinge"/>
+            <geom fromto="0.0 0.0 0.0 0.4 0.4 0.0" name="left_ankle_geom" size="0.08" type="capsule"/>
+          </body>
+        </body>
+      </body>
+      <body name="front_right_leg" pos="0 0 0">
+        <geom fromto="0.0 0.0 0.0 -0.2 0.2 0.0" name="aux_2_geom" size="0.08" type="capsule"/>
+        <body name="aux_2" pos="-0.2 0.2 0">
+          <joint axis="0 0 1" name="hip_2" pos="0.0 0.0 0.0" range="-30 30" type="hinge"/>
+          <geom fromto="0.0 0.0 0.0 -0.2 0.2 0.0" name="right_leg_geom" size="0.08" type="capsule"/>
+          <body pos="-0.2 0.2 0">
+            <joint axis="1 1 0" name="ankle_2" pos="0.0 0.0 0.0" range="-70 -30" type="hinge"/>
+            <geom fromto="0.0 0.0 0.0 -0.4 0.4 0.0" name="right_ankle_geom" size="0.08" type="capsule"/>
+          </body>
+        </body>
+      </body>
+      <body name="back_leg" pos="0 0 0">
+        <geom fromto="0.0 0.0 0.0 -0.2 -0.2 0.0" name="aux_3_geom" size="0.08" type="capsule"/>
+        <body name="aux_3" pos="-0.2 -0.2 0">
+          <joint axis="0 0 1" name="hip_3" pos="0.0 0.0 0.0" range="-30 30" type="hinge"/>
+          <geom fromto="0.0 0.0 0.0 -0.2 -0.2 0.0" name="back_leg_geom" size="0.08" type="capsule"/>
+          <body pos="-0.2 -0.2 0">
+            <joint axis="-1 1 0" name="ankle_3" pos="0.0 0.0 0.0" range="-70 -30" type="hinge"/>
+            <geom fromto="0.0 0.0 0.0 -0.4 -0.4 0.0" name="third_ankle_geom" size="0.08" type="capsule"/>
+          </body>
+        </body>
+      </body>
+      <body name="right_back_leg" pos="0 0 0">
+        <geom fromto="0.0 0.0 0.0 0.2 -0.2 0.0" name="aux_4_geom" size="0.08" type="capsule"/>
+        <body name="aux_4" pos="0.2 -0.2 0">
+          <joint axis="0 0 1" name="hip_4" pos="0.0 0.0 0.0" range="-30 30" type="hinge"/>
+          <geom fromto="0.0 0.0 0.0 0.2 -0.2 0.0" name="rightback_leg_geom" size="0.08" type="capsule"/>
+          <body pos="0.2 -0.2 0">
+            <joint axis="1 1 0" name="ankle_4" pos="0.0 0.0 0.0" range="30 70" type="hinge"/>
+            <geom fromto="0.0 0.0 0.0 0.4 -0.4 0.0" name="fourth_ankle_geom" size="0.08" type="capsule"/>
+          </body>
+        </body>
+      </body>
+    </body>
+  </worldbody>
+  <actuator>
+    <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="hip_4" gear="150"/>
+    <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="ankle_4" gear="150"/>
+    <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="hip_1" gear="150"/>
+    <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="ankle_1" gear="150"/>
+    <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="hip_2" gear="150"/>
+    <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="ankle_2" gear="150"/>
+    <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="hip_3" gear="150"/>
+    <motor ctrllimited="true" ctrlrange="-1.0 1.0" joint="ankle_3" gear="150"/>
+  </actuator>
+</mujoco>
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/half_cheetah.xml
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/half_cheetah.xml
@ -0,0 +1,96 @@
+<!-- Cheetah Model
+
+    The state space is populated with joints in the order that they are
+    defined in this file. The actuators also operate on joints.
+
+    State-Space (name/joint/parameter):
+        - rootx     slider      position (m)
+        - rootz     slider      position (m)
+        - rooty     hinge       angle (rad)
+        - bthigh    hinge       angle (rad)
+        - bshin     hinge       angle (rad)
+        - bfoot     hinge       angle (rad)
+        - fthigh    hinge       angle (rad)
+        - fshin     hinge       angle (rad)
+        - ffoot     hinge       angle (rad)
+        - rootx     slider      velocity (m/s)
+        - rootz     slider      velocity (m/s)
+        - rooty     hinge       angular velocity (rad/s)
+        - bthigh    hinge       angular velocity (rad/s)
+        - bshin     hinge       angular velocity (rad/s)
+        - bfoot     hinge       angular velocity (rad/s)
+        - fthigh    hinge       angular velocity (rad/s)
+        - fshin     hinge       angular velocity (rad/s)
+        - ffoot     hinge       angular velocity (rad/s)
+
+    Actuators (name/actuator/parameter):
+        - bthigh    hinge       torque (N m)
+        - bshin     hinge       torque (N m)
+        - bfoot     hinge       torque (N m)
+        - fthigh    hinge       torque (N m)
+        - fshin     hinge       torque (N m)
+        - ffoot     hinge       torque (N m)
+
+-->
+<mujoco model="cheetah">
+  <compiler angle="radian" coordinate="local" inertiafromgeom="true" settotalmass="14"/>
+  <default>
+    <joint armature=".1" damping=".01" limited="true" solimplimit="0 .8 .03" solreflimit=".02 1" stiffness="8"/>
+    <geom conaffinity="0" condim="3" contype="1" friction=".4 .1 .1" rgba="0.8 0.6 .4 1" solimp="0.0 0.8 0.01" solref="0.02 1"/>
+    <motor ctrllimited="true" ctrlrange="-1 1"/>
+  </default>
+  <size nstack="300000" nuser_geom="1"/>
+  <option gravity="0 0 -9.81" timestep="0.01"/>
+  <asset>
+    <texture builtin="gradient" height="100" rgb1="1 1 1" rgb2="0 0 0" type="skybox" width="100"/>
+    <texture builtin="flat" height="1278" mark="cross" markrgb="1 1 1" name="texgeom" random="0.01" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" type="cube" width="127"/>
+    <texture builtin="checker" height="100" name="texplane" rgb1="0 0 0" rgb2="0.8 0.8 0.8" type="2d" width="100"/>
+    <material name="MatPlane" reflectance="0.5" shininess="1" specular="1" texrepeat="60 60" texture="texplane"/>
+    <material name="geom" texture="texgeom" texuniform="true"/>
+  </asset>
+  <worldbody>
+    <light cutoff="100" diffuse="1 1 1" dir="-0 0 -1.3" directional="true" exponent="1" pos="0 0 1.3" specular=".1 .1 .1"/>
+    <geom conaffinity="1" condim="3" material="MatPlane" name="floor" pos="0 0 0" rgba="0.8 0.9 0.8 1" size="40 40 40" type="plane"/>
+    <body name="torso" pos="0 0 .7">
+      <camera name="track" mode="trackcom" pos="0 -3 0.3" xyaxes="1 0 0 0 0 1"/>
+      <joint armature="0" axis="1 0 0" damping="0" limited="false" name="rootx" pos="0 0 0" stiffness="0" type="slide"/>
+      <joint armature="0" axis="0 0 1" damping="0" limited="false" name="rootz" pos="0 0 0" stiffness="0" type="slide"/>
+      <joint armature="0" axis="0 1 0" damping="0" limited="false" name="rooty" pos="0 0 0" stiffness="0" type="hinge"/>
+      <geom fromto="-.5 0 0 .5 0 0" name="torso" size="0.046" type="capsule"/>
+      <geom axisangle="0 1 0 .87" name="head" pos=".6 0 .1" size="0.046 .15" type="capsule"/>
+      <!-- <site name='tip'  pos='.15 0 .11'/>-->
+      <body name="bthigh" pos="-.5 0 0">
+        <joint axis="0 1 0" damping="6" name="bthigh" pos="0 0 0" range="-.52 1.05" stiffness="240" type="hinge"/>
+        <geom axisangle="0 1 0 -3.8" name="bthigh" pos=".1 0 -.13" size="0.046 .145" type="capsule"/>
+        <body name="bshin" pos=".16 0 -.25">
+          <joint axis="0 1 0" damping="4.5" name="bshin" pos="0 0 0" range="-.785 .785" stiffness="180" type="hinge"/>
+          <geom axisangle="0 1 0 -2.03" name="bshin" pos="-.14 0 -.07" rgba="0.9 0.6 0.6 1" size="0.046 .15" type="capsule"/>
+          <body name="bfoot" pos="-.28 0 -.14">
+            <joint axis="0 1 0" damping="3" name="bfoot" pos="0 0 0" range="-.4 .785" stiffness="120" type="hinge"/>
+            <geom axisangle="0 1 0 -.27" name="bfoot" pos=".03 0 -.097" rgba="0.9 0.6 0.6 1" size="0.046 .094" type="capsule"/>
+          </body>
+        </body>
+      </body>
+      <body name="fthigh" pos=".5 0 0">
+        <joint axis="0 1 0" damping="4.5" name="fthigh" pos="0 0 0" range="-1 .7" stiffness="180" type="hinge"/>
+        <geom axisangle="0 1 0 .52" name="fthigh" pos="-.07 0 -.12" size="0.046 .133" type="capsule"/>
+        <body name="fshin" pos="-.14 0 -.24">
+          <joint axis="0 1 0" damping="3" name="fshin" pos="0 0 0" range="-1.2 .87" stiffness="120" type="hinge"/>
+          <geom axisangle="0 1 0 -.6" name="fshin" pos=".065 0 -.09" rgba="0.9 0.6 0.6 1" size="0.046 .106" type="capsule"/>
+          <body name="ffoot" pos=".13 0 -.18">
+            <joint axis="0 1 0" damping="1.5" name="ffoot" pos="0 0 0" range="-.5 .5" stiffness="60" type="hinge"/>
+            <geom axisangle="0 1 0 -.6" name="ffoot" pos=".045 0 -.07" rgba="0.9 0.6 0.6 1" size="0.046 .07" type="capsule"/>
+          </body>
+        </body>
+      </body>
+    </body>
+  </worldbody>
+  <actuator>
+    <motor gear="120" joint="bthigh" name="bthigh"/>
+    <motor gear="90" joint="bshin" name="bshin"/>
+    <motor gear="60" joint="bfoot" name="bfoot"/>
+    <motor gear="120" joint="fthigh" name="fthigh"/>
+    <motor gear="60" joint="fshin" name="fshin"/>
+    <motor gear="30" joint="ffoot" name="ffoot"/>
+  </actuator>
+</mujoco>
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/hopper.xml
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/hopper.xml
@ -0,0 +1,53 @@
+<!--
+Hopper model for `Hopper-v5`, based on openai/gym/Walker2d
+modified by @saran_t
+  - To not require `coordinate="global"`
+-->
+<mujoco model="hopper">
+  <compiler angle="degree" inertiafromgeom="true"/>
+  <default>
+    <joint armature="1" damping="1" limited="true"/>
+    <geom conaffinity="1" condim="1" contype="1" margin="0.001" material="geom" rgba="0.8 0.6 .4 1" solimp=".8 .8 .01" solref=".02 1"/>
+    <motor ctrllimited="true" ctrlrange="-.4 .4"/>
+  </default>
+  <option integrator="RK4" timestep="0.002"/>
+  <visual>
+    <map znear="0.02"/>
+  </visual>
+  <worldbody>
+    <light cutoff="100" diffuse="1 1 1" dir="-0 0 -1.3" directional="true" exponent="1" pos="0 0 1.3" specular=".1 .1 .1"/>
+    <geom conaffinity="1" condim="3" name="floor" pos="0 0 0" rgba="0.8 0.9 0.8 1" size="20 20 .125" type="plane" material="MatPlane"/>
+    <body name="torso" pos="0 0 1.25">
+      <camera name="track" mode="trackcom" pos="0 -3 -0.25" xyaxes="1 0 0 0 0 1"/>
+      <joint armature="0" axis="1 0 0" damping="0" limited="false" name="rootx" pos="0 0 -1.25" stiffness="0" type="slide"/>
+      <joint armature="0" axis="0 0 1" damping="0" limited="false" name="rootz" pos="0 0 -1.25" ref="1.25" stiffness="0" type="slide"/>
+      <joint armature="0" axis="0 1 0" damping="0" limited="false" name="rooty" pos="0 0 0" stiffness="0" type="hinge"/>
+      <geom friction="0.9" name="torso_geom" size="0.05 0.19999999999999996" type="capsule"/>
+      <body name="thigh" pos="0 0 -0.19999999999999996">
+        <joint axis="0 -1 0" name="thigh_joint" pos="0 0 0" range="-150 0" type="hinge"/>
+        <geom friction="0.9" pos="0 0 -0.22500000000000009" name="thigh_geom" size="0.05 0.22500000000000003" type="capsule"/>
+        <body name="leg" pos="0 0 -0.70000000000000007">
+          <joint axis="0 -1 0" name="leg_joint" pos="0 0 0.25" range="-150 0" type="hinge"/>
+          <geom friction="0.9" name="leg_geom" size="0.04 0.25" type="capsule"/>
+          <body name="foot" pos="0.13 0 -0.35">
+            <joint axis="0 -1 0" name="foot_joint" pos="-0.13 0 0.1" range="-45 45" type="hinge"/>
+            <geom friction="2.0" pos="-0.065 0 0.1" quat="0.70710678118654757 0 -0.70710678118654746 0" name="foot_geom" size="0.06 0.195" type="capsule"/>
+          </body>
+        </body>
+      </body>
+    </body>
+  </worldbody>
+  <actuator>
+    <motor ctrllimited="true" ctrlrange="-1.0 1.0" gear="200.0" joint="thigh_joint"/>
+    <motor ctrllimited="true" ctrlrange="-1.0 1.0" gear="200.0" joint="leg_joint"/>
+    <motor ctrllimited="true" ctrlrange="-1.0 1.0" gear="200.0" joint="foot_joint"/>
+  </actuator>
+    <asset>
+        <texture type="skybox" builtin="gradient" rgb1=".4 .5 .6" rgb2="0 0 0"
+            width="100" height="100"/>
+        <texture builtin="flat" height="1278" mark="cross" markrgb="1 1 1" name="texgeom" random="0.01" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" type="cube" width="127"/>
+        <texture builtin="checker" height="100" name="texplane" rgb1="0 0 0" rgb2="0.8 0.8 0.8" type="2d" width="100"/>
+        <material name="MatPlane" reflectance="0.5" shininess="1" specular="1" texrepeat="60 60" texture="texplane"/>
+        <material name="geom" texture="texgeom" texuniform="true"/>
+    </asset>
+</mujoco>
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/humanoid.xml
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/humanoid.xml
@ -0,0 +1,121 @@
+<mujoco model="humanoid">
+    <compiler angle="degree" inertiafromgeom="true"/>
+    <default>
+        <joint armature="1" damping="1" limited="true"/>
+        <geom conaffinity="1" condim="1" contype="1" margin="0.001" material="geom" rgba="0.8 0.6 .4 1"/>
+        <motor ctrllimited="true" ctrlrange="-.4 .4"/>
+    </default>
+    <option integrator="RK4" iterations="50" solver="PGS" timestep="0.003">
+        <!-- <flags solverstat="enable" energy="enable"/>-->
+    </option>
+    <size nkey="5" nuser_geom="1"/>
+    <visual>
+        <map fogend="5" fogstart="3"/>
+    </visual>
+    <asset>
+        <texture builtin="gradient" height="100" rgb1=".4 .5 .6" rgb2="0 0 0" type="skybox" width="100"/>
+        <!-- <texture builtin="gradient" height="100" rgb1="1 1 1" rgb2="0 0 0" type="skybox" width="100"/>-->
+        <texture builtin="flat" height="1278" mark="cross" markrgb="1 1 1" name="texgeom" random="0.01" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" type="cube" width="127"/>
+        <texture builtin="checker" height="100" name="texplane" rgb1="0 0 0" rgb2="0.8 0.8 0.8" type="2d" width="100"/>
+        <material name="MatPlane" reflectance="0.5" shininess="1" specular="1" texrepeat="60 60" texture="texplane"/>
+        <material name="geom" texture="texgeom" texuniform="true"/>
+    </asset>
+    <worldbody>
+        <light cutoff="100" diffuse="1 1 1" dir="-0 0 -1.3" directional="true" exponent="1" pos="0 0 1.3" specular=".1 .1 .1"/>
+        <geom condim="3" friction="1 .1 .1" material="MatPlane" name="floor" pos="0 0 0" rgba="0.8 0.9 0.8 1" size="20 20 0.125" type="plane"/>
+        <!-- <geom condim="3" material="MatPlane" name="floor" pos="0 0 0" size="10 10 0.125" type="plane"/>-->
+        <body name="torso" pos="0 0 1.4">
+            <camera name="track" mode="trackcom" pos="0 -4 0" xyaxes="1 0 0 0 0 1"/>
+            <joint armature="0" damping="0" limited="false" name="root" pos="0 0 0" stiffness="0" type="free"/>
+            <geom fromto="0 -.07 0 0 .07 0" name="torso1" size="0.07" type="capsule"/>
+            <geom name="head" pos="0 0 .19" size=".09" type="sphere" user="258"/>
+            <geom fromto="-.01 -.06 -.12 -.01 .06 -.12" name="uwaist" size="0.06" type="capsule"/>
+            <body name="lwaist" pos="-.01 0 -0.260" quat="1.000 0 -0.002 0">
+                <geom fromto="0 -.06 0 0 .06 0" name="lwaist" size="0.06" type="capsule"/>
+                <joint armature="0.02" axis="0 0 1" damping="5" name="abdomen_z" pos="0 0 0.065" range="-45 45" stiffness="20" type="hinge"/>
+                <joint armature="0.02" axis="0 1 0" damping="5" name="abdomen_y" pos="0 0 0.065" range="-75 30" stiffness="10" type="hinge"/>
+                <body name="pelvis" pos="0 0 -0.165" quat="1.000 0 -0.002 0">
+                    <joint armature="0.02" axis="1 0 0" damping="5" name="abdomen_x" pos="0 0 0.1" range="-35 35" stiffness="10" type="hinge"/>
+                    <geom fromto="-.02 -.07 0 -.02 .07 0" name="butt" size="0.09" type="capsule"/>
+                    <body name="right_thigh" pos="0 -0.1 -0.04">
+                        <joint armature="0.01" axis="1 0 0" damping="5" name="right_hip_x" pos="0 0 0" range="-25 5" stiffness="10" type="hinge"/>
+                        <joint armature="0.01" axis="0 0 1" damping="5" name="right_hip_z" pos="0 0 0" range="-60 35" stiffness="10" type="hinge"/>
+                        <joint armature="0.0080" axis="0 1 0" damping="5" name="right_hip_y" pos="0 0 0" range="-110 20" stiffness="20" type="hinge"/>
+                        <geom fromto="0 0 0 0 0.01 -.34" name="right_thigh1" size="0.06" type="capsule"/>
+                        <body name="right_shin" pos="0 0.01 -0.403">
+                            <joint armature="0.0060" axis="0 -1 0" name="right_knee" pos="0 0 .02" range="-160 -2" type="hinge"/>
+                            <geom fromto="0 0 0 0 0 -.3" name="right_shin1" size="0.049" type="capsule"/>
+                            <body name="right_foot" pos="0 0 -0.45">
+                                <geom name="right_foot" pos="0 0 0.1" size="0.075" type="sphere" user="0"/>
+                            </body>
+                        </body>
+                    </body>
+                    <body name="left_thigh" pos="0 0.1 -0.04">
+                        <joint armature="0.01" axis="-1 0 0" damping="5" name="left_hip_x" pos="0 0 0" range="-25 5" stiffness="10" type="hinge"/>
+                        <joint armature="0.01" axis="0 0 -1" damping="5" name="left_hip_z" pos="0 0 0" range="-60 35" stiffness="10" type="hinge"/>
+                        <joint armature="0.01" axis="0 1 0" damping="5" name="left_hip_y" pos="0 0 0" range="-110 20" stiffness="20" type="hinge"/>
+                        <geom fromto="0 0 0 0 -0.01 -.34" name="left_thigh1" size="0.06" type="capsule"/>
+                        <body name="left_shin" pos="0 -0.01 -0.403">
+                            <joint armature="0.0060" axis="0 -1 0" name="left_knee" pos="0 0 .02" range="-160 -2" stiffness="1" type="hinge"/>
+                            <geom fromto="0 0 0 0 0 -.3" name="left_shin1" size="0.049" type="capsule"/>
+                            <body name="left_foot" pos="0 0 -0.45">
+                                <geom name="left_foot" type="sphere" size="0.075" pos="0 0 0.1" user="0" />
+                            </body>
+                        </body>
+                    </body>
+                </body>
+            </body>
+            <body name="right_upper_arm" pos="0 -0.17 0.06">
+                <joint armature="0.0068" axis="2 1 1" name="right_shoulder1" pos="0 0 0" range="-85 60" stiffness="1" type="hinge"/>
+                <joint armature="0.0051" axis="0 -1 1" name="right_shoulder2" pos="0 0 0" range="-85 60" stiffness="1" type="hinge"/>
+                <geom fromto="0 0 0 .16 -.16 -.16" name="right_uarm1" size="0.04 0.16" type="capsule"/>
+                <body name="right_lower_arm" pos=".18 -.18 -.18">
+                    <joint armature="0.0028" axis="0 -1 1" name="right_elbow" pos="0 0 0" range="-90 50" stiffness="0" type="hinge"/>
+                    <geom fromto="0.01 0.01 0.01 .17 .17 .17" name="right_larm" size="0.031" type="capsule"/>
+                    <geom name="right_hand" pos=".18 .18 .18" size="0.04" type="sphere"/>
+                    <camera pos="0 0 0"/>
+                </body>
+            </body>
+            <body name="left_upper_arm" pos="0 0.17 0.06">
+                <joint armature="0.0068" axis="2 -1 1" name="left_shoulder1" pos="0 0 0" range="-60 85" stiffness="1" type="hinge"/>
+                <joint armature="0.0051" axis="0 1 1" name="left_shoulder2" pos="0 0 0" range="-60 85" stiffness="1" type="hinge"/>
+                <geom fromto="0 0 0 .16 .16 -.16" name="left_uarm1" size="0.04 0.16" type="capsule"/>
+                <body name="left_lower_arm" pos=".18 .18 -.18">
+                    <joint armature="0.0028" axis="0 -1 -1" name="left_elbow" pos="0 0 0" range="-90 50" stiffness="0" type="hinge"/>
+                    <geom fromto="0.01 -0.01 0.01 .17 -.17 .17" name="left_larm" size="0.031" type="capsule"/>
+                    <geom name="left_hand" pos=".18 -.18 .18" size="0.04" type="sphere"/>
+                </body>
+            </body>
+        </body>
+    </worldbody>
+    <tendon>
+        <fixed name="left_hipknee">
+            <joint coef="-1" joint="left_hip_y"/>
+            <joint coef="1" joint="left_knee"/>
+        </fixed>
+        <fixed name="right_hipknee">
+            <joint coef="-1" joint="right_hip_y"/>
+            <joint coef="1" joint="right_knee"/>
+        </fixed>
+    </tendon>
+
+    <actuator>
+        <motor gear="100" joint="abdomen_y" name="abdomen_y"/>
+        <motor gear="100" joint="abdomen_z" name="abdomen_z"/>
+        <motor gear="100" joint="abdomen_x" name="abdomen_x"/>
+        <motor gear="100" joint="right_hip_x" name="right_hip_x"/>
+        <motor gear="100" joint="right_hip_z" name="right_hip_z"/>
+        <motor gear="300" joint="right_hip_y" name="right_hip_y"/>
+        <motor gear="200" joint="right_knee" name="right_knee"/>
+        <motor gear="100" joint="left_hip_x" name="left_hip_x"/>
+        <motor gear="100" joint="left_hip_z" name="left_hip_z"/>
+        <motor gear="300" joint="left_hip_y" name="left_hip_y"/>
+        <motor gear="200" joint="left_knee" name="left_knee"/>
+        <motor gear="25" joint="right_shoulder1" name="right_shoulder1"/>
+        <motor gear="25" joint="right_shoulder2" name="right_shoulder2"/>
+        <motor gear="25" joint="right_elbow" name="right_elbow"/>
+        <motor gear="25" joint="left_shoulder1" name="left_shoulder1"/>
+        <motor gear="25" joint="left_shoulder2" name="left_shoulder2"/>
+        <motor gear="25" joint="left_elbow" name="left_elbow"/>
+    </actuator>
+</mujoco>
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/humanoidstandup.xml
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/humanoidstandup.xml
@ -0,0 +1,121 @@
+<mujoco model="humanoidstandup">
+    <compiler angle="degree" inertiafromgeom="true"/>
+    <default>
+        <joint armature="1" damping="1" limited="true"/>
+        <geom conaffinity="1" condim="1" contype="1" margin="0.001" material="geom" rgba="0.8 0.6 .4 1"/>
+        <motor ctrllimited="true" ctrlrange="-.4 .4"/>
+    </default>
+    <option integrator="RK4" iterations="50" solver="PGS" timestep="0.003">
+        <!-- <flags solverstat="enable" energy="enable"/>-->
+    </option>
+    <size nkey="5" nuser_geom="1"/>
+    <visual>
+        <map fogend="5" fogstart="3"/>
+    </visual>
+    <asset>
+        <texture builtin="gradient" height="100" rgb1=".4 .5 .6" rgb2="0 0 0" type="skybox" width="100"/>
+        <!-- <texture builtin="gradient" height="100" rgb1="1 1 1" rgb2="0 0 0" type="skybox" width="100"/>-->
+        <texture builtin="flat" height="1278" mark="cross" markrgb="1 1 1" name="texgeom" random="0.01" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" type="cube" width="127"/>
+        <texture builtin="checker" height="100" name="texplane" rgb1="0 0 0" rgb2="0.8 0.8 0.8" type="2d" width="100"/>
+        <material name="MatPlane" reflectance="0.5" shininess="1" specular="1" texrepeat="60 60" texture="texplane"/>
+        <material name="geom" texture="texgeom" texuniform="true"/>
+    </asset>
+    <worldbody>
+        <light cutoff="100" diffuse="1 1 1" dir="-0 0 -1.3" directional="true" exponent="1" pos="0 0 1.3" specular=".1 .1 .1"/>
+        <geom condim="3" friction="1 .1 .1" material="MatPlane" name="floor" pos="0 0 0" rgba="0.8 0.9 0.8 1" size="20 20 0.125" type="plane"/>
+        <!-- <geom condim="3" material="MatPlane" name="floor" pos="0 0 0" size="10 10 0.125" type="plane"/>-->
+        <body name="torso" pos="0 0 .105">
+            <camera name="track" mode="trackcom" pos="0 -3 .5" xyaxes="1 0 0 0 0 1"/>
+            <joint armature="0" damping="0" limited="false" name="root" pos="0 0 0" stiffness="0" type="free"/>
+            <geom fromto="0 -.07 0 0 .07 0" name="torso1" size="0.07" type="capsule"/>
+            <geom name="head" pos="-.15 0 0" size=".09" type="sphere" user="258"/>
+            <geom fromto=".11 -.06 0 .11 .06 0" name="uwaist" size="0.06" type="capsule"/>
+            <body name="lwaist" pos=".21 0 0" quat="1.000 0 -0.002 0">
+                <geom fromto="0 -.06 0 0 .06 0" name="lwaist" size="0.06" type="capsule"/>
+                <joint armature="0.02" axis="0 0 1" damping="5" name="abdomen_z" pos="0 0 0.065" range="-45 45" stiffness="20" type="hinge"/>
+                <joint armature="0.02" axis="0 1 0" damping="5" name="abdomen_y" pos="0 0 0.065" range="-75 30" stiffness="10" type="hinge"/>
+                <body name="pelvis" pos="0.165 0 0" quat="1.000 0 -0.002 0">
+                    <joint armature="0.02" axis="1 0 0" damping="5" name="abdomen_x" pos="0 0 0.1" range="-35 35" stiffness="10" type="hinge"/>
+                    <geom fromto="-.02 -.07 0 -.02 .07 0" name="butt" size="0.09" type="capsule"/>
+                    <body name="right_thigh" pos="0 -0.1 0">
+                        <joint armature="0.01" axis="1 0 0" damping="5" name="right_hip_x" pos="0 0 0" range="-25 5" stiffness="10" type="hinge"/>
+                        <joint armature="0.01" axis="0 0 1" damping="5" name="right_hip_z" pos="0 0 0" range="-60 35" stiffness="10" type="hinge"/>
+                        <joint armature="0.0080" axis="0 1 0" damping="5" name="right_hip_y" pos="0 0 0" range="-110 20" stiffness="20" type="hinge"/>
+                        <geom fromto="0 0 0 0.34 0.01 0" name="right_thigh1" size="0.06" type="capsule"/>
+                        <body name="right_shin" pos="0.403 0.01 0">
+                            <joint armature="0.0060" axis="0 -1 0" name="right_knee" pos="0 0 .02" range="-160 -2" type="hinge"/>
+                            <geom fromto="0 0 0 0.3 0 0" name="right_shin1" size="0.049" type="capsule"/>
+                            <body name="right_foot" pos="0.35 0 -.10">
+                                <geom name="right_foot" pos="0 0 0.1" size="0.075" type="sphere" user="0"/>
+                            </body>
+                        </body>
+                    </body>
+                    <body name="left_thigh" pos="0 0.1 0">
+                        <joint armature="0.01" axis="-1 0 0" damping="5" name="left_hip_x" pos="0 0 0" range="-25 5" stiffness="10" type="hinge"/>
+                        <joint armature="0.01" axis="0 0 -1" damping="5" name="left_hip_z" pos="0 0 0" range="-60 35" stiffness="10" type="hinge"/>
+                        <joint armature="0.01" axis="0 1 0" damping="5" name="left_hip_y" pos="0 0 0" range="-120 20" stiffness="20" type="hinge"/>
+                        <geom fromto="0 0 0 0.34 -0.01 0" name="left_thigh1" size="0.06" type="capsule"/>
+                        <body name="left_shin" pos="0.403 -0.01 0">
+                            <joint armature="0.0060" axis="0 -1 0" name="left_knee" pos="0 0 .02" range="-160 -2" stiffness="1" type="hinge"/>
+                            <geom fromto="0 0 0 0.3 0 0" name="left_shin1" size="0.049" type="capsule"/>
+                            <body name="left_foot" pos="0.35 0 -.1">
+                                <geom name="left_foot" type="sphere" size="0.075" pos="0 0 0.1" user="0" />
+                            </body>
+                        </body>
+                    </body>
+                </body>
+            </body>
+            <body name="right_upper_arm" pos="0 -0.17 0.06">
+                <joint armature="0.0068" axis="2 1 1" name="right_shoulder1" pos="0 0 0" range="-85 60" stiffness="1" type="hinge"/>
+                <joint armature="0.0051" axis="0 -1 1" name="right_shoulder2" pos="0 0 0" range="-85 60" stiffness="1" type="hinge"/>
+                <geom fromto="0 0 0 .16 -.16 -.16" name="right_uarm1" size="0.04 0.16" type="capsule"/>
+                <body name="right_lower_arm" pos=".18 -.18 -.18">
+                    <joint armature="0.0028" axis="0 -1 1" name="right_elbow" pos="0 0 0" range="-90 50" stiffness="0" type="hinge"/>
+                    <geom fromto="0.01 0.01 0.01 .17 .17 .17" name="right_larm" size="0.031" type="capsule"/>
+                    <geom name="right_hand" pos=".18 .18 .18" size="0.04" type="sphere"/>
+                    <camera pos="0 0 0"/>
+                </body>
+            </body>
+            <body name="left_upper_arm" pos="0 0.17 0.06">
+                <joint armature="0.0068" axis="2 -1 1" name="left_shoulder1" pos="0 0 0" range="-60 85" stiffness="1" type="hinge"/>
+                <joint armature="0.0051" axis="0 1 1" name="left_shoulder2" pos="0 0 0" range="-60 85" stiffness="1" type="hinge"/>
+                <geom fromto="0 0 0 .16 .16 -.16" name="left_uarm1" size="0.04 0.16" type="capsule"/>
+                <body name="left_lower_arm" pos=".18 .18 -.18">
+                    <joint armature="0.0028" axis="0 -1 -1" name="left_elbow" pos="0 0 0" range="-90 50" stiffness="0" type="hinge"/>
+                    <geom fromto="0.01 -0.01 0.01 .17 -.17 .17" name="left_larm" size="0.031" type="capsule"/>
+                    <geom name="left_hand" pos=".18 -.18 .18" size="0.04" type="sphere"/>
+                </body>
+            </body>
+        </body>
+    </worldbody>
+    <tendon>
+        <fixed name="left_hipknee">
+            <joint coef="-1" joint="left_hip_y"/>
+            <joint coef="1" joint="left_knee"/>
+        </fixed>
+        <fixed name="right_hipknee">
+            <joint coef="-1" joint="right_hip_y"/>
+            <joint coef="1" joint="right_knee"/>
+        </fixed>
+    </tendon>
+
+    <actuator>
+        <motor gear="100" joint="abdomen_y" name="abdomen_y"/>
+        <motor gear="100" joint="abdomen_z" name="abdomen_z"/>
+        <motor gear="100" joint="abdomen_x" name="abdomen_x"/>
+        <motor gear="100" joint="right_hip_x" name="right_hip_x"/>
+        <motor gear="100" joint="right_hip_z" name="right_hip_z"/>
+        <motor gear="300" joint="right_hip_y" name="right_hip_y"/>
+        <motor gear="200" joint="right_knee" name="right_knee"/>
+        <motor gear="100" joint="left_hip_x" name="left_hip_x"/>
+        <motor gear="100" joint="left_hip_z" name="left_hip_z"/>
+        <motor gear="300" joint="left_hip_y" name="left_hip_y"/>
+        <motor gear="200" joint="left_knee" name="left_knee"/>
+        <motor gear="25" joint="right_shoulder1" name="right_shoulder1"/>
+        <motor gear="25" joint="right_shoulder2" name="right_shoulder2"/>
+        <motor gear="25" joint="right_elbow" name="right_elbow"/>
+        <motor gear="25" joint="left_shoulder1" name="left_shoulder1"/>
+        <motor gear="25" joint="left_shoulder2" name="left_shoulder2"/>
+        <motor gear="25" joint="left_elbow" name="left_elbow"/>
+    </actuator>
+</mujoco>
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/inverted_double_pendulum.xml
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/inverted_double_pendulum.xml
@ -0,0 +1,47 @@
+<!-- Cartpole Model
+
+    The state space is populated with joints in the order that they are
+    defined in this file. The actuators also operate on joints.
+
+    State-Space (name/joint/parameter):
+        - cart      slider      position (m)
+        - pole      hinge       angle (rad)
+        - cart      slider      velocity (m/s)
+        - pole      hinge       angular velocity (rad/s)
+
+    Actuators (name/actuator/parameter):
+        - cart      motor       force x (N)
+
+-->
+<mujoco model="cartpole">
+  <compiler coordinate="local" inertiafromgeom="true"/>
+  <custom>
+    <numeric data="2" name="frame_skip"/>
+  </custom>
+  <default>
+    <joint damping="0.05"/>
+    <geom contype="0" friction="1 0.1 0.1" rgba="0.7 0.7 0 1"/>
+  </default>
+  <option gravity="1e-5 0 -9.81" integrator="RK4" timestep="0.01"/>
+  <size nstack="3000"/>
+  <worldbody>
+    <geom name="floor" pos="0 0 -3.0" rgba="0.8 0.9 0.8 1" size="40 40 40" type="plane"/>
+    <geom name="rail" pos="0 0 0" quat="0.707 0 0.707 0" rgba="0.3 0.3 0.7 1" size="0.02 1" type="capsule"/>
+    <body name="cart" pos="0 0 0">
+      <joint axis="1 0 0" limited="true" margin="0.01" name="slider" pos="0 0 0" range="-1 1" type="slide"/>
+      <geom name="cart" pos="0 0 0" quat="0.707 0 0.707 0" size="0.1 0.1" type="capsule"/>
+      <body name="pole" pos="0 0 0">
+        <joint axis="0 1 0" name="hinge" pos="0 0 0" type="hinge"/>
+        <geom fromto="0 0 0 0 0 0.6" name="cpole" rgba="0 0.7 0.7 1" size="0.045 0.3" type="capsule"/>
+        <body name="pole2" pos="0 0 0.6">
+          <joint axis="0 1 0" name="hinge2" pos="0 0 0" type="hinge"/>
+          <geom fromto="0 0 0 0 0 0.6" name="cpole2" rgba="0 0.7 0.7 1" size="0.045 0.3" type="capsule"/>
+          <site name="tip" pos="0 0 .6" size="0.01 0.01"/>
+        </body>
+      </body>
+    </body>
+  </worldbody>
+  <actuator>
+    <motor ctrllimited="true" ctrlrange="-1 1" gear="500" joint="slider" name="slide"/>
+  </actuator>
+</mujoco>
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/inverted_pendulum.xml
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/inverted_pendulum.xml
@ -0,0 +1,27 @@
+<mujoco model="inverted pendulum">
+	<compiler inertiafromgeom="true"/>
+	<default>
+		<joint armature="0" damping="1" limited="true"/>
+		<geom contype="0" friction="1 0.1 0.1" rgba="0.7 0.7 0 1"/>
+		<tendon/>
+		<motor ctrlrange="-3 3"/>
+	</default>
+	<option gravity="0 0 -9.81" integrator="RK4" timestep="0.02"/>
+	<size nstack="3000"/>
+	<worldbody>
+		<!--geom name="ground" type="plane" pos="0 0 0" /-->
+		<geom name="rail" pos="0 0 0" quat="0.707 0 0.707 0" rgba="0.3 0.3 0.7 1" size="0.02 1" type="capsule"/>
+		<body name="cart" pos="0 0 0">
+			<joint axis="1 0 0" limited="true" name="slider" pos="0 0 0" range="-1 1" type="slide"/>
+			<geom name="cart" pos="0 0 0" quat="0.707 0 0.707 0" size="0.1 0.1" type="capsule"/>
+			<body name="pole" pos="0 0 0">
+				<joint axis="0 1 0" name="hinge" pos="0 0 0" range="-90 90" type="hinge"/>
+				<geom fromto="0 0 0 0.001 0 0.6" name="cpole" rgba="0 0.7 0.7 1" size="0.049 0.3" type="capsule"/>
+				<!--                 <body name="pole2" pos="0.001 0 0.6"><joint name="hinge2" type="hinge" pos="0 0 0" axis="0 1 0"/><geom name="cpole2" type="capsule" fromto="0 0 0 0 0 0.6" size="0.05 0.3" rgba="0.7 0 0.7 1"/><site name="tip2" pos="0 0 .6"/></body>-->
+			</body>
+		</body>
+	</worldbody>
+	<actuator>
+		<motor ctrllimited="true" ctrlrange="-3 3" gear="100" joint="slider" name="slide"/>
+	</actuator>
+</mujoco>
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/point.xml
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/point.xml
@ -0,0 +1,31 @@
+<mujoco>
+  <compiler angle="degree" coordinate="local" inertiafromgeom="true"/>
+  <option integrator="RK4" timestep="0.02"/>
+  <default>
+    <joint armature="0" damping="0" limited="false"/>
+    <geom conaffinity="0" condim="3" density="100" friction="1 0.5 0.5" margin="0" rgba="0.8 0.6 0.4 1"/>
+  </default>
+  <asset>
+    <texture builtin="gradient" height="100" rgb1="1 1 1" rgb2="0 0 0" type="skybox" width="100"/>
+    <texture builtin="flat" height="1278" mark="cross" markrgb="1 1 1" name="texgeom" random="0.01" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" type="cube" width="127"/>
+    <texture builtin="checker" height="100" name="texplane" rgb1="0 0 0" rgb2="0.8 0.8 0.8" type="2d" width="100"/>
+    <material name="MatPlane" reflectance="0.5" shininess="1" specular="1" texrepeat="30 30" texture="texplane"/>
+    <material name="geom" texture="texgeom" texuniform="true"/>
+  </asset>
+  <worldbody>
+    <light cutoff="100" diffuse="1 1 1" dir="-0 0 -1.3" directional="true" exponent="1" pos="0 0 1.3" specular=".1 .1 .1"/>
+    <geom conaffinity="1" condim="3" material="MatPlane" name="floor" pos="0 0 0" rgba="0.8 0.9 0.8 1" size="40 40 40" type="plane"/>
+    <body name="torso" pos="0 0 0">
+      <geom name="pointbody" pos="0 0 0.5" size="0.5" type="sphere"/>
+      <geom name="pointarrow" pos="0.6 0 0.5" size="0.5 0.1 0.1" type="box"/>
+      <joint axis="1 0 0" name="ballx" pos="0 0 0" type="slide"/>
+      <joint axis="0 1 0" name="bally" pos="0 0 0" type="slide"/>
+      <joint axis="0 0 1" limited="false" name="rot" pos="0 0 0" type="hinge"/>
+    </body>
+  </worldbody>
+  <actuator>
+    <!-- Those are just dummy actuators for providing ranges -->
+    <motor ctrllimited="true" ctrlrange="-1 1" joint="ballx"/>
+    <motor ctrllimited="true" ctrlrange="-0.25 0.25" joint="rot"/>
+  </actuator>
+</mujoco>
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/pusher.xml
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/pusher.xml
@ -0,0 +1,91 @@
+<mujoco model="arm3d">
+  <compiler inertiafromgeom="true" angle="radian" coordinate="local"/>
+  <option timestep="0.01" gravity="0 0 0" iterations="20" integrator="Euler" />
+
+  <default>
+    <joint armature='0.04' damping="1" limited="true"/>
+    <geom friction=".8 .1 .1" density="300" margin="0.002" condim="1" contype="0" conaffinity="0"/>
+  </default>
+
+  <worldbody>
+    <light diffuse=".5 .5 .5" pos="0 0 3" dir="0 0 -1"/>
+    <geom name="table" type="plane" pos="0 0.5 -0.325" size="1 1 0.1" contype="1" conaffinity="1"/>
+
+    <body name="r_shoulder_pan_link" pos="0 -0.6 0">
+      <geom name="e1" type="sphere" rgba="0.6 0.6 0.6 1" pos="-0.06 0.05 0.2" size="0.05" />
+      <geom name="e2" type="sphere" rgba="0.6 0.6 0.6 1" pos=" 0.06 0.05 0.2" size="0.05" />
+      <geom name="e1p" type="sphere" rgba="0.1 0.1 0.1 1" pos="-0.06 0.09 0.2" size="0.03" />
+      <geom name="e2p" type="sphere" rgba="0.1 0.1 0.1 1" pos=" 0.06 0.09 0.2" size="0.03" />
+      <geom name="sp" type="capsule" fromto="0 0 -0.4 0 0 0.2" size="0.1" />
+      <joint name="r_shoulder_pan_joint" type="hinge" pos="0 0 0" axis="0 0 1" range="-2.2854 1.714602" damping="1.0" />
+
+      <body name="r_shoulder_lift_link" pos="0.1 0 0">
+        <geom name="sl" type="capsule" fromto="0 -0.1 0 0 0.1 0" size="0.1" />
+        <joint name="r_shoulder_lift_joint" type="hinge" pos="0 0 0" axis="0 1 0" range="-0.5236 1.3963" damping="1.0" />
+
+        <body name="r_upper_arm_roll_link" pos="0 0 0">
+          <geom name="uar" type="capsule" fromto="-0.1 0 0 0.1 0 0" size="0.02" />
+          <joint name="r_upper_arm_roll_joint" type="hinge" pos="0 0 0" axis="1 0 0" range="-1.5 1.7" damping="0.1" />
+
+          <body name="r_upper_arm_link" pos="0 0 0">
+            <geom name="ua" type="capsule" fromto="0 0 0 0.4 0 0" size="0.06" />
+
+            <body name="r_elbow_flex_link" pos="0.4 0 0">
+              <geom name="ef" type="capsule" fromto="0 -0.02 0 0.0 0.02 0" size="0.06" />
+              <joint name="r_elbow_flex_joint" type="hinge" pos="0 0 0" axis="0 1 0" range="-2.3213 0" damping="0.1" />
+
+              <body name="r_forearm_roll_link" pos="0 0 0">
+                <geom name="fr" type="capsule" fromto="-0.1 0 0 0.1 0 0" size="0.02" />
+                <joint name="r_forearm_roll_joint" type="hinge" limited="true" pos="0 0 0" axis="1 0 0" damping=".1" range="-1.5 1.5"/>
+
+                <body name="r_forearm_link" pos="0 0 0">
+                  <geom name="fa" type="capsule" fromto="0 0 0 0.291 0 0" size="0.05" />
+
+                  <body name="r_wrist_flex_link" pos="0.321 0 0">
+                    <geom name="wf" type="capsule" fromto="0 -0.02 0 0 0.02 0" size="0.01" />
+                    <joint name="r_wrist_flex_joint" type="hinge" pos="0 0 0" axis="0 1 0" range="-1.094 0" damping=".1" />
+
+                    <body name="r_wrist_roll_link" pos="0 0 0">
+                      <joint name="r_wrist_roll_joint" type="hinge" pos="0 0 0" limited="true" axis="1 0 0" damping="0.1" range="-1.5 1.5"/>
+                      <body name="tips_arm" pos="0 0 0">
+                        <geom name="tip_arml" type="sphere" pos="0.1 -0.1 0." size="0.01" />
+                        <geom name="tip_armr" type="sphere" pos="0.1 0.1 0." size="0.01" />
+                      </body>
+                      <geom type="capsule" fromto="0 -0.1 0. 0.0 +0.1 0" size="0.02" contype="1" conaffinity="1" />
+                      <geom type="capsule" fromto="0 -0.1 0. 0.1 -0.1 0" size="0.02" contype="1" conaffinity="1" />
+                      <geom type="capsule" fromto="0 +0.1 0. 0.1 +0.1 0." size="0.02" contype="1" conaffinity="1" />
+                    </body>
+                  </body>
+                </body>
+              </body>
+            </body>
+          </body>
+        </body>
+      </body>
+    </body>
+
+    <!--<body name="object" pos="0.55 -0.3 -0.275" >-->
+    <body name="object" pos="0.45 -0.05 -0.275" >
+      <geom rgba="1 1 1 0" type="sphere" size="0.05 0.05 0.05" density="0.00001" conaffinity="0"/>
+      <geom rgba="1 1 1 1" type="cylinder" size="0.05 0.05 0.05" density="0.00001" contype="1" conaffinity="0"/>
+      <joint name="obj_slidey" type="slide" pos="0 0 0" axis="0 1 0" range="-10.3213 10.3" damping="0.5"/>
+      <joint name="obj_slidex" type="slide" pos="0 0 0" axis="1 0 0" range="-10.3213 10.3" damping="0.5"/>
+    </body>
+
+    <body name="goal" pos="0.45 -0.05 -0.3230">
+      <geom rgba="1 0 0 1" type="cylinder" size="0.08 0.001 0.1" density='0.00001' contype="0" conaffinity="0"/>
+      <joint name="goal_slidey" type="slide" pos="0 0 0" axis="0 1 0" range="-10.3213 10.3" damping="0.5"/>
+      <joint name="goal_slidex" type="slide" pos="0 0 0" axis="1 0 0" range="-10.3213 10.3" damping="0.5"/>
+    </body>
+  </worldbody>
+
+  <actuator>
+    <motor joint="r_shoulder_pan_joint" ctrlrange="-2.0 2.0" ctrllimited="true" />
+    <motor joint="r_shoulder_lift_joint" ctrlrange="-2.0 2.0" ctrllimited="true" />
+    <motor joint="r_upper_arm_roll_joint" ctrlrange="-2.0 2.0" ctrllimited="true" />
+    <motor joint="r_elbow_flex_joint" ctrlrange="-2.0 2.0" ctrllimited="true" />
+    <motor joint="r_forearm_roll_joint" ctrlrange="-2.0 2.0" ctrllimited="true" />
+    <motor joint="r_wrist_flex_joint" ctrlrange="-2.0 2.0" ctrllimited="true" />
+    <motor joint="r_wrist_roll_joint" ctrlrange="-2.0 2.0" ctrllimited="true"/>
+  </actuator>
+</mujoco>
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/reacher.xml
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/reacher.xml
@ -0,0 +1,39 @@
+<mujoco model="reacher">
+	<compiler angle="radian" inertiafromgeom="true"/>
+	<default>
+		<joint armature="1" damping="1" limited="true"/>
+		<geom contype="0" friction="1 0.1 0.1" rgba="0.7 0.7 0 1"/>
+	</default>
+	<option gravity="0 0 -9.81" integrator="RK4" timestep="0.01"/>
+	<worldbody>
+		<!-- Arena -->
+		<geom conaffinity="0" contype="0" name="ground" pos="0 0 0" rgba="0.9 0.9 0.9 1" size="1 1 10" type="plane"/>
+		<geom conaffinity="0" fromto="-.3 -.3 .01 .3 -.3 .01" name="sideS" rgba="0.9 0.4 0.6 1" size=".02" type="capsule"/>
+		<geom conaffinity="0" fromto=" .3 -.3 .01 .3  .3 .01" name="sideE" rgba="0.9 0.4 0.6 1" size=".02" type="capsule"/>
+		<geom conaffinity="0" fromto="-.3  .3 .01 .3  .3 .01" name="sideN" rgba="0.9 0.4 0.6 1" size=".02" type="capsule"/>
+		<geom conaffinity="0" fromto="-.3 -.3 .01 -.3 .3 .01" name="sideW" rgba="0.9 0.4 0.6 1" size=".02" type="capsule"/>
+		<!-- Arm -->
+		<geom conaffinity="0" contype="0" fromto="0 0 0 0 0 0.02" name="root" rgba="0.9 0.4 0.6 1" size=".011" type="cylinder"/>
+		<body name="body0" pos="0 0 .01">
+			<geom fromto="0 0 0 0.1 0 0" name="link0" rgba="0.0 0.4 0.6 1" size=".01" type="capsule"/>
+			<joint axis="0 0 1" limited="false" name="joint0" pos="0 0 0" type="hinge"/>
+			<body name="body1" pos="0.1 0 0">
+				<joint axis="0 0 1" limited="true" name="joint1" pos="0 0 0" range="-3.0 3.0" type="hinge"/>
+				<geom fromto="0 0 0 0.1 0 0" name="link1" rgba="0.0 0.4 0.6 1" size=".01" type="capsule"/>
+				<body name="fingertip" pos="0.11 0 0">
+					<geom contype="0" name="fingertip" pos="0 0 0" rgba="0.0 0.8 0.6 1" size=".01" type="sphere"/>
+				</body>
+			</body>
+		</body>
+		<!-- Target -->
+		<body name="target" pos=".1 -.1 .01">
+			<joint armature="0" axis="1 0 0" damping="0" limited="true" name="target_x" pos="0 0 0" range="-.27 .27" ref=".1" stiffness="0" type="slide"/>
+			<joint armature="0" axis="0 1 0" damping="0" limited="true" name="target_y" pos="0 0 0" range="-.27 .27" ref="-.1" stiffness="0" type="slide"/>
+			<geom conaffinity="0" contype="0" name="target" pos="0 0 0" rgba="0.9 0.2 0.2 1" size=".009" type="sphere"/>
+		</body>
+	</worldbody>
+	<actuator>
+		<motor ctrllimited="true" ctrlrange="-1.0 1.0" gear="200.0" joint="joint0"/>
+		<motor ctrllimited="true" ctrlrange="-1.0 1.0" gear="200.0" joint="joint1"/>
+	</actuator>
+</mujoco>
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/swimmer.xml
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/swimmer.xml
@ -0,0 +1,39 @@
+<mujoco model="swimmer">
+  <compiler angle="degree" coordinate="local" inertiafromgeom="true"/>
+  <option collision="predefined" density="4000" integrator="RK4" timestep="0.01" viscosity="0.1"/>
+  <default>
+    <geom conaffinity="1" condim="1" contype="1" material="geom" rgba="0.8 0.6 .4 1"/>
+    <joint armature='0.1'  />
+  </default>
+  <asset>
+    <texture builtin="gradient" height="100" rgb1="1 1 1" rgb2="0 0 0" type="skybox" width="100"/>
+    <texture builtin="flat" height="1278" mark="cross" markrgb="1 1 1" name="texgeom" random="0.01" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" type="cube" width="127"/>
+    <texture builtin="checker" height="100" name="texplane" rgb1="0 0 0" rgb2="0.8 0.8 0.8" type="2d" width="100"/>
+    <material name="MatPlane" reflectance="0.5" shininess="1" specular="1" texrepeat="30 30" texture="texplane"/>
+    <material name="geom" texture="texgeom" texuniform="true"/>
+  </asset>
+  <worldbody>
+    <light cutoff="100" diffuse="1 1 1" dir="-0 0 -1.3" directional="true" exponent="1" pos="0 0 1.3" specular=".1 .1 .1"/>
+    <geom conaffinity="1" condim="3" material="MatPlane" name="floor" pos="0 0 -0.1" rgba="0.8 0.9 0.8 1" size="40 40 0.1" type="plane"/>
+    <!--  ================= SWIMMER ================= /-->
+    <body name="torso" pos="0 0 0">
+      <camera name="track" mode="trackcom" pos="0 -3 3" xyaxes="1 0 0 0 1 1"/>
+      <geom density="1000" fromto="1.5 0 0 0.5 0 0" size="0.1" type="capsule"/>
+      <joint axis="1 0 0" name="slider1" pos="0 0 0" type="slide"/>
+      <joint axis="0 1 0" name="slider2" pos="0 0 0" type="slide"/>
+      <joint axis="0 0 1" name="free_body_rot" pos="0 0 0" type="hinge"/>
+      <body name="mid" pos="0.5 0 0">
+        <geom density="1000" fromto="0 0 0 -1 0 0" size="0.1" type="capsule"/>
+        <joint axis="0 0 1" limited="true" name="motor1_rot" pos="0 0 0" range="-100 100" type="hinge"/>
+        <body name="back" pos="-1 0 0">
+          <geom density="1000" fromto="0 0 0 -1 0 0" size="0.1" type="capsule"/>
+          <joint axis="0 0 1" limited="true" name="motor2_rot" pos="0 0 0" range="-100 100" type="hinge"/>
+        </body>
+      </body>
+    </body>
+  </worldbody>
+  <actuator>
+    <motor ctrllimited="true" ctrlrange="-1 1" gear="150.0" joint="motor1_rot"/>
+    <motor ctrllimited="true" ctrlrange="-1 1" gear="150.0" joint="motor2_rot"/>
+  </actuator>
+</mujoco>
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/walker2d.xml
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/assets/walker2d.xml
@ -0,0 +1,68 @@
+<!--
+Walker2D model for `Walker2d-v5`, based on openai/gym/Walker2d
+modified by @kallinteris-Andreas
+  - To not require `coordinate="global"`
+  - keep feet friction to 0.9/1.9
+-->
+<mujoco model="walker2d">
+  <compiler angle="degree" inertiafromgeom="true"/>
+  <default>
+    <joint armature="0.01" damping=".1" limited="true"/>
+    <geom conaffinity="0" condim="3" contype="1" density="1000" friction=".7 .1 .1" rgba="0.8 0.6 .4 1"/>
+  </default>
+  <option integrator="RK4" timestep="0.002"/>
+  <worldbody>
+    <light cutoff="100" diffuse="1 1 1" dir="-0 0 -1.3" directional="true" exponent="1" pos="0 0 1.3" specular=".1 .1 .1"/>
+    <geom conaffinity="1" condim="3" name="floor" pos="0 0 0" rgba="0.8 0.9 0.8 1" size="40 40 40" type="plane" material="MatPlane"/>
+    <body name="torso" pos="0 0 1.25">
+      <camera name="track" mode="trackcom" pos="0 -3 -0.25" xyaxes="1 0 0 0 0 1"/>
+      <joint armature="0" axis="1 0 0" damping="0" limited="false" name="rootx" pos="0 0 -1.25" stiffness="0" type="slide"/>
+      <joint armature="0" axis="0 0 1" damping="0" limited="false" name="rootz" pos="0 0 -1.25" ref="1.25" stiffness="0" type="slide"/>
+      <joint armature="0" axis="0 1 0" damping="0" limited="false" name="rooty" pos="0 0 0" stiffness="0" type="hinge"/>
+      <geom friction="0.9" name="torso_geom" size="0.050000000000000003 0.19999999999999996" type="capsule"/>
+      <body name="thigh" pos="0 0 -0.19999999999999996">
+        <joint axis="0 -1 0" name="thigh_joint" pos="0 0 0" range="-150 0" type="hinge"/>
+        <geom friction="0.9" pos="0 0 -0.22500000000000009" name="thigh_geom" size="0.050000000000000003 0.22500000000000003" type="capsule"/>
+        <body name="leg" pos="0 0 -0.70000000000000007">
+          <joint axis="0 -1 0" name="leg_joint" pos="0 0 0.25" range="-150 0" type="hinge"/>
+          <geom friction="0.9" name="leg_geom" size="0.040000000000000001 0.25" type="capsule"/>
+          <body name="foot" pos="0.20000000000000001 0 -0.34999999999999998">
+            <joint axis="0 -1 0" name="foot_joint" pos="-0.20000000000000001 0 0.10000000000000001" range="-45 45" type="hinge"/>
+            <geom friction="0.9" pos="-0.10000000000000001 0 0.10000000000000001" quat="0.70710678118654757 0 -0.70710678118654746 0" name="foot_geom" size="0.059999999999999998 0.10000000000000001" type="capsule"/>
+          </body>
+        </body>
+      </body>
+      <!-- copied and then replace thigh->thigh_left, leg->leg_left, foot->foot_right -->
+      <body name="thigh_left" pos="0 0 -0.19999999999999996">
+        <joint axis="0 -1 0" name="thigh_left_joint" pos="0 0 0" range="-150 0" type="hinge"/>
+        <geom friction="0.9" name="thigh_left_geom" rgba=".7 .3 .6 1" size="0.050000000000000003 0.22500000000000003" pos="0 0 -0.22500000000000009" type="capsule"/>
+        <body name="leg_left" pos="0 0 -0.70000000000000007">
+          <joint axis="0 -1 0" name="leg_left_joint" pos="0 0 0.25" range="-150 0" type="hinge"/>
+          <geom friction="0.9" name="leg_left_geom" rgba=".7 .3 .6 1" size="0.040000000000000001 0.25" type="capsule"/>
+          <body name="foot_left" pos="0.20000000000000001 0 -0.34999999999999998">
+            <joint axis="0 -1 0" name="foot_left_joint" pos="-0.20000000000000001 0 0.10000000000000001" range="-45 45" type="hinge"/>
+            <geom friction="1.9" name="foot_left_geom" rgba=".7 .3 .6 1" size="0.059999999999999998 0.10000000000000001" pos="-0.10000000000000001 0 0.10000000000000001" type="capsule" quat="0.70710678118654757 0 -0.70710678118654746 0"/>
+          </body>
+        </body>
+      </body>
+    </body>
+  </worldbody>
+  <actuator>
+    <!-- <motor joint="torso_joint" ctrlrange="-100.0 100.0" isctrllimited="true"/>-->
+    <motor ctrllimited="true" ctrlrange="-1.0 1.0" gear="100" joint="thigh_joint"/>
+    <motor ctrllimited="true" ctrlrange="-1.0 1.0" gear="100" joint="leg_joint"/>
+    <motor ctrllimited="true" ctrlrange="-1.0 1.0" gear="100" joint="foot_joint"/>
+    <motor ctrllimited="true" ctrlrange="-1.0 1.0" gear="100" joint="thigh_left_joint"/>
+    <motor ctrllimited="true" ctrlrange="-1.0 1.0" gear="100" joint="leg_left_joint"/>
+    <motor ctrllimited="true" ctrlrange="-1.0 1.0" gear="100" joint="foot_left_joint"/>
+    <!-- <motor joint="finger2_rot" ctrlrange="-20.0 20.0" isctrllimited="true"/>-->
+  </actuator>
+    <asset>
+        <texture type="skybox" builtin="gradient" rgb1=".4 .5 .6" rgb2="0 0 0"
+            width="100" height="100"/>
+        <texture builtin="flat" height="1278" mark="cross" markrgb="1 1 1" name="texgeom" random="0.01" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" type="cube" width="127"/>
+        <texture builtin="checker" height="100" name="texplane" rgb1="0 0 0" rgb2="0.8 0.8 0.8" type="2d" width="100"/>
+        <material name="MatPlane" reflectance="0.5" shininess="1" specular="1" texrepeat="60 60" texture="texplane"/>
+        <material name="geom" texture="texgeom" texuniform="true"/>
+    </asset>
+</mujoco>
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/half_cheetah.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/half_cheetah.py
@ -0,0 +1,64 @@
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MuJocoPyEnv
+from gymnasium.spaces import Box
+
+
+class HalfCheetahEnv(MuJocoPyEnv, utils.EzPickle):
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 20,
+    }
+
+    def __init__(self, **kwargs):
+        observation_space = Box(low=-np.inf, high=np.inf, shape=(17,), dtype=np.float64)
+        MuJocoPyEnv.__init__(
+            self, "half_cheetah.xml", 5, observation_space=observation_space, **kwargs
+        )
+        utils.EzPickle.__init__(self, **kwargs)
+
+    def step(self, action):
+        xposbefore = self.sim.data.qpos[0]
+        self.do_simulation(action, self.frame_skip)
+        xposafter = self.sim.data.qpos[0]
+
+        ob = self._get_obs()
+        reward_ctrl = -0.1 * np.square(action).sum()
+        reward_run = (xposafter - xposbefore) / self.dt
+        reward = reward_ctrl + reward_run
+        terminated = False
+
+        if self.render_mode == "human":
+            self.render()
+        return (
+            ob,
+            reward,
+            terminated,
+            False,
+            dict(reward_run=reward_run, reward_ctrl=reward_ctrl),
+        )
+
+    def _get_obs(self):
+        return np.concatenate(
+            [
+                self.sim.data.qpos.flat[1:],
+                self.sim.data.qvel.flat,
+            ]
+        )
+
+    def reset_model(self):
+        qpos = self.init_qpos + self.np_random.uniform(
+            low=-0.1, high=0.1, size=self.model.nq
+        )
+        qvel = self.init_qvel + self.np_random.standard_normal(self.model.nv) * 0.1
+        self.set_state(qpos, qvel)
+        return self._get_obs()
+
+    def viewer_setup(self):
+        assert self.viewer is not None
+        self.viewer.cam.distance = self.model.stat.extent * 0.5
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/half_cheetah_v3.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/half_cheetah_v3.py
@ -0,0 +1,128 @@
+__credits__ = ["Rushiv Arora"]
+
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MuJocoPyEnv
+from gymnasium.spaces import Box
+
+
+DEFAULT_CAMERA_CONFIG = {
+    "distance": 4.0,
+}
+
+
+class HalfCheetahEnv(MuJocoPyEnv, utils.EzPickle):
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 20,
+    }
+
+    def __init__(
+        self,
+        xml_file="half_cheetah.xml",
+        forward_reward_weight=1.0,
+        ctrl_cost_weight=0.1,
+        reset_noise_scale=0.1,
+        exclude_current_positions_from_observation=True,
+        **kwargs,
+    ):
+        utils.EzPickle.__init__(
+            self,
+            xml_file,
+            forward_reward_weight,
+            ctrl_cost_weight,
+            reset_noise_scale,
+            exclude_current_positions_from_observation,
+            **kwargs,
+        )
+
+        self._forward_reward_weight = forward_reward_weight
+
+        self._ctrl_cost_weight = ctrl_cost_weight
+
+        self._reset_noise_scale = reset_noise_scale
+
+        self._exclude_current_positions_from_observation = (
+            exclude_current_positions_from_observation
+        )
+
+        if exclude_current_positions_from_observation:
+            observation_space = Box(
+                low=-np.inf, high=np.inf, shape=(17,), dtype=np.float64
+            )
+        else:
+            observation_space = Box(
+                low=-np.inf, high=np.inf, shape=(18,), dtype=np.float64
+            )
+
+        MuJocoPyEnv.__init__(
+            self, xml_file, 5, observation_space=observation_space, **kwargs
+        )
+
+    def control_cost(self, action):
+        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
+        return control_cost
+
+    def step(self, action):
+        x_position_before = self.sim.data.qpos[0]
+        self.do_simulation(action, self.frame_skip)
+        x_position_after = self.sim.data.qpos[0]
+        x_velocity = (x_position_after - x_position_before) / self.dt
+
+        ctrl_cost = self.control_cost(action)
+
+        forward_reward = self._forward_reward_weight * x_velocity
+
+        observation = self._get_obs()
+        reward = forward_reward - ctrl_cost
+        terminated = False
+        info = {
+            "x_position": x_position_after,
+            "x_velocity": x_velocity,
+            "reward_run": forward_reward,
+            "reward_ctrl": -ctrl_cost,
+        }
+
+        if self.render_mode == "human":
+            self.render()
+        return observation, reward, terminated, False, info
+
+    def _get_obs(self):
+        position = self.sim.data.qpos.flat.copy()
+        velocity = self.sim.data.qvel.flat.copy()
+
+        if self._exclude_current_positions_from_observation:
+            position = position[1:]
+
+        observation = np.concatenate((position, velocity)).ravel()
+        return observation
+
+    def reset_model(self):
+        noise_low = -self._reset_noise_scale
+        noise_high = self._reset_noise_scale
+
+        qpos = self.init_qpos + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nq
+        )
+        qvel = (
+            self.init_qvel
+            + self._reset_noise_scale * self.np_random.standard_normal(self.model.nv)
+        )
+
+        self.set_state(qpos, qvel)
+
+        observation = self._get_obs()
+        return observation
+
+    def viewer_setup(self):
+        assert self.viewer is not None
+        for key, value in DEFAULT_CAMERA_CONFIG.items():
+            if isinstance(value, np.ndarray):
+                getattr(self.viewer.cam, key)[:] = value
+            else:
+                setattr(self.viewer.cam, key, value)
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/half_cheetah_v4.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/half_cheetah_v4.py
@ -0,0 +1,244 @@
+__credits__ = ["Rushiv Arora"]
+
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MujocoEnv
+from gymnasium.spaces import Box
+
+
+DEFAULT_CAMERA_CONFIG = {
+    "distance": 4.0,
+}
+
+
+class HalfCheetahEnv(MujocoEnv, utils.EzPickle):
+    """
+    ## Description
+
+    This environment is based on the work by P. Wawrzyński in
+    ["A Cat-Like Robot Real-Time Learning to Run"](http://staff.elka.pw.edu.pl/~pwawrzyn/pub-s/0812_LSCLRR.pdf).
+    The HalfCheetah is a 2-dimensional robot consisting of 9 body parts and 8
+    joints connecting them (including two paws). The goal is to apply a torque
+    on the joints to make the cheetah run forward (right) as fast as possible,
+    with a positive reward allocated based on the distance moved forward and a
+    negative reward allocated for moving backward. The torso and head of the
+    cheetah are fixed, and the torque can only be applied on the other 6 joints
+    over the front and back thighs (connecting to the torso), shins
+    (connecting to the thighs) and feet (connecting to the shins).
+
+    ## Action Space
+    The action space is a `Box(-1, 1, (6,), float32)`. An action represents the torques applied at the hinge joints.
+
+    | Num | Action                                  | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
+    | --- | --------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ |
+    | 0   | Torque applied on the back thigh rotor  | -1          | 1           | bthigh                           | hinge | torque (N m) |
+    | 1   | Torque applied on the back shin rotor   | -1          | 1           | bshin                            | hinge | torque (N m) |
+    | 2   | Torque applied on the back foot rotor   | -1          | 1           | bfoot                            | hinge | torque (N m) |
+    | 3   | Torque applied on the front thigh rotor | -1          | 1           | fthigh                           | hinge | torque (N m) |
+    | 4   | Torque applied on the front shin rotor  | -1          | 1           | fshin                            | hinge | torque (N m) |
+    | 5   | Torque applied on the front foot rotor  | -1          | 1           | ffoot                            | hinge | torque (N m) |
+
+
+    ## Observation Space
+    Observations consist of positional values of different body parts of the
+    cheetah, followed by the velocities of those individual parts (their derivatives) with all the positions ordered before all the velocities.
+
+    By default, observations do not include the cheetah's `rootx`. It may
+    be included by passing `exclude_current_positions_from_observation=False` during construction.
+    In that case, the observation space will be a `Box(-Inf, Inf, (18,), float64)` where the first element
+    represents the `rootx`.
+    Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the
+    will be returned in `info` with key `"x_position"`.
+
+    However, by default, the observation is a `Box(-Inf, Inf, (17,), float64)` where the elements correspond to the following:
+
+    | Num | Observation                          | Min  | Max | Name (in corresponding XML file) | Joint | Unit                     |
+    | --- | ------------------------------------ | ---- | --- | -------------------------------- | ----- | ------------------------ |
+    | 0   | z-coordinate of the front tip        | -Inf | Inf | rootz                            | slide | position (m)             |
+    | 1   | angle of the front tip               | -Inf | Inf | rooty                            | hinge | angle (rad)              |
+    | 2   | angle of the second rotor            | -Inf | Inf | bthigh                           | hinge | angle (rad)              |
+    | 3   | angle of the second rotor            | -Inf | Inf | bshin                            | hinge | angle (rad)              |
+    | 4   | velocity of the tip along the x-axis | -Inf | Inf | bfoot                            | hinge | angle (rad)              |
+    | 5   | velocity of the tip along the y-axis | -Inf | Inf | fthigh                           | hinge | angle (rad)              |
+    | 6   | angular velocity of front tip        | -Inf | Inf | fshin                            | hinge | angle (rad)              |
+    | 7   | angular velocity of second rotor     | -Inf | Inf | ffoot                            | hinge | angle (rad)              |
+    | 8   | x-coordinate of the front tip        | -Inf | Inf | rootx                            | slide | velocity (m/s)           |
+    | 9   | y-coordinate of the front tip        | -Inf | Inf | rootz                            | slide | velocity (m/s)           |
+    | 10  | angle of the front tip               | -Inf | Inf | rooty                            | hinge | angular velocity (rad/s) |
+    | 11  | angle of the second rotor            | -Inf | Inf | bthigh                           | hinge | angular velocity (rad/s) |
+    | 12  | angle of the second rotor            | -Inf | Inf | bshin                            | hinge | angular velocity (rad/s) |
+    | 13  | velocity of the tip along the x-axis | -Inf | Inf | bfoot                            | hinge | angular velocity (rad/s) |
+    | 14  | velocity of the tip along the y-axis | -Inf | Inf | fthigh                           | hinge | angular velocity (rad/s) |
+    | 15  | angular velocity of front tip        | -Inf | Inf | fshin                            | hinge | angular velocity (rad/s) |
+    | 16  | angular velocity of second rotor     | -Inf | Inf | ffoot                            | hinge | angular velocity (rad/s) |
+    | excluded |  x-coordinate of the front tip  | -Inf | Inf | rootx                            | slide | position (m)             |
+
+    ## Rewards
+    The reward consists of two parts:
+    - *forward_reward*: A reward of moving forward which is measured
+    as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is
+    the time between actions and is dependent on the frame_skip parameter
+    (fixed to 5), where the frametime is 0.01 - making the
+    default *dt = 5 * 0.01 = 0.05*. This reward would be positive if the cheetah
+    runs forward (right).
+    - *ctrl_cost*: A cost for penalising the cheetah if it takes
+    actions that are too large. It is measured as *`ctrl_cost_weight` *
+    sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is a parameter set for the
+    control and has a default value of 0.1
+
+    The total reward returned is ***reward*** *=* *forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
+
+    ## Starting State
+    All observations start in state (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,) with a noise added to the
+    initial state for stochasticity. As seen before, the first 8 values in the
+    state are positional and the last 9 values are velocity. A uniform noise in
+    the range of [-`reset_noise_scale`, `reset_noise_scale`] is added to the positional values while a standard
+    normal noise with a mean of 0 and standard deviation of `reset_noise_scale` is added to the
+    initial velocity values of all zeros.
+
+    ## Episode End
+    The episode truncates when the episode length is greater than 1000.
+
+    ## Arguments
+
+    No additional arguments are currently supported in v2 and lower.
+
+    ```python
+    import gymnasium as gym
+    env = gym.make('HalfCheetah-v2')
+    ```
+
+    v3 and v4 take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc.
+
+    ```python
+    import gymnasium as gym
+    env = gym.make('HalfCheetah-v4', ctrl_cost_weight=0.1, ....)
+    ```
+
+    | Parameter                                    | Type      | Default              | Description                                                                                                                                                       |
+    | -------------------------------------------- | --------- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+    | `xml_file`                                   | **str**   | `"half_cheetah.xml"` | Path to a MuJoCo model                                                                                                                                            |
+    | `forward_reward_weight`                      | **float** | `1.0`                | Weight for _forward_reward_ term (see section on reward)                                                                                                          |
+    | `ctrl_cost_weight`                           | **float** | `0.1`                | Weight for _ctrl_cost_ weight (see section on reward)                                                                                                             |
+    | `reset_noise_scale`                          | **float** | `0.1`                | Scale of random perturbations of initial position and velocity (see section on Starting State)                                                                    |
+    | `exclude_current_positions_from_observation` | **bool**  | `True`               | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
+
+    ## Version History
+
+    * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3
+    * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
+    * v2: All continuous control environments now use mujoco-py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
+    * v0: Initial versions release (1.0.0)
+    """
+
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 20,
+    }
+
+    def __init__(
+        self,
+        forward_reward_weight=1.0,
+        ctrl_cost_weight=0.1,
+        reset_noise_scale=0.1,
+        exclude_current_positions_from_observation=True,
+        **kwargs,
+    ):
+        utils.EzPickle.__init__(
+            self,
+            forward_reward_weight,
+            ctrl_cost_weight,
+            reset_noise_scale,
+            exclude_current_positions_from_observation,
+            **kwargs,
+        )
+
+        self._forward_reward_weight = forward_reward_weight
+
+        self._ctrl_cost_weight = ctrl_cost_weight
+
+        self._reset_noise_scale = reset_noise_scale
+
+        self._exclude_current_positions_from_observation = (
+            exclude_current_positions_from_observation
+        )
+
+        if exclude_current_positions_from_observation:
+            observation_space = Box(
+                low=-np.inf, high=np.inf, shape=(17,), dtype=np.float64
+            )
+        else:
+            observation_space = Box(
+                low=-np.inf, high=np.inf, shape=(18,), dtype=np.float64
+            )
+
+        MujocoEnv.__init__(
+            self,
+            "half_cheetah.xml",
+            5,
+            observation_space=observation_space,
+            default_camera_config=DEFAULT_CAMERA_CONFIG,
+            **kwargs,
+        )
+
+    def control_cost(self, action):
+        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
+        return control_cost
+
+    def step(self, action):
+        x_position_before = self.data.qpos[0]
+        self.do_simulation(action, self.frame_skip)
+        x_position_after = self.data.qpos[0]
+        x_velocity = (x_position_after - x_position_before) / self.dt
+
+        ctrl_cost = self.control_cost(action)
+
+        forward_reward = self._forward_reward_weight * x_velocity
+
+        observation = self._get_obs()
+        reward = forward_reward - ctrl_cost
+        terminated = False
+        info = {
+            "x_position": x_position_after,
+            "x_velocity": x_velocity,
+            "reward_run": forward_reward,
+            "reward_ctrl": -ctrl_cost,
+        }
+
+        if self.render_mode == "human":
+            self.render()
+        return observation, reward, terminated, False, info
+
+    def _get_obs(self):
+        position = self.data.qpos.flat.copy()
+        velocity = self.data.qvel.flat.copy()
+
+        if self._exclude_current_positions_from_observation:
+            position = position[1:]
+
+        observation = np.concatenate((position, velocity)).ravel()
+        return observation
+
+    def reset_model(self):
+        noise_low = -self._reset_noise_scale
+        noise_high = self._reset_noise_scale
+
+        qpos = self.init_qpos + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nq
+        )
+        qvel = (
+            self.init_qvel
+            + self._reset_noise_scale * self.np_random.standard_normal(self.model.nv)
+        )
+
+        self.set_state(qpos, qvel)
+
+        observation = self._get_obs()
+        return observation
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/hopper.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/hopper.py
@ -0,0 +1,67 @@
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MuJocoPyEnv
+from gymnasium.spaces import Box
+
+
+class HopperEnv(MuJocoPyEnv, utils.EzPickle):
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 125,
+    }
+
+    def __init__(self, **kwargs):
+        observation_space = Box(low=-np.inf, high=np.inf, shape=(11,), dtype=np.float64)
+        MuJocoPyEnv.__init__(
+            self, "hopper.xml", 4, observation_space=observation_space, **kwargs
+        )
+        utils.EzPickle.__init__(self, **kwargs)
+
+    def step(self, a):
+        posbefore = self.sim.data.qpos[0]
+        self.do_simulation(a, self.frame_skip)
+        posafter, height, ang = self.sim.data.qpos[0:3]
+
+        alive_bonus = 1.0
+        reward = (posafter - posbefore) / self.dt
+        reward += alive_bonus
+        reward -= 1e-3 * np.square(a).sum()
+        s = self.state_vector()
+        terminated = not (
+            np.isfinite(s).all()
+            and (np.abs(s[2:]) < 100).all()
+            and (height > 0.7)
+            and (abs(ang) < 0.2)
+        )
+        ob = self._get_obs()
+
+        if self.render_mode == "human":
+            self.render()
+        return ob, reward, terminated, False, {}
+
+    def _get_obs(self):
+        return np.concatenate(
+            [self.sim.data.qpos.flat[1:], np.clip(self.sim.data.qvel.flat, -10, 10)]
+        )
+
+    def reset_model(self):
+        qpos = self.init_qpos + self.np_random.uniform(
+            low=-0.005, high=0.005, size=self.model.nq
+        )
+        qvel = self.init_qvel + self.np_random.uniform(
+            low=-0.005, high=0.005, size=self.model.nv
+        )
+        self.set_state(qpos, qvel)
+        return self._get_obs()
+
+    def viewer_setup(self):
+        assert self.viewer is not None
+        self.viewer.cam.trackbodyid = 2
+        self.viewer.cam.distance = self.model.stat.extent * 0.75
+        self.viewer.cam.lookat[2] = 1.15
+        self.viewer.cam.elevation = -20
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/hopper_v3.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/hopper_v3.py
@ -0,0 +1,178 @@
+__credits__ = ["Rushiv Arora"]
+
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MuJocoPyEnv
+from gymnasium.spaces import Box
+
+
+DEFAULT_CAMERA_CONFIG = {
+    "trackbodyid": 2,
+    "distance": 3.0,
+    "lookat": np.array((0.0, 0.0, 1.15)),
+    "elevation": -20.0,
+}
+
+
+class HopperEnv(MuJocoPyEnv, utils.EzPickle):
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 125,
+    }
+
+    def __init__(
+        self,
+        xml_file="hopper.xml",
+        forward_reward_weight=1.0,
+        ctrl_cost_weight=1e-3,
+        healthy_reward=1.0,
+        terminate_when_unhealthy=True,
+        healthy_state_range=(-100.0, 100.0),
+        healthy_z_range=(0.7, float("inf")),
+        healthy_angle_range=(-0.2, 0.2),
+        reset_noise_scale=5e-3,
+        exclude_current_positions_from_observation=True,
+        **kwargs,
+    ):
+        utils.EzPickle.__init__(
+            self,
+            xml_file,
+            forward_reward_weight,
+            ctrl_cost_weight,
+            healthy_reward,
+            terminate_when_unhealthy,
+            healthy_state_range,
+            healthy_z_range,
+            healthy_angle_range,
+            reset_noise_scale,
+            exclude_current_positions_from_observation,
+            **kwargs,
+        )
+
+        self._forward_reward_weight = forward_reward_weight
+
+        self._ctrl_cost_weight = ctrl_cost_weight
+
+        self._healthy_reward = healthy_reward
+        self._terminate_when_unhealthy = terminate_when_unhealthy
+
+        self._healthy_state_range = healthy_state_range
+        self._healthy_z_range = healthy_z_range
+        self._healthy_angle_range = healthy_angle_range
+
+        self._reset_noise_scale = reset_noise_scale
+
+        self._exclude_current_positions_from_observation = (
+            exclude_current_positions_from_observation
+        )
+
+        if exclude_current_positions_from_observation:
+            observation_space = Box(
+                low=-np.inf, high=np.inf, shape=(11,), dtype=np.float64
+            )
+        else:
+            observation_space = Box(
+                low=-np.inf, high=np.inf, shape=(12,), dtype=np.float64
+            )
+
+        MuJocoPyEnv.__init__(
+            self, xml_file, 4, observation_space=observation_space, **kwargs
+        )
+
+    @property
+    def healthy_reward(self):
+        return (
+            float(self.is_healthy or self._terminate_when_unhealthy)
+            * self._healthy_reward
+        )
+
+    def control_cost(self, action):
+        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
+        return control_cost
+
+    @property
+    def is_healthy(self):
+        z, angle = self.sim.data.qpos[1:3]
+        state = self.state_vector()[2:]
+
+        min_state, max_state = self._healthy_state_range
+        min_z, max_z = self._healthy_z_range
+        min_angle, max_angle = self._healthy_angle_range
+
+        healthy_state = np.all(np.logical_and(min_state < state, state < max_state))
+        healthy_z = min_z < z < max_z
+        healthy_angle = min_angle < angle < max_angle
+
+        is_healthy = all((healthy_state, healthy_z, healthy_angle))
+
+        return is_healthy
+
+    @property
+    def terminated(self):
+        terminated = not self.is_healthy if self._terminate_when_unhealthy else False
+        return terminated
+
+    def _get_obs(self):
+        position = self.sim.data.qpos.flat.copy()
+        velocity = np.clip(self.sim.data.qvel.flat.copy(), -10, 10)
+
+        if self._exclude_current_positions_from_observation:
+            position = position[1:]
+
+        observation = np.concatenate((position, velocity)).ravel()
+        return observation
+
+    def step(self, action):
+        x_position_before = self.sim.data.qpos[0]
+        self.do_simulation(action, self.frame_skip)
+        x_position_after = self.sim.data.qpos[0]
+        x_velocity = (x_position_after - x_position_before) / self.dt
+
+        ctrl_cost = self.control_cost(action)
+
+        forward_reward = self._forward_reward_weight * x_velocity
+        healthy_reward = self.healthy_reward
+
+        rewards = forward_reward + healthy_reward
+        costs = ctrl_cost
+
+        observation = self._get_obs()
+        reward = rewards - costs
+        terminated = self.terminated
+        info = {
+            "x_position": x_position_after,
+            "x_velocity": x_velocity,
+        }
+
+        if self.render_mode == "human":
+            self.render()
+        return observation, reward, terminated, False, info
+
+    def reset_model(self):
+        noise_low = -self._reset_noise_scale
+        noise_high = self._reset_noise_scale
+
+        qpos = self.init_qpos + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nq
+        )
+        qvel = self.init_qvel + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nv
+        )
+
+        self.set_state(qpos, qvel)
+
+        observation = self._get_obs()
+        return observation
+
+    def viewer_setup(self):
+        assert self.viewer is not None
+        for key, value in DEFAULT_CAMERA_CONFIG.items():
+            if isinstance(value, np.ndarray):
+                getattr(self.viewer.cam, key)[:] = value
+            else:
+                setattr(self.viewer.cam, key, value)
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/hopper_v4.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/hopper_v4.py
@ -0,0 +1,298 @@
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MujocoEnv
+from gymnasium.spaces import Box
+
+
+DEFAULT_CAMERA_CONFIG = {
+    "trackbodyid": 2,
+    "distance": 3.0,
+    "lookat": np.array((0.0, 0.0, 1.15)),
+    "elevation": -20.0,
+}
+
+
+class HopperEnv(MujocoEnv, utils.EzPickle):
+    """
+    ## Description
+
+    This environment is based on the work done by Erez, Tassa, and Todorov in
+    ["Infinite Horizon Model Predictive Control for Nonlinear Periodic Tasks"](http://www.roboticsproceedings.org/rss07/p10.pdf). The environment aims to
+    increase the number of independent state and control variables as compared to
+    the classic control environments. The hopper is a two-dimensional
+    one-legged figure that consist of four main body parts - the torso at the
+    top, the thigh in the middle, the leg in the bottom, and a single foot on
+    which the entire body rests. The goal is to make hops that move in the
+    forward (right) direction by applying torques on the three hinges
+    connecting the four body parts.
+
+    ## Action Space
+    The action space is a `Box(-1, 1, (3,), float32)`. An action represents the torques applied at the hinge joints.
+
+    | Num | Action                             | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
+    |-----|------------------------------------|-------------|-------------|----------------------------------|-------|--------------|
+    | 0   | Torque applied on the thigh rotor  | -1          | 1           | thigh_joint                      | hinge | torque (N m) |
+    | 1   | Torque applied on the leg rotor    | -1          | 1           | leg_joint                        | hinge | torque (N m) |
+    | 2   | Torque applied on the foot rotor   | -1          | 1           | foot_joint                       | hinge | torque (N m) |
+
+    ## Observation Space
+    Observations consist of positional values of different body parts of the
+    hopper, followed by the velocities of those individual parts
+    (their derivatives) with all the positions ordered before all the velocities.
+
+    By default, observations do not include the x-coordinate of the hopper. It may
+    be included by passing `exclude_current_positions_from_observation=False` during construction.
+    In that case, the observation space will be `Box(-Inf, Inf, (12,), float64)` where the first observation
+    represents the x-coordinate of the hopper.
+    Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x-coordinate
+    will be returned in `info` with key `"x_position"`.
+
+    However, by default, the observation is a `Box(-Inf, Inf, (11,), float64)` where the elements
+    correspond to the following:
+
+    | Num | Observation                                        | Min  | Max | Name (in corresponding XML file) | Joint | Unit                     |
+    | --- | -------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ |
+    | 0   | z-coordinate of the torso (height of hopper)       | -Inf | Inf | rootz                            | slide | position (m)             |
+    | 1   | angle of the torso                                 | -Inf | Inf | rooty                            | hinge | angle (rad)              |
+    | 2   | angle of the thigh joint                           | -Inf | Inf | thigh_joint                      | hinge | angle (rad)              |
+    | 3   | angle of the leg joint                             | -Inf | Inf | leg_joint                        | hinge | angle (rad)              |
+    | 4   | angle of the foot joint                            | -Inf | Inf | foot_joint                       | hinge | angle (rad)              |
+    | 5   | velocity of the x-coordinate of the torso          | -Inf | Inf | rootx                          | slide | velocity (m/s)           |
+    | 6   | velocity of the z-coordinate (height) of the torso | -Inf | Inf | rootz                          | slide | velocity (m/s)           |
+    | 7   | angular velocity of the angle of the torso         | -Inf | Inf | rooty                          | hinge | angular velocity (rad/s) |
+    | 8   | angular velocity of the thigh hinge                | -Inf | Inf | thigh_joint                      | hinge | angular velocity (rad/s) |
+    | 9   | angular velocity of the leg hinge                  | -Inf | Inf | leg_joint                        | hinge | angular velocity (rad/s) |
+    | 10  | angular velocity of the foot hinge                 | -Inf | Inf | foot_joint                       | hinge | angular velocity (rad/s) |
+    | excluded | x-coordinate of the torso                     | -Inf | Inf | rootx                            | slide | position (m)             |
+
+
+    ## Rewards
+    The reward consists of three parts:
+    - *healthy_reward*: Every timestep that the hopper is healthy (see definition in section "Episode Termination"), it gets a reward of fixed value `healthy_reward`.
+    - *forward_reward*: A reward of hopping forward which is measured
+    as *`forward_reward_weight` * (x-coordinate before action - x-coordinate after action)/dt*. *dt* is
+    the time between actions and is dependent on the frame_skip parameter
+    (fixed to 4), where the frametime is 0.002 - making the
+    default *dt = 4 * 0.002 = 0.008*. This reward would be positive if the hopper
+    hops forward (positive x direction).
+    - *ctrl_cost*: A cost for penalising the hopper if it takes
+    actions that are too large. It is measured as *`ctrl_cost_weight` *
+    sum(action<sup>2</sup>)* where *`ctrl_cost_weight`* is a parameter set for the
+    control and has a default value of 0.001
+
+    The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost* and `info` will also contain the individual reward terms
+
+    ## Starting State
+    All observations start in state
+    (0.0, 1.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise
+     in the range of [-`reset_noise_scale`, `reset_noise_scale`] added to the values for stochasticity.
+
+    ## Episode End
+    The hopper is said to be unhealthy if any of the following happens:
+
+    1. An element of `observation[1:]` (if  `exclude_current_positions_from_observation=True`, else `observation[2:]`) is no longer contained in the closed interval specified by the argument `healthy_state_range`
+    2. The height of the hopper (`observation[0]` if  `exclude_current_positions_from_observation=True`, else `observation[1]`) is no longer contained in the closed interval specified by the argument `healthy_z_range` (usually meaning that it has fallen)
+    3. The angle (`observation[1]` if  `exclude_current_positions_from_observation=True`, else `observation[2]`) is no longer contained in the closed interval specified by the argument `healthy_angle_range`
+
+    If `terminate_when_unhealthy=True` is passed during construction (which is the default),
+    the episode ends when any of the following happens:
+
+    1. Truncation: The episode duration reaches a 1000 timesteps
+    2. Termination: The hopper is unhealthy
+
+    If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded.
+
+    ## Arguments
+
+    No additional arguments are currently supported in v2 and lower.
+
+    ```python
+    import gymnasium as gym
+    env = gym.make('Hopper-v2')
+    ```
+
+    v3 and v4 take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc.
+
+    ```python
+    import gymnasium as gym
+    env = gym.make('Hopper-v4', ctrl_cost_weight=0.1, ....)
+    ```
+
+    | Parameter                                    | Type      | Default               | Description                                                                                                                                                                     |
+    | -------------------------------------------- | --------- | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+    | `xml_file`                                   | **str**   | `"hopper.xml"`        | Path to a MuJoCo model                                                                                                                                                          |
+    | `forward_reward_weight`                      | **float** | `1.0`                 | Weight for _forward_reward_ term (see section on reward)                                                                                                                        |
+    | `ctrl_cost_weight`                           | **float** | `0.001`               | Weight for _ctrl_cost_ reward (see section on reward)                                                                                                                           |
+    | `healthy_reward`                             | **float** | `1`                   | Constant reward given if the ant is "healthy" after timestep                                                                                                                    |
+    | `terminate_when_unhealthy`                   | **bool**  | `True`                | If true, issue a done signal if the hopper is no longer healthy                                                                                                                 |
+    | `healthy_state_range`                        | **tuple** | `(-100, 100)`         | The elements of `observation[1:]` (if `exclude_current_positions_from_observation=True`, else `observation[2:]`) must be in this range for the hopper to be considered healthy  |
+    | `healthy_z_range`                            | **tuple** | `(0.7, float("inf"))` | The z-coordinate must be in this range for the hopper to be considered healthy                                                                                                  |
+    | `healthy_angle_range`                        | **tuple** | `(-0.2, 0.2)`         | The angle given by `observation[1]` (if `exclude_current_positions_from_observation=True`, else `observation[2]`) must be in this range for the hopper to be considered healthy |
+    | `reset_noise_scale`                          | **float** | `5e-3`                | Scale of random perturbations of initial position and velocity (see section on Starting State)                                                                                  |
+    | `exclude_current_positions_from_observation` | **bool**  | `True`                | Whether or not to omit the x-coordinate from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies               |
+
+    ## Version History
+
+    * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3
+    * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
+    * v2: All continuous control environments now use mujoco-py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
+    * v0: Initial versions release (1.0.0)
+    """
+
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 125,
+    }
+
+    def __init__(
+        self,
+        forward_reward_weight=1.0,
+        ctrl_cost_weight=1e-3,
+        healthy_reward=1.0,
+        terminate_when_unhealthy=True,
+        healthy_state_range=(-100.0, 100.0),
+        healthy_z_range=(0.7, float("inf")),
+        healthy_angle_range=(-0.2, 0.2),
+        reset_noise_scale=5e-3,
+        exclude_current_positions_from_observation=True,
+        **kwargs,
+    ):
+        utils.EzPickle.__init__(
+            self,
+            forward_reward_weight,
+            ctrl_cost_weight,
+            healthy_reward,
+            terminate_when_unhealthy,
+            healthy_state_range,
+            healthy_z_range,
+            healthy_angle_range,
+            reset_noise_scale,
+            exclude_current_positions_from_observation,
+            **kwargs,
+        )
+
+        self._forward_reward_weight = forward_reward_weight
+
+        self._ctrl_cost_weight = ctrl_cost_weight
+
+        self._healthy_reward = healthy_reward
+        self._terminate_when_unhealthy = terminate_when_unhealthy
+
+        self._healthy_state_range = healthy_state_range
+        self._healthy_z_range = healthy_z_range
+        self._healthy_angle_range = healthy_angle_range
+
+        self._reset_noise_scale = reset_noise_scale
+
+        self._exclude_current_positions_from_observation = (
+            exclude_current_positions_from_observation
+        )
+
+        if exclude_current_positions_from_observation:
+            observation_space = Box(
+                low=-np.inf, high=np.inf, shape=(11,), dtype=np.float64
+            )
+        else:
+            observation_space = Box(
+                low=-np.inf, high=np.inf, shape=(12,), dtype=np.float64
+            )
+
+        MujocoEnv.__init__(
+            self,
+            "hopper.xml",
+            4,
+            observation_space=observation_space,
+            default_camera_config=DEFAULT_CAMERA_CONFIG,
+            **kwargs,
+        )
+
+    @property
+    def healthy_reward(self):
+        return (
+            float(self.is_healthy or self._terminate_when_unhealthy)
+            * self._healthy_reward
+        )
+
+    def control_cost(self, action):
+        control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
+        return control_cost
+
+    @property
+    def is_healthy(self):
+        z, angle = self.data.qpos[1:3]
+        state = self.state_vector()[2:]
+
+        min_state, max_state = self._healthy_state_range
+        min_z, max_z = self._healthy_z_range
+        min_angle, max_angle = self._healthy_angle_range
+
+        healthy_state = np.all(np.logical_and(min_state < state, state < max_state))
+        healthy_z = min_z < z < max_z
+        healthy_angle = min_angle < angle < max_angle
+
+        is_healthy = all((healthy_state, healthy_z, healthy_angle))
+
+        return is_healthy
+
+    @property
+    def terminated(self):
+        terminated = not self.is_healthy if self._terminate_when_unhealthy else False
+        return terminated
+
+    def _get_obs(self):
+        position = self.data.qpos.flat.copy()
+        velocity = np.clip(self.data.qvel.flat.copy(), -10, 10)
+
+        if self._exclude_current_positions_from_observation:
+            position = position[1:]
+
+        observation = np.concatenate((position, velocity)).ravel()
+        return observation
+
+    def step(self, action):
+        x_position_before = self.data.qpos[0]
+        self.do_simulation(action, self.frame_skip)
+        x_position_after = self.data.qpos[0]
+        x_velocity = (x_position_after - x_position_before) / self.dt
+
+        ctrl_cost = self.control_cost(action)
+
+        forward_reward = self._forward_reward_weight * x_velocity
+        healthy_reward = self.healthy_reward
+
+        rewards = forward_reward + healthy_reward
+        costs = ctrl_cost
+
+        observation = self._get_obs()
+        reward = rewards - costs
+        terminated = self.terminated
+        info = {
+            "x_position": x_position_after,
+            "x_velocity": x_velocity,
+        }
+
+        if self.render_mode == "human":
+            self.render()
+        return observation, reward, terminated, False, info
+
+    def reset_model(self):
+        noise_low = -self._reset_noise_scale
+        noise_high = self._reset_noise_scale
+
+        qpos = self.init_qpos + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nq
+        )
+        qvel = self.init_qvel + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nv
+        )
+
+        self.set_state(qpos, qvel)
+
+        observation = self._get_obs()
+        return observation
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/humanoid.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/humanoid.py
@ -0,0 +1,94 @@
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MuJocoPyEnv
+from gymnasium.spaces import Box
+
+
+def mass_center(model, sim):
+    mass = np.expand_dims(model.body_mass, 1)
+    xpos = sim.data.xipos
+    return (np.sum(mass * xpos, 0) / np.sum(mass))[0]
+
+
+class HumanoidEnv(MuJocoPyEnv, utils.EzPickle):
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 67,
+    }
+
+    def __init__(self, **kwargs):
+        observation_space = Box(
+            low=-np.inf, high=np.inf, shape=(376,), dtype=np.float64
+        )
+        MuJocoPyEnv.__init__(
+            self, "humanoid.xml", 5, observation_space=observation_space, **kwargs
+        )
+        utils.EzPickle.__init__(self, **kwargs)
+
+    def _get_obs(self):
+        data = self.sim.data
+        return np.concatenate(
+            [
+                data.qpos.flat[2:],
+                data.qvel.flat,
+                data.cinert.flat,
+                data.cvel.flat,
+                data.qfrc_actuator.flat,
+                data.cfrc_ext.flat,
+            ]
+        )
+
+    def step(self, a):
+        pos_before = mass_center(self.model, self.sim)
+        self.do_simulation(a, self.frame_skip)
+        pos_after = mass_center(self.model, self.sim)
+
+        alive_bonus = 5.0
+        data = self.sim.data
+        lin_vel_cost = 1.25 * (pos_after - pos_before) / self.dt
+        quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum()
+        quad_impact_cost = 0.5e-6 * np.square(data.cfrc_ext).sum()
+        quad_impact_cost = min(quad_impact_cost, 10)
+        reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus
+        qpos = self.sim.data.qpos
+        terminated = bool((qpos[2] < 1.0) or (qpos[2] > 2.0))
+
+        if self.render_mode == "human":
+            self.render()
+        return (
+            self._get_obs(),
+            reward,
+            terminated,
+            False,
+            dict(
+                reward_linvel=lin_vel_cost,
+                reward_quadctrl=-quad_ctrl_cost,
+                reward_alive=alive_bonus,
+                reward_impact=-quad_impact_cost,
+            ),
+        )
+
+    def reset_model(self):
+        c = 0.01
+        self.set_state(
+            self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq),
+            self.init_qvel
+            + self.np_random.uniform(
+                low=-c,
+                high=c,
+                size=self.model.nv,
+            ),
+        )
+        return self._get_obs()
+
+    def viewer_setup(self):
+        assert self.viewer is not None
+        self.viewer.cam.trackbodyid = 1
+        self.viewer.cam.distance = self.model.stat.extent * 1.0
+        self.viewer.cam.lookat[2] = 2.0
+        self.viewer.cam.elevation = -20
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/humanoid_v3.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/humanoid_v3.py
@ -0,0 +1,200 @@
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MuJocoPyEnv
+from gymnasium.spaces import Box
+
+
+DEFAULT_CAMERA_CONFIG = {
+    "trackbodyid": 1,
+    "distance": 4.0,
+    "lookat": np.array((0.0, 0.0, 2.0)),
+    "elevation": -20.0,
+}
+
+
+def mass_center(model, sim):
+    mass = np.expand_dims(model.body_mass, axis=1)
+    xpos = sim.data.xipos
+    return (np.sum(mass * xpos, axis=0) / np.sum(mass))[0:2].copy()
+
+
+class HumanoidEnv(MuJocoPyEnv, utils.EzPickle):
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 67,
+    }
+
+    def __init__(
+        self,
+        xml_file="humanoid.xml",
+        forward_reward_weight=1.25,
+        ctrl_cost_weight=0.1,
+        contact_cost_weight=5e-7,
+        contact_cost_range=(-np.inf, 10.0),
+        healthy_reward=5.0,
+        terminate_when_unhealthy=True,
+        healthy_z_range=(1.0, 2.0),
+        reset_noise_scale=1e-2,
+        exclude_current_positions_from_observation=True,
+        **kwargs,
+    ):
+        utils.EzPickle.__init__(
+            self,
+            xml_file,
+            forward_reward_weight,
+            ctrl_cost_weight,
+            contact_cost_weight,
+            contact_cost_range,
+            healthy_reward,
+            terminate_when_unhealthy,
+            healthy_z_range,
+            reset_noise_scale,
+            exclude_current_positions_from_observation,
+            **kwargs,
+        )
+
+        self._forward_reward_weight = forward_reward_weight
+        self._ctrl_cost_weight = ctrl_cost_weight
+        self._contact_cost_weight = contact_cost_weight
+        self._contact_cost_range = contact_cost_range
+        self._healthy_reward = healthy_reward
+        self._terminate_when_unhealthy = terminate_when_unhealthy
+        self._healthy_z_range = healthy_z_range
+
+        self._reset_noise_scale = reset_noise_scale
+
+        self._exclude_current_positions_from_observation = (
+            exclude_current_positions_from_observation
+        )
+        if exclude_current_positions_from_observation:
+            observation_space = Box(
+                low=-np.inf, high=np.inf, shape=(376,), dtype=np.float64
+            )
+        else:
+            observation_space = Box(
+                low=-np.inf, high=np.inf, shape=(378,), dtype=np.float64
+            )
+
+        MuJocoPyEnv.__init__(
+            self, xml_file, 5, observation_space=observation_space, **kwargs
+        )
+
+    @property
+    def healthy_reward(self):
+        return (
+            float(self.is_healthy or self._terminate_when_unhealthy)
+            * self._healthy_reward
+        )
+
+    def control_cost(self, action):
+        control_cost = self._ctrl_cost_weight * np.sum(np.square(self.sim.data.ctrl))
+        return control_cost
+
+    @property
+    def contact_cost(self):
+        contact_forces = self.sim.data.cfrc_ext
+        contact_cost = self._contact_cost_weight * np.sum(np.square(contact_forces))
+        min_cost, max_cost = self._contact_cost_range
+        contact_cost = np.clip(contact_cost, min_cost, max_cost)
+        return contact_cost
+
+    @property
+    def is_healthy(self):
+        min_z, max_z = self._healthy_z_range
+        is_healthy = min_z < self.sim.data.qpos[2] < max_z
+
+        return is_healthy
+
+    @property
+    def terminated(self):
+        terminated = (not self.is_healthy) if self._terminate_when_unhealthy else False
+        return terminated
+
+    def _get_obs(self):
+        position = self.sim.data.qpos.flat.copy()
+        velocity = self.sim.data.qvel.flat.copy()
+
+        com_inertia = self.sim.data.cinert.flat.copy()
+        com_velocity = self.sim.data.cvel.flat.copy()
+
+        actuator_forces = self.sim.data.qfrc_actuator.flat.copy()
+        external_contact_forces = self.sim.data.cfrc_ext.flat.copy()
+
+        if self._exclude_current_positions_from_observation:
+            position = position[2:]
+
+        return np.concatenate(
+            (
+                position,
+                velocity,
+                com_inertia,
+                com_velocity,
+                actuator_forces,
+                external_contact_forces,
+            )
+        )
+
+    def step(self, action):
+        xy_position_before = mass_center(self.model, self.sim)
+        self.do_simulation(action, self.frame_skip)
+        xy_position_after = mass_center(self.model, self.sim)
+
+        xy_velocity = (xy_position_after - xy_position_before) / self.dt
+        x_velocity, y_velocity = xy_velocity
+
+        ctrl_cost = self.control_cost(action)
+        contact_cost = self.contact_cost
+
+        forward_reward = self._forward_reward_weight * x_velocity
+        healthy_reward = self.healthy_reward
+
+        rewards = forward_reward + healthy_reward
+        costs = ctrl_cost + contact_cost
+
+        observation = self._get_obs()
+        reward = rewards - costs
+        terminated = self.terminated
+        info = {
+            "reward_linvel": forward_reward,
+            "reward_quadctrl": -ctrl_cost,
+            "reward_alive": healthy_reward,
+            "reward_impact": -contact_cost,
+            "x_position": xy_position_after[0],
+            "y_position": xy_position_after[1],
+            "distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
+            "x_velocity": x_velocity,
+            "y_velocity": y_velocity,
+            "forward_reward": forward_reward,
+        }
+
+        if self.render_mode == "human":
+            self.render()
+        return observation, reward, terminated, False, info
+
+    def reset_model(self):
+        noise_low = -self._reset_noise_scale
+        noise_high = self._reset_noise_scale
+
+        qpos = self.init_qpos + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nq
+        )
+        qvel = self.init_qvel + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nv
+        )
+        self.set_state(qpos, qvel)
+
+        observation = self._get_obs()
+        return observation
+
+    def viewer_setup(self):
+        assert self.viewer is not None
+        for key, value in DEFAULT_CAMERA_CONFIG.items():
+            if isinstance(value, np.ndarray):
+                getattr(self.viewer.cam, key)[:] = value
+            else:
+                setattr(self.viewer.cam, key, value)
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/humanoid_v4.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/humanoid_v4.py
@ -0,0 +1,422 @@
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MujocoEnv
+from gymnasium.spaces import Box
+
+
+DEFAULT_CAMERA_CONFIG = {
+    "trackbodyid": 1,
+    "distance": 4.0,
+    "lookat": np.array((0.0, 0.0, 2.0)),
+    "elevation": -20.0,
+}
+
+
+def mass_center(model, data):
+    mass = np.expand_dims(model.body_mass, axis=1)
+    xpos = data.xipos
+    return (np.sum(mass * xpos, axis=0) / np.sum(mass))[0:2].copy()
+
+
+class HumanoidEnv(MujocoEnv, utils.EzPickle):
+    """
+    ## Description
+
+    This environment is based on the environment introduced by Tassa, Erez and Todorov
+    in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
+    The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a pair of
+    legs and arms. The legs each consist of three body parts, and the arms 2 body parts (representing the knees and
+    elbows respectively). The goal of the environment is to walk forward as fast as possible without falling over.
+
+    ## Action Space
+    The action space is a `Box(-1, 1, (17,), float32)`. An action represents the torques applied at the hinge joints.
+
+    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit |
+    |-----|----------------------|---------------|----------------|---------------------------------------|-------|------|
+    | 0   | Torque applied on the hinge in the y-coordinate of the abdomen                     | -0.4 | 0.4 | abdomen_y                   | hinge | torque (N m) |
+    | 1   | Torque applied on the hinge in the z-coordinate of the abdomen                     | -0.4 | 0.4 | abdomen_z                   | hinge | torque (N m) |
+    | 2   | Torque applied on the hinge in the x-coordinate of the abdomen                     | -0.4 | 0.4 | abdomen_x                   | hinge | torque (N m) |
+    | 3   | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4 | 0.4 | right_hip_x (right_thigh)   | hinge | torque (N m) |
+    | 4   | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4 | 0.4 | right_hip_z (right_thigh)   | hinge | torque (N m) |
+    | 5   | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4 | 0.4 | right_hip_y (right_thigh)   | hinge | torque (N m) |
+    | 6   | Torque applied on the rotor between the right hip/thigh and the right shin         | -0.4 | 0.4 | right_knee                  | hinge | torque (N m) |
+    | 7   | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate)  | -0.4 | 0.4 | left_hip_x (left_thigh)     | hinge | torque (N m) |
+    | 8   | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate)  | -0.4 | 0.4 | left_hip_z (left_thigh)     | hinge | torque (N m) |
+    | 9   | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate)  | -0.4 | 0.4 | left_hip_y (left_thigh)     | hinge | torque (N m) |
+    | 10  | Torque applied on the rotor between the left hip/thigh and the left shin           | -0.4 | 0.4 | left_knee                   | hinge | torque (N m) |
+    | 11  | Torque applied on the rotor between the torso and right upper arm (coordinate -1)  | -0.4 | 0.4 | right_shoulder1             | hinge | torque (N m) |
+    | 12  | Torque applied on the rotor between the torso and right upper arm (coordinate -2)  | -0.4 | 0.4 | right_shoulder2             | hinge | torque (N m) |
+    | 13  | Torque applied on the rotor between the right upper arm and right lower arm        | -0.4 | 0.4 | right_elbow                 | hinge | torque (N m) |
+    | 14  | Torque applied on the rotor between the torso and left upper arm (coordinate -1)   | -0.4 | 0.4 | left_shoulder1              | hinge | torque (N m) |
+    | 15  | Torque applied on the rotor between the torso and left upper arm (coordinate -2)   | -0.4 | 0.4 | left_shoulder2              | hinge | torque (N m) |
+    | 16  | Torque applied on the rotor between the left upper arm and left lower arm          | -0.4 | 0.4 | left_elbow                  | hinge | torque (N m) |
+
+    ## Observation Space
+    Observations consist of positional values of different body parts of the Humanoid,
+    followed by the velocities of those individual parts (their derivatives) with all the
+    positions ordered before all the velocities.
+
+    By default, observations do not include the x- and y-coordinates of the torso. These may
+    be included by passing `exclude_current_positions_from_observation=False` during construction.
+    In that case, the observation space will be a `Box(-Inf, Inf, (378,), float64)` where the first two observations
+    represent the x- and y-coordinates of the torso.
+    Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates
+    will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively.
+
+    However, by default, the observation is a `Box(-Inf, Inf, (376,), float64)`. The elements correspond to the following:
+
+    | Num | Observation                                                                                                     | Min  | Max | Name (in corresponding XML file) | Joint | Unit                       |
+    | --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- |
+    | 0   | z-coordinate of the torso (centre)                                                                              | -Inf | Inf | root                             | free  | position (m)               |
+    | 1   | x-orientation of the torso (centre)                                                                             | -Inf | Inf | root                             | free  | angle (rad)                |
+    | 2   | y-orientation of the torso (centre)                                                                             | -Inf | Inf | root                             | free  | angle (rad)                |
+    | 3   | z-orientation of the torso (centre)                                                                             | -Inf | Inf | root                             | free  | angle (rad)                |
+    | 4   | w-orientation of the torso (centre)                                                                             | -Inf | Inf | root                             | free  | angle (rad)                |
+    | 5   | z-angle of the abdomen (in lower_waist)                                                                         | -Inf | Inf | abdomen_z                        | hinge | angle (rad)                |
+    | 6   | y-angle of the abdomen (in lower_waist)                                                                         | -Inf | Inf | abdomen_y                        | hinge | angle (rad)                |
+    | 7   | x-angle of the abdomen (in pelvis)                                                                              | -Inf | Inf | abdomen_x                        | hinge | angle (rad)                |
+    | 8   | x-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf | Inf | right_hip_x                      | hinge | angle (rad)                |
+    | 9   | z-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf | Inf | right_hip_z                      | hinge | angle (rad)                |
+    | 10  | y-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf | Inf | right_hip_y                      | hinge | angle (rad)                |
+    | 11  | angle between right hip and the right shin (in right_knee)                                                      | -Inf | Inf | right_knee                       | hinge | angle (rad)                |
+    | 12  | x-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf | Inf | left_hip_x                       | hinge | angle (rad)                |
+    | 13  | z-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf | Inf | left_hip_z                       | hinge | angle (rad)                |
+    | 14  | y-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf | Inf | left_hip_y                       | hinge | angle (rad)                |
+    | 15  | angle between left hip and the left shin (in left_knee)                                                         | -Inf | Inf | left_knee                        | hinge | angle (rad)                |
+    | 16  | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf | Inf | right_shoulder1                  | hinge | angle (rad)                |
+    | 17  | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf | Inf | right_shoulder2                  | hinge | angle (rad)                |
+    | 18  | angle between right upper arm and right_lower_arm                                                               | -Inf | Inf | right_elbow                      | hinge | angle (rad)                |
+    | 19  | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf | Inf | left_shoulder1                   | hinge | angle (rad)                |
+    | 20  | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf | Inf | left_shoulder2                   | hinge | angle (rad)                |
+    | 21  | angle between left upper arm and left_lower_arm                                                                 | -Inf | Inf | left_elbow                       | hinge | angle (rad)                |
+    | 22  | x-coordinate velocity of the torso (centre)                                                                     | -Inf | Inf | root                             | free  | velocity (m/s)             |
+    | 23  | y-coordinate velocity of the torso (centre)                                                                     | -Inf | Inf | root                             | free  | velocity (m/s)             |
+    | 24  | z-coordinate velocity of the torso (centre)                                                                     | -Inf | Inf | root                             | free  | velocity (m/s)             |
+    | 25  | x-coordinate angular velocity of the torso (centre)                                                             | -Inf | Inf | root                             | free  | anglular velocity (rad/s)  |
+    | 26  | y-coordinate angular velocity of the torso (centre)                                                             | -Inf | Inf | root                             | free  | anglular velocity (rad/s)  |
+    | 27  | z-coordinate angular velocity of the torso (centre)                                                             | -Inf | Inf | root                             | free  | anglular velocity (rad/s)  |
+    | 28  | z-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf | Inf | abdomen_z                        | hinge | anglular velocity (rad/s)  |
+    | 29  | y-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf | Inf | abdomen_y                        | hinge | anglular velocity (rad/s)  |
+    | 30  | x-coordinate of angular velocity of the abdomen (in pelvis)                                                     | -Inf | Inf | abdomen_x                        | hinge | aanglular velocity (rad/s) |
+    | 31  | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf | Inf | right_hip_x                      | hinge | anglular velocity (rad/s)  |
+    | 32  | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf | Inf | right_hip_z                      | hinge | anglular velocity (rad/s)  |
+    | 33  | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf | Inf | right_hip_y                      | hinge | anglular velocity (rad/s)  |
+    | 34  | angular velocity of the angle between right hip and the right shin (in right_knee)                              | -Inf | Inf | right_knee                       | hinge | anglular velocity (rad/s)  |
+    | 35  | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf | Inf | left_hip_x                       | hinge | anglular velocity (rad/s)  |
+    | 36  | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf | Inf | left_hip_z                       | hinge | anglular velocity (rad/s)  |
+    | 37  | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf | Inf | left_hip_y                       | hinge | anglular velocity (rad/s)  |
+    | 38  | angular velocity of the angle between left hip and the left shin (in left_knee)                                 | -Inf | Inf | left_knee                        | hinge | anglular velocity (rad/s)  |
+    | 39  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1                  | hinge | anglular velocity (rad/s)  |
+    | 40  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2                  | hinge | anglular velocity (rad/s)  |
+    | 41  | angular velocity of the angle between right upper arm and right_lower_arm                                       | -Inf | Inf | right_elbow                      | hinge | anglular velocity (rad/s)  |
+    | 42  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf | Inf | left_shoulder1                   | hinge | anglular velocity (rad/s)  |
+    | 43  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf | Inf | left_shoulder2                   | hinge | anglular velocity (rad/s)  |
+    | 44  | angular velocity of the angle between left upper arm and left_lower_arm                                         | -Inf | Inf | left_elbow                       | hinge | anglular velocity (rad/s)  |
+    | excluded | x-coordinate of the torso (centre)                                                                         | -Inf | Inf | root                             | free  | position (m)               |
+    | excluded | y-coordinate of the torso (centre)                                                                         | -Inf | Inf | root                             | free  | position (m)               |
+
+    Additionally, after all the positional and velocity based values in the table,
+    the observation contains (in order):
+    - *cinert:* Mass and inertia of a single rigid body relative to the center of mass
+    (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
+    and hence adds to another 140 elements in the state space.
+    - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
+    adds another 84 elements in the state space
+    - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
+    `(23,)`  *(nv * 1)* and hence adds another 23 elements to the state space.
+    - *cfrc_ext:* This is the center of mass based external force on the body.  It has shape
+    14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
+    where *nbody* stands for the number of bodies in the robot and *nv* stands for the
+    number of degrees of freedom (*= dim(qvel)*)
+
+    The body parts are:
+
+    | id (for `v2`,`v3`,`v4`) | body part |
+    | --- |  ------------  |
+    | 0   | worldBody (note: all values are constant 0) |
+    | 1   | torso |
+    | 2   | lwaist |
+    | 3   | pelvis |
+    | 4   | right_thigh |
+    | 5   | right_sin |
+    | 6   | right_foot |
+    | 7   | left_thigh |
+    | 8   | left_sin |
+    | 9   | left_foot |
+    | 10  | right_upper_arm |
+    | 11  | right_lower_arm |
+    | 12  | left_upper_arm |
+    | 13  | left_lower_arm |
+
+    The joints are:
+
+    | id (for `v2`,`v3`,`v4`) | joint |
+    | --- |  ------------  |
+    | 0   | root |
+    | 1   | root |
+    | 2   | root |
+    | 3   | root |
+    | 4   | root |
+    | 5   | root |
+    | 6   | abdomen_z |
+    | 7   | abdomen_y |
+    | 8   | abdomen_x |
+    | 9   | right_hip_x |
+    | 10  | right_hip_z |
+    | 11  | right_hip_y |
+    | 12  | right_knee |
+    | 13  | left_hip_x |
+    | 14  | left_hiz_z |
+    | 15  | left_hip_y |
+    | 16  | left_knee |
+    | 17  | right_shoulder1 |
+    | 18  | right_shoulder2 |
+    | 19  | right_elbow|
+    | 20  | left_shoulder1 |
+    | 21  | left_shoulder2 |
+    | 22  | left_elfbow |
+
+    The (x,y,z) coordinates are translational DOFs while the orientations are rotational
+    DOFs expressed as quaternions. One can read more about free joints on the
+    [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
+
+    **Note:** Humanoid-v4 environment no longer has the following contact forces issue.
+    If using previous Humanoid versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0
+    results in the contact forces always being 0. As such we recommend to use a Mujoco-Py
+    version < 2.0 when using the Humanoid environment if you would like to report results
+    with contact forces (if contact forces are not used in your experiments, you can use
+    version > 2.0).
+
+    ## Rewards
+    The reward consists of three parts:
+    - *healthy_reward*: Every timestep that the humanoid is alive (see section Episode Termination for definition), it gets a reward of fixed value `healthy_reward`
+    - *forward_reward*: A reward of walking forward which is measured as *`forward_reward_weight` *
+    (average center of mass before action - average center of mass after action)/dt*.
+    *dt* is the time between actions and is dependent on the frame_skip parameter
+    (default is 5), where the frametime is 0.003 - making the default *dt = 5 * 0.003 = 0.015*.
+    This reward would be positive if the humanoid walks forward (in positive x-direction). The calculation
+    for the center of mass is defined in the `.py` file for the Humanoid.
+    - *ctrl_cost*: A negative reward for penalising the humanoid if it has too
+    large of a control force. If there are *nu* actuators/controls, then the control has
+    shape  `nu x 1`. It is measured as *`ctrl_cost_weight` * sum(control<sup>2</sup>)*.
+    - *contact_cost*: A negative reward for penalising the humanoid if the external
+    contact force is too large. It is calculated by clipping
+    *`contact_cost_weight` * sum(external contact force<sup>2</sup>)* to the interval specified by `contact_cost_range`.
+
+    The total reward returned is ***reward*** *=* *healthy_reward + forward_reward - ctrl_cost - contact_cost* and `info` will also contain the individual reward terms
+
+    ## Starting State
+    All observations start in state
+    (0.0, 0.0,  1.4, 1.0, 0.0  ... 0.0) with a uniform noise in the range
+    of [-`reset_noise_scale`, `reset_noise_scale`] added to the positional and velocity values (values in the table)
+    for stochasticity. Note that the initial z coordinate is intentionally
+    selected to be high, thereby indicating a standing up humanoid. The initial
+    orientation is designed to make it face forward as well.
+
+    ## Episode End
+    The humanoid is said to be unhealthy if the z-position of the torso is no longer contained in the
+    closed interval specified by the argument `healthy_z_range`.
+
+    If `terminate_when_unhealthy=True` is passed during construction (which is the default),
+    the episode ends when any of the following happens:
+
+    1. Truncation: The episode duration reaches a 1000 timesteps
+    3. Termination: The humanoid is unhealthy
+
+    If `terminate_when_unhealthy=False` is passed, the episode is ended only when 1000 timesteps are exceeded.
+
+    ## Arguments
+
+    No additional arguments are currently supported in v2 and lower.
+
+    ```python
+    import gymnasium as gym
+    env = gym.make('Humanoid-v4')
+    ```
+
+    v3 and v4 take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc.
+
+    ```python
+    import gymnasium as gym
+    env = gym.make('Humanoid-v4', ctrl_cost_weight=0.1, ....)
+    ```
+
+    | Parameter                                    | Type      | Default          | Description                                                                                                                                                               |
+    | -------------------------------------------- | --------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+    | `xml_file`                                   | **str**   | `"humanoid.xml"` | Path to a MuJoCo model                                                                                                                                                    |
+    | `forward_reward_weight`                      | **float** | `1.25`           | Weight for _forward_reward_ term (see section on reward)                                                                                                                  |
+    | `ctrl_cost_weight`                           | **float** | `0.1`            | Weight for _ctrl_cost_ term (see section on reward)                                                                                                                       |
+    | `contact_cost_weight`                        | **float** | `5e-7`           | Weight for _contact_cost_ term (see section on reward)                                                                                                                    |
+    | `healthy_reward`                             | **float** | `5.0`            | Constant reward given if the humanoid is "healthy" after timestep                                                                                                         |
+    | `terminate_when_unhealthy`                   | **bool**  | `True`           | If true, issue a done signal if the z-coordinate of the torso is no longer in the `healthy_z_range`                                                                       |
+    | `healthy_z_range`                            | **tuple** | `(1.0, 2.0)`     | The humanoid is considered healthy if the z-coordinate of the torso is in this range                                                                                      |
+    | `reset_noise_scale`                          | **float** | `1e-2`           | Scale of random perturbations of initial position and velocity (see section on Starting State)                                                                            |
+    | `exclude_current_positions_from_observation` | **bool**  | `True`           | Whether or not to omit the x- and y-coordinates from observations. Excluding the position can serve as an inductive bias to induce position-agnostic behavior in policies |
+
+    ## Version History
+
+    * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3
+    * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
+    * v2: All continuous control environments now use mujoco-py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
+    * v0: Initial versions release (1.0.0)
+    """
+
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 67,
+    }
+
+    def __init__(
+        self,
+        forward_reward_weight=1.25,
+        ctrl_cost_weight=0.1,
+        healthy_reward=5.0,
+        terminate_when_unhealthy=True,
+        healthy_z_range=(1.0, 2.0),
+        reset_noise_scale=1e-2,
+        exclude_current_positions_from_observation=True,
+        **kwargs,
+    ):
+        utils.EzPickle.__init__(
+            self,
+            forward_reward_weight,
+            ctrl_cost_weight,
+            healthy_reward,
+            terminate_when_unhealthy,
+            healthy_z_range,
+            reset_noise_scale,
+            exclude_current_positions_from_observation,
+            **kwargs,
+        )
+
+        self._forward_reward_weight = forward_reward_weight
+        self._ctrl_cost_weight = ctrl_cost_weight
+        self._healthy_reward = healthy_reward
+        self._terminate_when_unhealthy = terminate_when_unhealthy
+        self._healthy_z_range = healthy_z_range
+
+        self._reset_noise_scale = reset_noise_scale
+
+        self._exclude_current_positions_from_observation = (
+            exclude_current_positions_from_observation
+        )
+
+        if exclude_current_positions_from_observation:
+            observation_space = Box(
+                low=-np.inf, high=np.inf, shape=(376,), dtype=np.float64
+            )
+        else:
+            observation_space = Box(
+                low=-np.inf, high=np.inf, shape=(378,), dtype=np.float64
+            )
+
+        MujocoEnv.__init__(
+            self,
+            "humanoid.xml",
+            5,
+            observation_space=observation_space,
+            default_camera_config=DEFAULT_CAMERA_CONFIG,
+            **kwargs,
+        )
+
+    @property
+    def healthy_reward(self):
+        return (
+            float(self.is_healthy or self._terminate_when_unhealthy)
+            * self._healthy_reward
+        )
+
+    def control_cost(self, action):
+        control_cost = self._ctrl_cost_weight * np.sum(np.square(self.data.ctrl))
+        return control_cost
+
+    @property
+    def is_healthy(self):
+        min_z, max_z = self._healthy_z_range
+        is_healthy = min_z < self.data.qpos[2] < max_z
+
+        return is_healthy
+
+    @property
+    def terminated(self):
+        terminated = (not self.is_healthy) if self._terminate_when_unhealthy else False
+        return terminated
+
+    def _get_obs(self):
+        position = self.data.qpos.flat.copy()
+        velocity = self.data.qvel.flat.copy()
+
+        com_inertia = self.data.cinert.flat.copy()
+        com_velocity = self.data.cvel.flat.copy()
+
+        actuator_forces = self.data.qfrc_actuator.flat.copy()
+        external_contact_forces = self.data.cfrc_ext.flat.copy()
+
+        if self._exclude_current_positions_from_observation:
+            position = position[2:]
+
+        return np.concatenate(
+            (
+                position,
+                velocity,
+                com_inertia,
+                com_velocity,
+                actuator_forces,
+                external_contact_forces,
+            )
+        )
+
+    def step(self, action):
+        xy_position_before = mass_center(self.model, self.data)
+        self.do_simulation(action, self.frame_skip)
+        xy_position_after = mass_center(self.model, self.data)
+
+        xy_velocity = (xy_position_after - xy_position_before) / self.dt
+        x_velocity, y_velocity = xy_velocity
+
+        ctrl_cost = self.control_cost(action)
+
+        forward_reward = self._forward_reward_weight * x_velocity
+        healthy_reward = self.healthy_reward
+
+        rewards = forward_reward + healthy_reward
+
+        observation = self._get_obs()
+        reward = rewards - ctrl_cost
+        terminated = self.terminated
+        info = {
+            "reward_linvel": forward_reward,
+            "reward_quadctrl": -ctrl_cost,
+            "reward_alive": healthy_reward,
+            "x_position": xy_position_after[0],
+            "y_position": xy_position_after[1],
+            "distance_from_origin": np.linalg.norm(xy_position_after, ord=2),
+            "x_velocity": x_velocity,
+            "y_velocity": y_velocity,
+            "forward_reward": forward_reward,
+        }
+
+        if self.render_mode == "human":
+            self.render()
+        return observation, reward, terminated, False, info
+
+    def reset_model(self):
+        noise_low = -self._reset_noise_scale
+        noise_high = self._reset_noise_scale
+
+        qpos = self.init_qpos + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nq
+        )
+        qvel = self.init_qvel + self.np_random.uniform(
+            low=noise_low, high=noise_high, size=self.model.nv
+        )
+        self.set_state(qpos, qvel)
+
+        observation = self._get_obs()
+        return observation
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/humanoidstandup.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/humanoidstandup.py
@ -0,0 +1,87 @@
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MuJocoPyEnv
+from gymnasium.spaces import Box
+
+
+class HumanoidStandupEnv(MuJocoPyEnv, utils.EzPickle):
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 67,
+    }
+
+    def __init__(self, **kwargs):
+        observation_space = Box(
+            low=-np.inf, high=np.inf, shape=(376,), dtype=np.float64
+        )
+        MuJocoPyEnv.__init__(
+            self,
+            "humanoidstandup.xml",
+            5,
+            observation_space=observation_space,
+            **kwargs,
+        )
+        utils.EzPickle.__init__(self, **kwargs)
+
+    def _get_obs(self):
+        data = self.sim.data
+        return np.concatenate(
+            [
+                data.qpos.flat[2:],
+                data.qvel.flat,
+                data.cinert.flat,
+                data.cvel.flat,
+                data.qfrc_actuator.flat,
+                data.cfrc_ext.flat,
+            ]
+        )
+
+    def step(self, a):
+        self.do_simulation(a, self.frame_skip)
+        pos_after = self.sim.data.qpos[2]
+        data = self.sim.data
+        uph_cost = (pos_after - 0) / self.model.opt.timestep
+
+        quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum()
+        quad_impact_cost = 0.5e-6 * np.square(data.cfrc_ext).sum()
+        quad_impact_cost = min(quad_impact_cost, 10)
+        reward = uph_cost - quad_ctrl_cost - quad_impact_cost + 1
+
+        if self.render_mode == "human":
+            self.render()
+        return (
+            self._get_obs(),
+            reward,
+            False,
+            False,
+            dict(
+                reward_linup=uph_cost,
+                reward_quadctrl=-quad_ctrl_cost,
+                reward_impact=-quad_impact_cost,
+            ),
+        )
+
+    def reset_model(self):
+        c = 0.01
+        self.set_state(
+            self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq),
+            self.init_qvel
+            + self.np_random.uniform(
+                low=-c,
+                high=c,
+                size=self.model.nv,
+            ),
+        )
+        return self._get_obs()
+
+    def viewer_setup(self):
+        assert self.viewer is not None
+        self.viewer.cam.trackbodyid = 1
+        self.viewer.cam.distance = self.model.stat.extent * 1.0
+        self.viewer.cam.lookat[2] = 0.8925
+        self.viewer.cam.elevation = -20
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/humanoidstandup_v4.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/humanoidstandup_v4.py
@ -0,0 +1,319 @@
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MujocoEnv
+from gymnasium.spaces import Box
+
+
+DEFAULT_CAMERA_CONFIG = {
+    "trackbodyid": 1,
+    "distance": 4.0,
+    "lookat": np.array((0.0, 0.0, 0.8925)),
+    "elevation": -20.0,
+}
+
+
+class HumanoidStandupEnv(MujocoEnv, utils.EzPickle):
+    """
+    ## Description
+
+    This environment is based on the environment introduced by Tassa, Erez and Todorov
+    in ["Synthesis and stabilization of complex behaviors through online trajectory optimization"](https://ieeexplore.ieee.org/document/6386025).
+    The 3D bipedal robot is designed to simulate a human. It has a torso (abdomen) with a
+    pair of legs and arms. The legs each consist of two links, and so the arms (representing the
+    knees and elbows respectively). The environment starts with the humanoid laying on the ground,
+    and then the goal of the environment is to make the humanoid standup and then keep it standing
+    by applying torques on the various hinges.
+
+    ## Action Space
+    The agent take a 17-element vector for actions.
+
+    The action space is a continuous `(action, ...)` all in `[-1, 1]`, where `action`
+    represents the numerical torques applied at the hinge joints.
+
+    | Num | Action                                                                             | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit         |
+    | --- | ---------------------------------------------------------------------------------- | ----------- | ----------- | -------------------------------- | ----- | ------------ |
+    | 0   | Torque applied on the hinge in the y-coordinate of the abdomen                     | -0.4        | 0.4         | abdomen_y                        | hinge | torque (N m) |
+    | 1   | Torque applied on the hinge in the z-coordinate of the abdomen                     | -0.4        | 0.4         | abdomen_z                        | hinge | torque (N m) |
+    | 2   | Torque applied on the hinge in the x-coordinate of the abdomen                     | -0.4        | 0.4         | abdomen_x                        | hinge | torque (N m) |
+    | 3   | Torque applied on the rotor between torso/abdomen and the right hip (x-coordinate) | -0.4        | 0.4         | right_hip_x (right_thigh)        | hinge | torque (N m) |
+    | 4   | Torque applied on the rotor between torso/abdomen and the right hip (z-coordinate) | -0.4        | 0.4         | right_hip_z (right_thigh)        | hinge | torque (N m) |
+    | 5   | Torque applied on the rotor between torso/abdomen and the right hip (y-coordinate) | -0.4        | 0.4         | right_hip_y (right_thigh)        | hinge | torque (N m) |
+    | 6   | Torque applied on the rotor between the right hip/thigh and the right shin         | -0.4        | 0.4         | right_knee                       | hinge | torque (N m) |
+    | 7   | Torque applied on the rotor between torso/abdomen and the left hip (x-coordinate)  | -0.4        | 0.4         | left_hip_x (left_thigh)          | hinge | torque (N m) |
+    | 8   | Torque applied on the rotor between torso/abdomen and the left hip (z-coordinate)  | -0.4        | 0.4         | left_hip_z (left_thigh)          | hinge | torque (N m) |
+    | 9   | Torque applied on the rotor between torso/abdomen and the left hip (y-coordinate)  | -0.4        | 0.4         | left_hip_y (left_thigh)          | hinge | torque (N m) |
+    | 10  | Torque applied on the rotor between the left hip/thigh and the left shin           | -0.4        | 0.4         | left_knee                        | hinge | torque (N m) |
+    | 11  | Torque applied on the rotor between the torso and right upper arm (coordinate -1)  | -0.4        | 0.4         | right_shoulder1                  | hinge | torque (N m) |
+    | 12  | Torque applied on the rotor between the torso and right upper arm (coordinate -2)  | -0.4        | 0.4         | right_shoulder2                  | hinge | torque (N m) |
+    | 13  | Torque applied on the rotor between the right upper arm and right lower arm        | -0.4        | 0.4         | right_elbow                      | hinge | torque (N m) |
+    | 14  | Torque applied on the rotor between the torso and left upper arm (coordinate -1)   | -0.4        | 0.4         | left_shoulder1                   | hinge | torque (N m) |
+    | 15  | Torque applied on the rotor between the torso and left upper arm (coordinate -2)   | -0.4        | 0.4         | left_shoulder2                   | hinge | torque (N m) |
+    | 16  | Torque applied on the rotor between the left upper arm and left lower arm          | -0.4        | 0.4         | left_elbow                       | hinge | torque (N m) |
+
+    ## Observation Space
+    Observations consist of positional values of different body parts of the Humanoid,
+    followed by the velocities of those individual parts (their derivatives) with all the
+    positions ordered before all the velocities.
+
+    By default, observations do not include the x- and y-coordinates of the torso. These may
+    be included by passing `exclude_current_positions_from_observation=False` during construction.
+    In that case, the observation space will be a `Box(-Inf, Inf, (378,), float64)` where the first two observations
+    represent the x- and y-coordinates of the torso.
+    Regardless of whether `exclude_current_positions_from_observation` was set to true or false, the x- and y-coordinates
+    will be returned in `info` with keys `"x_position"` and `"y_position"`, respectively.
+
+    However, by default, the observation is a `Box(-Inf, Inf, (376,), float64)`. The elements correspond to the following:
+
+    | Num | Observation                                                                                                     | Min  | Max | Name (in corresponding XML file) | Joint | Unit                       |
+    | --- | --------------------------------------------------------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | -------------------------- |
+    | 0   | z-coordinate of the torso (centre)                                                                              | -Inf | Inf | root                             | free  | position (m)               |
+    | 1   | x-orientation of the torso (centre)                                                                             | -Inf | Inf | root                             | free  | angle (rad)                |
+    | 2   | y-orientation of the torso (centre)                                                                             | -Inf | Inf | root                             | free  | angle (rad)                |
+    | 3   | z-orientation of the torso (centre)                                                                             | -Inf | Inf | root                             | free  | angle (rad)                |
+    | 4   | w-orientation of the torso (centre)                                                                             | -Inf | Inf | root                             | free  | angle (rad)                |
+    | 5   | z-angle of the abdomen (in lower_waist)                                                                         | -Inf | Inf | abdomen_z                        | hinge | angle (rad)                |
+    | 6   | y-angle of the abdomen (in lower_waist)                                                                         | -Inf | Inf | abdomen_y                        | hinge | angle (rad)                |
+    | 7   | x-angle of the abdomen (in pelvis)                                                                              | -Inf | Inf | abdomen_x                        | hinge | angle (rad)                |
+    | 8   | x-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf | Inf | right_hip_x                      | hinge | angle (rad)                |
+    | 9   | z-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf | Inf | right_hip_z                      | hinge | angle (rad)                |
+    | 10  | y-coordinate of angle between pelvis and right hip (in right_thigh)                                             | -Inf | Inf | right_hip_y                      | hinge | angle (rad)                |
+    | 11  | angle between right hip and the right shin (in right_knee)                                                      | -Inf | Inf | right_knee                       | hinge | angle (rad)                |
+    | 12  | x-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf | Inf | left_hip_x                       | hinge | angle (rad)                |
+    | 13  | z-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf | Inf | left_hip_z                       | hinge | angle (rad)                |
+    | 14  | y-coordinate of angle between pelvis and left hip (in left_thigh)                                               | -Inf | Inf | left_hip_y                       | hinge | angle (rad)                |
+    | 15  | angle between left hip and the left shin (in left_knee)                                                         | -Inf | Inf | left_knee                        | hinge | angle (rad)                |
+    | 16  | coordinate-1 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf | Inf | right_shoulder1                  | hinge | angle (rad)                |
+    | 17  | coordinate-2 (multi-axis) angle between torso and right arm (in right_upper_arm)                                | -Inf | Inf | right_shoulder2                  | hinge | angle (rad)                |
+    | 18  | angle between right upper arm and right_lower_arm                                                               | -Inf | Inf | right_elbow                      | hinge | angle (rad)                |
+    | 19  | coordinate-1 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf | Inf | left_shoulder1                   | hinge | angle (rad)                |
+    | 20  | coordinate-2 (multi-axis) angle between torso and left arm (in left_upper_arm)                                  | -Inf | Inf | left_shoulder2                   | hinge | angle (rad)                |
+    | 21  | angle between left upper arm and left_lower_arm                                                                 | -Inf | Inf | left_elbow                       | hinge | angle (rad)                |
+    | 22  | x-coordinate velocity of the torso (centre)                                                                     | -Inf | Inf | root                             | free  | velocity (m/s)             |
+    | 23  | y-coordinate velocity of the torso (centre)                                                                     | -Inf | Inf | root                             | free  | velocity (m/s)             |
+    | 24  | z-coordinate velocity of the torso (centre)                                                                     | -Inf | Inf | root                             | free  | velocity (m/s)             |
+    | 25  | x-coordinate angular velocity of the torso (centre)                                                             | -Inf | Inf | root                             | free  | anglular velocity (rad/s)  |
+    | 26  | y-coordinate angular velocity of the torso (centre)                                                             | -Inf | Inf | root                             | free  | anglular velocity (rad/s)  |
+    | 27  | z-coordinate angular velocity of the torso (centre)                                                             | -Inf | Inf | root                             | free  | anglular velocity (rad/s)  |
+    | 28  | z-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf | Inf | abdomen_z                        | hinge | anglular velocity (rad/s)  |
+    | 29  | y-coordinate of angular velocity of the abdomen (in lower_waist)                                                | -Inf | Inf | abdomen_y                        | hinge | anglular velocity (rad/s)  |
+    | 30  | x-coordinate of angular velocity of the abdomen (in pelvis)                                                     | -Inf | Inf | abdomen_x                        | hinge | aanglular velocity (rad/s) |
+    | 31  | x-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf | Inf | right_hip_x                      | hinge | anglular velocity (rad/s)  |
+    | 32  | z-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf | Inf | right_hip_z                      | hinge | anglular velocity (rad/s)  |
+    | 33  | y-coordinate of the angular velocity of the angle between pelvis and right hip (in right_thigh)                 | -Inf | Inf | right_hip_y                      | hinge | anglular velocity (rad/s)  |
+    | 34  | angular velocity of the angle between right hip and the right shin (in right_knee)                              | -Inf | Inf | right_knee                       | hinge | anglular velocity (rad/s)  |
+    | 35  | x-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf | Inf | left_hip_x                       | hinge | anglular velocity (rad/s)  |
+    | 36  | z-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf | Inf | left_hip_z                       | hinge | anglular velocity (rad/s)  |
+    | 37  | y-coordinate of the angular velocity of the angle between pelvis and left hip (in left_thigh)                   | -Inf | Inf | left_hip_y                       | hinge | anglular velocity (rad/s)  |
+    | 38  | angular velocity of the angle between left hip and the left shin (in left_knee)                                 | -Inf | Inf | left_knee                        | hinge | anglular velocity (rad/s)  |
+    | 39  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder1                  | hinge | anglular velocity (rad/s)  |
+    | 40  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and right arm (in right_upper_arm) | -Inf | Inf | right_shoulder2                  | hinge | anglular velocity (rad/s)  |
+    | 41  | angular velocity of the angle between right upper arm and right_lower_arm                                       | -Inf | Inf | right_elbow                      | hinge | anglular velocity (rad/s)  |
+    | 42  | coordinate-1 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf | Inf | left_shoulder1                   | hinge | anglular velocity (rad/s)  |
+    | 43  | coordinate-2 (multi-axis) of the angular velocity of the angle between torso and left arm (in left_upper_arm)   | -Inf | Inf | left_shoulder2                   | hinge | anglular velocity (rad/s)  |
+    | 44  | angular velocity of the angle between left upper arm and left_lower_arm                                         | -Inf | Inf | left_elbow                       | hinge | anglular velocity (rad/s)  |
+    | excluded | x-coordinate of the torso (centre)                                                                         | -Inf | Inf | root                             | free  | position (m)               |
+    | excluded | y-coordinate of the torso (centre)                                                                         | -Inf | Inf | root                             | free  | position (m)               |
+
+    Additionally, after all the positional and velocity based values in the table,
+    the observation contains (in order):
+    - *cinert:* Mass and inertia of a single rigid body relative to the center of mass
+    (this is an intermediate result of transition). It has shape 14*10 (*nbody * 10*)
+    and hence adds to another 140 elements in the state space.
+    - *cvel:* Center of mass based velocity. It has shape 14 * 6 (*nbody * 6*) and hence
+    adds another 84 elements in the state space
+    - *qfrc_actuator:* Constraint force generated as the actuator force. This has shape
+    `(23,)`  *(nv * 1)* and hence adds another 23 elements to the state space.
+    - *cfrc_ext:* This is the center of mass based external force on the body.  It has shape
+    14 * 6 (*nbody * 6*) and hence adds to another 84 elements in the state space.
+    where *nbody* stands for the number of bodies in the robot and *nv* stands for the
+    number of degrees of freedom (*= dim(qvel)*)
+
+    The body parts are:
+
+    | id (for `v2`,`v3`,`v4`) | body part |
+    | --- |  ------------  |
+    | 0   | worldBody (note: all values are constant 0) |
+    | 1   | torso |
+    | 2   | lwaist |
+    | 3   | pelvis |
+    | 4   | right_thigh |
+    | 5   | right_sin |
+    | 6   | right_foot |
+    | 7   | left_thigh |
+    | 8   | left_sin |
+    | 9   | left_foot |
+    | 10  | right_upper_arm |
+    | 11  | right_lower_arm |
+    | 12  | left_upper_arm |
+    | 13  | left_lower_arm |
+
+    The joints are:
+
+    | id (for `v2`,`v3`,`v4`) | joint |
+    | --- |  ------------  |
+    | 0   | root |
+    | 1   | root |
+    | 2   | root |
+    | 3   | root |
+    | 4   | root |
+    | 5   | root |
+    | 6   | abdomen_z |
+    | 7   | abdomen_y |
+    | 8   | abdomen_x |
+    | 9   | right_hip_x |
+    | 10  | right_hip_z |
+    | 11  | right_hip_y |
+    | 12  | right_knee |
+    | 13  | left_hip_x |
+    | 14  | left_hiz_z |
+    | 15  | left_hip_y |
+    | 16  | left_knee |
+    | 17  | right_shoulder1 |
+    | 18  | right_shoulder2 |
+    | 19  | right_elbow|
+    | 20  | left_shoulder1 |
+    | 21  | left_shoulder2 |
+    | 22  | left_elfbow |
+
+    The (x,y,z) coordinates are translational DOFs while the orientations are rotational
+    DOFs expressed as quaternions. One can read more about free joints on the
+    [Mujoco Documentation](https://mujoco.readthedocs.io/en/latest/XMLreference.html).
+
+    **Note:** HumanoidStandup-v4 environment no longer has the following contact forces issue.
+    If using previous HumanoidStandup versions from v4, there have been reported issues that using a Mujoco-Py version > 2.0 results
+    in the contact forces always being 0. As such we recommend to use a Mujoco-Py version < 2.0
+    when using the Humanoid environment if you would like to report results with contact forces
+    (if contact forces are not used in your experiments, you can use version > 2.0).
+
+    ## Rewards
+    The reward consists of three parts:
+    - *uph_cost*: A reward for moving upward (in an attempt to stand up). This is not a relative
+    reward which measures how much upward it has moved from the last timestep, but it is an
+    absolute reward which measures how much upward the Humanoid has moved overall. It is
+    measured as *(z coordinate after action - 0)/(atomic timestep)*, where *z coordinate after
+    action* is index 0 in the state/index 2 in the table, and *atomic timestep* is the time for
+    one frame of movement even though the simulation has a framerate of 5 (done in order to inflate
+    rewards a little for faster learning).
+    - *quad_ctrl_cost*: A negative reward for penalising the humanoid if it has too large of
+    a control force. If there are *nu* actuators/controls, then the control has shape  `nu x 1`.
+    It is measured as *0.1 **x** sum(control<sup>2</sup>)*.
+    - *quad_impact_cost*: A negative reward for penalising the humanoid if the external
+    contact force is too large. It is calculated as *min(0.5 * 0.000001 * sum(external
+    contact force<sup>2</sup>), 10)*.
+
+    The total reward returned is ***reward*** *=* *uph_cost + 1 - quad_ctrl_cost - quad_impact_cost*
+
+    ## Starting State
+    All observations start in state
+    (0.0, 0.0,  0.105, 1.0, 0.0  ... 0.0) with a uniform noise in the range of
+    [-0.01, 0.01] added to the positional and velocity values (values in the table)
+    for stochasticity. Note that the initial z coordinate is intentionally selected
+    to be low, thereby indicating a laying down humanoid. The initial orientation is
+    designed to make it face forward as well.
+
+    ## Episode End
+    The episode ends when any of the following happens:
+
+    1. Truncation: The episode duration reaches a 1000 timesteps
+    2. Termination: Any of the state space values is no longer finite
+
+    ## Arguments
+
+    No additional arguments are currently supported.
+
+    ```python
+    import gymnasium as gym
+    env = gym.make('HumanoidStandup-v4')
+    ```
+
+    There is no v3 for HumanoidStandup, unlike the robot environments where a v3 and
+    beyond take gymnasium.make kwargs such as xml_file, ctrl_cost_weight, reset_noise_scale etc.
+
+    ```python
+    import gymnasium as gym
+    env = gym.make('HumanoidStandup-v2')
+    ```
+
+    ## Version History
+
+    * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3
+    * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
+    * v2: All continuous control environments now use mujoco-py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks. Added reward_threshold to environments.
+    * v0: Initial versions release (1.0.0)
+    """
+
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 67,
+    }
+
+    def __init__(self, **kwargs):
+        observation_space = Box(
+            low=-np.inf, high=np.inf, shape=(376,), dtype=np.float64
+        )
+        MujocoEnv.__init__(
+            self,
+            "humanoidstandup.xml",
+            5,
+            observation_space=observation_space,
+            default_camera_config=DEFAULT_CAMERA_CONFIG,
+            **kwargs,
+        )
+        utils.EzPickle.__init__(self, **kwargs)
+
+    def _get_obs(self):
+        data = self.data
+        return np.concatenate(
+            [
+                data.qpos.flat[2:],
+                data.qvel.flat,
+                data.cinert.flat,
+                data.cvel.flat,
+                data.qfrc_actuator.flat,
+                data.cfrc_ext.flat,
+            ]
+        )
+
+    def step(self, a):
+        self.do_simulation(a, self.frame_skip)
+        pos_after = self.data.qpos[2]
+        data = self.data
+        uph_cost = (pos_after - 0) / self.model.opt.timestep
+
+        quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum()
+        quad_impact_cost = 0.5e-6 * np.square(data.cfrc_ext).sum()
+        quad_impact_cost = min(quad_impact_cost, 10)
+        reward = uph_cost - quad_ctrl_cost - quad_impact_cost + 1
+
+        if self.render_mode == "human":
+            self.render()
+        return (
+            self._get_obs(),
+            reward,
+            False,
+            False,
+            dict(
+                reward_linup=uph_cost,
+                reward_quadctrl=-quad_ctrl_cost,
+                reward_impact=-quad_impact_cost,
+            ),
+        )
+
+    def reset_model(self):
+        c = 0.01
+        self.set_state(
+            self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq),
+            self.init_qvel
+            + self.np_random.uniform(
+                low=-c,
+                high=c,
+                size=self.model.nv,
+            ),
+        )
+        return self._get_obs()
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/inverted_double_pendulum.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/inverted_double_pendulum.py
@ -0,0 +1,69 @@
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MuJocoPyEnv
+from gymnasium.spaces import Box
+
+
+class InvertedDoublePendulumEnv(MuJocoPyEnv, utils.EzPickle):
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 20,
+    }
+
+    def __init__(self, **kwargs):
+        observation_space = Box(low=-np.inf, high=np.inf, shape=(11,), dtype=np.float64)
+        MuJocoPyEnv.__init__(
+            self,
+            "inverted_double_pendulum.xml",
+            5,
+            observation_space=observation_space,
+            **kwargs,
+        )
+        utils.EzPickle.__init__(self, **kwargs)
+
+    def step(self, action):
+        self.do_simulation(action, self.frame_skip)
+
+        ob = self._get_obs()
+        x, _, y = self.sim.data.site_xpos[0]
+        dist_penalty = 0.01 * x**2 + (y - 2) ** 2
+        v1, v2 = self.sim.data.qvel[1:3]
+        vel_penalty = 1e-3 * v1**2 + 5e-3 * v2**2
+        alive_bonus = 10
+        r = alive_bonus - dist_penalty - vel_penalty
+        terminated = bool(y <= 1)
+
+        if self.render_mode == "human":
+            self.render()
+        return ob, r, terminated, False, {}
+
+    def _get_obs(self):
+        return np.concatenate(
+            [
+                self.sim.data.qpos[:1],  # cart x pos
+                np.sin(self.sim.data.qpos[1:]),  # link angles
+                np.cos(self.sim.data.qpos[1:]),
+                np.clip(self.sim.data.qvel, -10, 10),
+                np.clip(self.sim.data.qfrc_constraint, -10, 10),
+            ]
+        ).ravel()
+
+    def reset_model(self):
+        self.set_state(
+            self.init_qpos
+            + self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq),
+            self.init_qvel + self.np_random.standard_normal(self.model.nv) * 0.1,
+        )
+        return self._get_obs()
+
+    def viewer_setup(self):
+        assert self.viewer is not None
+        v = self.viewer
+        v.cam.trackbodyid = 0
+        v.cam.distance = self.model.stat.extent * 0.5
+        v.cam.lookat[2] = 0.12250000000000005  # v.model.stat.center[2]
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/inverted_double_pendulum_v4.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/inverted_double_pendulum_v4.py
@ -0,0 +1,179 @@
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MujocoEnv
+from gymnasium.spaces import Box
+
+
+DEFAULT_CAMERA_CONFIG = {
+    "trackbodyid": 0,
+    "distance": 4.1225,
+    "lookat": np.array((0.0, 0.0, 0.12250000000000005)),
+}
+
+
+class InvertedDoublePendulumEnv(MujocoEnv, utils.EzPickle):
+    """
+    ## Description
+
+    This environment originates from control theory and builds on the cartpole
+    environment based on the work done by Barto, Sutton, and Anderson in
+    ["Neuronlike adaptive elements that can solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
+    powered by the Mujoco physics simulator - allowing for more complex experiments
+    (such as varying the effects of gravity or constraints). This environment involves a cart that can
+    moved linearly, with a pole fixed on it and a second pole fixed on the other end of the first one
+    (leaving the second pole as the only one with one free end). The cart can be pushed left or right,
+    and the goal is to balance the second pole on top of the first pole, which is in turn on top of the
+    cart, by applying continuous forces on the cart.
+
+    ## Action Space
+    The agent take a 1-element vector for actions.
+    The action space is a continuous `(action)` in `[-1, 1]`, where `action` represents the
+    numerical force applied to the cart (with magnitude representing the amount of force and
+    sign representing the direction)
+
+    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit      |
+    |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
+    | 0   | Force applied on the cart | -1          | 1           | slider                           | slide | Force (N) |
+
+    ## Observation Space
+
+    The state space consists of positional values of different body parts of the pendulum system,
+    followed by the velocities of those individual parts (their derivatives) with all the
+    positions ordered before all the velocities.
+
+    The observation is a `ndarray` with shape `(11,)` where the elements correspond to the following:
+
+    | Num | Observation                                                       | Min  | Max | Name (in corresponding XML file) | Joint | Unit                     |
+    | --- | ----------------------------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------ |
+    | 0   | position of the cart along the linear surface                     | -Inf | Inf | slider                           | slide | position (m)             |
+    | 1   | sine of the angle between the cart and the first pole             | -Inf | Inf | sin(hinge)                       | hinge | unitless                 |
+    | 2   | sine of the angle between the two poles                           | -Inf | Inf | sin(hinge2)                      | hinge | unitless                 |
+    | 3   | cosine of the angle between the cart and the first pole           | -Inf | Inf | cos(hinge)                       | hinge | unitless                 |
+    | 4   | cosine of the angle between the two poles                         | -Inf | Inf | cos(hinge2)                      | hinge | unitless                 |
+    | 5   | velocity of the cart                                              | -Inf | Inf | slider                           | slide | velocity (m/s)           |
+    | 6   | angular velocity of the angle between the cart and the first pole | -Inf | Inf | hinge                            | hinge | angular velocity (rad/s) |
+    | 7   | angular velocity of the angle between the two poles               | -Inf | Inf | hinge2                           | hinge | angular velocity (rad/s) |
+    | 8   | constraint force - 1                                              | -Inf | Inf |                                  |       | Force (N)                |
+    | 9   | constraint force - 2                                              | -Inf | Inf |                                  |       | Force (N)                |
+    | 10  | constraint force - 3                                              | -Inf | Inf |                                  |       | Force (N)                |
+
+
+    There is physical contact between the robots and their environment - and Mujoco
+    attempts at getting realistic physics simulations for the possible physical contact
+    dynamics by aiming for physical accuracy and computational efficiency.
+
+    There is one constraint force for contacts for each degree of freedom (3).
+    The approach and handling of constraints by Mujoco is unique to the simulator
+    and is based on their research. Once can find more information in their
+    [*documentation*](https://mujoco.readthedocs.io/en/latest/computation.html)
+    or in their paper
+    ["Analytically-invertible dynamics with contacts and constraints: Theory and implementation in MuJoCo"](https://homes.cs.washington.edu/~todorov/papers/TodorovICRA14.pdf).
+
+
+    ## Rewards
+
+    The reward consists of two parts:
+    - *alive_bonus*: The goal is to make the second inverted pendulum stand upright
+    (within a certain angle limit) as long as possible - as such a reward of +10 is awarded
+     for each timestep that the second pole is upright.
+    - *distance_penalty*: This reward is a measure of how far the *tip* of the second pendulum
+    (the only free end) moves, and it is calculated as
+    *0.01 * x<sup>2</sup> + (y - 2)<sup>2</sup>*, where *x* is the x-coordinate of the tip
+    and *y* is the y-coordinate of the tip of the second pole.
+    - *velocity_penalty*: A negative reward for penalising the agent if it moves too
+    fast *0.001 *  v<sub>1</sub><sup>2</sup> + 0.005 * v<sub>2</sub> <sup>2</sup>*
+
+    The total reward returned is ***reward*** *=* *alive_bonus - distance_penalty - velocity_penalty*
+
+    ## Starting State
+    All observations start in state
+    (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
+    of [-0.1, 0.1] added to the positional values (cart position and pole angles) and standard
+    normal force with a standard deviation of 0.1 added to the velocity values for stochasticity.
+
+    ## Episode End
+    The episode ends when any of the following happens:
+
+    1.Truncation:  The episode duration reaches 1000 timesteps.
+    2.Termination: Any of the state space values is no longer finite.
+    3.Termination: The y_coordinate of the tip of the second pole *is less than or equal* to 1. The maximum standing height of the system is 1.196 m when all the parts are perpendicularly vertical on top of each other).
+
+    ## Arguments
+
+    No additional arguments are currently supported.
+
+    ```python
+    import gymnasium as gym
+    env = gym.make('InvertedDoublePendulum-v4')
+    ```
+    There is no v3 for InvertedPendulum, unlike the robot environments where a v3 and
+    beyond take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc.
+
+    ```python
+    import gymnasium as gym
+    env = gym.make('InvertedDoublePendulum-v2')
+    ```
+
+    ## Version History
+
+    * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3
+    * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
+    * v2: All continuous control environments now use mujoco-py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
+    * v0: Initial versions release (1.0.0)
+    """
+
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 20,
+    }
+
+    def __init__(self, **kwargs):
+        observation_space = Box(low=-np.inf, high=np.inf, shape=(11,), dtype=np.float64)
+        MujocoEnv.__init__(
+            self,
+            "inverted_double_pendulum.xml",
+            5,
+            observation_space=observation_space,
+            default_camera_config=DEFAULT_CAMERA_CONFIG,
+            **kwargs,
+        )
+        utils.EzPickle.__init__(self, **kwargs)
+
+    def step(self, action):
+        self.do_simulation(action, self.frame_skip)
+        ob = self._get_obs()
+        x, _, y = self.data.site_xpos[0]
+        dist_penalty = 0.01 * x**2 + (y - 2) ** 2
+        v1, v2 = self.data.qvel[1:3]
+        vel_penalty = 1e-3 * v1**2 + 5e-3 * v2**2
+        alive_bonus = 10
+        r = alive_bonus - dist_penalty - vel_penalty
+        terminated = bool(y <= 1)
+        if self.render_mode == "human":
+            self.render()
+        return ob, r, terminated, False, {}
+
+    def _get_obs(self):
+        return np.concatenate(
+            [
+                self.data.qpos[:1],  # cart x pos
+                np.sin(self.data.qpos[1:]),  # link angles
+                np.cos(self.data.qpos[1:]),
+                np.clip(self.data.qvel, -10, 10),
+                np.clip(self.data.qfrc_constraint, -10, 10),
+            ]
+        ).ravel()
+
+    def reset_model(self):
+        self.set_state(
+            self.init_qpos
+            + self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq),
+            self.init_qvel + self.np_random.standard_normal(self.model.nv) * 0.1,
+        )
+        return self._get_obs()
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/inverted_pendulum.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/inverted_pendulum.py
@ -0,0 +1,56 @@
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MuJocoPyEnv
+from gymnasium.spaces import Box
+
+
+class InvertedPendulumEnv(MuJocoPyEnv, utils.EzPickle):
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 25,
+    }
+
+    def __init__(self, **kwargs):
+        utils.EzPickle.__init__(self, **kwargs)
+        observation_space = Box(low=-np.inf, high=np.inf, shape=(4,), dtype=np.float64)
+        MuJocoPyEnv.__init__(
+            self,
+            "inverted_pendulum.xml",
+            2,
+            observation_space=observation_space,
+            **kwargs,
+        )
+
+    def step(self, a):
+        reward = 1.0
+        self.do_simulation(a, self.frame_skip)
+
+        ob = self._get_obs()
+        terminated = bool(not np.isfinite(ob).all() or (np.abs(ob[1]) > 0.2))
+
+        if self.render_mode == "human":
+            self.render()
+        return ob, reward, terminated, False, {}
+
+    def reset_model(self):
+        qpos = self.init_qpos + self.np_random.uniform(
+            size=self.model.nq, low=-0.01, high=0.01
+        )
+        qvel = self.init_qvel + self.np_random.uniform(
+            size=self.model.nv, low=-0.01, high=0.01
+        )
+        self.set_state(qpos, qvel)
+        return self._get_obs()
+
+    def _get_obs(self):
+        return np.concatenate([self.sim.data.qpos, self.sim.data.qvel]).ravel()
+
+    def viewer_setup(self):
+        assert self.viewer is not None
+        self.viewer.cam.trackbodyid = 0
+        self.viewer.cam.distance = self.model.stat.extent
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/inverted_pendulum_v4.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/inverted_pendulum_v4.py
@ -0,0 +1,137 @@
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MujocoEnv
+from gymnasium.spaces import Box
+
+
+DEFAULT_CAMERA_CONFIG = {
+    "trackbodyid": 0,
+    "distance": 2.04,
+}
+
+
+class InvertedPendulumEnv(MujocoEnv, utils.EzPickle):
+    """
+    ## Description
+
+    This environment is the cartpole environment based on the work done by
+    Barto, Sutton, and Anderson in ["Neuronlike adaptive elements that can
+    solve difficult learning control problems"](https://ieeexplore.ieee.org/document/6313077),
+    just like in the classic environments but now powered by the Mujoco physics simulator -
+    allowing for more complex experiments (such as varying the effects of gravity).
+    This environment involves a cart that can moved linearly, with a pole fixed on it
+    at one end and having another end free. The cart can be pushed left or right, and the
+    goal is to balance the pole on the top of the cart by applying forces on the cart.
+
+    ## Action Space
+    The agent take a 1-element vector for actions.
+
+    The action space is a continuous `(action)` in `[-3, 3]`, where `action` represents
+    the numerical force applied to the cart (with magnitude representing the amount of
+    force and sign representing the direction)
+
+    | Num | Action                    | Control Min | Control Max | Name (in corresponding XML file) | Joint | Unit      |
+    |-----|---------------------------|-------------|-------------|----------------------------------|-------|-----------|
+    | 0   | Force applied on the cart | -3          | 3           | slider                           | slide | Force (N) |
+
+    ## Observation Space
+
+    The state space consists of positional values of different body parts of
+    the pendulum system, followed by the velocities of those individual parts (their derivatives)
+    with all the positions ordered before all the velocities.
+
+    The observation is a `ndarray` with shape `(4,)` where the elements correspond to the following:
+
+    | Num | Observation                                   | Min  | Max | Name (in corresponding XML file) | Joint | Unit                      |
+    | --- | --------------------------------------------- | ---- | --- | -------------------------------- | ----- | ------------------------- |
+    | 0   | position of the cart along the linear surface | -Inf | Inf | slider                           | slide | position (m)              |
+    | 1   | vertical angle of the pole on the cart        | -Inf | Inf | hinge                            | hinge | angle (rad)               |
+    | 2   | linear velocity of the cart                   | -Inf | Inf | slider                           | slide | velocity (m/s)            |
+    | 3   | angular velocity of the pole on the cart      | -Inf | Inf | hinge                            | hinge | anglular velocity (rad/s) |
+
+
+    ## Rewards
+
+    The goal is to make the inverted pendulum stand upright (within a certain angle limit)
+    as long as possible - as such a reward of +1 is awarded for each timestep that
+    the pole is upright.
+
+    ## Starting State
+    All observations start in state
+    (0.0, 0.0, 0.0, 0.0) with a uniform noise in the range
+    of [-0.01, 0.01] added to the values for stochasticity.
+
+    ## Episode End
+    The episode ends when any of the following happens:
+
+    1. Truncation: The episode duration reaches 1000 timesteps.
+    2. Termination: Any of the state space values is no longer finite.
+    3. Termination: The absolute value of the vertical angle between the pole and the cart is greater than 0.2 radian.
+
+    ## Arguments
+
+    No additional arguments are currently supported.
+
+    ```python
+    import gymnasium as gym
+    env = gym.make('InvertedPendulum-v4')
+    ```
+    There is no v3 for InvertedPendulum, unlike the robot environments where a
+    v3 and beyond take `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc.
+    ```python
+    import gymnasium as gym
+    env = gym.make('InvertedPendulum-v2')
+    ```
+
+    ## Version History
+
+    * v4: All MuJoCo environments now use the MuJoCo bindings in mujoco >= 2.1.3
+    * v3: Support for `gymnasium.make` kwargs such as `xml_file`, `ctrl_cost_weight`, `reset_noise_scale`, etc. rgb rendering comes from tracking camera (so agent does not run away from screen)
+    * v2: All continuous control environments now use mujoco-py >= 1.50
+    * v1: max_time_steps raised to 1000 for robot based tasks (including inverted pendulum)
+    * v0: Initial versions release (1.0.0)
+    """
+
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 25,
+    }
+
+    def __init__(self, **kwargs):
+        utils.EzPickle.__init__(self, **kwargs)
+        observation_space = Box(low=-np.inf, high=np.inf, shape=(4,), dtype=np.float64)
+        MujocoEnv.__init__(
+            self,
+            "inverted_pendulum.xml",
+            2,
+            observation_space=observation_space,
+            default_camera_config=DEFAULT_CAMERA_CONFIG,
+            **kwargs,
+        )
+
+    def step(self, a):
+        reward = 1.0
+        self.do_simulation(a, self.frame_skip)
+        ob = self._get_obs()
+        terminated = bool(not np.isfinite(ob).all() or (np.abs(ob[1]) > 0.2))
+        if self.render_mode == "human":
+            self.render()
+        return ob, reward, terminated, False, {}
+
+    def reset_model(self):
+        qpos = self.init_qpos + self.np_random.uniform(
+            size=self.model.nq, low=-0.01, high=0.01
+        )
+        qvel = self.init_qvel + self.np_random.uniform(
+            size=self.model.nv, low=-0.01, high=0.01
+        )
+        self.set_state(qpos, qvel)
+        return self._get_obs()
+
+    def _get_obs(self):
+        return np.concatenate([self.data.qpos, self.data.qvel]).ravel()
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/mujoco_env.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/mujoco_env.py
@ -0,0 +1,418 @@
+from os import path
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+from numpy.typing import NDArray
+
+import gymnasium as gym
+from gymnasium import error, logger, spaces
+from gymnasium.spaces import Space
+
+
+try:
+    import mujoco_py
+except ImportError as e:
+    MUJOCO_PY_IMPORT_ERROR = e
+else:
+    MUJOCO_PY_IMPORT_ERROR = None
+
+try:
+    import mujoco
+except ImportError as e:
+    MUJOCO_IMPORT_ERROR = e
+else:
+    MUJOCO_IMPORT_ERROR = None
+
+
+DEFAULT_SIZE = 480
+
+
+class BaseMujocoEnv(gym.Env[np.float64, np.float32]):
+    """Superclass for all MuJoCo environments."""
+
+    def __init__(
+        self,
+        model_path,
+        frame_skip,
+        observation_space: Space,
+        render_mode: Optional[str] = None,
+        width: int = DEFAULT_SIZE,
+        height: int = DEFAULT_SIZE,
+        camera_id: Optional[int] = None,
+        camera_name: Optional[str] = None,
+    ):
+        """Base abstract class for mujoco based environments.
+
+        Args:
+            model_path: Path to the MuJoCo Model.
+            frame_skip: Number of MuJoCo simulation steps per gym `step()`.
+            observation_space: The observation space of the environment.
+            render_mode: The `render_mode` used.
+            width: The width of the render window.
+            height: The height of the render window.
+            camera_id: The camera ID used.
+            camera_name: The name of the camera used (can not be used in conjunction with `camera_id`).
+
+        Raises:
+            OSError: when the `model_path` does not exist.
+            error.DependencyNotInstalled: When `mujoco` is not installed.
+        """
+        if model_path.startswith(".") or model_path.startswith("/"):
+            self.fullpath = model_path
+        elif model_path.startswith("~"):
+            self.fullpath = path.expanduser(model_path)
+        else:
+            self.fullpath = path.join(path.dirname(__file__), "assets", model_path)
+        if not path.exists(self.fullpath):
+            raise OSError(f"File {self.fullpath} does not exist")
+
+        self.width = width
+        self.height = height
+        # may use width and height
+        self.model, self.data = self._initialize_simulation()
+
+        self.init_qpos = self.data.qpos.ravel().copy()
+        self.init_qvel = self.data.qvel.ravel().copy()
+
+        self.frame_skip = frame_skip
+
+        assert self.metadata["render_modes"] == [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ], self.metadata["render_modes"]
+        if "render_fps" in self.metadata:
+            assert (
+                int(np.round(1.0 / self.dt)) == self.metadata["render_fps"]
+            ), f'Expected value: {int(np.round(1.0 / self.dt))}, Actual value: {self.metadata["render_fps"]}'
+
+        self.observation_space = observation_space
+        self._set_action_space()
+
+        self.render_mode = render_mode
+        self.camera_name = camera_name
+        self.camera_id = camera_id
+
+    def _set_action_space(self):
+        bounds = self.model.actuator_ctrlrange.copy().astype(np.float32)
+        low, high = bounds.T
+        self.action_space = spaces.Box(low=low, high=high, dtype=np.float32)
+        return self.action_space
+
+    # methods to override:
+    # ----------------------------
+
+    def reset_model(self) -> NDArray[np.float64]:
+        """
+        Reset the robot degrees of freedom (qpos and qvel).
+        Implement this in each subclass.
+        """
+        raise NotImplementedError
+
+    def _initialize_simulation(self) -> Tuple[Any, Any]:
+        """
+        Initialize MuJoCo simulation data structures mjModel and mjData.
+        """
+        raise NotImplementedError
+
+    def _reset_simulation(self) -> None:
+        """
+        Reset MuJoCo simulation data structures, mjModel and mjData.
+        """
+        raise NotImplementedError
+
+    def _step_mujoco_simulation(self, ctrl, n_frames) -> None:
+        """
+        Step over the MuJoCo simulation.
+        """
+        raise NotImplementedError
+
+    def render(self) -> Union[NDArray[np.float64], None]:
+        """
+        Render a frame from the MuJoCo simulation as specified by the render_mode.
+        """
+        raise NotImplementedError
+
+    # -----------------------------
+    def _get_reset_info(self) -> Dict[str, float]:
+        """Function that generates the `info` that is returned during a `reset()`."""
+        return {}
+
+    def reset(
+        self,
+        *,
+        seed: Optional[int] = None,
+        options: Optional[dict] = None,
+    ):
+        super().reset(seed=seed)
+
+        self._reset_simulation()
+
+        ob = self.reset_model()
+        info = self._get_reset_info()
+
+        if self.render_mode == "human":
+            self.render()
+        return ob, info
+
+    def set_state(self, qpos, qvel) -> None:
+        """
+        Set the joints position qpos and velocity qvel of the model. Override this method depending on the MuJoCo bindings used.
+        """
+        assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,)
+
+    @property
+    def dt(self) -> float:
+        return self.model.opt.timestep * self.frame_skip
+
+    def do_simulation(self, ctrl, n_frames) -> None:
+        """
+        Step the simulation n number of frames and applying a control action.
+        """
+        # Check control input is contained in the action space
+        if np.array(ctrl).shape != (self.model.nu,):
+            raise ValueError(
+                f"Action dimension mismatch. Expected {(self.model.nu,)}, found {np.array(ctrl).shape}"
+            )
+        self._step_mujoco_simulation(ctrl, n_frames)
+
+    def close(self):
+        """Close all processes like rendering contexts"""
+        raise NotImplementedError
+
+    def get_body_com(self, body_name) -> NDArray[np.float64]:
+        """Return the cartesian position of a body frame"""
+        raise NotImplementedError
+
+    def state_vector(self) -> NDArray[np.float64]:
+        """Return the position and velocity joint states of the model"""
+        return np.concatenate([self.data.qpos.flat, self.data.qvel.flat])
+
+
+class MuJocoPyEnv(BaseMujocoEnv):
+    def __init__(
+        self,
+        model_path: str,
+        frame_skip: int,
+        observation_space: Space,
+        render_mode: Optional[str] = None,
+        width: int = DEFAULT_SIZE,
+        height: int = DEFAULT_SIZE,
+        camera_id: Optional[int] = None,
+        camera_name: Optional[str] = None,
+    ):
+        if MUJOCO_PY_IMPORT_ERROR is not None:
+            raise error.DependencyNotInstalled(
+                f"{MUJOCO_PY_IMPORT_ERROR}. "
+                "(HINT: you need to install mujoco-py, and also perform the setup instructions "
+                "here: https://github.com/openai/mujoco-py.)"
+            )
+
+        logger.deprecation(
+            "This version of the mujoco environments depends "
+            "on the mujoco-py bindings, which are no longer maintained "
+            "and may stop working. Please upgrade to the v4 versions of "
+            "the environments (which depend on the mujoco python bindings instead), unless "
+            "you are trying to precisely replicate previous works)."
+        )
+
+        self.viewer = None
+        self._viewers = {}
+
+        super().__init__(
+            model_path,
+            frame_skip,
+            observation_space,
+            render_mode,
+            width,
+            height,
+            camera_id,
+            camera_name,
+        )
+
+    def _initialize_simulation(self):
+        model = mujoco_py.load_model_from_path(self.fullpath)
+        self.sim = mujoco_py.MjSim(model)
+        data = self.sim.data
+        return model, data
+
+    def _reset_simulation(self):
+        self.sim.reset()
+
+    def set_state(self, qpos, qvel):
+        super().set_state(qpos, qvel)
+        state = self.sim.get_state()
+        state = mujoco_py.MjSimState(state.time, qpos, qvel, state.act, state.udd_state)
+        self.sim.set_state(state)
+        self.sim.forward()
+
+    def get_body_com(self, body_name):
+        return self.data.get_body_xpos(body_name)
+
+    def _step_mujoco_simulation(self, ctrl, n_frames):
+        self.sim.data.ctrl[:] = ctrl
+
+        for _ in range(n_frames):
+            self.sim.step()
+
+    def render(self):
+        if self.render_mode is None:
+            assert self.spec is not None
+            gym.logger.warn(
+                "You are calling render method without specifying any render mode. "
+                "You can specify the render_mode at initialization, "
+                f'e.g. gym.make("{self.spec.id}", render_mode="rgb_array")'
+            )
+            return
+
+        width, height = self.width, self.height
+        camera_name, camera_id = self.camera_name, self.camera_id
+        if self.render_mode in {"rgb_array", "depth_array"}:
+            if camera_id is not None and camera_name is not None:
+                raise ValueError(
+                    "Both `camera_id` and `camera_name` cannot be"
+                    " specified at the same time."
+                )
+
+            no_camera_specified = camera_name is None and camera_id is None
+            if no_camera_specified:
+                camera_name = "track"
+
+            if camera_id is None and camera_name in self.model._camera_name2id:
+                if camera_name in self.model._camera_name2id:
+                    camera_id = self.model.camera_name2id(camera_name)
+
+                self._get_viewer(self.render_mode).render(
+                    width, height, camera_id=camera_id
+                )
+
+        if self.render_mode == "rgb_array":
+            data = self._get_viewer(self.render_mode).read_pixels(
+                width, height, depth=False
+            )
+            # original image is upside-down, so flip it
+            return data[::-1, :, :]
+        elif self.render_mode == "depth_array":
+            self._get_viewer(self.render_mode).render(width, height)
+            # Extract depth part of the read_pixels() tuple
+            data = self._get_viewer(self.render_mode).read_pixels(
+                width, height, depth=True
+            )[1]
+            # original image is upside-down, so flip it
+            return data[::-1, :]
+        elif self.render_mode == "human":
+            self._get_viewer(self.render_mode).render()
+
+    def _get_viewer(
+        self, mode
+    ) -> Union["mujoco_py.MjViewer", "mujoco_py.MjRenderContextOffscreen"]:
+        self.viewer = self._viewers.get(mode)
+        if self.viewer is None:
+            if mode == "human":
+                self.viewer = mujoco_py.MjViewer(self.sim)
+
+            elif mode in {"rgb_array", "depth_array"}:
+                self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, -1)
+            else:
+                raise AttributeError(
+                    f"Unknown mode: {mode}, expected modes: {self.metadata['render_modes']}"
+                )
+
+            self.viewer_setup()
+            self._viewers[mode] = self.viewer
+
+        return self.viewer
+
+    def close(self):
+        if self.viewer is not None:
+            self.viewer = None
+            self._viewers = {}
+
+    def viewer_setup(self):
+        """
+        This method is called when the viewer is initialized.
+        Optionally implement this method, if you need to tinker with camera position and so forth.
+        """
+        raise NotImplementedError
+
+
+class MujocoEnv(BaseMujocoEnv):
+    """Superclass for MuJoCo environments."""
+
+    def __init__(
+        self,
+        model_path,
+        frame_skip,
+        observation_space: Space,
+        render_mode: Optional[str] = None,
+        width: int = DEFAULT_SIZE,
+        height: int = DEFAULT_SIZE,
+        camera_id: Optional[int] = None,
+        camera_name: Optional[str] = None,
+        default_camera_config: Optional[dict] = None,
+    ):
+        if MUJOCO_IMPORT_ERROR is not None:
+            raise error.DependencyNotInstalled(
+                f"{MUJOCO_IMPORT_ERROR}. "
+                "(HINT: you need to install mujoco, run `pip install gymnasium[mujoco]`.)"
+            )
+
+        super().__init__(
+            model_path,
+            frame_skip,
+            observation_space,
+            render_mode,
+            width,
+            height,
+            camera_id,
+            camera_name,
+        )
+
+        from gymnasium.envs.mujoco.mujoco_rendering import MujocoRenderer
+
+        self.mujoco_renderer = MujocoRenderer(
+            self.model, self.data, default_camera_config
+        )
+
+    def _initialize_simulation(
+        self,
+    ):
+        model = mujoco.MjModel.from_xml_path(self.fullpath)
+        # MjrContext will copy model.vis.global_.off* to con.off*
+        model.vis.global_.offwidth = self.width
+        model.vis.global_.offheight = self.height
+        data = mujoco.MjData(model)
+        return model, data
+
+    def _reset_simulation(self):
+        mujoco.mj_resetData(self.model, self.data)
+
+    def set_state(self, qpos, qvel):
+        super().set_state(qpos, qvel)
+        self.data.qpos[:] = np.copy(qpos)
+        self.data.qvel[:] = np.copy(qvel)
+        if self.model.na == 0:
+            self.data.act[:] = None
+        mujoco.mj_forward(self.model, self.data)
+
+    def _step_mujoco_simulation(self, ctrl, n_frames):
+        self.data.ctrl[:] = ctrl
+
+        mujoco.mj_step(self.model, self.data, nstep=n_frames)
+
+        # As of MuJoCo 2.0, force-related quantities like cacc are not computed
+        # unless there's a force sensor in the model.
+        # See https://github.com/openai/gym/issues/1541
+        mujoco.mj_rnePostConstraint(self.model, self.data)
+
+    def render(self):
+        return self.mujoco_renderer.render(
+            self.render_mode, self.camera_id, self.camera_name
+        )
+
+    def close(self):
+        if self.mujoco_renderer is not None:
+            self.mujoco_renderer.close()
+
+    def get_body_com(self, body_name):
+        return self.data.body(body_name).xpos
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/mujoco_rendering.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/mujoco_rendering.py
@ -0,0 +1,714 @@
+import collections
+import os
+import time
+from typing import Optional
+
+import glfw
+import imageio
+import mujoco
+import numpy as np
+
+
+def _import_egl(width, height):
+    from mujoco.egl import GLContext
+
+    return GLContext(width, height)
+
+
+def _import_glfw(width, height):
+    from mujoco.glfw import GLContext
+
+    return GLContext(width, height)
+
+
+def _import_osmesa(width, height):
+    from mujoco.osmesa import GLContext
+
+    return GLContext(width, height)
+
+
+_ALL_RENDERERS = collections.OrderedDict(
+    [
+        ("glfw", _import_glfw),
+        ("egl", _import_egl),
+        ("osmesa", _import_osmesa),
+    ]
+)
+
+
+class BaseRender:
+    def __init__(
+        self, model: "mujoco.MjModel", data: "mujoco.MjData", width: int, height: int
+    ):
+        """Render context superclass for offscreen and window rendering."""
+        self.model = model
+        self.data = data
+
+        self._markers = []
+        self._overlays = {}
+
+        self.viewport = mujoco.MjrRect(0, 0, width, height)
+
+        # This goes to specific visualizer
+        self.scn = mujoco.MjvScene(self.model, 1000)
+        self.cam = mujoco.MjvCamera()
+        self.vopt = mujoco.MjvOption()
+        self.pert = mujoco.MjvPerturb()
+
+        self.make_context_current()
+
+        # Keep in Mujoco Context
+        self.con = mujoco.MjrContext(self.model, mujoco.mjtFontScale.mjFONTSCALE_150)
+
+        self._set_mujoco_buffer()
+
+    def _set_mujoco_buffer(self):
+        raise NotImplementedError
+
+    def make_context_current(self):
+        raise NotImplementedError
+
+    def add_overlay(self, gridpos: int, text1: str, text2: str):
+        """Overlays text on the scene."""
+        if gridpos not in self._overlays:
+            self._overlays[gridpos] = ["", ""]
+        self._overlays[gridpos][0] += text1 + "\n"
+        self._overlays[gridpos][1] += text2 + "\n"
+
+    def add_marker(self, **marker_params):
+        self._markers.append(marker_params)
+
+    def _add_marker_to_scene(self, marker: dict):
+        if self.scn.ngeom >= self.scn.maxgeom:
+            raise RuntimeError("Ran out of geoms. maxgeom: %d" % self.scn.maxgeom)
+
+        g = self.scn.geoms[self.scn.ngeom]
+        # default values.
+        g.dataid = -1
+        g.objtype = mujoco.mjtObj.mjOBJ_UNKNOWN
+        g.objid = -1
+        g.category = mujoco.mjtCatBit.mjCAT_DECOR
+        g.texid = -1
+        g.texuniform = 0
+        g.texrepeat[0] = 1
+        g.texrepeat[1] = 1
+        g.emission = 0
+        g.specular = 0.5
+        g.shininess = 0.5
+        g.reflectance = 0
+        g.type = mujoco.mjtGeom.mjGEOM_BOX
+        g.size[:] = np.ones(3) * 0.1
+        g.mat[:] = np.eye(3)
+        g.rgba[:] = np.ones(4)
+
+        for key, value in marker.items():
+            if isinstance(value, (int, float, mujoco._enums.mjtGeom)):
+                setattr(g, key, value)
+            elif isinstance(value, (tuple, list, np.ndarray)):
+                attr = getattr(g, key)
+                attr[:] = np.asarray(value).reshape(attr.shape)
+            elif isinstance(value, str):
+                assert key == "label", "Only label is a string in mjtGeom."
+                if value is None:
+                    g.label[0] = 0
+                else:
+                    g.label = value
+            elif hasattr(g, key):
+                raise ValueError(
+                    "mjtGeom has attr {} but type {} is invalid".format(
+                        key, type(value)
+                    )
+                )
+            else:
+                raise ValueError("mjtGeom doesn't have field %s" % key)
+
+        self.scn.ngeom += 1
+
+    def close(self):
+        """Override close in your rendering subclass to perform any necessary cleanup
+        after env.close() is called.
+        """
+        raise NotImplementedError
+
+
+class OffScreenViewer(BaseRender):
+    """Offscreen rendering class with opengl context."""
+
+    def __init__(self, model: "mujoco.MjMujoco", data: "mujoco.MjData"):
+        width = model.vis.global_.offwidth
+        height = model.vis.global_.offheight
+
+        # We must make GLContext before MjrContext
+        self._get_opengl_backend(width, height)
+
+        super().__init__(model, data, width, height)
+
+        self._init_camera()
+
+    def _init_camera(self):
+        self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE
+        self.cam.fixedcamid = -1
+        for i in range(3):
+            self.cam.lookat[i] = np.median(self.data.geom_xpos[:, i])
+        self.cam.distance = self.model.stat.extent
+
+    def _get_opengl_backend(self, width: int, height: int):
+        self.backend = os.environ.get("MUJOCO_GL")
+        if self.backend is not None:
+            try:
+                self.opengl_context = _ALL_RENDERERS[self.backend](width, height)
+            except KeyError as e:
+                raise RuntimeError(
+                    "Environment variable {} must be one of {!r}: got {!r}.".format(
+                        "MUJOCO_GL", _ALL_RENDERERS.keys(), self.backend
+                    )
+                ) from e
+
+        else:
+            for name, _ in _ALL_RENDERERS.items():
+                try:
+                    self.opengl_context = _ALL_RENDERERS[name](width, height)
+                    self.backend = name
+                    break
+                except:  # noqa:E722
+                    pass
+            if self.backend is None:
+                raise RuntimeError(
+                    "No OpenGL backend could be imported. Attempting to create a "
+                    "rendering context will result in a RuntimeError."
+                )
+
+    def _set_mujoco_buffer(self):
+        mujoco.mjr_setBuffer(mujoco.mjtFramebuffer.mjFB_OFFSCREEN, self.con)
+
+    def make_context_current(self):
+        self.opengl_context.make_current()
+
+    def free(self):
+        self.opengl_context.free()
+
+    def __del__(self):
+        self.free()
+
+    def render(
+        self,
+        render_mode: str,
+        camera_id: Optional[int] = None,
+        segmentation: bool = False,
+    ):
+        if camera_id is not None:
+            if camera_id == -1:
+                self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE
+            else:
+                self.cam.type = mujoco.mjtCamera.mjCAMERA_FIXED
+            self.cam.fixedcamid = camera_id
+
+        mujoco.mjv_updateScene(
+            self.model,
+            self.data,
+            self.vopt,
+            self.pert,
+            self.cam,
+            mujoco.mjtCatBit.mjCAT_ALL,
+            self.scn,
+        )
+
+        if segmentation:
+            self.scn.flags[mujoco.mjtRndFlag.mjRND_SEGMENT] = 1
+            self.scn.flags[mujoco.mjtRndFlag.mjRND_IDCOLOR] = 1
+
+        for marker_params in self._markers:
+            self._add_marker_to_scene(marker_params)
+
+        mujoco.mjr_render(self.viewport, self.scn, self.con)
+
+        for gridpos, (text1, text2) in self._overlays.items():
+            mujoco.mjr_overlay(
+                mujoco.mjtFontScale.mjFONTSCALE_150,
+                gridpos,
+                self.viewport,
+                text1.encode(),
+                text2.encode(),
+                self.con,
+            )
+
+        if segmentation:
+            self.scn.flags[mujoco.mjtRndFlag.mjRND_SEGMENT] = 0
+            self.scn.flags[mujoco.mjtRndFlag.mjRND_IDCOLOR] = 0
+
+        rgb_arr = np.zeros(
+            3 * self.viewport.width * self.viewport.height, dtype=np.uint8
+        )
+        depth_arr = np.zeros(
+            self.viewport.width * self.viewport.height, dtype=np.float32
+        )
+
+        mujoco.mjr_readPixels(rgb_arr, depth_arr, self.viewport, self.con)
+
+        if render_mode == "depth_array":
+            depth_img = depth_arr.reshape(self.viewport.height, self.viewport.width)
+            # original image is upside-down, so flip it
+            return depth_img[::-1, :]
+        else:
+            rgb_img = rgb_arr.reshape(self.viewport.height, self.viewport.width, 3)
+
+            if segmentation:
+                seg_img = (
+                    rgb_img[:, :, 0]
+                    + rgb_img[:, :, 1] * (2**8)
+                    + rgb_img[:, :, 2] * (2**16)
+                )
+                seg_img[seg_img >= (self.scn.ngeom + 1)] = 0
+                seg_ids = np.full(
+                    (self.scn.ngeom + 1, 2), fill_value=-1, dtype=np.int32
+                )
+
+                for i in range(self.scn.ngeom):
+                    geom = self.scn.geoms[i]
+                    if geom.segid != -1:
+                        seg_ids[geom.segid + 1, 0] = geom.objtype
+                        seg_ids[geom.segid + 1, 1] = geom.objid
+                rgb_img = seg_ids[seg_img]
+
+            # original image is upside-down, so flip i
+            return rgb_img[::-1, :, :]
+
+    def close(self):
+        self.free()
+        glfw.terminate()
+
+
+class WindowViewer(BaseRender):
+    """Class for window rendering in all MuJoCo environments."""
+
+    def __init__(self, model: "mujoco.MjModel", data: "mujoco.MjData"):
+        glfw.init()
+
+        self._button_left_pressed = False
+        self._button_right_pressed = False
+        self._last_mouse_x = 0
+        self._last_mouse_y = 0
+        self._paused = False
+        self._transparent = False
+        self._contacts = False
+        self._render_every_frame = True
+        self._image_idx = 0
+        self._image_path = "/tmp/frame_%07d.png"
+        self._time_per_render = 1 / 60.0
+        self._run_speed = 1.0
+        self._loop_count = 0
+        self._advance_by_one_step = False
+        self._hide_menu = False
+
+        width, height = glfw.get_video_mode(glfw.get_primary_monitor()).size
+        glfw.window_hint(glfw.VISIBLE, 1)
+        self.window = glfw.create_window(width // 2, height // 2, "mujoco", None, None)
+
+        self.width, self.height = glfw.get_framebuffer_size(self.window)
+        window_width, _ = glfw.get_window_size(self.window)
+        self._scale = self.width * 1.0 / window_width
+
+        # set callbacks
+        glfw.set_cursor_pos_callback(self.window, self._cursor_pos_callback)
+        glfw.set_mouse_button_callback(self.window, self._mouse_button_callback)
+        glfw.set_scroll_callback(self.window, self._scroll_callback)
+        glfw.set_key_callback(self.window, self._key_callback)
+
+        super().__init__(model, data, width, height)
+        glfw.swap_interval(1)
+
+    def _set_mujoco_buffer(self):
+        mujoco.mjr_setBuffer(mujoco.mjtFramebuffer.mjFB_WINDOW, self.con)
+
+    def make_context_current(self):
+        glfw.make_context_current(self.window)
+
+    def free(self):
+        if self.window:
+            if glfw.get_current_context() == self.window:
+                glfw.make_context_current(None)
+            glfw.destroy_window(self.window)
+            self.window = None
+
+    def __del__(self):
+        """Eliminate all of the OpenGL glfw contexts and windows"""
+        self.free()
+
+    def render(self):
+        """
+        Renders the environment geometries in the OpenGL glfw window:
+            1. Create the overlay for the left side panel menu.
+            2. Update the geometries used for rendering based on the current state of the model - `mujoco.mjv_updateScene()`.
+            3. Add markers to scene, these are additional geometries to include in the model, i.e arrows, https://mujoco.readthedocs.io/en/latest/APIreference.html?highlight=arrow#mjtgeom.
+                These markers are added with the `add_marker()` method before rendering.
+            4. Render the 3D scene to the window context - `mujoco.mjr_render()`.
+            5. Render overlays in the window context - `mujoco.mjr_overlay()`.
+            6. Swap front and back buffer, https://www.glfw.org/docs/3.3/quick.html.
+            7. Poll events like mouse clicks or keyboard input.
+        """
+
+        # mjv_updateScene, mjr_render, mjr_overlay
+        def update():
+            # fill overlay items
+            self._create_overlay()
+
+            render_start = time.time()
+            if self.window is None:
+                return
+            elif glfw.window_should_close(self.window):
+                glfw.destroy_window(self.window)
+                glfw.terminate()
+            self.viewport.width, self.viewport.height = glfw.get_framebuffer_size(
+                self.window
+            )
+            # update scene
+            mujoco.mjv_updateScene(
+                self.model,
+                self.data,
+                self.vopt,
+                mujoco.MjvPerturb(),
+                self.cam,
+                mujoco.mjtCatBit.mjCAT_ALL.value,
+                self.scn,
+            )
+
+            # marker items
+            for marker in self._markers:
+                self._add_marker_to_scene(marker)
+
+            # render
+            mujoco.mjr_render(self.viewport, self.scn, self.con)
+
+            # overlay items
+            if not self._hide_menu:
+                for gridpos, [t1, t2] in self._overlays.items():
+                    mujoco.mjr_overlay(
+                        mujoco.mjtFontScale.mjFONTSCALE_150,
+                        gridpos,
+                        self.viewport,
+                        t1,
+                        t2,
+                        self.con,
+                    )
+
+            glfw.swap_buffers(self.window)
+            glfw.poll_events()
+            self._time_per_render = 0.9 * self._time_per_render + 0.1 * (
+                time.time() - render_start
+            )
+
+        if self._paused:
+            while self._paused:
+                update()
+                if self._advance_by_one_step:
+                    self._advance_by_one_step = False
+                    break
+        else:
+            self._loop_count += self.model.opt.timestep / (
+                self._time_per_render * self._run_speed
+            )
+            if self._render_every_frame:
+                self._loop_count = 1
+            while self._loop_count > 0:
+                update()
+                self._loop_count -= 1
+
+        # clear overlay
+        self._overlays.clear()
+        # clear markers
+        self._markers.clear()
+
+    def close(self):
+        self.free()
+        glfw.terminate()
+
+    def _key_callback(self, window, key: int, scancode, action: int, mods):
+        if action != glfw.RELEASE:
+            return
+        # Switch cameras
+        elif key == glfw.KEY_TAB:
+            self.cam.fixedcamid += 1
+            self.cam.type = mujoco.mjtCamera.mjCAMERA_FIXED
+            if self.cam.fixedcamid >= self.model.ncam:
+                self.cam.fixedcamid = -1
+                self.cam.type = mujoco.mjtCamera.mjCAMERA_FREE
+        # Pause simulation
+        elif key == glfw.KEY_SPACE and self._paused is not None:
+            self._paused = not self._paused
+        # Advances simulation by one step.
+        elif key == glfw.KEY_RIGHT and self._paused is not None:
+            self._advance_by_one_step = True
+            self._paused = True
+        # Slows down simulation
+        elif key == glfw.KEY_S:
+            self._run_speed /= 2.0
+        # Speeds up simulation
+        elif key == glfw.KEY_F:
+            self._run_speed *= 2.0
+        # Turn off / turn on rendering every frame.
+        elif key == glfw.KEY_D:
+            self._render_every_frame = not self._render_every_frame
+        # Capture screenshot
+        elif key == glfw.KEY_T:
+            img = np.zeros(
+                (
+                    glfw.get_framebuffer_size(self.window)[1],
+                    glfw.get_framebuffer_size(self.window)[0],
+                    3,
+                ),
+                dtype=np.uint8,
+            )
+            mujoco.mjr_readPixels(img, None, self.viewport, self.con)
+            imageio.imwrite(self._image_path % self._image_idx, np.flipud(img))
+            self._image_idx += 1
+        # Display contact forces
+        elif key == glfw.KEY_C:
+            self._contacts = not self._contacts
+            self.vopt.flags[mujoco.mjtVisFlag.mjVIS_CONTACTPOINT] = self._contacts
+            self.vopt.flags[mujoco.mjtVisFlag.mjVIS_CONTACTFORCE] = self._contacts
+        # Display coordinate frames
+        elif key == glfw.KEY_E:
+            self.vopt.frame = 1 - self.vopt.frame
+        # Hide overlay menu
+        elif key == glfw.KEY_H:
+            self._hide_menu = not self._hide_menu
+        # Make transparent
+        elif key == glfw.KEY_R:
+            self._transparent = not self._transparent
+            if self._transparent:
+                self.model.geom_rgba[:, 3] /= 5.0
+            else:
+                self.model.geom_rgba[:, 3] *= 5.0
+        # Geom group visibility
+        elif key in (glfw.KEY_0, glfw.KEY_1, glfw.KEY_2, glfw.KEY_3, glfw.KEY_4):
+            self.vopt.geomgroup[key - glfw.KEY_0] ^= 1
+        # Quit
+        if key == glfw.KEY_ESCAPE:
+            print("Pressed ESC")
+            print("Quitting.")
+            glfw.destroy_window(self.window)
+            glfw.terminate()
+
+    def _cursor_pos_callback(
+        self, window: "glfw.LP__GLFWwindow", xpos: float, ypos: float
+    ):
+        if not (self._button_left_pressed or self._button_right_pressed):
+            return
+
+        mod_shift = (
+            glfw.get_key(window, glfw.KEY_LEFT_SHIFT) == glfw.PRESS
+            or glfw.get_key(window, glfw.KEY_RIGHT_SHIFT) == glfw.PRESS
+        )
+        if self._button_right_pressed:
+            action = (
+                mujoco.mjtMouse.mjMOUSE_MOVE_H
+                if mod_shift
+                else mujoco.mjtMouse.mjMOUSE_MOVE_V
+            )
+        elif self._button_left_pressed:
+            action = (
+                mujoco.mjtMouse.mjMOUSE_ROTATE_H
+                if mod_shift
+                else mujoco.mjtMouse.mjMOUSE_ROTATE_V
+            )
+        else:
+            action = mujoco.mjtMouse.mjMOUSE_ZOOM
+
+        dx = int(self._scale * xpos) - self._last_mouse_x
+        dy = int(self._scale * ypos) - self._last_mouse_y
+        width, height = glfw.get_framebuffer_size(window)
+
+        mujoco.mjv_moveCamera(
+            self.model, action, dx / height, dy / height, self.scn, self.cam
+        )
+
+        self._last_mouse_x = int(self._scale * xpos)
+        self._last_mouse_y = int(self._scale * ypos)
+
+    def _mouse_button_callback(self, window: "glfw.LP__GLFWwindow", button, act, mods):
+        self._button_left_pressed = (
+            glfw.get_mouse_button(window, glfw.MOUSE_BUTTON_LEFT) == glfw.PRESS
+        )
+        self._button_right_pressed = (
+            glfw.get_mouse_button(window, glfw.MOUSE_BUTTON_RIGHT) == glfw.PRESS
+        )
+
+        x, y = glfw.get_cursor_pos(window)
+        self._last_mouse_x = int(self._scale * x)
+        self._last_mouse_y = int(self._scale * y)
+
+    def _scroll_callback(self, window, x_offset, y_offset: float):
+        mujoco.mjv_moveCamera(
+            self.model,
+            mujoco.mjtMouse.mjMOUSE_ZOOM,
+            0,
+            -0.05 * y_offset,
+            self.scn,
+            self.cam,
+        )
+
+    def _create_overlay(self):
+        topleft = mujoco.mjtGridPos.mjGRID_TOPLEFT
+        bottomleft = mujoco.mjtGridPos.mjGRID_BOTTOMLEFT
+
+        if self._render_every_frame:
+            self.add_overlay(topleft, "", "")
+        else:
+            self.add_overlay(
+                topleft,
+                "Run speed = %.3f x real time" % self._run_speed,
+                "[S]lower, [F]aster",
+            )
+        self.add_overlay(
+            topleft, "Ren[d]er every frame", "On" if self._render_every_frame else "Off"
+        )
+        self.add_overlay(
+            topleft,
+            "Switch camera (#cams = %d)" % (self.model.ncam + 1),
+            "[Tab] (camera ID = %d)" % self.cam.fixedcamid,
+        )
+        self.add_overlay(topleft, "[C]ontact forces", "On" if self._contacts else "Off")
+        self.add_overlay(topleft, "T[r]ansparent", "On" if self._transparent else "Off")
+        if self._paused is not None:
+            if not self._paused:
+                self.add_overlay(topleft, "Stop", "[Space]")
+            else:
+                self.add_overlay(topleft, "Start", "[Space]")
+                self.add_overlay(
+                    topleft, "Advance simulation by one step", "[right arrow]"
+                )
+        self.add_overlay(
+            topleft, "Referenc[e] frames", "On" if self.vopt.frame == 1 else "Off"
+        )
+        self.add_overlay(topleft, "[H]ide Menu", "")
+        if self._image_idx > 0:
+            fname = self._image_path % (self._image_idx - 1)
+            self.add_overlay(topleft, "Cap[t]ure frame", "Saved as %s" % fname)
+        else:
+            self.add_overlay(topleft, "Cap[t]ure frame", "")
+        self.add_overlay(topleft, "Toggle geomgroup visibility", "0-4")
+
+        self.add_overlay(bottomleft, "FPS", "%d%s" % (1 / self._time_per_render, ""))
+        self.add_overlay(
+            bottomleft, "Solver iterations", str(self.data.solver_iter + 1)
+        )
+        self.add_overlay(
+            bottomleft, "Step", str(round(self.data.time / self.model.opt.timestep))
+        )
+        self.add_overlay(bottomleft, "timestep", "%.5f" % self.model.opt.timestep)
+
+
+class MujocoRenderer:
+    """This is the MuJoCo renderer manager class for every MuJoCo environment.
+
+    The class has two main public methods available:
+    - :meth:`render` - Renders the environment in three possible modes: "human", "rgb_array", or "depth_array"
+    - :meth:`close` - Closes all contexts initialized with the renderer
+
+    """
+
+    def __init__(
+        self,
+        model: "mujoco.MjModel",
+        data: "mujoco.MjData",
+        default_cam_config: Optional[dict] = None,
+    ):
+        """A wrapper for clipping continuous actions within the valid bound.
+
+        Args:
+            model: MjModel data structure of the MuJoCo simulation
+            data: MjData data structure of the MuJoCo simulation
+            default_cam_config: dictionary with attribute values of the viewer's default camera, https://mujoco.readthedocs.io/en/latest/XMLreference.html?highlight=camera#visual-global
+        """
+        self.model = model
+        self.data = data
+        self._viewers = {}
+        self.viewer = None
+        self.default_cam_config = default_cam_config
+
+    def render(
+        self,
+        render_mode: str,
+        camera_id: Optional[int] = None,
+        camera_name: Optional[str] = None,
+    ):
+        """Renders a frame of the simulation in a specific format and camera view.
+
+        Args:
+            render_mode: The format to render the frame, it can be: "human", "rgb_array", or "depth_array"
+            camera_id: The integer camera id from which to render the frame in the MuJoCo simulation
+            camera_name: The string name of the camera from which to render the frame in the MuJoCo simulation. This argument should not be passed if using cameara_id instead and vice versa
+
+        Returns:
+            If render_mode is "rgb_array" or "depth_arra" it returns a numpy array in the specified format. "human" render mode does not return anything.
+        """
+
+        viewer = self._get_viewer(render_mode=render_mode)
+
+        if render_mode in {
+            "rgb_array",
+            "depth_array",
+        }:
+            if camera_id is not None and camera_name is not None:
+                raise ValueError(
+                    "Both `camera_id` and `camera_name` cannot be"
+                    " specified at the same time."
+                )
+
+            no_camera_specified = camera_name is None and camera_id is None
+            if no_camera_specified:
+                camera_name = "track"
+
+            if camera_id is None:
+                camera_id = mujoco.mj_name2id(
+                    self.model,
+                    mujoco.mjtObj.mjOBJ_CAMERA,
+                    camera_name,
+                )
+
+            img = viewer.render(render_mode=render_mode, camera_id=camera_id)
+            return img
+
+        elif render_mode == "human":
+            return viewer.render()
+
+    def _get_viewer(self, render_mode: str):
+        """Initializes and returns a viewer class depending on the render_mode
+        - `WindowViewer` class for "human" render mode
+        - `OffScreenViewer` class for "rgb_array" or "depth_array" render mode
+        """
+        self.viewer = self._viewers.get(render_mode)
+        if self.viewer is None:
+            if render_mode == "human":
+                self.viewer = WindowViewer(self.model, self.data)
+
+            elif render_mode in {"rgb_array", "depth_array"}:
+                self.viewer = OffScreenViewer(self.model, self.data)
+            else:
+                raise AttributeError(
+                    f"Unexpected mode: {render_mode}, expected modes: human, rgb_array, or depth_array"
+                )
+            # Add default camera parameters
+            self._set_cam_config()
+            self._viewers[render_mode] = self.viewer
+
+        if len(self._viewers.keys()) > 1:
+            # Only one context can be current at a time
+            self.viewer.make_context_current()
+
+        return self.viewer
+
+    def _set_cam_config(self):
+        """Set the default camera parameters"""
+        assert self.viewer is not None
+        if self.default_cam_config is not None:
+            for key, value in self.default_cam_config.items():
+                if isinstance(value, np.ndarray):
+                    getattr(self.viewer.cam, key)[:] = value
+                else:
+                    setattr(self.viewer.cam, key, value)
+
+    def close(self):
+        """Close the OpenGL rendering contexts of all viewer modes"""
+        for _, viewer in self._viewers.items():
+            viewer.close()
--- a/rl/Lib/site-packages/gymnasium/envs/mujoco/pusher.py
+++ b/rl/Lib/site-packages/gymnasium/envs/mujoco/pusher.py
@ -0,0 +1,84 @@
+import numpy as np
+
+from gymnasium import utils
+from gymnasium.envs.mujoco import MuJocoPyEnv
+from gymnasium.spaces import Box
+
+
+class PusherEnv(MuJocoPyEnv, utils.EzPickle):
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 20,
+    }
+
+    def __init__(self, **kwargs):
+        utils.EzPickle.__init__(self, **kwargs)
+        observation_space = Box(low=-np.inf, high=np.inf, shape=(23,), dtype=np.float64)
+        MuJocoPyEnv.__init__(
+            self, "pusher.xml", 5, observation_space=observation_space, **kwargs
+        )
+
+    def step(self, a):
+        vec_1 = self.get_body_com("object") - self.get_body_com("tips_arm")
+        vec_2 = self.get_body_com("object") - self.get_body_com("goal")
+
+        reward_near = -np.linalg.norm(vec_1)
+        reward_dist = -np.linalg.norm(vec_2)
+        reward_ctrl = -np.square(a).sum()
+        reward = reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near
+
+        self.do_simulation(a, self.frame_skip)
+        if self.render_mode == "human":
+            self.render()
+
+        ob = self._get_obs()
+        return (
+            ob,
+            reward,
+            False,
+            False,
+            dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl),
+        )
+
+    def viewer_setup(self):
+        assert self.viewer is not None
+        self.viewer.cam.trackbodyid = -1
+        self.viewer.cam.distance = 4.0
+
+    def reset_model(self):
+        qpos = self.init_qpos
+
+        self.goal_pos = np.asarray([0, 0])
+        while True:
+            self.cylinder_pos = np.concatenate(
+                [
+                    self.np_random.uniform(low=-0.3, high=0, size=1),
+                    self.np_random.uniform(low=-0.2, high=0.2, size=1),
+                ]
+            )
+            if np.linalg.norm(self.cylinder_pos - self.goal_pos) > 0.17:
+                break
+
+        qpos[-4:-2] = self.cylinder_pos
+        qpos[-2:] = self.goal_pos
+        qvel = self.init_qvel + self.np_random.uniform(
+            low=-0.005, high=0.005, size=self.model.nv
+        )
+        qvel[-4:] = 0
+        self.set_state(qpos, qvel)
+        return self._get_obs()
+
+    def _get_obs(self):
+        return np.concatenate(
+            [
+                self.sim.data.qpos.flat[:7],
+                self.sim.data.qvel.flat[:7],
+                self.get_body_com("tips_arm"),
+                self.get_body_com("object"),
+                self.get_body_com("goal"),
+            ]
+        )
--- a/Show More
+++ b/Show More