I am done

2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions
--- a/rl/Lib/site-packages/stable_baselines3/common/envs/init.py
+++ b/rl/Lib/site-packages/stable_baselines3/common/envs/init.py
@ -0,0 +1,20 @@
+from stable_baselines3.common.envs.bit_flipping_env import BitFlippingEnv
+from stable_baselines3.common.envs.identity_env import (
+    FakeImageEnv,
+    IdentityEnv,
+    IdentityEnvBox,
+    IdentityEnvMultiBinary,
+    IdentityEnvMultiDiscrete,
+)
+from stable_baselines3.common.envs.multi_input_envs import SimpleMultiObsEnv
+
+__all__ = [
+    "BitFlippingEnv",
+    "FakeImageEnv",
+    "IdentityEnv",
+    "IdentityEnvBox",
+    "IdentityEnvMultiBinary",
+    "IdentityEnvMultiDiscrete",
+    "SimpleMultiObsEnv",
+    "SimpleMultiObsEnv",
+]
--- a/rl/Lib/site-packages/stable_baselines3/common/envs/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/stable_baselines3/common/envs/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/stable_baselines3/common/envs/pycache/bit_flipping_env.cpython-312.pyc
+++ b/rl/Lib/site-packages/stable_baselines3/common/envs/pycache/bit_flipping_env.cpython-312.pyc
--- a/rl/Lib/site-packages/stable_baselines3/common/envs/pycache/identity_env.cpython-312.pyc
+++ b/rl/Lib/site-packages/stable_baselines3/common/envs/pycache/identity_env.cpython-312.pyc
--- a/rl/Lib/site-packages/stable_baselines3/common/envs/pycache/multi_input_envs.cpython-312.pyc
+++ b/rl/Lib/site-packages/stable_baselines3/common/envs/pycache/multi_input_envs.cpython-312.pyc
--- a/rl/Lib/site-packages/stable_baselines3/common/envs/bit_flipping_env.py
+++ b/rl/Lib/site-packages/stable_baselines3/common/envs/bit_flipping_env.py
@ -0,0 +1,235 @@
+from collections import OrderedDict
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+from gymnasium import Env, spaces
+from gymnasium.envs.registration import EnvSpec
+
+from stable_baselines3.common.type_aliases import GymStepReturn
+
+
+class BitFlippingEnv(Env):
+    """
+    Simple bit flipping env, useful to test HER.
+    The goal is to flip all the bits to get a vector of ones.
+    In the continuous variant, if the ith action component has a value > 0,
+    then the ith bit will be flipped. Uses a ``MultiBinary`` observation space
+    by default.
+
+    :param n_bits: Number of bits to flip
+    :param continuous: Whether to use the continuous actions version or not,
+        by default, it uses the discrete one
+    :param max_steps: Max number of steps, by default, equal to n_bits
+    :param discrete_obs_space: Whether to use the discrete observation
+        version or not, ie a one-hot encoding of all possible states
+    :param image_obs_space: Whether to use an image observation version
+        or not, ie a greyscale image of the state
+    :param channel_first: Whether to use channel-first or last image.
+    """
+
+    spec = EnvSpec("BitFlippingEnv-v0", "no-entry-point")
+    state: np.ndarray
+
+    def __init__(
+        self,
+        n_bits: int = 10,
+        continuous: bool = False,
+        max_steps: Optional[int] = None,
+        discrete_obs_space: bool = False,
+        image_obs_space: bool = False,
+        channel_first: bool = True,
+        render_mode: str = "human",
+    ):
+        super().__init__()
+        self.render_mode = render_mode
+        # Shape of the observation when using image space
+        self.image_shape = (1, 36, 36) if channel_first else (36, 36, 1)
+        # The achieved goal is determined by the current state
+        # here, it is a special where they are equal
+
+        # observation space for observations given to the model
+        self.observation_space = self._make_observation_space(discrete_obs_space, image_obs_space, n_bits)
+        # observation space used to update internal state
+        self._obs_space = spaces.MultiBinary(n_bits)
+
+        if continuous:
+            self.action_space = spaces.Box(-1, 1, shape=(n_bits,), dtype=np.float32)
+        else:
+            self.action_space = spaces.Discrete(n_bits)
+        self.continuous = continuous
+        self.discrete_obs_space = discrete_obs_space
+        self.image_obs_space = image_obs_space
+        self.desired_goal = np.ones((n_bits,), dtype=self.observation_space["desired_goal"].dtype)
+        if max_steps is None:
+            max_steps = n_bits
+        self.max_steps = max_steps
+        self.current_step = 0
+
+    def seed(self, seed: int) -> None:
+        self._obs_space.seed(seed)
+
+    def convert_if_needed(self, state: np.ndarray) -> Union[int, np.ndarray]:
+        """
+        Convert to discrete space if needed.
+
+        :param state:
+        :return:
+        """
+        if self.discrete_obs_space:
+            # The internal state is the binary representation of the
+            # observed one
+            return int(sum(state[i] * 2**i for i in range(len(state))))
+
+        if self.image_obs_space:
+            size = np.prod(self.image_shape)
+            image = np.concatenate((state * 255, np.zeros(size - len(state), dtype=np.uint8)))
+            return image.reshape(self.image_shape).astype(np.uint8)
+        return state
+
+    def convert_to_bit_vector(self, state: Union[int, np.ndarray], batch_size: int) -> np.ndarray:
+        """
+        Convert to bit vector if needed.
+
+        :param state: The state to be converted, which can be either an integer or a numpy array.
+        :param batch_size: The batch size.
+        :return: The state converted into a bit vector.
+        """
+        # Convert back to bit vector
+        if isinstance(state, int):
+            bit_vector = np.array(state).reshape(batch_size, -1)
+            # Convert to binary representation
+            bit_vector = ((bit_vector[:, :] & (1 << np.arange(len(self.state)))) > 0).astype(int)
+        elif self.image_obs_space:
+            bit_vector = state.reshape(batch_size, -1)[:, : len(self.state)] / 255
+        else:
+            bit_vector = np.array(state).reshape(batch_size, -1)
+        return bit_vector
+
+    def _make_observation_space(self, discrete_obs_space: bool, image_obs_space: bool, n_bits: int) -> spaces.Dict:
+        """
+        Helper to create observation space
+
+        :param discrete_obs_space: Whether to use the discrete observation version
+        :param image_obs_space: Whether to use the image observation version
+        :param n_bits: The number of bits used to represent the state
+        :return: the environment observation space
+        """
+        if discrete_obs_space and image_obs_space:
+            raise ValueError("Cannot use both discrete and image observation spaces")
+
+        if discrete_obs_space:
+            # In the discrete case, the agent act on the binary
+            # representation of the observation
+            return spaces.Dict(
+                {
+                    "observation": spaces.Discrete(2**n_bits),
+                    "achieved_goal": spaces.Discrete(2**n_bits),
+                    "desired_goal": spaces.Discrete(2**n_bits),
+                }
+            )
+
+        if image_obs_space:
+            # When using image as input,
+            # one image contains the bits 0 -> 0, 1 -> 255
+            # and the rest is filled with zeros
+            return spaces.Dict(
+                {
+                    "observation": spaces.Box(
+                        low=0,
+                        high=255,
+                        shape=self.image_shape,
+                        dtype=np.uint8,
+                    ),
+                    "achieved_goal": spaces.Box(
+                        low=0,
+                        high=255,
+                        shape=self.image_shape,
+                        dtype=np.uint8,
+                    ),
+                    "desired_goal": spaces.Box(
+                        low=0,
+                        high=255,
+                        shape=self.image_shape,
+                        dtype=np.uint8,
+                    ),
+                }
+            )
+
+        return spaces.Dict(
+            {
+                "observation": spaces.MultiBinary(n_bits),
+                "achieved_goal": spaces.MultiBinary(n_bits),
+                "desired_goal": spaces.MultiBinary(n_bits),
+            }
+        )
+
+    def _get_obs(self) -> Dict[str, Union[int, np.ndarray]]:
+        """
+        Helper to create the observation.
+
+        :return: The current observation.
+        """
+        return OrderedDict(
+            [
+                ("observation", self.convert_if_needed(self.state.copy())),
+                ("achieved_goal", self.convert_if_needed(self.state.copy())),
+                ("desired_goal", self.convert_if_needed(self.desired_goal.copy())),
+            ]
+        )
+
+    def reset(
+        self, *, seed: Optional[int] = None, options: Optional[Dict] = None
+    ) -> Tuple[Dict[str, Union[int, np.ndarray]], Dict]:
+        if seed is not None:
+            self._obs_space.seed(seed)
+        self.current_step = 0
+        self.state = self._obs_space.sample()
+        return self._get_obs(), {}
+
+    def step(self, action: Union[np.ndarray, int]) -> GymStepReturn:
+        """
+        Step into the env.
+
+        :param action:
+        :return:
+        """
+        if self.continuous:
+            self.state[action > 0] = 1 - self.state[action > 0]
+        else:
+            self.state[action] = 1 - self.state[action]
+        obs = self._get_obs()
+        reward = float(self.compute_reward(obs["achieved_goal"], obs["desired_goal"], None).item())
+        terminated = reward == 0
+        self.current_step += 1
+        # Episode terminate when we reached the goal or the max number of steps
+        info = {"is_success": terminated}
+        truncated = self.current_step >= self.max_steps
+        return obs, reward, terminated, truncated, info
+
+    def compute_reward(
+        self, achieved_goal: Union[int, np.ndarray], desired_goal: Union[int, np.ndarray], _info: Optional[Dict[str, Any]]
+    ) -> np.float32:
+        # As we are using a vectorized version, we need to keep track of the `batch_size`
+        if isinstance(achieved_goal, int):
+            batch_size = 1
+        elif self.image_obs_space:
+            batch_size = achieved_goal.shape[0] if len(achieved_goal.shape) > 3 else 1
+        else:
+            batch_size = achieved_goal.shape[0] if len(achieved_goal.shape) > 1 else 1
+
+        desired_goal = self.convert_to_bit_vector(desired_goal, batch_size)
+        achieved_goal = self.convert_to_bit_vector(achieved_goal, batch_size)
+
+        # Deceptive reward: it is positive only when the goal is achieved
+        # Here we are using a vectorized version
+        distance = np.linalg.norm(achieved_goal - desired_goal, axis=-1)
+        return -(distance > 0).astype(np.float32)
+
+    def render(self) -> Optional[np.ndarray]:  # type: ignore[override]
+        if self.render_mode == "rgb_array":
+            return self.state.copy()
+        print(self.state)
+        return None
+
+    def close(self) -> None:
+        pass
--- a/rl/Lib/site-packages/stable_baselines3/common/envs/identity_env.py
+++ b/rl/Lib/site-packages/stable_baselines3/common/envs/identity_env.py
@ -0,0 +1,159 @@
+from typing import Any, Dict, Generic, Optional, Tuple, TypeVar, Union
+
+import gymnasium as gym
+import numpy as np
+from gymnasium import spaces
+
+from stable_baselines3.common.type_aliases import GymStepReturn
+
+T = TypeVar("T", int, np.ndarray)
+
+
+class IdentityEnv(gym.Env, Generic[T]):
+    def __init__(self, dim: Optional[int] = None, space: Optional[spaces.Space] = None, ep_length: int = 100):
+        """
+        Identity environment for testing purposes
+
+        :param dim: the size of the action and observation dimension you want
+            to learn. Provide at most one of ``dim`` and ``space``. If both are
+            None, then initialization proceeds with ``dim=1`` and ``space=None``.
+        :param space: the action and observation space. Provide at most one of
+            ``dim`` and ``space``.
+        :param ep_length: the length of each episode in timesteps
+        """
+        if space is None:
+            if dim is None:
+                dim = 1
+            space = spaces.Discrete(dim)
+        else:
+            assert dim is None, "arguments for both 'dim' and 'space' provided: at most one allowed"
+
+        self.action_space = self.observation_space = space
+        self.ep_length = ep_length
+        self.current_step = 0
+        self.num_resets = -1  # Becomes 0 after __init__ exits.
+        self.reset()
+
+    def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None) -> Tuple[T, Dict]:
+        if seed is not None:
+            super().reset(seed=seed)
+        self.current_step = 0
+        self.num_resets += 1
+        self._choose_next_state()
+        return self.state, {}
+
+    def step(self, action: T) -> Tuple[T, float, bool, bool, Dict[str, Any]]:
+        reward = self._get_reward(action)
+        self._choose_next_state()
+        self.current_step += 1
+        terminated = False
+        truncated = self.current_step >= self.ep_length
+        return self.state, reward, terminated, truncated, {}
+
+    def _choose_next_state(self) -> None:
+        self.state = self.action_space.sample()
+
+    def _get_reward(self, action: T) -> float:
+        return 1.0 if np.all(self.state == action) else 0.0
+
+    def render(self, mode: str = "human") -> None:
+        pass
+
+
+class IdentityEnvBox(IdentityEnv[np.ndarray]):
+    def __init__(self, low: float = -1.0, high: float = 1.0, eps: float = 0.05, ep_length: int = 100):
+        """
+        Identity environment for testing purposes
+
+        :param low: the lower bound of the box dim
+        :param high: the upper bound of the box dim
+        :param eps: the epsilon bound for correct value
+        :param ep_length: the length of each episode in timesteps
+        """
+        space = spaces.Box(low=low, high=high, shape=(1,), dtype=np.float32)
+        super().__init__(ep_length=ep_length, space=space)
+        self.eps = eps
+
+    def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, bool, Dict[str, Any]]:
+        reward = self._get_reward(action)
+        self._choose_next_state()
+        self.current_step += 1
+        terminated = False
+        truncated = self.current_step >= self.ep_length
+        return self.state, reward, terminated, truncated, {}
+
+    def _get_reward(self, action: np.ndarray) -> float:
+        return 1.0 if (self.state - self.eps) <= action <= (self.state + self.eps) else 0.0
+
+
+class IdentityEnvMultiDiscrete(IdentityEnv[np.ndarray]):
+    def __init__(self, dim: int = 1, ep_length: int = 100) -> None:
+        """
+        Identity environment for testing purposes
+
+        :param dim: the size of the dimensions you want to learn
+        :param ep_length: the length of each episode in timesteps
+        """
+        space = spaces.MultiDiscrete([dim, dim])
+        super().__init__(ep_length=ep_length, space=space)
+
+
+class IdentityEnvMultiBinary(IdentityEnv[np.ndarray]):
+    def __init__(self, dim: int = 1, ep_length: int = 100) -> None:
+        """
+        Identity environment for testing purposes
+
+        :param dim: the size of the dimensions you want to learn
+        :param ep_length: the length of each episode in timesteps
+        """
+        space = spaces.MultiBinary(dim)
+        super().__init__(ep_length=ep_length, space=space)
+
+
+class FakeImageEnv(gym.Env):
+    """
+    Fake image environment for testing purposes, it mimics Atari games.
+
+    :param action_dim: Number of discrete actions
+    :param screen_height: Height of the image
+    :param screen_width: Width of the image
+    :param n_channels: Number of color channels
+    :param discrete: Create discrete action space instead of continuous
+    :param channel_first: Put channels on first axis instead of last
+    """
+
+    def __init__(
+        self,
+        action_dim: int = 6,
+        screen_height: int = 84,
+        screen_width: int = 84,
+        n_channels: int = 1,
+        discrete: bool = True,
+        channel_first: bool = False,
+    ) -> None:
+        self.observation_shape = (screen_height, screen_width, n_channels)
+        if channel_first:
+            self.observation_shape = (n_channels, screen_height, screen_width)
+        self.observation_space = spaces.Box(low=0, high=255, shape=self.observation_shape, dtype=np.uint8)
+        if discrete:
+            self.action_space = spaces.Discrete(action_dim)
+        else:
+            self.action_space = spaces.Box(low=-1, high=1, shape=(5,), dtype=np.float32)
+        self.ep_length = 10
+        self.current_step = 0
+
+    def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None) -> Tuple[np.ndarray, Dict]:
+        if seed is not None:
+            super().reset(seed=seed)
+        self.current_step = 0
+        return self.observation_space.sample(), {}
+
+    def step(self, action: Union[np.ndarray, int]) -> GymStepReturn:
+        reward = 0.0
+        self.current_step += 1
+        terminated = False
+        truncated = self.current_step >= self.ep_length
+        return self.observation_space.sample(), reward, terminated, truncated, {}
+
+    def render(self, mode: str = "human") -> None:
+        pass
--- a/rl/Lib/site-packages/stable_baselines3/common/envs/multi_input_envs.py
+++ b/rl/Lib/site-packages/stable_baselines3/common/envs/multi_input_envs.py
@ -0,0 +1,183 @@
+from typing import Dict, List, Optional, Tuple, Union
+
+import gymnasium as gym
+import numpy as np
+from gymnasium import spaces
+
+from stable_baselines3.common.type_aliases import GymStepReturn
+
+
+class SimpleMultiObsEnv(gym.Env):
+    """
+    Base class for GridWorld-based MultiObs Environments 4x4  grid world.
+
+    .. code-block:: text
+
+        ____________
+       | 0  1  2   3|
+       | 4|¯5¯¯6¯| 7|
+       | 8|_9_10_|11|
+       |12 13  14 15|
+       ¯¯¯¯¯¯¯¯¯¯¯¯¯¯
+
+    start is 0
+    states 5, 6, 9, and 10 are blocked
+    goal is 15
+    actions are = [left, down, right, up]
+
+    simple linear state env of 15 states but encoded with a vector and an image observation:
+    each column is represented by a random vector and each row is
+    represented by a random image, both sampled once at creation time.
+
+    :param num_col: Number of columns in the grid
+    :param num_row: Number of rows in the grid
+    :param random_start: If true, agent starts in random position
+    :param channel_last: If true, the image will be channel last, else it will be channel first
+    """
+
+    def __init__(
+        self,
+        num_col: int = 4,
+        num_row: int = 4,
+        random_start: bool = True,
+        discrete_actions: bool = True,
+        channel_last: bool = True,
+    ):
+        super().__init__()
+
+        self.vector_size = 5
+        if channel_last:
+            self.img_size = [64, 64, 1]
+        else:
+            self.img_size = [1, 64, 64]
+
+        self.random_start = random_start
+        self.discrete_actions = discrete_actions
+        if discrete_actions:
+            self.action_space = spaces.Discrete(4)
+        else:
+            self.action_space = spaces.Box(0, 1, (4,))
+
+        self.observation_space = spaces.Dict(
+            spaces={
+                "vec": spaces.Box(0, 1, (self.vector_size,), dtype=np.float64),
+                "img": spaces.Box(0, 255, self.img_size, dtype=np.uint8),
+            }
+        )
+        self.count = 0
+        # Timeout
+        self.max_count = 100
+        self.log = ""
+        self.state = 0
+        self.action2str = ["left", "down", "right", "up"]
+        self.init_possible_transitions()
+
+        self.num_col = num_col
+        self.state_mapping: List[Dict[str, np.ndarray]] = []
+        self.init_state_mapping(num_col, num_row)
+
+        self.max_state = len(self.state_mapping) - 1
+
+    def init_state_mapping(self, num_col: int, num_row: int) -> None:
+        """
+        Initializes the state_mapping array which holds the observation values for each state
+
+        :param num_col: Number of columns.
+        :param num_row: Number of rows.
+        """
+        # Each column is represented by a random vector
+        col_vecs = np.random.random((num_col, self.vector_size))
+        # Each row is represented by a random image
+        row_imgs = np.random.randint(0, 255, (num_row, 64, 64), dtype=np.uint8)
+
+        for i in range(num_col):
+            for j in range(num_row):
+                self.state_mapping.append({"vec": col_vecs[i], "img": row_imgs[j].reshape(self.img_size)})
+
+    def get_state_mapping(self) -> Dict[str, np.ndarray]:
+        """
+        Uses the state to get the observation mapping.
+
+        :return: observation dict {'vec': ..., 'img': ...}
+        """
+        return self.state_mapping[self.state]
+
+    def init_possible_transitions(self) -> None:
+        """
+        Initializes the transitions of the environment
+        The environment exploits the cardinal directions of the grid by noting that
+        they correspond to simple addition and subtraction from the cell id within the grid
+
+        - up => means moving up a row => means subtracting the length of a column
+        - down => means moving down a row => means adding the length of a column
+        - left => means moving left by one => means subtracting 1
+        - right => means moving right by one => means adding 1
+
+        Thus one only needs to specify in which states each action is possible
+        in order to define the transitions of the environment
+        """
+        self.left_possible = [1, 2, 3, 13, 14, 15]
+        self.down_possible = [0, 4, 8, 3, 7, 11]
+        self.right_possible = [0, 1, 2, 12, 13, 14]
+        self.up_possible = [4, 8, 12, 7, 11, 15]
+
+    def step(self, action: Union[int, np.ndarray]) -> GymStepReturn:
+        """
+        Run one timestep of the environment's dynamics. When end of
+        episode is reached, you are responsible for calling `reset()`
+        to reset this environment's state.
+        Accepts an action and returns a tuple (observation, reward, terminated, truncated, info).
+
+        :param action:
+        :return: tuple (observation, reward, terminated, truncated, info).
+        """
+        if not self.discrete_actions:
+            action = np.argmax(action)  # type: ignore[assignment]
+
+        self.count += 1
+
+        prev_state = self.state
+
+        reward = -0.1
+        # define state transition
+        if self.state in self.left_possible and action == 0:  # left
+            self.state -= 1
+        elif self.state in self.down_possible and action == 1:  # down
+            self.state += self.num_col
+        elif self.state in self.right_possible and action == 2:  # right
+            self.state += 1
+        elif self.state in self.up_possible and action == 3:  # up
+            self.state -= self.num_col
+
+        got_to_end = self.state == self.max_state
+        reward = 1.0 if got_to_end else reward
+        truncated = self.count > self.max_count
+        terminated = got_to_end
+
+        self.log = f"Went {self.action2str[action]} in state {prev_state}, got to state {self.state}"
+
+        return self.get_state_mapping(), reward, terminated, truncated, {"got_to_end": got_to_end}
+
+    def render(self, mode: str = "human") -> None:
+        """
+        Prints the log of the environment.
+
+        :param mode:
+        """
+        print(self.log)
+
+    def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None) -> Tuple[Dict[str, np.ndarray], Dict]:
+        """
+        Resets the environment state and step count and returns reset observation.
+
+        :param seed:
+        :return: observation dict {'vec': ..., 'img': ...}
+        """
+        if seed is not None:
+            super().reset(seed=seed)
+        self.count = 0
+        if not self.random_start:
+            self.state = 0
+        else:
+            self.state = np.random.randint(0, self.max_state)
+        return self.state_mapping[self.state], {}