From 693b120f3dbae50408a9c6b6c9c180423ca93ba3 Mon Sep 17 00:00:00 2001 From: Babic Date: Thu, 30 Apr 2026 08:32:34 +0200 Subject: [PATCH 1/6] fix: add TimeLimit wrapper in PPO load() to prevent infinite playback Without this wrapper, play_ppo.py ran indefinitely on a converged model. The trained policy produces near-zero reward, so the reward_limit threshold was never crossed and no termination signal fired. TimeLimit caps episodes at max_episode_steps=3000, matching the training configuration used in __init__ and resume(). --- src/crane_controller/ppo_agent.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/crane_controller/ppo_agent.py b/src/crane_controller/ppo_agent.py index 36e1e40..2634160 100644 --- a/src/crane_controller/ppo_agent.py +++ b/src/crane_controller/ppo_agent.py @@ -95,7 +95,13 @@ def load( Agent configured for inference with VecNormalize in evaluation mode. """ instance = object.__new__(cls) - raw_vec_env = make_vec_env(env_id=env, n_envs=1, env_kwargs=env_kwargs) + raw_vec_env = make_vec_env( + env_id=env, + n_envs=1, + env_kwargs=env_kwargs, + wrapper_class=TimeLimit, + wrapper_kwargs={"max_episode_steps": 3000}, + ) stats_path = cls._stats_path(str(model_path)) if stats_path.exists(): instance.vec_env = VecNormalize.load(str(stats_path), raw_vec_env) From e439e458165f573a036475a273ac1b4de622c536 Mon Sep 17 00:00:00 2001 From: Babic Date: Thu, 30 Apr 2026 08:33:34 +0200 Subject: [PATCH 2/6] feat: add configurable gamma parameter to PPO agent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Exposes the PPO discount factor as gamma (default 0.99) on ProximalPolicyOptimizationAgent and as --gamma in train_ppo.py. Tested with gamma=0.999: converges slower for this task. The pendulum control problem is reactive and local — a longer planning horizon adds value function complexity without improving policy quality. gamma=0.99 remains the default. --- scripts/train_ppo.py | 7 +++++++ src/crane_controller/ppo_agent.py | 7 ++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/scripts/train_ppo.py b/scripts/train_ppo.py index fd07120..77b677c 100644 --- a/scripts/train_ppo.py +++ b/scripts/train_ppo.py @@ -40,6 +40,12 @@ def main() -> None: default=None, help="Path to a saved model zip to resume training from.", ) + _ = parser.add_argument( + "--gamma", + type=float, + default=0.99, + help="Discount factor for future rewards (default 0.99). Try 0.999 for longer planning horizon.", + ) _ = parser.add_argument( "--dry-run", action="store_true", @@ -86,6 +92,7 @@ def main() -> None: "render_mode": args.render_mode, }, save_path=args.save_path, + gamma=args.gamma, ) agent.do_training(args.steps) vecnorm_path = Path(args.save_path).parent / f"{Path(args.save_path).stem}_vecnorm.pkl" diff --git a/src/crane_controller/ppo_agent.py b/src/crane_controller/ppo_agent.py index 2634160..a017713 100644 --- a/src/crane_controller/ppo_agent.py +++ b/src/crane_controller/ppo_agent.py @@ -48,6 +48,10 @@ class ProximalPolicyOptimizationAgent: Maximum steps per episode enforced via a TimeLimit wrapper (default 3000). Ensures episodes always end, even when a plateau agent never triggers the environment's own termination condition. + gamma : float, optional + Discount factor for future rewards (default 0.99). Higher values (e.g. 0.999) + extend the effective planning horizon, which can improve policy quality on + long episodes at the cost of slower value function convergence. """ def __init__( @@ -57,6 +61,7 @@ def __init__( env_kwargs: dict[str, Any] | None = None, save_path: str | None = None, max_episode_steps: int = 3000, + gamma: float = 0.99, ) -> None: """Set up the agent for training. Use :meth:`load` for inference.""" self.save_path = save_path @@ -68,7 +73,7 @@ def __init__( wrapper_kwargs={"max_episode_steps": max_episode_steps}, ) self.vec_env = VecNormalize(raw_vec_env, norm_obs=True, norm_reward=True) - self.model = PPO("MlpPolicy", self.vec_env, verbose=1 if n_envs == 1 else 0) + self.model = PPO("MlpPolicy", self.vec_env, gamma=gamma, verbose=1 if n_envs == 1 else 0) self.env: AntiPendulumEnv = self.vec_env.venv.envs[0] # type: ignore[attr-defined] @classmethod From 650c3787d2ceb4e7cb28e4c9caa1fb28295624a0 Mon Sep 17 00:00:00 2001 From: Babic Date: Thu, 30 Apr 2026 08:34:15 +0200 Subject: [PATCH 3/6] fix: disable explicit time penalty for PPO (reward_fac[2] = 0.0) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reward_fac[2] term adds -self.time * 0.001 to the reward each step. self.time grows continuously but is not part of the observation, violating the Markov property: two identical crane/pendulum states at different episode times produce different rewards. PPO's value function V(s) cannot condition on hidden state, so it must average across the time distribution at each state — producing noisy gradient estimates and erratic convergence. Time preference is already encoded implicitly through the discount factor γ: V(s) = r_t + γ*r_{t+1} + γ²*r_{t+2} + … so the policy naturally prefers faster solutions without any extra signal. An explicit time penalty is both redundant and harmful for function-approximation methods like PPO. Q-learning tolerates it because dt=0.1 makes the penalty 10× smaller, and the tabular value function averages over many revisits to the same state bucket, smoothing out the non-stationarity. Set reward_fac[2] = 0.0 in both train_ppo.py and play_ppo.py. The reward_fac parameter remains on the environment so Q-learning can still use it; only the PPO scripts override it explicitly. --- scripts/play_ppo.py | 1 + scripts/train_ppo.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/scripts/play_ppo.py b/scripts/play_ppo.py index 6b5bfde..a7dbbf2 100644 --- a/scripts/play_ppo.py +++ b/scripts/play_ppo.py @@ -34,6 +34,7 @@ def main() -> None: "crane": build_crane, "start_speed": 1.0, "render_mode": args.render_mode, + "reward_fac": (1.0, 0.0015, 0.0), }, ) diff --git a/scripts/train_ppo.py b/scripts/train_ppo.py index 77b677c..dcef917 100644 --- a/scripts/train_ppo.py +++ b/scripts/train_ppo.py @@ -73,6 +73,7 @@ def main() -> None: "crane": build_crane, "start_speed": 1.0, "render_mode": args.render_mode, + "reward_fac": (1.0, 0.0015, 0.0), }, save_path=args.save_path, n_envs=args.n_envs, @@ -90,6 +91,7 @@ def main() -> None: "crane": build_crane, "start_speed": 1.0, "render_mode": args.render_mode, + "reward_fac": (1.0, 0.0015, 0.0), }, save_path=args.save_path, gamma=args.gamma, From fd2500aca12cdfdbaf990c0140be7bc882fb5c16 Mon Sep 17 00:00:00 2001 From: Babic Date: Thu, 30 Apr 2026 08:41:07 +0200 Subject: [PATCH 4/6] fix: trigger show_plot from render() so plot appears after episode render() had no handler for render_mode='plot'. show_plot() was only called from reset() at the start of the next episode, so with a single episode run the plot never appeared. --- src/crane_controller/envs/controlled_crane_pendulum.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/crane_controller/envs/controlled_crane_pendulum.py b/src/crane_controller/envs/controlled_crane_pendulum.py index 78c6c69..364d0d8 100644 --- a/src/crane_controller/envs/controlled_crane_pendulum.py +++ b/src/crane_controller/envs/controlled_crane_pendulum.py @@ -485,5 +485,7 @@ def step(self, action: int) -> tuple[tuple[int, ...] | np.ndarray, float, bool, def render(self) -> None: """Render the current episode.""" - if self.render_mode == "play-back": # show the animation + if self.render_mode == "play-back": self.show_animation() + elif self.render_mode == "plot": + self.show_plot(self.nresets) From d8fde64ddafa3429dfb986dd74488ffbec6afea5 Mon Sep 17 00:00:00 2001 From: Babic Date: Thu, 30 Apr 2026 08:41:50 +0200 Subject: [PATCH 5/6] =?UTF-8?q?fix:=20improve=20show=5Fplot=20visualizatio?= =?UTF-8?q?n=20=E2=80=94=20layout,=20legends,=20figure=20title?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Switch from 2×2 grid to 4×1 vertical layout (figsize 16×12): gives each subplot full width and a shared time axis, making crane-pendulum interactions easier to align visually. - Combine primary and twin-axis legend handles so load speed, damping, and crane speed (on ax1y2/ax2y2) appear in the legend. ax.legend() alone only captured primary-axis lines. - Use plt.suptitle() instead of plt.title() for the figure-level title. plt.title() attached to the last active axes (ax4), placing the text between subplots. suptitle() places it above all subplots. - Add loc="upper left" to ax2 legend to avoid overlap with the twin y-axis spine on the right. - Add suptitle stub to stubs/matplotlib-stubs/pyplot.pyi (pyright did not recognise plt.suptitle without it). Also updates CHANGELOG.md with all changes from this branch. --- CHANGELOG.md | 20 +++++++++++++++++++ .../envs/controlled_crane_pendulum.py | 12 +++++++---- stubs/matplotlib-stubs/pyplot.pyi | 1 + 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 542f881..5f2d90a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,26 @@ The changelog format is based on [Keep a Changelog](https://keepachangelog.com/e ## [Unreleased] ### Added +* `gamma` parameter on `ProximalPolicyOptimizationAgent` (default 0.99) and `--gamma` CLI flag in + `train_ppo.py` to configure the PPO discount factor without editing source code. + +### Fixed +* `ProximalPolicyOptimizationAgent.load()` now applies a `TimeLimit` wrapper (max 3000 steps), + matching the training configuration. Without it, `play_ppo.py` ran indefinitely on a converged + model whose near-zero reward never crossed the termination threshold. +* `AntiPendulumEnv.render()` now handles `render_mode='plot'` by calling `show_plot()` directly, + so the episode plot appears when running `play_ppo.py --render-mode plot`. +* `show_plot()` legend now includes lines from twin y-axes (load speed, crane speed, damping) by + combining handles from both axes with `get_legend_handles_labels()`. +* `show_plot()` title moved from `plt.title()` (attached to last axes) to `plt.suptitle()` + (figure-level), preventing the title from appearing between subplots. +* `show_plot()` switched from 2×2 grid to 4×1 vertical layout (16×12 in) so all subplots share + a common time axis and each has full width. +* Disabled explicit time penalty (`reward_fac[2] = 0.0`) in PPO training and playback scripts. + The term `−self.time × 0.001` uses hidden state absent from the observation, violating the + Markov property and destabilising PPO's value function. Time preference is already encoded + implicitly through the discount factor γ. + * `ProximalPolicyOptimizationAgent.resume()` classmethod to continue training from a saved checkpoint. Restores VecNormalize statistics and keeps normalization in training mode, consistent with SB3's `PPO.load()` + `.learn(reset_num_timesteps=False)` pattern. diff --git a/src/crane_controller/envs/controlled_crane_pendulum.py b/src/crane_controller/envs/controlled_crane_pendulum.py index 364d0d8..04992e1 100644 --- a/src/crane_controller/envs/controlled_crane_pendulum.py +++ b/src/crane_controller/envs/controlled_crane_pendulum.py @@ -242,7 +242,7 @@ def show_plot(self, episode: int) -> None: episode : int Episode number used in the plot title. """ - _, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2) + _, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(16, 12)) times = self.dt * np.arange(len(self.traces["c_x"])) damping = self.traces["l_v"][0] * np.exp(-times / self.wire.damping_time) ax1.plot(times, self.traces["l_x"], label="load angle", color="blue") @@ -254,11 +254,15 @@ def show_plot(self, episode: int) -> None: ax2y2.plot(times, self.traces["c_v"], label="crane speed", color="red") ax3.plot(times[: len(self.rewards)], self.rewards, label="rewards") ax4.plot(times, self.traces["acc"], label="x-acceleration", color="green") - _ = ax1.legend() - _ = ax2.legend() + lines1, labels1 = ax1.get_legend_handles_labels() + lines2, labels2 = ax1y2.get_legend_handles_labels() + ax1.legend(lines1 + lines2, labels1 + labels2) + lines3, labels3 = ax2.get_legend_handles_labels() + lines4, labels4 = ax2y2.get_legend_handles_labels() + ax2.legend(lines3 + lines4, labels3 + labels4, loc="upper left") _ = ax3.legend() _ = ax4.legend() - _ = plt.title(f"Detailed plot of episode {episode}, reward:{self.reward}") + _ = plt.suptitle(f"Detailed plot of episode {episode}, reward:{self.reward}") plt.show() for key in self.traces: self.traces[key] = [] diff --git a/stubs/matplotlib-stubs/pyplot.pyi b/stubs/matplotlib-stubs/pyplot.pyi index 78298f6..6ae08f6 100644 --- a/stubs/matplotlib-stubs/pyplot.pyi +++ b/stubs/matplotlib-stubs/pyplot.pyi @@ -66,6 +66,7 @@ def title( y: float | None = None, **kwargs: Any, ) -> Text: ... +def suptitle(t: str, **kwargs: Any) -> Text: ... def plot( *args: Any, scalex: bool = True, From 448312f9c83fcd05f682ba885584e81abceb98a3 Mon Sep 17 00:00:00 2001 From: Babic Date: Thu, 30 Apr 2026 09:43:23 +0200 Subject: [PATCH 6/6] fix: add missing type: ignore[arg-type] on TimeLimit in load() --- src/crane_controller/ppo_agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crane_controller/ppo_agent.py b/src/crane_controller/ppo_agent.py index a017713..b9533a8 100644 --- a/src/crane_controller/ppo_agent.py +++ b/src/crane_controller/ppo_agent.py @@ -104,7 +104,7 @@ def load( env_id=env, n_envs=1, env_kwargs=env_kwargs, - wrapper_class=TimeLimit, + wrapper_class=TimeLimit, # type: ignore[arg-type] wrapper_kwargs={"max_episode_steps": 3000}, ) stats_path = cls._stats_path(str(model_path))