From 693b120f3dbae50408a9c6b6c9c180423ca93ba3 Mon Sep 17 00:00:00 2001
From: Babic <Aleksandar.Babic@dnv.com>
Date: Thu, 30 Apr 2026 08:32:34 +0200
Subject: [PATCH 1/6] fix: add TimeLimit wrapper in PPO load() to prevent
 infinite playback

Without this wrapper, play_ppo.py ran indefinitely on a converged model.
The trained policy produces near-zero reward, so the reward_limit
threshold was never crossed and no termination signal fired.

TimeLimit caps episodes at max_episode_steps=3000, matching the
training configuration used in __init__ and resume().
---
 src/crane_controller/ppo_agent.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/crane_controller/ppo_agent.py b/src/crane_controller/ppo_agent.py
index 36e1e40..2634160 100644
--- a/src/crane_controller/ppo_agent.py
+++ b/src/crane_controller/ppo_agent.py
@@ -95,7 +95,13 @@ def load(
             Agent configured for inference with VecNormalize in evaluation mode.
         """
         instance = object.__new__(cls)
-        raw_vec_env = make_vec_env(env_id=env, n_envs=1, env_kwargs=env_kwargs)
+        raw_vec_env = make_vec_env(
+            env_id=env,
+            n_envs=1,
+            env_kwargs=env_kwargs,
+            wrapper_class=TimeLimit,
+            wrapper_kwargs={"max_episode_steps": 3000},
+        )
         stats_path = cls._stats_path(str(model_path))
         if stats_path.exists():
             instance.vec_env = VecNormalize.load(str(stats_path), raw_vec_env)

From e439e458165f573a036475a273ac1b4de622c536 Mon Sep 17 00:00:00 2001
From: Babic <Aleksandar.Babic@dnv.com>
Date: Thu, 30 Apr 2026 08:33:34 +0200
Subject: [PATCH 2/6] feat: add configurable gamma parameter to PPO agent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Exposes the PPO discount factor as gamma (default 0.99) on
ProximalPolicyOptimizationAgent and as --gamma in train_ppo.py.

Tested with gamma=0.999: converges slower for this task. The pendulum
control problem is reactive and local — a longer planning horizon adds
value function complexity without improving policy quality. gamma=0.99
remains the default.
---
 scripts/train_ppo.py              | 7 +++++++
 src/crane_controller/ppo_agent.py | 7 ++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/scripts/train_ppo.py b/scripts/train_ppo.py
index fd07120..77b677c 100644
--- a/scripts/train_ppo.py
+++ b/scripts/train_ppo.py
@@ -40,6 +40,12 @@ def main() -> None:
         default=None,
         help="Path to a saved model zip to resume training from.",
     )
+    _ = parser.add_argument(
+        "--gamma",
+        type=float,
+        default=0.99,
+        help="Discount factor for future rewards (default 0.99). Try 0.999 for longer planning horizon.",
+    )
     _ = parser.add_argument(
         "--dry-run",
         action="store_true",
@@ -86,6 +92,7 @@ def main() -> None:
                 "render_mode": args.render_mode,
             },
             save_path=args.save_path,
+            gamma=args.gamma,
         )
         agent.do_training(args.steps)
         vecnorm_path = Path(args.save_path).parent / f"{Path(args.save_path).stem}_vecnorm.pkl"
diff --git a/src/crane_controller/ppo_agent.py b/src/crane_controller/ppo_agent.py
index 2634160..a017713 100644
--- a/src/crane_controller/ppo_agent.py
+++ b/src/crane_controller/ppo_agent.py
@@ -48,6 +48,10 @@ class ProximalPolicyOptimizationAgent:
         Maximum steps per episode enforced via a TimeLimit wrapper (default 3000).
         Ensures episodes always end, even when a plateau agent never triggers the
         environment's own termination condition.
+    gamma : float, optional
+        Discount factor for future rewards (default 0.99). Higher values (e.g. 0.999)
+        extend the effective planning horizon, which can improve policy quality on
+        long episodes at the cost of slower value function convergence.
     """
 
     def __init__(
@@ -57,6 +61,7 @@ def __init__(
         env_kwargs: dict[str, Any] | None = None,
         save_path: str | None = None,
         max_episode_steps: int = 3000,
+        gamma: float = 0.99,
     ) -> None:
         """Set up the agent for training. Use :meth:`load` for inference."""
         self.save_path = save_path
@@ -68,7 +73,7 @@ def __init__(
             wrapper_kwargs={"max_episode_steps": max_episode_steps},
         )
         self.vec_env = VecNormalize(raw_vec_env, norm_obs=True, norm_reward=True)
-        self.model = PPO("MlpPolicy", self.vec_env, verbose=1 if n_envs == 1 else 0)
+        self.model = PPO("MlpPolicy", self.vec_env, gamma=gamma, verbose=1 if n_envs == 1 else 0)
         self.env: AntiPendulumEnv = self.vec_env.venv.envs[0]  # type: ignore[attr-defined]
 
     @classmethod

From 650c3787d2ceb4e7cb28e4c9caa1fb28295624a0 Mon Sep 17 00:00:00 2001
From: Babic <Aleksandar.Babic@dnv.com>
Date: Thu, 30 Apr 2026 08:34:15 +0200
Subject: [PATCH 3/6] fix: disable explicit time penalty for PPO (reward_fac[2]
 = 0.0)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The reward_fac[2] term adds -self.time * 0.001 to the reward each step.
self.time grows continuously but is not part of the observation, violating
the Markov property: two identical crane/pendulum states at different
episode times produce different rewards. PPO's value function V(s) cannot
condition on hidden state, so it must average across the time distribution
at each state — producing noisy gradient estimates and erratic convergence.

Time preference is already encoded implicitly through the discount factor γ:
V(s) = r_t + γ*r_{t+1} + γ²*r_{t+2} + … so the policy naturally prefers
faster solutions without any extra signal. An explicit time penalty is
both redundant and harmful for function-approximation methods like PPO.

Q-learning tolerates it because dt=0.1 makes the penalty 10× smaller, and
the tabular value function averages over many revisits to the same state
bucket, smoothing out the non-stationarity.

Set reward_fac[2] = 0.0 in both train_ppo.py and play_ppo.py. The
reward_fac parameter remains on the environment so Q-learning can still
use it; only the PPO scripts override it explicitly.
---
 scripts/play_ppo.py  | 1 +
 scripts/train_ppo.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/scripts/play_ppo.py b/scripts/play_ppo.py
index 6b5bfde..a7dbbf2 100644
--- a/scripts/play_ppo.py
+++ b/scripts/play_ppo.py
@@ -34,6 +34,7 @@ def main() -> None:
             "crane": build_crane,
             "start_speed": 1.0,
             "render_mode": args.render_mode,
+            "reward_fac": (1.0, 0.0015, 0.0),
         },
     )
 
diff --git a/scripts/train_ppo.py b/scripts/train_ppo.py
index 77b677c..dcef917 100644
--- a/scripts/train_ppo.py
+++ b/scripts/train_ppo.py
@@ -73,6 +73,7 @@ def main() -> None:
                 "crane": build_crane,
                 "start_speed": 1.0,
                 "render_mode": args.render_mode,
+                "reward_fac": (1.0, 0.0015, 0.0),
             },
             save_path=args.save_path,
             n_envs=args.n_envs,
@@ -90,6 +91,7 @@ def main() -> None:
                 "crane": build_crane,
                 "start_speed": 1.0,
                 "render_mode": args.render_mode,
+                "reward_fac": (1.0, 0.0015, 0.0),
             },
             save_path=args.save_path,
             gamma=args.gamma,

From fd2500aca12cdfdbaf990c0140be7bc882fb5c16 Mon Sep 17 00:00:00 2001
From: Babic <Aleksandar.Babic@dnv.com>
Date: Thu, 30 Apr 2026 08:41:07 +0200
Subject: [PATCH 4/6] fix: trigger show_plot from render() so plot appears
 after episode

render() had no handler for render_mode='plot'. show_plot() was only
called from reset() at the start of the next episode, so with a single
episode run the plot never appeared.
---
 src/crane_controller/envs/controlled_crane_pendulum.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/crane_controller/envs/controlled_crane_pendulum.py b/src/crane_controller/envs/controlled_crane_pendulum.py
index 78c6c69..364d0d8 100644
--- a/src/crane_controller/envs/controlled_crane_pendulum.py
+++ b/src/crane_controller/envs/controlled_crane_pendulum.py
@@ -485,5 +485,7 @@ def step(self, action: int) -> tuple[tuple[int, ...] | np.ndarray, float, bool,
 
     def render(self) -> None:
         """Render the current episode."""
-        if self.render_mode == "play-back":  # show the animation
+        if self.render_mode == "play-back":
             self.show_animation()
+        elif self.render_mode == "plot":
+            self.show_plot(self.nresets)

From d8fde64ddafa3429dfb986dd74488ffbec6afea5 Mon Sep 17 00:00:00 2001
From: Babic <Aleksandar.Babic@dnv.com>
Date: Thu, 30 Apr 2026 08:41:50 +0200
Subject: [PATCH 5/6] =?UTF-8?q?fix:=20improve=20show=5Fplot=20visualizatio?=
 =?UTF-8?q?n=20=E2=80=94=20layout,=20legends,=20figure=20title?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Switch from 2×2 grid to 4×1 vertical layout (figsize 16×12): gives
  each subplot full width and a shared time axis, making crane-pendulum
  interactions easier to align visually.
- Combine primary and twin-axis legend handles so load speed, damping,
  and crane speed (on ax1y2/ax2y2) appear in the legend. ax.legend()
  alone only captured primary-axis lines.
- Use plt.suptitle() instead of plt.title() for the figure-level title.
  plt.title() attached to the last active axes (ax4), placing the text
  between subplots. suptitle() places it above all subplots.
- Add loc="upper left" to ax2 legend to avoid overlap with the twin
  y-axis spine on the right.
- Add suptitle stub to stubs/matplotlib-stubs/pyplot.pyi (pyright did
  not recognise plt.suptitle without it).

Also updates CHANGELOG.md with all changes from this branch.
---
 CHANGELOG.md                                  | 20 +++++++++++++++++++
 .../envs/controlled_crane_pendulum.py         | 12 +++++++----
 stubs/matplotlib-stubs/pyplot.pyi             |  1 +
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 542f881..5f2d90a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,26 @@ The changelog format is based on [Keep a Changelog](https://keepachangelog.com/e
 ## [Unreleased]
 
 ### Added
+* `gamma` parameter on `ProximalPolicyOptimizationAgent` (default 0.99) and `--gamma` CLI flag in
+  `train_ppo.py` to configure the PPO discount factor without editing source code.
+
+### Fixed
+* `ProximalPolicyOptimizationAgent.load()` now applies a `TimeLimit` wrapper (max 3000 steps),
+  matching the training configuration. Without it, `play_ppo.py` ran indefinitely on a converged
+  model whose near-zero reward never crossed the termination threshold.
+* `AntiPendulumEnv.render()` now handles `render_mode='plot'` by calling `show_plot()` directly,
+  so the episode plot appears when running `play_ppo.py --render-mode plot`.
+* `show_plot()` legend now includes lines from twin y-axes (load speed, crane speed, damping) by
+  combining handles from both axes with `get_legend_handles_labels()`.
+* `show_plot()` title moved from `plt.title()` (attached to last axes) to `plt.suptitle()`
+  (figure-level), preventing the title from appearing between subplots.
+* `show_plot()` switched from 2×2 grid to 4×1 vertical layout (16×12 in) so all subplots share
+  a common time axis and each has full width.
+* Disabled explicit time penalty (`reward_fac[2] = 0.0`) in PPO training and playback scripts.
+  The term `−self.time × 0.001` uses hidden state absent from the observation, violating the
+  Markov property and destabilising PPO's value function. Time preference is already encoded
+  implicitly through the discount factor γ.
+
 * `ProximalPolicyOptimizationAgent.resume()` classmethod to continue training from a saved checkpoint.
   Restores VecNormalize statistics and keeps normalization in training mode, consistent with SB3's
   `PPO.load()` + `.learn(reset_num_timesteps=False)` pattern.
diff --git a/src/crane_controller/envs/controlled_crane_pendulum.py b/src/crane_controller/envs/controlled_crane_pendulum.py
index 364d0d8..04992e1 100644
--- a/src/crane_controller/envs/controlled_crane_pendulum.py
+++ b/src/crane_controller/envs/controlled_crane_pendulum.py
@@ -242,7 +242,7 @@ def show_plot(self, episode: int) -> None:
         episode : int
             Episode number used in the plot title.
         """
-        _, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
+        _, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(16, 12))
         times = self.dt * np.arange(len(self.traces["c_x"]))
         damping = self.traces["l_v"][0] * np.exp(-times / self.wire.damping_time)
         ax1.plot(times, self.traces["l_x"], label="load angle", color="blue")
@@ -254,11 +254,15 @@ def show_plot(self, episode: int) -> None:
         ax2y2.plot(times, self.traces["c_v"], label="crane speed", color="red")
         ax3.plot(times[: len(self.rewards)], self.rewards, label="rewards")
         ax4.plot(times, self.traces["acc"], label="x-acceleration", color="green")
-        _ = ax1.legend()
-        _ = ax2.legend()
+        lines1, labels1 = ax1.get_legend_handles_labels()
+        lines2, labels2 = ax1y2.get_legend_handles_labels()
+        ax1.legend(lines1 + lines2, labels1 + labels2)
+        lines3, labels3 = ax2.get_legend_handles_labels()
+        lines4, labels4 = ax2y2.get_legend_handles_labels()
+        ax2.legend(lines3 + lines4, labels3 + labels4, loc="upper left")
         _ = ax3.legend()
         _ = ax4.legend()
-        _ = plt.title(f"Detailed plot of episode {episode}, reward:{self.reward}")
+        _ = plt.suptitle(f"Detailed plot of episode {episode}, reward:{self.reward}")
         plt.show()
         for key in self.traces:
             self.traces[key] = []
diff --git a/stubs/matplotlib-stubs/pyplot.pyi b/stubs/matplotlib-stubs/pyplot.pyi
index 78298f6..6ae08f6 100644
--- a/stubs/matplotlib-stubs/pyplot.pyi
+++ b/stubs/matplotlib-stubs/pyplot.pyi
@@ -66,6 +66,7 @@ def title(
     y: float | None = None,
     **kwargs: Any,
 ) -> Text: ...
+def suptitle(t: str, **kwargs: Any) -> Text: ...
 def plot(
     *args: Any,
     scalex: bool = True,

From 448312f9c83fcd05f682ba885584e81abceb98a3 Mon Sep 17 00:00:00 2001
From: Babic <Aleksandar.Babic@dnv.com>
Date: Thu, 30 Apr 2026 09:43:23 +0200
Subject: [PATCH 6/6] fix: add missing type: ignore[arg-type] on TimeLimit in
 load()

---
 src/crane_controller/ppo_agent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/crane_controller/ppo_agent.py b/src/crane_controller/ppo_agent.py
index a017713..b9533a8 100644
--- a/src/crane_controller/ppo_agent.py
+++ b/src/crane_controller/ppo_agent.py
@@ -104,7 +104,7 @@ def load(
             env_id=env,
             n_envs=1,
             env_kwargs=env_kwargs,
-            wrapper_class=TimeLimit,
+            wrapper_class=TimeLimit,  # type: ignore[arg-type]
             wrapper_kwargs={"max_episode_steps": 3000},
         )
         stats_path = cls._stats_path(str(model_path))