From 2516cb2105663b874429f68430fa5f0c7799f2a8 Mon Sep 17 00:00:00 2001 From: "loki-hermes-agent[bot]" <3150032+loki-hermes-agent[bot]@users.noreply.github.com> Date: Tue, 9 Jun 2026 19:35:40 +0300 Subject: [PATCH 1/5] feat: add hermes_alive heartbeat metric for self-monitoring --- pyproject.toml | 1 + src/agentic_node_ops/processor.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 5f1063b..8def41c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ requires-python = ">=3.12" dependencies = [ "httpx>=0.27", "pyyaml>=6.0", + "prometheus-client>=0.20", ] [project.optional-dependencies] diff --git a/src/agentic_node_ops/processor.py b/src/agentic_node_ops/processor.py index 4f7cc6a..e493366 100644 --- a/src/agentic_node_ops/processor.py +++ b/src/agentic_node_ops/processor.py @@ -13,6 +13,8 @@ from pathlib import Path from typing import Optional +from prometheus_client import Gauge, start_http_server + from .context import build_hermes_context from .database import Database from .dispatcher import NotificationDispatcher @@ -24,6 +26,19 @@ ALERT_OFFSET_PATH = os.environ.get( "ALERT_OFFSET_PATH", "/var/hermes/alerts.jsonl.offset" ) +METRICS_PORT = int(os.environ.get("METRICS_PORT", "8091")) + +# Prometheus metrics +HERMES_ALIVE = Gauge("hermes_alive", "Hermes agent heartbeat (1 = alive, 0 = silent)") + + +def _start_metrics_server() -> None: + """Start Prometheus metrics HTTP server in a background thread.""" + try: + start_http_server(METRICS_PORT) + log.info("Prometheus metrics server started on port %d", METRICS_PORT) + except Exception as e: + log.error("Failed to start metrics server: %s", e) def read_offset(path: str) -> int: @@ -186,4 +201,20 @@ def run_processor_loop( poll_interval: Seconds to wait between polling cycles when queue is empty """ log.info("Starting alert processor loop (poll interval: %ss)", poll_interval) + + # Start Prometheus metrics server and set initial heartbeat + _start_metrics_server() + HERMES_ALIVE.set(1) + asyncio.run(_run_loop_async(db, dispatcher, poll_interval)) + + # Clear heartbeat on shutdown + HERMES_ALIVE.set(0) + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + ) + run_processor_loop() From 0cd761cd00ab18f2202886eea164ea46024ac10c Mon Sep 17 00:00:00 2001 From: "loki-hermes-agent[bot]" <3150032+loki-hermes-agent[bot]@users.noreply.github.com> Date: Tue, 9 Jun 2026 19:46:39 +0300 Subject: [PATCH 2/5] fix: remove trailing whitespace in processor.py to satisfy ruff format --- src/agentic_node_ops/processor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/agentic_node_ops/processor.py b/src/agentic_node_ops/processor.py index e493366..b475136 100644 --- a/src/agentic_node_ops/processor.py +++ b/src/agentic_node_ops/processor.py @@ -201,13 +201,13 @@ def run_processor_loop( poll_interval: Seconds to wait between polling cycles when queue is empty """ log.info("Starting alert processor loop (poll interval: %ss)", poll_interval) - + # Start Prometheus metrics server and set initial heartbeat _start_metrics_server() HERMES_ALIVE.set(1) - + asyncio.run(_run_loop_async(db, dispatcher, poll_interval)) - + # Clear heartbeat on shutdown HERMES_ALIVE.set(0) From 9ff143b852d68f05f7628ba77526e98ac2699894 Mon Sep 17 00:00:00 2001 From: "loki-hermes-agent[bot]" <3150032+loki-hermes-agent[bot]@users.noreply.github.com> Date: Wed, 10 Jun 2026 02:08:12 +0300 Subject: [PATCH 3/5] refactor: remove entry point block (moved to infra PR #18) --- src/agentic_node_ops/processor.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/agentic_node_ops/processor.py b/src/agentic_node_ops/processor.py index b475136..264850c 100644 --- a/src/agentic_node_ops/processor.py +++ b/src/agentic_node_ops/processor.py @@ -210,11 +210,3 @@ def run_processor_loop( # Clear heartbeat on shutdown HERMES_ALIVE.set(0) - - -if __name__ == "__main__": - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", - ) - run_processor_loop() From 374a55ab1ff68b23066bdd5d518e6b9bb425bafc Mon Sep 17 00:00:00 2001 From: "loki-hermes-agent[bot]" <3150032+loki-hermes-agent[bot]@users.noreply.github.com> Date: Wed, 10 Jun 2026 23:42:39 +0300 Subject: [PATCH 4/5] fix: add periodic heartbeat in async loop to detect event loop deadlocks --- src/agentic_node_ops/processor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/agentic_node_ops/processor.py b/src/agentic_node_ops/processor.py index 264850c..9e0c931 100644 --- a/src/agentic_node_ops/processor.py +++ b/src/agentic_node_ops/processor.py @@ -176,6 +176,9 @@ async def _run_loop_async( """Internal async loop that processes alerts continuously.""" while True: try: + # Periodic heartbeat to prove event loop is alive and not deadlocked + HERMES_ALIVE.set(1) + count = await process_alerts_async(db=db, dispatcher=dispatcher) if count == 0: await asyncio.sleep(poll_interval) From 7574fb04223872356e9615b7af994f7eb0c8c087 Mon Sep 17 00:00:00 2001 From: "loki-hermes-agent[bot]" <3150032+loki-hermes-agent[bot]@users.noreply.github.com> Date: Wed, 10 Jun 2026 23:48:47 +0300 Subject: [PATCH 5/5] style: fix trailing whitespace in processor.py to satisfy ruff format --- src/agentic_node_ops/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentic_node_ops/processor.py b/src/agentic_node_ops/processor.py index 2d4c3b6..427a1de 100644 --- a/src/agentic_node_ops/processor.py +++ b/src/agentic_node_ops/processor.py @@ -178,7 +178,7 @@ async def _run_loop_async( try: # Periodic heartbeat to prove event loop is alive and not deadlocked HERMES_ALIVE.set(1) - + count = await process_alerts_async(db=db, dispatcher=dispatcher) if count == 0: await asyncio.sleep(poll_interval)