diff --git a/pyproject.toml b/pyproject.toml index 5f1063b..8def41c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ requires-python = ">=3.12" dependencies = [ "httpx>=0.27", "pyyaml>=6.0", + "prometheus-client>=0.20", ] [project.optional-dependencies] diff --git a/src/agentic_node_ops/processor.py b/src/agentic_node_ops/processor.py index a253510..427a1de 100644 --- a/src/agentic_node_ops/processor.py +++ b/src/agentic_node_ops/processor.py @@ -13,6 +13,8 @@ from pathlib import Path from typing import Optional +from prometheus_client import Gauge, start_http_server + from .context import build_hermes_context from .database import Database from .dispatcher import NotificationDispatcher @@ -24,6 +26,19 @@ ALERT_OFFSET_PATH = os.environ.get( "ALERT_OFFSET_PATH", "/var/hermes/alerts.jsonl.offset" ) +METRICS_PORT = int(os.environ.get("METRICS_PORT", "8091")) + +# Prometheus metrics +HERMES_ALIVE = Gauge("hermes_alive", "Hermes agent heartbeat (1 = alive, 0 = silent)") + + +def _start_metrics_server() -> None: + """Start Prometheus metrics HTTP server in a background thread.""" + try: + start_http_server(METRICS_PORT) + log.info("Prometheus metrics server started on port %d", METRICS_PORT) + except Exception as e: + log.error("Failed to start metrics server: %s", e) def read_offset(path: str) -> int: @@ -161,6 +176,9 @@ async def _run_loop_async( """Internal async loop that processes alerts continuously.""" while True: try: + # Periodic heartbeat to prove event loop is alive and not deadlocked + HERMES_ALIVE.set(1) + count = await process_alerts_async(db=db, dispatcher=dispatcher) if count == 0: await asyncio.sleep(poll_interval) @@ -186,8 +204,16 @@ def run_processor_loop( poll_interval: Seconds to wait between polling cycles when queue is empty """ log.info("Starting alert processor loop (poll interval: %ss)", poll_interval) + + # Start Prometheus metrics server and set initial heartbeat + _start_metrics_server() + HERMES_ALIVE.set(1) + asyncio.run(_run_loop_async(db, dispatcher, poll_interval)) + # Clear heartbeat on shutdown + HERMES_ALIVE.set(0) + if __name__ == "__main__": run_processor_loop()