From 0bc26aac7723f26768ff00553728f4c9cae48b72 Mon Sep 17 00:00:00 2001 From: "loki-hermes-agent[bot]" <3150032+loki-hermes-agent[bot]@users.noreply.github.com> Date: Wed, 10 Jun 2026 23:57:36 +0300 Subject: [PATCH 1/2] docs: add functional testing runbook and alertmanager routing config --- docs/alertmanager-hermes-routing.yml | 37 ++++++ docs/functional-testing-runbook.md | 161 +++++++++++++++++++++++++++ 2 files changed, 198 insertions(+) create mode 100644 docs/alertmanager-hermes-routing.yml create mode 100644 docs/functional-testing-runbook.md diff --git a/docs/alertmanager-hermes-routing.yml b/docs/alertmanager-hermes-routing.yml new file mode 100644 index 0000000..08ebeba --- /dev/null +++ b/docs/alertmanager-hermes-routing.yml @@ -0,0 +1,37 @@ +# Alertmanager Routing Configuration for Hermes +# +# Add this to your eth-docker Alertmanager configuration (e.g., alertmanager/config.yml) +# to route all alerts to the Hermes webhook receiver for normalization and reasoning. + +route: + # Default receiver for all alerts + receiver: 'hermes-webhook' + + # Optional: Group alerts to reduce notification spam + group_by: ['alertname', 'host'] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + + routes: + # Route all alerts to Hermes. + # You can add specific matchers here if you only want certain alerts processed. + - matchers: + - alertname=~".+" + receiver: 'hermes-webhook' + +receivers: + - name: 'hermes-webhook' + webhook_configs: + # Adjust the URL if webhook-receiver is deployed on a different host/port + - url: 'http://webhook-receiver:8090/webhook' + send_resolved: true + # Optional: Add basic auth or headers if your webhook receiver requires it + # http_config: + # basic_auth: + # username: 'hermes' + # password: 'your-secret' + +# Global settings (ensure these match your existing Alertmanager config) +global: + resolve_timeout: 5m \ No newline at end of file diff --git a/docs/functional-testing-runbook.md b/docs/functional-testing-runbook.md new file mode 100644 index 0000000..b43ea31 --- /dev/null +++ b/docs/functional-testing-runbook.md @@ -0,0 +1,161 @@ +# Functional Testing Runbook + +> **Objective:** Validate the end-to-end agentic node ops pipeline on a real `eth-docker` deployment before accumulating production data for Phase 5. +> **Prerequisite:** An active `eth-docker` deployment with Prometheus, Alertmanager, and Loki running. + +--- + +## 1. Pre-Flight Checks + +### 1.1 Verify Environment Variables +Ensure the following are set in your `.env` file (or passed via environment) before deployment: +```bash +DISCORD_WEBHOOK_URL="https://discord.com/api/webhooks/..." +NTFY_TOPIC="https://ntfy.sh/your-secret-topic" +``` + +### 1.2 Verify Alertmanager Routing +Add the following route to your `eth-docker` Alertmanager configuration (`alertmanager/config.yml`) to ensure alerts are routed to the webhook receiver: + +```yaml +route: + receiver: 'hermes-webhook' + routes: + - matchers: + - alertname=~".+" + receiver: 'hermes-webhook' + +receivers: + - name: 'hermes-webhook' + webhook_configs: + - url: 'http://webhook-receiver:8090/webhook' + send_resolved: true +``` +*Note: Adjust the URL if deploying outside the same Docker network, or ensure `webhook-receiver` is resolvable.* + +### 1.3 Verify Prometheus Rules +Ensure the self-monitoring rules are loaded. If using `eth-docker`, place `monitoring/rules/hermes.yml` in the Prometheus rules directory and reload Prometheus. + +--- + +## 2. Deployment + +### 2.1 Build and Start the Stack +From the `agentic-node-ops` root directory: +```bash +# Ensure the eth-docker network exists +docker network ls | grep ethd_default + +# Build and start the Hermes stack +docker compose up -d --build +``` + +### 2.2 Verify Health Endpoints +```bash +# Check webhook receiver health +curl http://localhost:8090/health + +# Check hermes-agent metrics (should return Prometheus text format) +curl http://localhost:8091/metrics | grep hermes_alive +``` +*Expected:* `hermes_alive 1.0` + +--- + +## 3. Test Scenarios + +### Scenario A: Basic Alert Processing & Notification (Tier 1) +**Goal:** Verify alert ingestion, context assembly, and two-tier notification. + +1. **Trigger:** Manually fire a test alert via Alertmanager or Prometheus: + ```yaml + # In Prometheus UI -> Alerting -> Rules, or via alertmanager API + alertname: "TestConsensusDesync" + severity: "high" + host: "test-host" + ``` +2. **Validate Webhook Receiver:** + ```bash + docker compose logs webhook-receiver | grep "Processed alert" + ``` + *Expected:* Log showing the alert was written to `alerts.jsonl`. +3. **Validate Hermes Agent:** + ```bash + docker compose logs hermes-agent | grep "Processed alert id=" + ``` + *Expected:* Log showing the alert was read, context was built, and dispatched. +4. **Validate Notifications:** Check Discord and ntfy.sh for the incoming alert. The message should include the `hermes_context` summary. + +### Scenario B: Storm Protection +**Goal:** Verify that rapid, repeated alerts are bundled. + +1. **Trigger:** Fire 4+ identical alerts for the same `host` within 30 seconds. +2. **Validate:** Check `webhook-receiver` logs: + ```bash + docker compose logs webhook-receiver | grep "storm_single_host" + ``` + *Expected:* A single bundled alert is written to `alerts.jsonl` with `alert_type: storm_single_host`. + +### Scenario C: Deduplication +**Goal:** Verify that resolved alerts do not spam, but re-firing alerts after resolution do process. + +1. **Trigger:** Fire an alert, let it process, then resolve it (send `status: resolved`). +2. **Trigger Again:** Fire the exact same alert immediately. +3. **Validate:** Check `webhook-receiver` logs for `Deduplicated` or `skipped`. + *Expected:* The second firing is skipped. If you wait past the cooldown (e.g., 1 hour for high severity) or change severity to `critical`, it should process again. + +### Scenario D: Approval Flow & Execution (Tier 2/3) +**Goal:** Verify the state machine, fatigue prevention, and safe execution. + +1. **Trigger:** Fire an alert that matches a runbook with `requires_approval: true` (e.g., `client_crash`). +2. **Validate Proposal:** Check `hermes-agent` logs for `Proposing action`. +3. **Validate Database:** Inspect the SQLite DB for the pending proposal: + ```bash + docker compose exec hermes-agent sqlite3 /var/hermes/hermes.db \ + "SELECT id, alert_type, action_id, status FROM action_proposals WHERE status = 'pending';" + ``` +4. **Simulate Approval:** (In a real scenario, this is done via UI/API. For testing, update the DB directly or use the approval endpoint if exposed): + ```bash + docker compose exec hermes-agent sqlite3 /var/hermes/hermes.db \ + "UPDATE action_proposals SET status = 'approved' WHERE status = 'pending';" + ``` +5. **Validate Execution:** Check `hermes-agent` logs for `Executing action` and verify the `executor.py` successfully ran the command (e.g., `docker restart execution`). +6. **Validate Outcome:** Verify a record was written to the `runbook_outcomes` table. + +### Scenario E: Self-Monitoring Failure +**Goal:** Verify the agent detects its own silence. + +1. **Trigger:** Stop the hermes-agent container: + ```bash + docker compose stop hermes-agent + ``` +2. **Validate:** Wait 2 minutes. Check Prometheus for the `HermesAgentSilent` alert firing. + *Expected:* Alertmanager routes this to the webhook receiver, and you receive a critical notification. + +--- + +## 4. Post-Test Cleanup & Data Verification + +### 4.1 Verify Phase 5 Data Accumulation +Ensure the database is correctly logging outcomes for future synthesis: +```bash +docker compose exec hermes-agent sqlite3 /var/hermes/hermes.db \ + "SELECT COUNT(*) FROM incidents; SELECT COUNT(*) FROM runbook_outcomes;" +``` +*Expected:* Counts > 0 if Scenario A and D were successful. + +### 4.2 Teardown (Optional) +```bash +docker compose down -v # Removes volumes (WARNING: deletes accumulated test data) +``` + +--- + +## 5. Troubleshooting + +| Symptom | Likely Cause | Fix | +|---|---|---| +| `webhook-receiver` returns 500 | Prometheus/Loki unreachable | Verify `PROMETHEUS_URL` and network connectivity (`docker compose exec webhook-receiver curl http://prometheus:9090`) | +| `hermes-agent` fails to execute action | Nginx proxy blocking POST | Check `hermes-docker-proxy` logs. Ensure `nginx-docker-filter.conf` allows `POST /containers/{name}/restart` | +| Notifications not arriving | Missing env vars | Verify `DISCORD_WEBHOOK_URL` and `NTFY_TOPIC` are set in `.env` | +| `hermes_alive` is 0 | Agent crashed or deadlocked | Check `docker compose logs hermes-agent` for tracebacks | From 3a60df4184a75d999bc1f6a0b3781f4a610f0b43 Mon Sep 17 00:00:00 2001 From: "loki-hermes-agent[bot]" <3150032+loki-hermes-agent[bot]@users.noreply.github.com> Date: Thu, 11 Jun 2026 00:05:23 +0300 Subject: [PATCH 2/2] docs: fix functional testing runbook (correct SQL, trigger method, DB path, and Scenario E expectations) --- docs/alertmanager-hermes-routing.yml | 12 ++---- docs/functional-testing-runbook.md | 61 ++++++++++++++++++---------- 2 files changed, 43 insertions(+), 30 deletions(-) diff --git a/docs/alertmanager-hermes-routing.yml b/docs/alertmanager-hermes-routing.yml index 08ebeba..7fdd17c 100644 --- a/docs/alertmanager-hermes-routing.yml +++ b/docs/alertmanager-hermes-routing.yml @@ -6,20 +6,14 @@ route: # Default receiver for all alerts receiver: 'hermes-webhook' - + # Optional: Group alerts to reduce notification spam - group_by: ['alertname', 'host'] + # Note: eth-docker uses 'instance' or 'job' labels, not 'host' + group_by: ['alertname', 'instance'] group_wait: 30s group_interval: 5m repeat_interval: 4h - routes: - # Route all alerts to Hermes. - # You can add specific matchers here if you only want certain alerts processed. - - matchers: - - alertname=~".+" - receiver: 'hermes-webhook' - receivers: - name: 'hermes-webhook' webhook_configs: diff --git a/docs/functional-testing-runbook.md b/docs/functional-testing-runbook.md index b43ea31..9a2dcab 100644 --- a/docs/functional-testing-runbook.md +++ b/docs/functional-testing-runbook.md @@ -15,15 +15,15 @@ NTFY_TOPIC="https://ntfy.sh/your-secret-topic" ``` ### 1.2 Verify Alertmanager Routing -Add the following route to your `eth-docker` Alertmanager configuration (`alertmanager/config.yml`) to ensure alerts are routed to the webhook receiver: +Add the following route to your `eth-docker` Alertmanager configuration (`alertmanager/config.yml`) to ensure alerts are routed to the webhook receiver. (See `docs/alertmanager-hermes-routing.yml` for the full configuration). ```yaml route: receiver: 'hermes-webhook' - routes: - - matchers: - - alertname=~".+" - receiver: 'hermes-webhook' + group_by: ['alertname', 'instance'] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h receivers: - name: 'hermes-webhook' @@ -67,12 +67,25 @@ curl http://localhost:8091/metrics | grep hermes_alive ### Scenario A: Basic Alert Processing & Notification (Tier 1) **Goal:** Verify alert ingestion, context assembly, and two-tier notification. -1. **Trigger:** Manually fire a test alert via Alertmanager or Prometheus: - ```yaml - # In Prometheus UI -> Alerting -> Rules, or via alertmanager API - alertname: "TestConsensusDesync" - severity: "high" - host: "test-host" +1. **Trigger:** Manually fire a test alert directly to the webhook receiver (isolates the pipeline from Alertmanager configuration): + ```bash + curl -X POST http://localhost:8090/webhook \ + -H "Content-Type: application/json" \ + -d '{ + "receiver": "hermes-webhook", + "status": "firing", + "alerts": [{ + "status": "firing", + "labels": { + "alertname": "ConsensusDesync", + "severity": "high", + "instance": "test-host" + }, + "annotations": {"summary": "Test alert"}, + "startsAt": "2025-01-01T00:00:00Z", + "fingerprint": "test-001" + }] + }' ``` 2. **Validate Webhook Receiver:** ```bash @@ -109,28 +122,34 @@ curl http://localhost:8091/metrics | grep hermes_alive 1. **Trigger:** Fire an alert that matches a runbook with `requires_approval: true` (e.g., `client_crash`). 2. **Validate Proposal:** Check `hermes-agent` logs for `Proposing action`. -3. **Validate Database:** Inspect the SQLite DB for the pending proposal: +3. **Validate Database:** Inspect the SQLite DB for the pending proposal (note: `outcome` is `NULL` when pending): ```bash - docker compose exec hermes-agent sqlite3 /var/hermes/hermes.db \ - "SELECT id, alert_type, action_id, status FROM action_proposals WHERE status = 'pending';" + docker compose exec hermes-agent sqlite3 /var/hermes/incidents.db \ + "SELECT ap.id, i.alert_type, ap.action_id, ap.outcome FROM action_proposals ap JOIN incidents i ON ap.incident_id = i.id WHERE ap.outcome IS NULL;" ``` -4. **Simulate Approval:** (In a real scenario, this is done via UI/API. For testing, update the DB directly or use the approval endpoint if exposed): +4. **Simulate Approval:** (In a real scenario, this is done via UI/API. For testing, update the DB directly): ```bash - docker compose exec hermes-agent sqlite3 /var/hermes/hermes.db \ - "UPDATE action_proposals SET status = 'approved' WHERE status = 'pending';" + docker compose exec hermes-agent sqlite3 /var/hermes/incidents.db \ + "UPDATE action_proposals SET outcome = 'approved', resolved_at = datetime('now') WHERE outcome IS NULL;" ``` 5. **Validate Execution:** Check `hermes-agent` logs for `Executing action` and verify the `executor.py` successfully ran the command (e.g., `docker restart execution`). 6. **Validate Outcome:** Verify a record was written to the `runbook_outcomes` table. ### Scenario E: Self-Monitoring Failure -**Goal:** Verify the agent detects its own silence. +**Goal:** Verify the agent detects its own silence and recovers from backlog. 1. **Trigger:** Stop the hermes-agent container: ```bash docker compose stop hermes-agent ``` -2. **Validate:** Wait 2 minutes. Check Prometheus for the `HermesAgentSilent` alert firing. - *Expected:* Alertmanager routes this to the webhook receiver, and you receive a critical notification. +2. **Validate Queue Survival:** Wait 2 minutes. Check Prometheus for the `HermesAgentSilent` alert firing. Alertmanager will route this to the webhook receiver, which will append it to `alerts.jsonl`. +3. **Validate Backlog Drain:** Restart the agent: + ```bash + docker compose start hermes-agent + ``` + *Expected:* Upon restart, `hermes-agent` reads the backlogged `HermesAgentSilent` alert from the queue and sends the critical notification. This validates that the queue survived the outage and the backlog is drained correctly. + + > **Design Limitation Note:** Self-monitoring alerts cannot notify you in real-time if `hermes-agent` itself is down, because the agent is the component that dispatches notifications. If real-time silence alerting is required, you must configure a secondary notification path (e.g., Alertmanager routing `HermesAgentSilent` directly to a Discord webhook in addition to the Hermes receiver). --- @@ -139,7 +158,7 @@ curl http://localhost:8091/metrics | grep hermes_alive ### 4.1 Verify Phase 5 Data Accumulation Ensure the database is correctly logging outcomes for future synthesis: ```bash -docker compose exec hermes-agent sqlite3 /var/hermes/hermes.db \ +docker compose exec hermes-agent sqlite3 /var/hermes/incidents.db \ "SELECT COUNT(*) FROM incidents; SELECT COUNT(*) FROM runbook_outcomes;" ``` *Expected:* Counts > 0 if Scenario A and D were successful.