From fa0ed19aae20fd38acf835ace234fb3b418412b3 Mon Sep 17 00:00:00 2001 From: "loki-hermes-agent[bot]" <3150032+loki-hermes-agent[bot]@users.noreply.github.com> Date: Tue, 9 Jun 2026 19:35:38 +0300 Subject: [PATCH 1/3] feat: add missing runbooks and prometheus alert rules --- docs/prometheus-alerts.yml | 17 +++++++++++++++++ runbooks/client_crash.yaml | 21 +++++++++++++++++++++ runbooks/validator_duty_misses.yaml | 17 +++++++++++++++++ 3 files changed, 55 insertions(+) create mode 100644 docs/prometheus-alerts.yml create mode 100644 runbooks/client_crash.yaml create mode 100644 runbooks/validator_duty_misses.yaml diff --git a/docs/prometheus-alerts.yml b/docs/prometheus-alerts.yml new file mode 100644 index 0000000..a7497f9 --- /dev/null +++ b/docs/prometheus-alerts.yml @@ -0,0 +1,17 @@ +groups: + - name: hermes-self-monitoring + rules: + - alert: HermesAgentSilent + expr: absent(hermes_alive) or hermes_alive == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Hermes agent not responding" + - alert: WebhookReceiverDown + expr: absent(webhook_receiver_up) or webhook_receiver_up == 0 + for: 30s + labels: + severity: critical + annotations: + summary: "Alertmanager webhook receiver is down" diff --git a/runbooks/client_crash.yaml b/runbooks/client_crash.yaml new file mode 100644 index 0000000..9082204 --- /dev/null +++ b/runbooks/client_crash.yaml @@ -0,0 +1,21 @@ +id: client_crash +description: Ethereum client container has crashed or exited unexpectedly. +trigger: + alert_type: client_crash +diagnostics: + - id: check_container_status + cmd: "docker inspect --format='{{.State.Status}}' {{ container }}" + description: "Verify current container state" + timeout: 5s + - id: check_container_logs + cmd: "docker logs {{ container }} --tail 100" + description: "Inspect last 100 lines of container logs" + timeout: 10s +actions: + - id: restart_client + description: "Restart the crashed client container" + cmd: "docker start {{ container }}" + risk: low + reversible: true + requires_approval: true + approval_timeout: 10m diff --git a/runbooks/validator_duty_misses.yaml b/runbooks/validator_duty_misses.yaml new file mode 100644 index 0000000..7f229e1 --- /dev/null +++ b/runbooks/validator_duty_misses.yaml @@ -0,0 +1,17 @@ +id: validator_duty_misses +description: Validator is missing attestation or proposal duties. +trigger: + alert_type: validator_duty_misses +diagnostics: + - id: check_validator_status + cmd: "curl -s http://validator:5042/eth/v1/validator/status" + description: "Check validator client status" + timeout: 5s +actions: + - id: restart_validator + description: "Restart the validator client" + cmd: "docker restart validator" + risk: low + reversible: true + requires_approval: true + approval_timeout: 15m From 7e218dbeb7c67ad592aa341f2ab585856778387a Mon Sep 17 00:00:00 2001 From: "loki-hermes-agent[bot]" <3150032+loki-hermes-agent[bot]@users.noreply.github.com> Date: Wed, 10 Jun 2026 23:33:04 +0300 Subject: [PATCH 2/3] fix: correct runbook schema (triggers/suggested_actions), hardcode service names, fix VC port, and move alerts to monitoring/rules/ --- .../rules/hermes.yml | 0 runbooks/client_crash.yaml | 15 ++++++++------- runbooks/validator_duty_misses.yaml | 11 ++++++----- 3 files changed, 14 insertions(+), 12 deletions(-) rename docs/prometheus-alerts.yml => monitoring/rules/hermes.yml (100%) diff --git a/docs/prometheus-alerts.yml b/monitoring/rules/hermes.yml similarity index 100% rename from docs/prometheus-alerts.yml rename to monitoring/rules/hermes.yml diff --git a/runbooks/client_crash.yaml b/runbooks/client_crash.yaml index 9082204..8677460 100644 --- a/runbooks/client_crash.yaml +++ b/runbooks/client_crash.yaml @@ -1,21 +1,22 @@ id: client_crash description: Ethereum client container has crashed or exited unexpectedly. -trigger: - alert_type: client_crash +triggers: + - alert_type: client_crash + min_severity: low diagnostics: - id: check_container_status - cmd: "docker inspect --format='{{.State.Status}}' {{ container }}" + cmd: "docker inspect --format='{{{{.State.Status}}}}' execution" description: "Verify current container state" timeout: 5s - id: check_container_logs - cmd: "docker logs {{ container }} --tail 100" + cmd: "docker logs execution --tail 100" description: "Inspect last 100 lines of container logs" timeout: 10s -actions: +suggested_actions: - id: restart_client description: "Restart the crashed client container" - cmd: "docker start {{ container }}" + cmd: "docker restart execution" risk: low reversible: true requires_approval: true - approval_timeout: 10m + approval_timeout: 10m \ No newline at end of file diff --git a/runbooks/validator_duty_misses.yaml b/runbooks/validator_duty_misses.yaml index 7f229e1..4455af7 100644 --- a/runbooks/validator_duty_misses.yaml +++ b/runbooks/validator_duty_misses.yaml @@ -1,17 +1,18 @@ id: validator_duty_misses description: Validator is missing attestation or proposal duties. -trigger: - alert_type: validator_duty_misses +triggers: + - alert_type: validator_duty_misses + min_severity: low diagnostics: - id: check_validator_status - cmd: "curl -s http://validator:5042/eth/v1/validator/status" + cmd: "curl -s http://validator:5062/eth/v1/validator/status" description: "Check validator client status" timeout: 5s -actions: +suggested_actions: - id: restart_validator description: "Restart the validator client" cmd: "docker restart validator" risk: low reversible: true requires_approval: true - approval_timeout: 15m + approval_timeout: 15m \ No newline at end of file From cea96faad13cc44a935b9bbf94bfd7336ae7bca4 Mon Sep 17 00:00:00 2001 From: "loki-hermes-agent[bot]" <3150032+loki-hermes-agent[bot]@users.noreply.github.com> Date: Wed, 10 Jun 2026 23:36:43 +0300 Subject: [PATCH 3/3] fix: correct docker inspect format string (single braces for Go template) --- runbooks/client_crash.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runbooks/client_crash.yaml b/runbooks/client_crash.yaml index 8677460..60d505a 100644 --- a/runbooks/client_crash.yaml +++ b/runbooks/client_crash.yaml @@ -5,7 +5,7 @@ triggers: min_severity: low diagnostics: - id: check_container_status - cmd: "docker inspect --format='{{{{.State.Status}}}}' execution" + cmd: "docker inspect --format='{{.State.Status}}' execution" description: "Verify current container state" timeout: 5s - id: check_container_logs