diff --git a/monitoring/rules/hermes.yml b/monitoring/rules/hermes.yml new file mode 100644 index 0000000..a7497f9 --- /dev/null +++ b/monitoring/rules/hermes.yml @@ -0,0 +1,17 @@ +groups: + - name: hermes-self-monitoring + rules: + - alert: HermesAgentSilent + expr: absent(hermes_alive) or hermes_alive == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Hermes agent not responding" + - alert: WebhookReceiverDown + expr: absent(webhook_receiver_up) or webhook_receiver_up == 0 + for: 30s + labels: + severity: critical + annotations: + summary: "Alertmanager webhook receiver is down" diff --git a/runbooks/client_crash.yaml b/runbooks/client_crash.yaml new file mode 100644 index 0000000..60d505a --- /dev/null +++ b/runbooks/client_crash.yaml @@ -0,0 +1,22 @@ +id: client_crash +description: Ethereum client container has crashed or exited unexpectedly. +triggers: + - alert_type: client_crash + min_severity: low +diagnostics: + - id: check_container_status + cmd: "docker inspect --format='{{.State.Status}}' execution" + description: "Verify current container state" + timeout: 5s + - id: check_container_logs + cmd: "docker logs execution --tail 100" + description: "Inspect last 100 lines of container logs" + timeout: 10s +suggested_actions: + - id: restart_client + description: "Restart the crashed client container" + cmd: "docker restart execution" + risk: low + reversible: true + requires_approval: true + approval_timeout: 10m \ No newline at end of file diff --git a/runbooks/validator_duty_misses.yaml b/runbooks/validator_duty_misses.yaml new file mode 100644 index 0000000..4455af7 --- /dev/null +++ b/runbooks/validator_duty_misses.yaml @@ -0,0 +1,18 @@ +id: validator_duty_misses +description: Validator is missing attestation or proposal duties. +triggers: + - alert_type: validator_duty_misses + min_severity: low +diagnostics: + - id: check_validator_status + cmd: "curl -s http://validator:5062/eth/v1/validator/status" + description: "Check validator client status" + timeout: 5s +suggested_actions: + - id: restart_validator + description: "Restart the validator client" + cmd: "docker restart validator" + risk: low + reversible: true + requires_approval: true + approval_timeout: 15m \ No newline at end of file