From 492be6ee39990aef7c1a6db83e529611f946b326 Mon Sep 17 00:00:00 2001 From: Evan Nemerson Date: Tue, 9 Jun 2026 23:48:07 -0400 Subject: [PATCH] CP-42614: Add monitoring discovery model (enabled + discovery.method) The chart's monitoring integration could advertise the agent's metrics two ways at once -- prometheus.io/* annotations and ServiceMonitor CRDs -- which scrape the same targets twice. This finalizes the still-in-validation monitoring configuration into an explicit, opt-in model. Implementation Approach: The chart emits two kinds of monitoring resources -- Prometheus Operator CRDs (ServiceMonitor + PrometheusRule) and prometheus.io/* annotations on the agent Services. Both are driven by two values resolved in helm/templates/_helpers.tpl and consumed by the Service, ServiceMonitor, and PrometheusRule templates. There is no cluster auto-detection; the configuration is explicit. This finalizes the still-in-validation monitoring integration added in CP-34935, so its defaults change: the default install emits nothing, and enabled:true selects one mechanism via discovery.method rather than emitting annotations and ServiceMonitors together. Functional Requirements: 1. Monitoring must be off by default and turned on explicitly. Added components.monitoring.enabled as a plain boolean defaulting to false. The four servicemonitor-*.yaml templates, prometheusrule.yaml, and the prometheus.io/* annotations on the three Service templates are all gated on it via helpers in _helpers.tpl. 2. When enabled, the operator must be able to choose the discovery mechanism. Added components.monitoring.discovery.method (auto | serviceMonitors | annotations, default auto). auto resolves to serviceMonitors today and is left as an enum so a future mechanism can extend it; annotations is an explicit opt-in. serviceMonitors emits monitoring.coreos.com/v1 CRDs, so the install fails if the Prometheus Operator is absent; annotations needs no CRDs. 3. The ServiceMonitors and the PrometheusRule must form a single bundle; annotation-based discovery is discovery-only. serviceMonitorsActive and rulesActive resolve together on the serviceMonitors path, so method=annotations yields no ServiceMonitors and no alert rules. 4. The new values must validate and be documented for users. Updated helm/values.schema.yaml (enabled: boolean; discovery.method enum) and regenerated helm/values.schema.json; set defaults and comments in helm/values.yaml; rewrote helm/docs/monitoring-infrastructure.md as user-facing reference. Validation: - helm lint passing - helm unit tests passing; helm/tests/monitoring_integration_test.yaml and defaults_service_test.yaml were rewritten for the new model - helm schema tests passing, including new fixtures under tests/helm/schema for the default, serviceMonitors, annotations, none, and invalid-value cases. kubeconform validates the rendered ServiceMonitor and PrometheusRule against the CRD catalog - golden manifests regenerated; a default `helm template` emits no ServiceMonitors, no PrometheusRule, and no prometheus.io/* annotations - helm subchart tests passing Co-Authored-By: Claude Opus 4.8 --- app/functions/helmless/default-values.yaml | 39 ++- helm/docs/monitoring-infrastructure.md | 168 +++---------- helm/templates/_helpers.tpl | 74 +++--- helm/templates/agent-service.yaml | 6 +- helm/templates/aggregator-service.yaml | 6 +- helm/templates/prometheusrule.yaml | 6 +- helm/templates/servicemonitor-agent.yaml | 2 +- helm/templates/servicemonitor-collector.yaml | 2 +- helm/templates/servicemonitor-shipper.yaml | 2 +- helm/templates/servicemonitor-webhook.yaml | 2 +- helm/templates/webhook-service.yaml | 6 +- helm/tests/defaults_service_test.yaml | 70 ++++++ helm/tests/monitoring_integration_test.yaml | 231 ++---------------- helm/values.schema.json | 24 +- helm/values.schema.yaml | 54 ++-- helm/values.yaml | 39 ++- ...monitoring.discovery.annotations.pass.yaml | 9 + .../monitoring.discovery.default.pass.yaml | 3 + ...itoring.discovery.method.invalid.fail.yaml | 6 + .../monitoring.discovery.none.pass.yaml | 5 + ...toring.discovery.servicemonitors.pass.yaml | 10 + .../monitoring.enabled.invalid-type.fail.yaml | 5 + tests/helm/template/alloy.yaml | 18 +- tests/helm/template/cert-manager.yaml | 18 +- tests/helm/template/federated.yaml | 18 +- tests/helm/template/istio.yaml | 18 +- tests/helm/template/kubestate.yaml | 18 +- tests/helm/template/manifest.yaml | 18 +- 28 files changed, 331 insertions(+), 546 deletions(-) create mode 100644 tests/helm/schema/monitoring.discovery.annotations.pass.yaml create mode 100644 tests/helm/schema/monitoring.discovery.default.pass.yaml create mode 100644 tests/helm/schema/monitoring.discovery.method.invalid.fail.yaml create mode 100644 tests/helm/schema/monitoring.discovery.none.pass.yaml create mode 100644 tests/helm/schema/monitoring.discovery.servicemonitors.pass.yaml create mode 100644 tests/helm/schema/monitoring.enabled.invalid-type.fail.yaml diff --git a/app/functions/helmless/default-values.yaml b/app/functions/helmless/default-values.yaml index d3762feb1..cbe33c397 100644 --- a/app/functions/helmless/default-values.yaml +++ b/app/functions/helmless/default-values.yaml @@ -867,29 +867,26 @@ components: # https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ securityContext: {} - # Prometheus Operator monitoring integration. + # Monitoring integration with the customer's Prometheus stack. # - # When enabled, the chart creates ServiceMonitor and PrometheusRule CRDs that - # allow the Prometheus Operator to automatically discover and scrape CloudZero - # Agent metrics, and to evaluate pre-configured alert rules. - # - # This follows the same null/true/false pattern as integrations.istio: - # - # - null (default): Follow the release default. Currently maps to "false" - # (disabled) while this feature is being validated. In a future release, - # null will map to "auto". Use null unless you have a reason to override. - # - "auto": Auto-detect. Creates monitoring CRDs only if the Prometheus - # Operator CRDs (monitoring.coreos.com/v1) are available in the cluster. - # - true: Always create monitoring CRDs. Helm will fail if the Prometheus - # Operator CRDs are not installed in the cluster. - # - false: Never create monitoring CRDs. - # - # To opt in now, set to "auto" or true. - # - # Regardless of this setting, prometheus.io/* annotations are always added to - # Services for customers using standard Prometheus service discovery. + # The chart integrates two ways: via the Prometheus Operator (ServiceMonitor + + # PrometheusRule CRDs) and/or via prometheus.io/* annotations on Services. monitoring: - enabled: null + # Whether to create the monitoring resources. Off by default; set to true to + # enable. With the default discovery method (serviceMonitors), this creates + # monitoring.coreos.com/v1 CRDs, so the install fails if the Prometheus + # Operator is not present -- use discovery.method: annotations otherwise. + enabled: false + + discovery: + # How your Prometheus discovers the agent's metrics (when monitoring is on). + # + # - auto (default): use ServiceMonitors. + # - serviceMonitors: create ServiceMonitor resources for the Prometheus + # Operator, along with the alert rules. + # - annotations: add prometheus.io/* annotations to the Services instead, for + # a Prometheus that scrapes by annotation. No alert rules. + method: auto # Namespace override for PrometheusRule and ServiceMonitor CRDs. # null (default) = same namespace as the agent installation. diff --git a/helm/docs/monitoring-infrastructure.md b/helm/docs/monitoring-infrastructure.md index d346d131d..686f492d1 100644 --- a/helm/docs/monitoring-infrastructure.md +++ b/helm/docs/monitoring-infrastructure.md @@ -1,37 +1,48 @@ # CloudZero Agent Monitoring Infrastructure This document describes the monitoring resources shipped with the CloudZero -Agent Helm chart: what they are, what they cover, what they don't cover, and -how they were validated. +Agent Helm chart: what they are, what they cover, and what they don't cover. ## Overview -The chart provides two categories of monitoring integration: +The chart can expose the CloudZero Agent's own metrics to your Prometheus and +ship a set of alert rules. It is off by default; two values turn it on and choose +how. -1. **Prometheus `prometheus.io/*` annotations** on all Services (always enabled). - These allow standard Prometheus installations using `kubernetes_sd_configs` - to auto-discover and scrape CloudZero Agent metrics without any CRDs. +**`components.monitoring.enabled`** — whether the monitoring resources are created: -2. **Prometheus Operator CRDs** (opt-in via `components.monitoring.enabled`). - When enabled, the chart creates `ServiceMonitor` and `PrometheusRule` - resources that the Prometheus Operator automatically picks up. +- `false` (default): off. +- `true`: on. -These resources are designed to be useful regardless of the customer's -monitoring stack. The `ServiceMonitor` and `PrometheusRule` CRDs are the -standard interoperability format understood by the Prometheus Operator, but -also by compatible tools like Victoria Metrics Operator, Datadog (via its -Prometheus integration), Grafana Agent, and others. +**`components.monitoring.discovery.method`** — how your Prometheus discovers the +agent's metrics, when monitoring is on: + +- `auto` (default): use ServiceMonitors. +- `serviceMonitors`: create `ServiceMonitor` resources for the Prometheus Operator, + along with the alert rules. These are Prometheus Operator resources, so the + install fails if the Operator isn't present. +- `annotations`: add `prometheus.io/*` annotations to the agent's Services instead, + for a Prometheus configured to scrape by annotation. No alert rules. + +To turn monitoring on, set `enabled: true`. Keep the default `method` if you run the +Prometheus Operator; use `annotations` if your Prometheus discovers targets by +annotation instead. ## Configuration ```yaml components: monitoring: - # null = auto-detect (install CRDs if Prometheus Operator is present) - # true = always install CRDs (fails if Prometheus Operator absent) - # false = never install CRDs (default while feature is being validated) + # Whether to create the monitoring resources. false (default) = off; true = on. enabled: false + discovery: + # How Prometheus discovers the agent's metrics: + # auto (default) -> ServiceMonitors + # serviceMonitors -> ServiceMonitor resources + alert rules + # annotations -> prometheus.io/* annotations only (no alert rules) + method: auto + # Override namespace for CRDs (default: same as agent namespace) namespace: "" @@ -177,126 +188,3 @@ installation, the following may be absent until their trigger condition occurs: Alerts that reference absent metrics evaluate to "no data" rather than firing, which is the correct behavior (absence of the failure counter means no failures have occurred). - -## Validation Results - -Tested on the `bach` cluster (GKE, `kube-prometheus-stack` installed) with -an intentionally invalid API key to trigger failure-path alerts. - -### Scrape Targets - -All four targets verified as `UP` in Prometheus: - -| Target | Endpoint | Status | -| ------------------------ | ------------------------ | ------------------------ | -| `cz-agent-cz-server` | `http-metrics` (9090) | UP | -| `cz-agent-cz-aggregator` | `metrics` (8080) | UP | -| `cz-agent-cz-aggregator` | `shipper-metrics` (8081) | UP | -| `cz-agent-cz-webhook` | `http` (8443) | UP (when pod is healthy) | - -### Metric Availability - -All metrics referenced by alert rules were verified present in Prometheus: - -| Metric | Source | Time Series Count | Notes | -| ---------------------------------------------------------------- | ------------------ | ----------------- | -------------------------------------------------------- | -| `metrics_received_total` | Collector | 4 | Present immediately | -| `metrics_received_cost_total` | Collector | 1 | Present immediately | -| `czo_webhook_types_total` | Webhook | 41 | Present immediately (multiple label combinations) | -| `function_execution_seconds_bucket` | Webhook | 33 | Present immediately (histogram buckets) | -| `prometheus_remote_storage_queue_highest_timestamp_seconds` | Server | 1 | Present immediately | -| `prometheus_remote_storage_queue_highest_sent_timestamp_seconds` | Server | 1 | Present immediately | -| `shipper_run_fail_total` | Shipper | 3 | Appeared after first failed upload (invalid API key) | -| `shipper_handle_request_success_total` | Shipper | 0 | Expected absent (no successful uploads with invalid key) | -| `remote_write_failures_total` | Webhook | 0 | Counter, appears after first failure event | -| `container_memory_working_set_bytes` | kubelet/cAdvisor | 244 | Standard Kubernetes metric (cluster-wide) | -| `kube_pod_container_resource_limits` | kube-state-metrics | 315 | Standard Kubernetes metric (cluster-wide) | - -### Alert Firing Behavior - -Validated using multiple test scenarios on the `bach` cluster: - -**Scenario 1: Invalid API key (natural failure path)** - -| Alert | State | Notes | -| ------------------------------------- | ------------------ | --------------------------------------------------------- | -| `CloudZeroShipperUploadFailures` | **Pending/Firing** | Correctly detected `error_status_code="err-unauthorized"` | -| `CloudZeroWebhookNoEvents` | **Firing** | Webhook pod on unhealthy node; no admission events | -| `CloudZeroWebhookRemoteWriteFailures` | **Firing** | Webhook unable to push metadata | -| `CloudZeroRemoteWriteLag` | **Pending** | Remote-write queue falling behind | - -**Scenario 2: Unschedulable pod (memory request set to 64Gi)** - -| Alert | State | Notes | -| -------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `CloudZeroAgentDown` | **Pending** | `absent(up{...})` correctly detected missing target. Original `up == 0` expression would NOT have fired because the metric becomes absent (not zero) when a pod disappears entirely. Fixed to use `up == 0 or absent(up{...})`. | - -**Scenario 3: Memory pressure (memory limit set to 65Mi vs 73.7Mi working set)** - -| Alert | State | Notes | -| ----------------------------------- | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `CloudZeroAgentHighMemoryUsage` | **Pending** | Memory ratio at 93.9%. Required `on(namespace, pod, container)` join -- without it, the division silently produced zero results due to label mismatch between cAdvisor and kube-state-metrics. | -| `CloudZeroAgentContainerRestarting` | **Pending** | Container restarting from memory pressure. Caught restart loop alongside the memory alert. | - -**Scenario 4: OOM kill (memory limit set to 30Mi vs 73.7Mi working set)** - -| Alert | State | Notes | -| ----------------------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `CloudZeroAgentOOMKilled` | **Firing** | `kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}` correctly detected OOM kill within 1 minute. Standard KSM metric, no configuration changes required. | -| `CloudZeroAgentContainerRestarting` | **Pending** | Caught the CrashLoopBackOff from repeated OOM kills. | -| `CloudZeroAgentDown` | **Pending** | Server target gone due to crash loop. | - -**Scenario 5: Collector starved (server + webhook scaled to 0)** - -| Alert | State | Notes | -| ----------------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `CloudZeroCollectorNoMetrics` | **Pending** | Correctly detected that `rate(metrics_received_total[10m]) == 0`. Note: scaling only the server to 0 is insufficient because the webhook also remote-writes to the collector. Both data sources must be stopped for this alert to fire. | - -**Alerts validated as correctly inactive during normal operation:** - -| Alert | Notes | -| ----------------------------------- | --------------------------------------------------------------------------------------------------------------- | -| `CloudZeroWebhookServerHighLatency` | p99 latency well below 500ms threshold. Would require sustained load to trigger. | -| `CloudZeroMissingContainerMetrics` | Container-level cAdvisor metrics present (bach uses containerd CRI). Would require Docker-based CRI to trigger. | - -**Alerts with intentional design limitations:** - -| Alert | Notes | -| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `CloudZeroShipperNoSuccessfulUploads` | Detects "was working, then stopped" -- NOT "never worked at all." When the success counter has never been incremented, it is absent in Prometheus and `rate()` returns no data rather than 0. This is intentional: the "never worked" case is covered by `CloudZeroShipperUploadFailures` (which fires on upload errors) and the OOM/restart/down alerts (which catch container-level failures). | - -### Bugs Found and Fixed During Testing - -1. **`absent()` bug in AgentDown/WebhookDown**: When a pod is completely - absent (unschedulable, scaled to 0), the `up` metric becomes absent rather - than zero. The original `up == 0` expression never fires. Fixed to use - `up == 0 or absent(up{...})`. - -2. **Label join bug in HighMemoryUsage**: `container_memory_working_set_bytes` - (from cAdvisor/kubelet) and `kube_pod_container_resource_limits` (from - kube-state-metrics) have different label sets. Without an explicit - `on(namespace, pod, container)` join, Prometheus's implicit matching - silently produces zero results. Fixed by adding the explicit join. - -3. **Job label mismatch in collector/shipper alerts**: The Prometheus Operator - assigns the `job` label from the Service name, not the ServiceMonitor name. - Both collector and shipper share the aggregator Service, so they get the - same `job` label. Fixed by adding `endpoint` label selectors to - distinguish them. - -### Auto-Detection Logic - -Tested via `helm template` with all three modes: - -| `components.monitoring.enabled` | ServiceMonitors | PrometheusRules | `prometheus.io/*` annotations | -| ------------------------------- | --------------- | --------------- | ----------------------------- | -| `null` (no CRDs in cluster) | 0 | 0 | 3 (always) | -| `true` | 4 | 1 | 3 (always) | -| `false` | 0 | 0 | 3 (always) | - -### Test Suite - -- Helm lint: passing -- Helm unit tests: 498/498 passing -- Helm schema tests: all passing (includes `components.monitoring.enabled` - null/true/false validation) diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl index 8847b54b2..a765ef3ec 100644 --- a/helm/templates/_helpers.tpl +++ b/helm/templates/_helpers.tpl @@ -1679,42 +1679,52 @@ checks: {{ $checks | toYaml | nindent 2 -}} {{- end -}} {{/* -Prometheus Operator Monitoring Enabled Helper +Prometheus monitoring resolution helpers. + +The chart integrates with a customer's Prometheus stack through the Prometheus +Operator (ServiceMonitor + PrometheusRule CRDs) and/or prometheus.io/* annotations. + + components.monitoring.enabled (true|false, default false) -- whether any + monitoring resources are created. Explicit; there is no auto-detection. + components.monitoring.discovery.method (auto|serviceMonitors|annotations) -- when + enabled, the discovery mechanism: + auto (default) = resolves to serviceMonitors today; left as an enum so a + future mechanism can extend it without changing the default + serviceMonitors = ServiceMonitor CRDs (+ the PrometheusRule alert bundle). + These are monitoring.coreos.com/v1 CRDs, so the install + fails if the Prometheus Operator is not present. + annotations = prometheus.io/* annotations only (no Operator CRDs, no alerts) + +The ServiceMonitors and the PrometheusRule are one Operator bundle: both ride the +serviceMonitors path. Choosing annotations is discovery-only (no alert bundle). +Each helper returns "true"/"". The `dig` default mirrors values.yaml so values +files predating `discovery` still resolve. +*/}} +{{/* Resolved discovery method: "annotations" only when explicitly chosen; "auto" and + "serviceMonitors" both resolve to serviceMonitors (auto is extensible in future). */}} +{{- define "cloudzero-agent.monitoring.method" -}} +{{- if eq (toString (dig "discovery" "method" "auto" .Values.components.monitoring)) "annotations" -}} +{{- "annotations" -}} +{{- else -}} +{{- "serviceMonitors" -}} +{{- end -}} +{{- end -}} -Determines whether Prometheus Operator CRDs (ServiceMonitor, PrometheusRule) should -be created. +{{- define "cloudzero-agent.monitoring.serviceMonitorsActive" -}} +{{- if dig "enabled" false .Values.components.monitoring -}} +{{- if eq (include "cloudzero-agent.monitoring.method" .) "serviceMonitors" -}}{{- true -}}{{- end -}} +{{- end -}} +{{- end -}} - - null (default): Follow the release default. Currently maps to "false" - (disabled) while the feature is being validated in customer environments. - In a future release, null will map to "auto". - - "auto": Auto-detect via CRD presence in the cluster. Creates monitoring - resources only if the Prometheus Operator CRDs (monitoring.coreos.com/v1) - are available. - - true: Force enable (will fail if CRDs are not installed) - - false: Force disable +{{/* The PrometheusRule alert bundle ships with the ServiceMonitor (operator) path. */}} +{{- define "cloudzero-agent.monitoring.rulesActive" -}} +{{- include "cloudzero-agent.monitoring.serviceMonitorsActive" . -}} +{{- end -}} -Usage: {{ if include "cloudzero-agent.monitoring.enabled" . }}...{{ end }} -Returns: "true" (truthy) when enabled, empty string (falsy) when disabled -*/}} -{{- define "cloudzero-agent.monitoring.enabled" -}} -{{- $monitoringSetting := .Values.components.monitoring.enabled -}} -{{- if kindIs "invalid" $monitoringSetting -}} - {{- /* null/not set = release default. Currently: disabled. - Change this block to auto-detect when promoting to GA: - if .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" - true - end - */ -}} -{{- else if eq (toString $monitoringSetting) "auto" -}} - {{- /* "auto" = detect Prometheus Operator CRDs */ -}} - {{- if .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" -}} - {{- true -}} - {{- end -}} -{{- else if eq (toString $monitoringSetting) "true" -}} - {{- /* true = force enabled */ -}} - {{- true -}} +{{- define "cloudzero-agent.monitoring.annotationsActive" -}} +{{- if dig "enabled" false .Values.components.monitoring -}} +{{- if eq (include "cloudzero-agent.monitoring.method" .) "annotations" -}}{{- true -}}{{- end -}} {{- end -}} -{{- /* false = force disabled, returns empty string */ -}} {{- end -}} {{/* diff --git a/helm/templates/agent-service.yaml b/helm/templates/agent-service.yaml index eff24e69f..3df51b7a0 100644 --- a/helm/templates/agent-service.yaml +++ b/helm/templates/agent-service.yaml @@ -11,11 +11,15 @@ metadata: .Values.commonMetaLabels ) ) | nindent 2 }} + {{- $promAnnotations := dict -}} + {{- if include "cloudzero-agent.monitoring.annotationsActive" . -}} + {{- $promAnnotations = dict "prometheus.io/scrape" "true" "prometheus.io/port" "9090" "prometheus.io/path" "/metrics" -}} + {{- end -}} {{- include "cloudzero-agent.generateAnnotations" (dict "root" . "annotations" (list .Values.defaults.annotations - (dict "prometheus.io/scrape" "true" "prometheus.io/port" "9090" "prometheus.io/path" "/metrics") + $promAnnotations ) ) | nindent 2 }} spec: diff --git a/helm/templates/aggregator-service.yaml b/helm/templates/aggregator-service.yaml index be32b42ba..d9ed2f528 100644 --- a/helm/templates/aggregator-service.yaml +++ b/helm/templates/aggregator-service.yaml @@ -12,12 +12,16 @@ metadata: .Values.components.aggregator.labels ) ) | nindent 2 }} + {{- $promAnnotations := dict -}} + {{- if include "cloudzero-agent.monitoring.annotationsActive" . -}} + {{- $promAnnotations = dict "prometheus.io/scrape" "true" "prometheus.io/port" (.Values.aggregator.collector.port | quote) "prometheus.io/path" "/metrics" -}} + {{- end -}} {{- include "cloudzero-agent.generateAnnotations" (dict "root" . "annotations" (list .Values.defaults.annotations .Values.components.aggregator.annotations - (dict "prometheus.io/scrape" "true" "prometheus.io/port" (.Values.aggregator.collector.port | quote) "prometheus.io/path" "/metrics") + $promAnnotations ) ) | nindent 2 }} spec: diff --git a/helm/templates/prometheusrule.yaml b/helm/templates/prometheusrule.yaml index 92d2928e0..18145dc99 100644 --- a/helm/templates/prometheusrule.yaml +++ b/helm/templates/prometheusrule.yaml @@ -6,13 +6,13 @@ up. These alerts cover the critical failure modes documented in the "Monitoring the CloudZero Agent" wiki page. Requires the Prometheus Operator CRDs (monitoring.coreos.com/v1) to be installed -in the cluster. Controlled by components.monitoring.enabled (null=auto-detect, -true=force, false=disable). +in the cluster. Controlled by components.monitoring.enabled (true/false); enabling +it on a cluster without the Operator CRDs will fail at apply time. Alert expressions use the Helm-generated service names to construct job label selectors, so they work regardless of the Helm release name. */}} -{{- if include "cloudzero-agent.monitoring.enabled" . }} +{{- if include "cloudzero-agent.monitoring.rulesActive" . }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: diff --git a/helm/templates/servicemonitor-agent.yaml b/helm/templates/servicemonitor-agent.yaml index c22ddcd90..3498525db 100644 --- a/helm/templates/servicemonitor-agent.yaml +++ b/helm/templates/servicemonitor-agent.yaml @@ -7,7 +7,7 @@ prometheus_remote_storage_* counters, and other Prometheus native metrics. See: Monitoring the CloudZero Agent > Monitoring Prometheus Agent Scrape Health */}} -{{- if include "cloudzero-agent.monitoring.enabled" . }} +{{- if include "cloudzero-agent.monitoring.serviceMonitorsActive" . }} apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: diff --git a/helm/templates/servicemonitor-collector.yaml b/helm/templates/servicemonitor-collector.yaml index 44719d59d..02992536d 100644 --- a/helm/templates/servicemonitor-collector.yaml +++ b/helm/templates/servicemonitor-collector.yaml @@ -7,7 +7,7 @@ metrics_received_cost_total, and http_request_duration_seconds. See: Monitoring the CloudZero Agent > Monitoring Data Pipeline Health */}} -{{- if include "cloudzero-agent.monitoring.enabled" . }} +{{- if include "cloudzero-agent.monitoring.serviceMonitorsActive" . }} apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: diff --git a/helm/templates/servicemonitor-shipper.yaml b/helm/templates/servicemonitor-shipper.yaml index 5f2371eb2..be47046be 100644 --- a/helm/templates/servicemonitor-shipper.yaml +++ b/helm/templates/servicemonitor-shipper.yaml @@ -8,7 +8,7 @@ shipper_disk_* gauges. See: Monitoring the CloudZero Agent > Monitoring Shipper Health */}} -{{- if include "cloudzero-agent.monitoring.enabled" . }} +{{- if include "cloudzero-agent.monitoring.serviceMonitorsActive" . }} apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: diff --git a/helm/templates/servicemonitor-webhook.yaml b/helm/templates/servicemonitor-webhook.yaml index 3ee47911b..c3ece3e85 100644 --- a/helm/templates/servicemonitor-webhook.yaml +++ b/helm/templates/servicemonitor-webhook.yaml @@ -18,7 +18,7 @@ which means Prometheus connects directly to the webhook (bypassing the sidecar). See: Monitoring the CloudZero Agent > Monitoring Webhook Event Processing Monitoring the CloudZero Agent > Monitoring Webhook Metadata Delivery */}} -{{- if include "cloudzero-agent.monitoring.enabled" . }} +{{- if include "cloudzero-agent.monitoring.serviceMonitorsActive" . }} apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: diff --git a/helm/templates/webhook-service.yaml b/helm/templates/webhook-service.yaml index b3703de4d..b3d367d7d 100644 --- a/helm/templates/webhook-service.yaml +++ b/helm/templates/webhook-service.yaml @@ -11,13 +11,17 @@ metadata: .Values.components.webhookServer.labels ) ) | nindent 2 }} + {{- $promAnnotations := dict -}} + {{- if include "cloudzero-agent.monitoring.annotationsActive" . -}} + {{- $promAnnotations = dict "prometheus.io/scrape" "true" "prometheus.io/port" "8443" "prometheus.io/path" "/metrics" "prometheus.io/scheme" "https" -}} + {{- end -}} {{- include "cloudzero-agent.generateAnnotations" (dict "root" . "annotations" (list .Values.defaults.annotations .Values.components.webhookServer.annotations (dict "nginx.ingress.kubernetes.io/ssl-redirect" "false") - (dict "prometheus.io/scrape" "true" "prometheus.io/port" "8443" "prometheus.io/path" "/metrics" "prometheus.io/scheme" "https") + $promAnnotations ) ) | nindent 2 }} namespace: {{ .Release.Namespace }} diff --git a/helm/tests/defaults_service_test.yaml b/helm/tests/defaults_service_test.yaml index 9aa083d58..8653a86d2 100644 --- a/helm/tests/defaults_service_test.yaml +++ b/helm/tests/defaults_service_test.yaml @@ -6,11 +6,16 @@ # Services only support metadata-level defaults (labels and annotations). # PodSpec defaults (affinity, tolerations, etc.) do not apply to Services. # +# Also validates that monitoring.discovery.annotations controls the +# prometheus.io/* annotations on Services. +# # Templates tested: +# - agent-service.yaml # - aggregator-service.yaml # - webhook-service.yaml suite: defaults.* properties apply to Service resources templates: + - agent-service.yaml - aggregator-service.yaml - webhook-service.yaml tests: @@ -91,3 +96,68 @@ tests: - equal: path: metadata.annotations.test-defaults-annotation value: sentinel-value-annotation + + # ============================================================================ + # monitoring.discovery.method: prometheus.io/* annotation gating. + # Annotations render only when monitoring is enabled AND method=annotations. + # enabled defaults to false, so by default the annotations are omitted. + # ============================================================================ + - it: should omit prometheus.io annotations on agent-service by default + template: agent-service.yaml + asserts: + - isNull: + path: metadata.annotations["prometheus.io/scrape"] + + - it: should omit prometheus.io annotations on aggregator-service by default + template: aggregator-service.yaml + asserts: + - isNull: + path: metadata.annotations["prometheus.io/scrape"] + + - it: should omit prometheus.io annotations on webhook-service by default + template: webhook-service.yaml + set: + insightsController.enabled: true + asserts: + - isNull: + path: metadata.annotations["prometheus.io/scrape"] + + - it: should add prometheus.io annotations on agent-service when enabled + method=annotations + template: agent-service.yaml + set: + components.monitoring.enabled: true + components.monitoring.discovery.method: annotations + asserts: + - equal: + path: metadata.annotations["prometheus.io/scrape"] + value: "true" + + - it: should add prometheus.io annotations on aggregator-service when enabled + method=annotations + template: aggregator-service.yaml + set: + components.monitoring.enabled: true + components.monitoring.discovery.method: annotations + asserts: + - equal: + path: metadata.annotations["prometheus.io/scrape"] + value: "true" + + - it: should add prometheus.io annotations on webhook-service when enabled + method=annotations + template: webhook-service.yaml + set: + insightsController.enabled: true + components.monitoring.enabled: true + components.monitoring.discovery.method: annotations + asserts: + - equal: + path: metadata.annotations["prometheus.io/scrape"] + value: "true" + + - it: should NOT add prometheus.io annotations when method=serviceMonitors (operator path) + template: agent-service.yaml + set: + components.monitoring.enabled: true + components.monitoring.discovery.method: serviceMonitors + asserts: + - isNull: + path: metadata.annotations["prometheus.io/scrape"] diff --git a/helm/tests/monitoring_integration_test.yaml b/helm/tests/monitoring_integration_test.yaml index f7209bedd..a28b08483 100644 --- a/helm/tests/monitoring_integration_test.yaml +++ b/helm/tests/monitoring_integration_test.yaml @@ -7,71 +7,10 @@ templates: - servicemonitor-webhook.yaml tests: # ============================================================================ - # enabled=null (release default, currently maps to disabled) + # enabled gates everything (plain boolean, default false) # ============================================================================ - - it: "null: should NOT create PrometheusRule (release default is disabled)" + - it: "enabled=false (default): no PrometheusRule" template: prometheusrule.yaml - set: - apiKey: "test-key" - existingSecretName: null - clusterName: "test-cluster" - components.monitoring.enabled: null - asserts: - - hasDocuments: - count: 0 - - - it: "null: should NOT create ServiceMonitor for agent" - template: servicemonitor-agent.yaml - set: - apiKey: "test-key" - existingSecretName: null - clusterName: "test-cluster" - components.monitoring.enabled: null - asserts: - - hasDocuments: - count: 0 - - - it: "null: should NOT create ServiceMonitor for collector" - template: servicemonitor-collector.yaml - set: - apiKey: "test-key" - existingSecretName: null - clusterName: "test-cluster" - components.monitoring.enabled: null - asserts: - - hasDocuments: - count: 0 - - - it: "null: should NOT create ServiceMonitor for shipper" - template: servicemonitor-shipper.yaml - set: - apiKey: "test-key" - existingSecretName: null - clusterName: "test-cluster" - components.monitoring.enabled: null - asserts: - - hasDocuments: - count: 0 - - - it: "null: should NOT create ServiceMonitor for webhook" - template: servicemonitor-webhook.yaml - set: - apiKey: "test-key" - existingSecretName: null - clusterName: "test-cluster" - components.monitoring.enabled: null - asserts: - - hasDocuments: - count: 0 - - # ============================================================================ - # enabled=false (explicit disable) - # ============================================================================ - - it: "false: should NOT create PrometheusRule even with CRDs present" - template: prometheusrule.yaml - capabilities: - apiVersions: - - monitoring.coreos.com/v1 set: apiKey: "test-key" existingSecretName: null @@ -81,11 +20,8 @@ tests: - hasDocuments: count: 0 - - it: "false: should NOT create any ServiceMonitors even with CRDs present" + - it: "enabled=false (default): no ServiceMonitor" template: servicemonitor-agent.yaml - capabilities: - apiVersions: - - monitoring.coreos.com/v1 set: apiKey: "test-key" existingSecretName: null @@ -95,10 +31,7 @@ tests: - hasDocuments: count: 0 - # ============================================================================ - # enabled=true (force enable) - # ============================================================================ - - it: "true: should create PrometheusRule" + - it: "enabled=true: creates PrometheusRule (method auto -> serviceMonitors)" template: prometheusrule.yaml set: apiKey: "test-key" @@ -111,7 +44,7 @@ tests: - isKind: of: PrometheusRule - - it: "true: should create ServiceMonitor for agent" + - it: "enabled=true: creates ServiceMonitor (method auto -> serviceMonitors)" template: servicemonitor-agent.yaml set: apiKey: "test-key" @@ -124,39 +57,41 @@ tests: - isKind: of: ServiceMonitor - - it: "true: should create ServiceMonitor for collector" - template: servicemonitor-collector.yaml + # ============================================================================ + # discovery.method = annotations: discovery-only, no Operator bundle + # ============================================================================ + - it: "method=annotations: no ServiceMonitor" + template: servicemonitor-agent.yaml set: apiKey: "test-key" existingSecretName: null clusterName: "test-cluster" components.monitoring.enabled: true + components.monitoring.discovery.method: annotations asserts: - hasDocuments: - count: 1 - - isKind: - of: ServiceMonitor + count: 0 - - it: "true: should create ServiceMonitor for shipper" - template: servicemonitor-shipper.yaml + - it: "method=annotations: no PrometheusRule" + template: prometheusrule.yaml set: apiKey: "test-key" existingSecretName: null clusterName: "test-cluster" components.monitoring.enabled: true + components.monitoring.discovery.method: annotations asserts: - hasDocuments: - count: 1 - - isKind: - of: ServiceMonitor + count: 0 - - it: "true: should create ServiceMonitor for webhook" - template: servicemonitor-webhook.yaml + - it: "method=serviceMonitors (explicit): creates ServiceMonitor" + template: servicemonitor-collector.yaml set: apiKey: "test-key" existingSecretName: null clusterName: "test-cluster" components.monitoring.enabled: true + components.monitoring.discovery.method: serviceMonitors asserts: - hasDocuments: count: 1 @@ -164,71 +99,7 @@ tests: of: ServiceMonitor # ============================================================================ - # enabled="auto" with CRDs present (should create) - # ============================================================================ - - it: "auto: should create PrometheusRule when Prometheus Operator CRDs present" - template: prometheusrule.yaml - capabilities: - apiVersions: - - monitoring.coreos.com/v1 - set: - apiKey: "test-key" - existingSecretName: null - clusterName: "test-cluster" - components.monitoring.enabled: auto - asserts: - - hasDocuments: - count: 1 - - isKind: - of: PrometheusRule - - - it: "auto: should create all ServiceMonitors when CRDs present" - template: servicemonitor-agent.yaml - capabilities: - apiVersions: - - monitoring.coreos.com/v1 - set: - apiKey: "test-key" - existingSecretName: null - clusterName: "test-cluster" - components.monitoring.enabled: auto - asserts: - - hasDocuments: - count: 1 - - isKind: - of: ServiceMonitor - - # ============================================================================ - # enabled="auto" without CRDs (should NOT create) - # ============================================================================ - - it: "auto: should NOT create PrometheusRule when no CRDs" - template: prometheusrule.yaml - capabilities: - apiVersions: [] - set: - apiKey: "test-key" - existingSecretName: null - clusterName: "test-cluster" - components.monitoring.enabled: auto - asserts: - - hasDocuments: - count: 0 - - - it: "auto: should NOT create ServiceMonitors when no CRDs" - template: servicemonitor-agent.yaml - capabilities: - apiVersions: [] - set: - apiKey: "test-key" - existingSecretName: null - clusterName: "test-cluster" - components.monitoring.enabled: auto - asserts: - - hasDocuments: - count: 0 - - # ============================================================================ - # PrometheusRule content validation + # PrometheusRule content validation (enabled=true; method auto -> serviceMonitors) # ============================================================================ - it: "should use Helm-generated service names in alert job labels" template: prometheusrule.yaml @@ -257,9 +128,6 @@ tests: path: spec.groups[0].rules count: 14 - # ============================================================================ - # Aggregator Down alert - # ============================================================================ - it: "should include CloudZeroAggregatorDown alert with correct job label" template: prometheusrule.yaml release: @@ -277,9 +145,6 @@ tests: path: spec.groups[0].rules[3].alert value: CloudZeroAggregatorDown - # ============================================================================ - # Dynamic shipper thresholds (based on costMaxInterval) - # ============================================================================ - it: "shipper alerts should use 2x and 1.5x of default costMaxInterval (30m)" template: prometheusrule.yaml set: @@ -369,7 +234,7 @@ tests: value: "450s" # ============================================================================ - # Custom labels and namespace + # Custom labels and namespace (enabled=true; method auto -> serviceMonitors) # ============================================================================ - it: "should apply custom labels to ServiceMonitor" template: servicemonitor-agent.yaml @@ -557,60 +422,6 @@ tests: path: spec.namespaceSelector.matchNames[0] value: cloudzero - - it: "namespace null: collector ServiceMonitor should use release namespace" - template: servicemonitor-collector.yaml - release: - namespace: cloudzero - set: - apiKey: "test-key" - existingSecretName: null - clusterName: "test-cluster" - components.monitoring.enabled: true - components.monitoring.namespace: null - asserts: - - equal: - path: metadata.namespace - value: cloudzero - - equal: - path: spec.namespaceSelector.matchNames[0] - value: cloudzero - - - it: "namespace null: shipper ServiceMonitor should use release namespace" - template: servicemonitor-shipper.yaml - release: - namespace: cloudzero - set: - apiKey: "test-key" - existingSecretName: null - clusterName: "test-cluster" - components.monitoring.enabled: true - components.monitoring.namespace: null - asserts: - - equal: - path: metadata.namespace - value: cloudzero - - equal: - path: spec.namespaceSelector.matchNames[0] - value: cloudzero - - - it: "namespace null: webhook ServiceMonitor should use release namespace" - template: servicemonitor-webhook.yaml - release: - namespace: cloudzero - set: - apiKey: "test-key" - existingSecretName: null - clusterName: "test-cluster" - components.monitoring.enabled: true - components.monitoring.namespace: null - asserts: - - equal: - path: metadata.namespace - value: cloudzero - - equal: - path: spec.namespaceSelector.matchNames[0] - value: cloudzero - # Empty string should behave identically to null - it: "namespace empty string: PrometheusRule should use release namespace" template: prometheusrule.yaml diff --git a/helm/values.schema.json b/helm/values.schema.json index 215f47dcf..d1317ad62 100644 --- a/helm/values.schema.json +++ b/helm/values.schema.json @@ -6309,20 +6309,20 @@ "monitoring": { "additionalProperties": false, "properties": { - "enabled": { - "default": null, - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "enum": ["auto"], + "discovery": { + "additionalProperties": false, + "properties": { + "method": { + "default": "auto", + "enum": ["auto", "serviceMonitors", "annotations"], "type": "string" } - ] + }, + "type": "object" + }, + "enabled": { + "default": false, + "type": "boolean" }, "labels": { "additionalProperties": { diff --git a/helm/values.schema.yaml b/helm/values.schema.yaml index d593841f3..d1905a087 100644 --- a/helm/values.schema.yaml +++ b/helm/values.schema.yaml @@ -1342,36 +1342,46 @@ properties: monitoring: description: | - Prometheus Operator monitoring integration. + Monitoring integration with the customer's Prometheus stack. - When enabled, the chart creates ServiceMonitor and PrometheusRule CRDs - that allow the Prometheus Operator to automatically discover and scrape - CloudZero Agent metrics and evaluate pre-configured alert rules. - - Regardless of this setting, prometheus.io/* annotations are always added - to Services for customers using standard Prometheus service discovery. + The chart integrates two ways: via the Prometheus Operator (ServiceMonitor + + PrometheusRule CRDs) and/or via prometheus.io/* annotations on Services. + monitoring.enabled turns it on; monitoring.discovery.method selects which. additionalProperties: false type: object properties: enabled: description: | - Controls creation of Prometheus Operator CRDs (ServiceMonitor, PrometheusRule). - - - null (default): Follow the release default. Currently maps to false - while this feature is being validated. In a future release, null will - map to "auto". - - "auto": Auto-detect. Creates monitoring CRDs only if the Prometheus - Operator CRDs (monitoring.coreos.com/v1) are available in the cluster. - - true: Always create monitoring CRDs. Helm will fail if the Prometheus - Operator CRDs are not installed. - - false: Never create monitoring CRDs. - default: null - oneOf: - - type: "null" - - type: boolean - - type: string + Whether to create the monitoring resources. Off by default; set to + true to enable. With the default discovery method (serviceMonitors), + this creates monitoring.coreos.com/v1 CRDs, so the install fails if the + Prometheus Operator is not present -- use discovery.method: annotations + otherwise. + default: false + type: boolean + discovery: + description: | + How the agent's metrics are advertised when monitoring.enabled is on. + additionalProperties: false + type: object + properties: + method: + description: | + Discovery mechanism used when monitoring.enabled resolves on. + + - "auto" (default): Pick automatically. Resolves to "serviceMonitors" + today; kept as an enum so a future auto-detectable backend can extend + it without changing the default. + - "serviceMonitors": Emit ServiceMonitor CRDs plus the PrometheusRule + alert bundle for the Prometheus Operator. + - "annotations": Emit prometheus.io/* annotations only -- no Operator + CRDs and no alert bundle. Explicit opt-in; never auto-selected. + default: auto + type: string enum: - auto + - serviceMonitors + - annotations namespace: description: | Namespace override for PrometheusRule and ServiceMonitor CRDs. diff --git a/helm/values.yaml b/helm/values.yaml index d7b5d5713..dd4da1c76 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -867,29 +867,26 @@ components: # https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ securityContext: {} - # Prometheus Operator monitoring integration. + # Monitoring integration with the customer's Prometheus stack. # - # When enabled, the chart creates ServiceMonitor and PrometheusRule CRDs that - # allow the Prometheus Operator to automatically discover and scrape CloudZero - # Agent metrics, and to evaluate pre-configured alert rules. - # - # This follows the same null/true/false pattern as integrations.istio: - # - # - null (default): Follow the release default. Currently maps to "false" - # (disabled) while this feature is being validated. In a future release, - # null will map to "auto". Use null unless you have a reason to override. - # - "auto": Auto-detect. Creates monitoring CRDs only if the Prometheus - # Operator CRDs (monitoring.coreos.com/v1) are available in the cluster. - # - true: Always create monitoring CRDs. Helm will fail if the Prometheus - # Operator CRDs are not installed in the cluster. - # - false: Never create monitoring CRDs. - # - # To opt in now, set to "auto" or true. - # - # Regardless of this setting, prometheus.io/* annotations are always added to - # Services for customers using standard Prometheus service discovery. + # The chart integrates two ways: via the Prometheus Operator (ServiceMonitor + + # PrometheusRule CRDs) and/or via prometheus.io/* annotations on Services. monitoring: - enabled: null + # Whether to create the monitoring resources. Off by default; set to true to + # enable. With the default discovery method (serviceMonitors), this creates + # monitoring.coreos.com/v1 CRDs, so the install fails if the Prometheus + # Operator is not present -- use discovery.method: annotations otherwise. + enabled: false + + discovery: + # How your Prometheus discovers the agent's metrics (when monitoring is on). + # + # - auto (default): use ServiceMonitors. + # - serviceMonitors: create ServiceMonitor resources for the Prometheus + # Operator, along with the alert rules. + # - annotations: add prometheus.io/* annotations to the Services instead, for + # a Prometheus that scrapes by annotation. No alert rules. + method: auto # Namespace override for PrometheusRule and ServiceMonitor CRDs. # null (default) = same namespace as the agent installation. diff --git a/tests/helm/schema/monitoring.discovery.annotations.pass.yaml b/tests/helm/schema/monitoring.discovery.annotations.pass.yaml new file mode 100644 index 000000000..6779dc3b8 --- /dev/null +++ b/tests/helm/schema/monitoring.discovery.annotations.pass.yaml @@ -0,0 +1,9 @@ +# Annotation discovery opt-in: gate forced on + method=annotations renders the +# prometheus.io/* annotations only (no Operator CRDs, no alert bundle), so it +# applies cleanly on a cluster without the Prometheus Operator. +clusterName: "test-cluster" +components: + monitoring: + enabled: true + discovery: + method: annotations diff --git a/tests/helm/schema/monitoring.discovery.default.pass.yaml b/tests/helm/schema/monitoring.discovery.default.pass.yaml new file mode 100644 index 000000000..e7cdcef82 --- /dev/null +++ b/tests/helm/schema/monitoring.discovery.default.pass.yaml @@ -0,0 +1,3 @@ +# Chart defaults (enabled: false). Monitoring is off, so nothing +# monitoring-related renders. Validates the default install. +clusterName: "test-cluster" diff --git a/tests/helm/schema/monitoring.discovery.method.invalid.fail.yaml b/tests/helm/schema/monitoring.discovery.method.invalid.fail.yaml new file mode 100644 index 000000000..16b149d5a --- /dev/null +++ b/tests/helm/schema/monitoring.discovery.method.invalid.fail.yaml @@ -0,0 +1,6 @@ +# Invalid: discovery.method must be one of auto | serviceMonitors | annotations. +clusterName: "test-cluster" +components: + monitoring: + discovery: + method: bogus diff --git a/tests/helm/schema/monitoring.discovery.none.pass.yaml b/tests/helm/schema/monitoring.discovery.none.pass.yaml new file mode 100644 index 000000000..ae7d97efb --- /dev/null +++ b/tests/helm/schema/monitoring.discovery.none.pass.yaml @@ -0,0 +1,5 @@ +# Gate off -> no Operator CRDs, no annotations. The chart still renders cleanly. +clusterName: "test-cluster" +components: + monitoring: + enabled: false diff --git a/tests/helm/schema/monitoring.discovery.servicemonitors.pass.yaml b/tests/helm/schema/monitoring.discovery.servicemonitors.pass.yaml new file mode 100644 index 000000000..f84608594 --- /dev/null +++ b/tests/helm/schema/monitoring.discovery.servicemonitors.pass.yaml @@ -0,0 +1,10 @@ +# Force the gate on (enabled: true) with the serviceMonitors path so the +# ServiceMonitors + PrometheusRule render even though `helm template` detects no +# Operator. kubeconform validates these monitoring.coreos.com/v1 CRDs against the +# datreeio CRDs-catalog. +clusterName: "test-cluster" +components: + monitoring: + enabled: true + discovery: + method: serviceMonitors diff --git a/tests/helm/schema/monitoring.enabled.invalid-type.fail.yaml b/tests/helm/schema/monitoring.enabled.invalid-type.fail.yaml new file mode 100644 index 000000000..441a24f8d --- /dev/null +++ b/tests/helm/schema/monitoring.enabled.invalid-type.fail.yaml @@ -0,0 +1,5 @@ +# Invalid: enabled is a plain boolean -- "auto" (and null) are no longer accepted. +clusterName: "test-cluster" +components: + monitoring: + enabled: auto diff --git a/tests/helm/template/alloy.yaml b/tests/helm/template/alloy.yaml index fa506732c..3ec38f36c 100644 --- a/tests/helm/template/alloy.yaml +++ b/tests/helm/template/alloy.yaml @@ -1103,7 +1103,9 @@ data: memory: 64Mi securityContext: {} monitoring: - enabled: null + discovery: + method: auto + enabled: false labels: {} namespace: null sharedSecret: false @@ -2334,10 +2336,7 @@ metadata: app.kubernetes.io/part-of: cloudzero-agent app.kubernetes.io/version: v3.10.0 helm.sh/chart: cloudzero-agent-1.1.0-dev - annotations: - prometheus.io/path: /metrics - prometheus.io/port: "9090" - prometheus.io/scrape: "true" + spec: type: ClusterIP ports: @@ -2362,10 +2361,7 @@ metadata: app.kubernetes.io/part-of: cloudzero-agent app.kubernetes.io/version: v3.10.0 helm.sh/chart: cloudzero-agent-1.1.0-dev - annotations: - prometheus.io/path: /metrics - prometheus.io/port: '"8080"' - prometheus.io/scrape: "true" + spec: selector: app.kubernetes.io/name: aggregator @@ -2395,10 +2391,6 @@ metadata: helm.sh/chart: cloudzero-agent-1.1.0-dev annotations: nginx.ingress.kubernetes.io/ssl-redirect: "false" - prometheus.io/path: /metrics - prometheus.io/port: "8443" - prometheus.io/scheme: https - prometheus.io/scrape: "true" namespace: cz-agent spec: type: ClusterIP diff --git a/tests/helm/template/cert-manager.yaml b/tests/helm/template/cert-manager.yaml index 46a9eb676..25d386f7c 100644 --- a/tests/helm/template/cert-manager.yaml +++ b/tests/helm/template/cert-manager.yaml @@ -1018,7 +1018,9 @@ data: memory: 64Mi securityContext: {} monitoring: - enabled: null + discovery: + method: auto + enabled: false labels: {} namespace: null sharedSecret: false @@ -2249,10 +2251,7 @@ metadata: app.kubernetes.io/part-of: cloudzero-agent app.kubernetes.io/version: v3.10.0 helm.sh/chart: cloudzero-agent-1.1.0-dev - annotations: - prometheus.io/path: /metrics - prometheus.io/port: "9090" - prometheus.io/scrape: "true" + spec: type: ClusterIP ports: @@ -2277,10 +2276,7 @@ metadata: app.kubernetes.io/part-of: cloudzero-agent app.kubernetes.io/version: v3.10.0 helm.sh/chart: cloudzero-agent-1.1.0-dev - annotations: - prometheus.io/path: /metrics - prometheus.io/port: '"8080"' - prometheus.io/scrape: "true" + spec: selector: app.kubernetes.io/name: aggregator @@ -2310,10 +2306,6 @@ metadata: helm.sh/chart: cloudzero-agent-1.1.0-dev annotations: nginx.ingress.kubernetes.io/ssl-redirect: "false" - prometheus.io/path: /metrics - prometheus.io/port: "8443" - prometheus.io/scheme: https - prometheus.io/scrape: "true" namespace: cz-agent spec: type: ClusterIP diff --git a/tests/helm/template/federated.yaml b/tests/helm/template/federated.yaml index 2ea82b3d3..2b037d766 100644 --- a/tests/helm/template/federated.yaml +++ b/tests/helm/template/federated.yaml @@ -1106,7 +1106,9 @@ data: memory: 64Mi securityContext: {} monitoring: - enabled: null + discovery: + method: auto + enabled: false labels: {} namespace: null sharedSecret: false @@ -2337,10 +2339,7 @@ metadata: app.kubernetes.io/part-of: cloudzero-agent app.kubernetes.io/version: v3.10.0 helm.sh/chart: cloudzero-agent-1.1.0-dev - annotations: - prometheus.io/path: /metrics - prometheus.io/port: "9090" - prometheus.io/scrape: "true" + spec: type: ClusterIP ports: @@ -2365,10 +2364,7 @@ metadata: app.kubernetes.io/part-of: cloudzero-agent app.kubernetes.io/version: v3.10.0 helm.sh/chart: cloudzero-agent-1.1.0-dev - annotations: - prometheus.io/path: /metrics - prometheus.io/port: '"8080"' - prometheus.io/scrape: "true" + spec: selector: app.kubernetes.io/name: aggregator @@ -2398,10 +2394,6 @@ metadata: helm.sh/chart: cloudzero-agent-1.1.0-dev annotations: nginx.ingress.kubernetes.io/ssl-redirect: "false" - prometheus.io/path: /metrics - prometheus.io/port: "8443" - prometheus.io/scheme: https - prometheus.io/scrape: "true" namespace: cz-agent spec: type: ClusterIP diff --git a/tests/helm/template/istio.yaml b/tests/helm/template/istio.yaml index 00eac588e..6bee829f5 100644 --- a/tests/helm/template/istio.yaml +++ b/tests/helm/template/istio.yaml @@ -1033,7 +1033,9 @@ data: memory: 64Mi securityContext: {} monitoring: - enabled: null + discovery: + method: auto + enabled: false labels: {} namespace: null sharedSecret: false @@ -2264,10 +2266,7 @@ metadata: app.kubernetes.io/part-of: cloudzero-agent app.kubernetes.io/version: v3.10.0 helm.sh/chart: cloudzero-agent-1.1.0-dev - annotations: - prometheus.io/path: /metrics - prometheus.io/port: "9090" - prometheus.io/scrape: "true" + spec: type: ClusterIP ports: @@ -2292,10 +2291,7 @@ metadata: app.kubernetes.io/part-of: cloudzero-agent app.kubernetes.io/version: v3.10.0 helm.sh/chart: cloudzero-agent-1.1.0-dev - annotations: - prometheus.io/path: /metrics - prometheus.io/port: '"8080"' - prometheus.io/scrape: "true" + spec: selector: app.kubernetes.io/name: aggregator @@ -2325,10 +2321,6 @@ metadata: helm.sh/chart: cloudzero-agent-1.1.0-dev annotations: nginx.ingress.kubernetes.io/ssl-redirect: "false" - prometheus.io/path: /metrics - prometheus.io/port: "8443" - prometheus.io/scheme: https - prometheus.io/scrape: "true" namespace: cz-agent spec: type: ClusterIP diff --git a/tests/helm/template/kubestate.yaml b/tests/helm/template/kubestate.yaml index 20c54ceaf..4b1cdebef 100644 --- a/tests/helm/template/kubestate.yaml +++ b/tests/helm/template/kubestate.yaml @@ -1070,7 +1070,9 @@ data: memory: 64Mi securityContext: {} monitoring: - enabled: null + discovery: + method: auto + enabled: false labels: {} namespace: null sharedSecret: false @@ -1874,10 +1876,7 @@ metadata: app.kubernetes.io/part-of: cloudzero-agent app.kubernetes.io/version: v3.10.0 helm.sh/chart: cloudzero-agent-1.1.0-dev - annotations: - prometheus.io/path: /metrics - prometheus.io/port: "9090" - prometheus.io/scrape: "true" + spec: type: ClusterIP ports: @@ -1902,10 +1901,7 @@ metadata: app.kubernetes.io/part-of: cloudzero-agent app.kubernetes.io/version: v3.10.0 helm.sh/chart: cloudzero-agent-1.1.0-dev - annotations: - prometheus.io/path: /metrics - prometheus.io/port: '"8080"' - prometheus.io/scrape: "true" + spec: selector: app.kubernetes.io/name: aggregator @@ -1935,10 +1931,6 @@ metadata: helm.sh/chart: cloudzero-agent-1.1.0-dev annotations: nginx.ingress.kubernetes.io/ssl-redirect: "false" - prometheus.io/path: /metrics - prometheus.io/port: "8443" - prometheus.io/scheme: https - prometheus.io/scrape: "true" namespace: cz-agent spec: type: ClusterIP diff --git a/tests/helm/template/manifest.yaml b/tests/helm/template/manifest.yaml index 51582f517..9fcaf326d 100644 --- a/tests/helm/template/manifest.yaml +++ b/tests/helm/template/manifest.yaml @@ -1033,7 +1033,9 @@ data: memory: 64Mi securityContext: {} monitoring: - enabled: null + discovery: + method: auto + enabled: false labels: {} namespace: null sharedSecret: false @@ -2264,10 +2266,7 @@ metadata: app.kubernetes.io/part-of: cloudzero-agent app.kubernetes.io/version: v3.10.0 helm.sh/chart: cloudzero-agent-1.1.0-dev - annotations: - prometheus.io/path: /metrics - prometheus.io/port: "9090" - prometheus.io/scrape: "true" + spec: type: ClusterIP ports: @@ -2292,10 +2291,7 @@ metadata: app.kubernetes.io/part-of: cloudzero-agent app.kubernetes.io/version: v3.10.0 helm.sh/chart: cloudzero-agent-1.1.0-dev - annotations: - prometheus.io/path: /metrics - prometheus.io/port: '"8080"' - prometheus.io/scrape: "true" + spec: selector: app.kubernetes.io/name: aggregator @@ -2325,10 +2321,6 @@ metadata: helm.sh/chart: cloudzero-agent-1.1.0-dev annotations: nginx.ingress.kubernetes.io/ssl-redirect: "false" - prometheus.io/path: /metrics - prometheus.io/port: "8443" - prometheus.io/scheme: https - prometheus.io/scrape: "true" namespace: cz-agent spec: type: ClusterIP