From 492be6ee39990aef7c1a6db83e529611f946b326 Mon Sep 17 00:00:00 2001
From: Evan Nemerson <evan.nemerson@cloudzero.com>
Date: Tue, 9 Jun 2026 23:48:07 -0400
Subject: [PATCH] CP-42614: Add monitoring discovery model (enabled +
 discovery.method)

The chart's monitoring integration could advertise the agent's metrics two ways
at once -- prometheus.io/* annotations and ServiceMonitor CRDs -- which scrape the
same targets twice. This finalizes the still-in-validation monitoring
configuration into an explicit, opt-in model.

Implementation Approach:

The chart emits two kinds of monitoring resources -- Prometheus Operator CRDs
(ServiceMonitor + PrometheusRule) and prometheus.io/* annotations on the agent
Services. Both are driven by two values resolved in helm/templates/_helpers.tpl
and consumed by the Service, ServiceMonitor, and PrometheusRule templates. There
is no cluster auto-detection; the configuration is explicit. This finalizes the
still-in-validation monitoring integration added in CP-34935, so its defaults
change: the default install emits nothing, and enabled:true selects one mechanism
via discovery.method rather than emitting annotations and ServiceMonitors together.

Functional Requirements:

1. Monitoring must be off by default and turned on explicitly.

   Added components.monitoring.enabled as a plain boolean defaulting to false.
   The four servicemonitor-*.yaml templates, prometheusrule.yaml, and the
   prometheus.io/* annotations on the three Service templates are all gated on it
   via helpers in _helpers.tpl.

2. When enabled, the operator must be able to choose the discovery mechanism.

   Added components.monitoring.discovery.method (auto | serviceMonitors |
   annotations, default auto). auto resolves to serviceMonitors today and is left
   as an enum so a future mechanism can extend it; annotations is an explicit
   opt-in. serviceMonitors emits monitoring.coreos.com/v1 CRDs, so the install
   fails if the Prometheus Operator is absent; annotations needs no CRDs.

3. The ServiceMonitors and the PrometheusRule must form a single bundle;
   annotation-based discovery is discovery-only.

   serviceMonitorsActive and rulesActive resolve together on the serviceMonitors
   path, so method=annotations yields no ServiceMonitors and no alert rules.

4. The new values must validate and be documented for users.

   Updated helm/values.schema.yaml (enabled: boolean; discovery.method enum) and
   regenerated helm/values.schema.json; set defaults and comments in
   helm/values.yaml; rewrote helm/docs/monitoring-infrastructure.md as user-facing
   reference.

Validation:

- helm lint passing
- helm unit tests passing; helm/tests/monitoring_integration_test.yaml and
  defaults_service_test.yaml were rewritten for the new model
- helm schema tests passing, including new fixtures under tests/helm/schema for
  the default, serviceMonitors, annotations, none, and invalid-value cases.
  kubeconform validates the rendered ServiceMonitor and PrometheusRule against the
  CRD catalog
- golden manifests regenerated; a default `helm template` emits no ServiceMonitors,
  no PrometheusRule, and no prometheus.io/* annotations
- helm subchart tests passing

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 app/functions/helmless/default-values.yaml    |  39 ++-
 helm/docs/monitoring-infrastructure.md        | 168 +++----------
 helm/templates/_helpers.tpl                   |  74 +++---
 helm/templates/agent-service.yaml             |   6 +-
 helm/templates/aggregator-service.yaml        |   6 +-
 helm/templates/prometheusrule.yaml            |   6 +-
 helm/templates/servicemonitor-agent.yaml      |   2 +-
 helm/templates/servicemonitor-collector.yaml  |   2 +-
 helm/templates/servicemonitor-shipper.yaml    |   2 +-
 helm/templates/servicemonitor-webhook.yaml    |   2 +-
 helm/templates/webhook-service.yaml           |   6 +-
 helm/tests/defaults_service_test.yaml         |  70 ++++++
 helm/tests/monitoring_integration_test.yaml   | 231 ++----------------
 helm/values.schema.json                       |  24 +-
 helm/values.schema.yaml                       |  54 ++--
 helm/values.yaml                              |  39 ++-
 ...monitoring.discovery.annotations.pass.yaml |   9 +
 .../monitoring.discovery.default.pass.yaml    |   3 +
 ...itoring.discovery.method.invalid.fail.yaml |   6 +
 .../monitoring.discovery.none.pass.yaml       |   5 +
 ...toring.discovery.servicemonitors.pass.yaml |  10 +
 .../monitoring.enabled.invalid-type.fail.yaml |   5 +
 tests/helm/template/alloy.yaml                |  18 +-
 tests/helm/template/cert-manager.yaml         |  18 +-
 tests/helm/template/federated.yaml            |  18 +-
 tests/helm/template/istio.yaml                |  18 +-
 tests/helm/template/kubestate.yaml            |  18 +-
 tests/helm/template/manifest.yaml             |  18 +-
 28 files changed, 331 insertions(+), 546 deletions(-)
 create mode 100644 tests/helm/schema/monitoring.discovery.annotations.pass.yaml
 create mode 100644 tests/helm/schema/monitoring.discovery.default.pass.yaml
 create mode 100644 tests/helm/schema/monitoring.discovery.method.invalid.fail.yaml
 create mode 100644 tests/helm/schema/monitoring.discovery.none.pass.yaml
 create mode 100644 tests/helm/schema/monitoring.discovery.servicemonitors.pass.yaml
 create mode 100644 tests/helm/schema/monitoring.enabled.invalid-type.fail.yaml

diff --git a/app/functions/helmless/default-values.yaml b/app/functions/helmless/default-values.yaml
index d3762feb1..cbe33c397 100644
--- a/app/functions/helmless/default-values.yaml
+++ b/app/functions/helmless/default-values.yaml
@@ -867,29 +867,26 @@ components:
       # https://kubernetes.io/docs/tasks/configure-pod-container/security-context/
       securityContext: {}
 
-  # Prometheus Operator monitoring integration.
+  # Monitoring integration with the customer's Prometheus stack.
   #
-  # When enabled, the chart creates ServiceMonitor and PrometheusRule CRDs that
-  # allow the Prometheus Operator to automatically discover and scrape CloudZero
-  # Agent metrics, and to evaluate pre-configured alert rules.
-  #
-  # This follows the same null/true/false pattern as integrations.istio:
-  #
-  # - null (default): Follow the release default. Currently maps to "false"
-  #   (disabled) while this feature is being validated. In a future release,
-  #   null will map to "auto". Use null unless you have a reason to override.
-  # - "auto": Auto-detect. Creates monitoring CRDs only if the Prometheus
-  #   Operator CRDs (monitoring.coreos.com/v1) are available in the cluster.
-  # - true: Always create monitoring CRDs. Helm will fail if the Prometheus
-  #   Operator CRDs are not installed in the cluster.
-  # - false: Never create monitoring CRDs.
-  #
-  # To opt in now, set to "auto" or true.
-  #
-  # Regardless of this setting, prometheus.io/* annotations are always added to
-  # Services for customers using standard Prometheus service discovery.
+  # The chart integrates two ways: via the Prometheus Operator (ServiceMonitor +
+  # PrometheusRule CRDs) and/or via prometheus.io/* annotations on Services.
   monitoring:
-    enabled: null
+    # Whether to create the monitoring resources. Off by default; set to true to
+    # enable. With the default discovery method (serviceMonitors), this creates
+    # monitoring.coreos.com/v1 CRDs, so the install fails if the Prometheus
+    # Operator is not present -- use discovery.method: annotations otherwise.
+    enabled: false
+
+    discovery:
+      # How your Prometheus discovers the agent's metrics (when monitoring is on).
+      #
+      # - auto (default): use ServiceMonitors.
+      # - serviceMonitors: create ServiceMonitor resources for the Prometheus
+      #   Operator, along with the alert rules.
+      # - annotations: add prometheus.io/* annotations to the Services instead, for
+      #   a Prometheus that scrapes by annotation. No alert rules.
+      method: auto
 
     # Namespace override for PrometheusRule and ServiceMonitor CRDs.
     # null (default) = same namespace as the agent installation.
diff --git a/helm/docs/monitoring-infrastructure.md b/helm/docs/monitoring-infrastructure.md
index d346d131d..686f492d1 100644
--- a/helm/docs/monitoring-infrastructure.md
+++ b/helm/docs/monitoring-infrastructure.md
@@ -1,37 +1,48 @@
 # CloudZero Agent Monitoring Infrastructure
 
 This document describes the monitoring resources shipped with the CloudZero
-Agent Helm chart: what they are, what they cover, what they don't cover, and
-how they were validated.
+Agent Helm chart: what they are, what they cover, and what they don't cover.
 
 ## Overview
 
-The chart provides two categories of monitoring integration:
+The chart can expose the CloudZero Agent's own metrics to your Prometheus and
+ship a set of alert rules. It is off by default; two values turn it on and choose
+how.
 
-1. **Prometheus `prometheus.io/*` annotations** on all Services (always enabled).
-   These allow standard Prometheus installations using `kubernetes_sd_configs`
-   to auto-discover and scrape CloudZero Agent metrics without any CRDs.
+**`components.monitoring.enabled`** — whether the monitoring resources are created:
 
-2. **Prometheus Operator CRDs** (opt-in via `components.monitoring.enabled`).
-   When enabled, the chart creates `ServiceMonitor` and `PrometheusRule`
-   resources that the Prometheus Operator automatically picks up.
+- `false` (default): off.
+- `true`: on.
 
-These resources are designed to be useful regardless of the customer's
-monitoring stack. The `ServiceMonitor` and `PrometheusRule` CRDs are the
-standard interoperability format understood by the Prometheus Operator, but
-also by compatible tools like Victoria Metrics Operator, Datadog (via its
-Prometheus integration), Grafana Agent, and others.
+**`components.monitoring.discovery.method`** — how your Prometheus discovers the
+agent's metrics, when monitoring is on:
+
+- `auto` (default): use ServiceMonitors.
+- `serviceMonitors`: create `ServiceMonitor` resources for the Prometheus Operator,
+  along with the alert rules. These are Prometheus Operator resources, so the
+  install fails if the Operator isn't present.
+- `annotations`: add `prometheus.io/*` annotations to the agent's Services instead,
+  for a Prometheus configured to scrape by annotation. No alert rules.
+
+To turn monitoring on, set `enabled: true`. Keep the default `method` if you run the
+Prometheus Operator; use `annotations` if your Prometheus discovers targets by
+annotation instead.
 
 ## Configuration
 
 ```yaml
 components:
   monitoring:
-    # null = auto-detect (install CRDs if Prometheus Operator is present)
-    # true = always install CRDs (fails if Prometheus Operator absent)
-    # false = never install CRDs (default while feature is being validated)
+    # Whether to create the monitoring resources. false (default) = off; true = on.
     enabled: false
 
+    discovery:
+      # How Prometheus discovers the agent's metrics:
+      #   auto (default)  -> ServiceMonitors
+      #   serviceMonitors -> ServiceMonitor resources + alert rules
+      #   annotations     -> prometheus.io/* annotations only (no alert rules)
+      method: auto
+
     # Override namespace for CRDs (default: same as agent namespace)
     namespace: ""
 
@@ -177,126 +188,3 @@ installation, the following may be absent until their trigger condition occurs:
 Alerts that reference absent metrics evaluate to "no data" rather than firing,
 which is the correct behavior (absence of the failure counter means no failures
 have occurred).
-
-## Validation Results
-
-Tested on the `bach` cluster (GKE, `kube-prometheus-stack` installed) with
-an intentionally invalid API key to trigger failure-path alerts.
-
-### Scrape Targets
-
-All four targets verified as `UP` in Prometheus:
-
-| Target                   | Endpoint                 | Status                   |
-| ------------------------ | ------------------------ | ------------------------ |
-| `cz-agent-cz-server`     | `http-metrics` (9090)    | UP                       |
-| `cz-agent-cz-aggregator` | `metrics` (8080)         | UP                       |
-| `cz-agent-cz-aggregator` | `shipper-metrics` (8081) | UP                       |
-| `cz-agent-cz-webhook`    | `http` (8443)            | UP (when pod is healthy) |
-
-### Metric Availability
-
-All metrics referenced by alert rules were verified present in Prometheus:
-
-| Metric                                                           | Source             | Time Series Count | Notes                                                    |
-| ---------------------------------------------------------------- | ------------------ | ----------------- | -------------------------------------------------------- |
-| `metrics_received_total`                                         | Collector          | 4                 | Present immediately                                      |
-| `metrics_received_cost_total`                                    | Collector          | 1                 | Present immediately                                      |
-| `czo_webhook_types_total`                                        | Webhook            | 41                | Present immediately (multiple label combinations)        |
-| `function_execution_seconds_bucket`                              | Webhook            | 33                | Present immediately (histogram buckets)                  |
-| `prometheus_remote_storage_queue_highest_timestamp_seconds`      | Server             | 1                 | Present immediately                                      |
-| `prometheus_remote_storage_queue_highest_sent_timestamp_seconds` | Server             | 1                 | Present immediately                                      |
-| `shipper_run_fail_total`                                         | Shipper            | 3                 | Appeared after first failed upload (invalid API key)     |
-| `shipper_handle_request_success_total`                           | Shipper            | 0                 | Expected absent (no successful uploads with invalid key) |
-| `remote_write_failures_total`                                    | Webhook            | 0                 | Counter, appears after first failure event               |
-| `container_memory_working_set_bytes`                             | kubelet/cAdvisor   | 244               | Standard Kubernetes metric (cluster-wide)                |
-| `kube_pod_container_resource_limits`                             | kube-state-metrics | 315               | Standard Kubernetes metric (cluster-wide)                |
-
-### Alert Firing Behavior
-
-Validated using multiple test scenarios on the `bach` cluster:
-
-**Scenario 1: Invalid API key (natural failure path)**
-
-| Alert                                 | State              | Notes                                                     |
-| ------------------------------------- | ------------------ | --------------------------------------------------------- |
-| `CloudZeroShipperUploadFailures`      | **Pending/Firing** | Correctly detected `error_status_code="err-unauthorized"` |
-| `CloudZeroWebhookNoEvents`            | **Firing**         | Webhook pod on unhealthy node; no admission events        |
-| `CloudZeroWebhookRemoteWriteFailures` | **Firing**         | Webhook unable to push metadata                           |
-| `CloudZeroRemoteWriteLag`             | **Pending**        | Remote-write queue falling behind                         |
-
-**Scenario 2: Unschedulable pod (memory request set to 64Gi)**
-
-| Alert                | State       | Notes                                                                                                                                                                                                                           |
-| -------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `CloudZeroAgentDown` | **Pending** | `absent(up{...})` correctly detected missing target. Original `up == 0` expression would NOT have fired because the metric becomes absent (not zero) when a pod disappears entirely. Fixed to use `up == 0 or absent(up{...})`. |
-
-**Scenario 3: Memory pressure (memory limit set to 65Mi vs 73.7Mi working set)**
-
-| Alert                               | State       | Notes                                                                                                                                                                                          |
-| ----------------------------------- | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `CloudZeroAgentHighMemoryUsage`     | **Pending** | Memory ratio at 93.9%. Required `on(namespace, pod, container)` join -- without it, the division silently produced zero results due to label mismatch between cAdvisor and kube-state-metrics. |
-| `CloudZeroAgentContainerRestarting` | **Pending** | Container restarting from memory pressure. Caught restart loop alongside the memory alert.                                                                                                     |
-
-**Scenario 4: OOM kill (memory limit set to 30Mi vs 73.7Mi working set)**
-
-| Alert                               | State       | Notes                                                                                                                                                                       |
-| ----------------------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `CloudZeroAgentOOMKilled`           | **Firing**  | `kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}` correctly detected OOM kill within 1 minute. Standard KSM metric, no configuration changes required. |
-| `CloudZeroAgentContainerRestarting` | **Pending** | Caught the CrashLoopBackOff from repeated OOM kills.                                                                                                                        |
-| `CloudZeroAgentDown`                | **Pending** | Server target gone due to crash loop.                                                                                                                                       |
-
-**Scenario 5: Collector starved (server + webhook scaled to 0)**
-
-| Alert                         | State       | Notes                                                                                                                                                                                                                                   |
-| ----------------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `CloudZeroCollectorNoMetrics` | **Pending** | Correctly detected that `rate(metrics_received_total[10m]) == 0`. Note: scaling only the server to 0 is insufficient because the webhook also remote-writes to the collector. Both data sources must be stopped for this alert to fire. |
-
-**Alerts validated as correctly inactive during normal operation:**
-
-| Alert                               | Notes                                                                                                           |
-| ----------------------------------- | --------------------------------------------------------------------------------------------------------------- |
-| `CloudZeroWebhookServerHighLatency` | p99 latency well below 500ms threshold. Would require sustained load to trigger.                                |
-| `CloudZeroMissingContainerMetrics`  | Container-level cAdvisor metrics present (bach uses containerd CRI). Would require Docker-based CRI to trigger. |
-
-**Alerts with intentional design limitations:**
-
-| Alert                                 | Notes                                                                                                                                                                                                                                                                                                                                                                                            |
-| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `CloudZeroShipperNoSuccessfulUploads` | Detects "was working, then stopped" -- NOT "never worked at all." When the success counter has never been incremented, it is absent in Prometheus and `rate()` returns no data rather than 0. This is intentional: the "never worked" case is covered by `CloudZeroShipperUploadFailures` (which fires on upload errors) and the OOM/restart/down alerts (which catch container-level failures). |
-
-### Bugs Found and Fixed During Testing
-
-1. **`absent()` bug in AgentDown/WebhookDown**: When a pod is completely
-   absent (unschedulable, scaled to 0), the `up` metric becomes absent rather
-   than zero. The original `up == 0` expression never fires. Fixed to use
-   `up == 0 or absent(up{...})`.
-
-2. **Label join bug in HighMemoryUsage**: `container_memory_working_set_bytes`
-   (from cAdvisor/kubelet) and `kube_pod_container_resource_limits` (from
-   kube-state-metrics) have different label sets. Without an explicit
-   `on(namespace, pod, container)` join, Prometheus's implicit matching
-   silently produces zero results. Fixed by adding the explicit join.
-
-3. **Job label mismatch in collector/shipper alerts**: The Prometheus Operator
-   assigns the `job` label from the Service name, not the ServiceMonitor name.
-   Both collector and shipper share the aggregator Service, so they get the
-   same `job` label. Fixed by adding `endpoint` label selectors to
-   distinguish them.
-
-### Auto-Detection Logic
-
-Tested via `helm template` with all three modes:
-
-| `components.monitoring.enabled` | ServiceMonitors | PrometheusRules | `prometheus.io/*` annotations |
-| ------------------------------- | --------------- | --------------- | ----------------------------- |
-| `null` (no CRDs in cluster)     | 0               | 0               | 3 (always)                    |
-| `true`                          | 4               | 1               | 3 (always)                    |
-| `false`                         | 0               | 0               | 3 (always)                    |
-
-### Test Suite
-
-- Helm lint: passing
-- Helm unit tests: 498/498 passing
-- Helm schema tests: all passing (includes `components.monitoring.enabled`
-  null/true/false validation)
diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl
index 8847b54b2..a765ef3ec 100644
--- a/helm/templates/_helpers.tpl
+++ b/helm/templates/_helpers.tpl
@@ -1679,42 +1679,52 @@ checks: {{ $checks | toYaml | nindent 2 -}}
 {{- end -}}
 
 {{/*
-Prometheus Operator Monitoring Enabled Helper
+Prometheus monitoring resolution helpers.
+
+The chart integrates with a customer's Prometheus stack through the Prometheus
+Operator (ServiceMonitor + PrometheusRule CRDs) and/or prometheus.io/* annotations.
+
+  components.monitoring.enabled (true|false, default false) -- whether any
+      monitoring resources are created. Explicit; there is no auto-detection.
+  components.monitoring.discovery.method (auto|serviceMonitors|annotations) -- when
+      enabled, the discovery mechanism:
+        auto (default)  = resolves to serviceMonitors today; left as an enum so a
+                          future mechanism can extend it without changing the default
+        serviceMonitors = ServiceMonitor CRDs (+ the PrometheusRule alert bundle).
+                          These are monitoring.coreos.com/v1 CRDs, so the install
+                          fails if the Prometheus Operator is not present.
+        annotations     = prometheus.io/* annotations only (no Operator CRDs, no alerts)
+
+The ServiceMonitors and the PrometheusRule are one Operator bundle: both ride the
+serviceMonitors path. Choosing annotations is discovery-only (no alert bundle).
+Each helper returns "true"/"". The `dig` default mirrors values.yaml so values
+files predating `discovery` still resolve.
+*/}}
+{{/* Resolved discovery method: "annotations" only when explicitly chosen; "auto" and
+     "serviceMonitors" both resolve to serviceMonitors (auto is extensible in future). */}}
+{{- define "cloudzero-agent.monitoring.method" -}}
+{{- if eq (toString (dig "discovery" "method" "auto" .Values.components.monitoring)) "annotations" -}}
+{{- "annotations" -}}
+{{- else -}}
+{{- "serviceMonitors" -}}
+{{- end -}}
+{{- end -}}
 
-Determines whether Prometheus Operator CRDs (ServiceMonitor, PrometheusRule) should
-be created.
+{{- define "cloudzero-agent.monitoring.serviceMonitorsActive" -}}
+{{- if dig "enabled" false .Values.components.monitoring -}}
+{{- if eq (include "cloudzero-agent.monitoring.method" .) "serviceMonitors" -}}{{- true -}}{{- end -}}
+{{- end -}}
+{{- end -}}
 
-  - null (default): Follow the release default. Currently maps to "false"
-    (disabled) while the feature is being validated in customer environments.
-    In a future release, null will map to "auto".
-  - "auto": Auto-detect via CRD presence in the cluster. Creates monitoring
-    resources only if the Prometheus Operator CRDs (monitoring.coreos.com/v1)
-    are available.
-  - true: Force enable (will fail if CRDs are not installed)
-  - false: Force disable
+{{/* The PrometheusRule alert bundle ships with the ServiceMonitor (operator) path. */}}
+{{- define "cloudzero-agent.monitoring.rulesActive" -}}
+{{- include "cloudzero-agent.monitoring.serviceMonitorsActive" . -}}
+{{- end -}}
 
-Usage: {{ if include "cloudzero-agent.monitoring.enabled" . }}...{{ end }}
-Returns: "true" (truthy) when enabled, empty string (falsy) when disabled
-*/}}
-{{- define "cloudzero-agent.monitoring.enabled" -}}
-{{- $monitoringSetting := .Values.components.monitoring.enabled -}}
-{{- if kindIs "invalid" $monitoringSetting -}}
-  {{- /* null/not set = release default. Currently: disabled.
-         Change this block to auto-detect when promoting to GA:
-           if .Capabilities.APIVersions.Has "monitoring.coreos.com/v1"
-             true
-           end
-  */ -}}
-{{- else if eq (toString $monitoringSetting) "auto" -}}
-  {{- /* "auto" = detect Prometheus Operator CRDs */ -}}
-  {{- if .Capabilities.APIVersions.Has "monitoring.coreos.com/v1" -}}
-    {{- true -}}
-  {{- end -}}
-{{- else if eq (toString $monitoringSetting) "true" -}}
-  {{- /* true = force enabled */ -}}
-  {{- true -}}
+{{- define "cloudzero-agent.monitoring.annotationsActive" -}}
+{{- if dig "enabled" false .Values.components.monitoring -}}
+{{- if eq (include "cloudzero-agent.monitoring.method" .) "annotations" -}}{{- true -}}{{- end -}}
 {{- end -}}
-{{- /* false = force disabled, returns empty string */ -}}
 {{- end -}}
 
 {{/*
diff --git a/helm/templates/agent-service.yaml b/helm/templates/agent-service.yaml
index eff24e69f..3df51b7a0 100644
--- a/helm/templates/agent-service.yaml
+++ b/helm/templates/agent-service.yaml
@@ -11,11 +11,15 @@ metadata:
         .Values.commonMetaLabels
       )
     ) | nindent 2 }}
+  {{- $promAnnotations := dict -}}
+  {{- if include "cloudzero-agent.monitoring.annotationsActive" . -}}
+  {{- $promAnnotations = dict "prometheus.io/scrape" "true" "prometheus.io/port" "9090" "prometheus.io/path" "/metrics" -}}
+  {{- end -}}
   {{- include "cloudzero-agent.generateAnnotations" (dict
       "root" .
       "annotations" (list
         .Values.defaults.annotations
-        (dict "prometheus.io/scrape" "true" "prometheus.io/port" "9090" "prometheus.io/path" "/metrics")
+        $promAnnotations
       )
     ) | nindent 2 }}
 spec:
diff --git a/helm/templates/aggregator-service.yaml b/helm/templates/aggregator-service.yaml
index be32b42ba..d9ed2f528 100644
--- a/helm/templates/aggregator-service.yaml
+++ b/helm/templates/aggregator-service.yaml
@@ -12,12 +12,16 @@ metadata:
         .Values.components.aggregator.labels
       )
     ) | nindent 2 }}
+  {{- $promAnnotations := dict -}}
+  {{- if include "cloudzero-agent.monitoring.annotationsActive" . -}}
+  {{- $promAnnotations = dict "prometheus.io/scrape" "true" "prometheus.io/port" (.Values.aggregator.collector.port | quote) "prometheus.io/path" "/metrics" -}}
+  {{- end -}}
   {{- include "cloudzero-agent.generateAnnotations" (dict
       "root" .
       "annotations" (list
         .Values.defaults.annotations
         .Values.components.aggregator.annotations
-        (dict "prometheus.io/scrape" "true" "prometheus.io/port" (.Values.aggregator.collector.port | quote) "prometheus.io/path" "/metrics")
+        $promAnnotations
       )
     ) | nindent 2 }}
 spec:
diff --git a/helm/templates/prometheusrule.yaml b/helm/templates/prometheusrule.yaml
index 92d2928e0..18145dc99 100644
--- a/helm/templates/prometheusrule.yaml
+++ b/helm/templates/prometheusrule.yaml
@@ -6,13 +6,13 @@ up. These alerts cover the critical failure modes documented in the "Monitoring
 the CloudZero Agent" wiki page.
 
 Requires the Prometheus Operator CRDs (monitoring.coreos.com/v1) to be installed
-in the cluster. Controlled by components.monitoring.enabled (null=auto-detect,
-true=force, false=disable).
+in the cluster. Controlled by components.monitoring.enabled (true/false); enabling
+it on a cluster without the Operator CRDs will fail at apply time.
 
 Alert expressions use the Helm-generated service names to construct job label
 selectors, so they work regardless of the Helm release name.
 */}}
-{{- if include "cloudzero-agent.monitoring.enabled" . }}
+{{- if include "cloudzero-agent.monitoring.rulesActive" . }}
 apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
diff --git a/helm/templates/servicemonitor-agent.yaml b/helm/templates/servicemonitor-agent.yaml
index c22ddcd90..3498525db 100644
--- a/helm/templates/servicemonitor-agent.yaml
+++ b/helm/templates/servicemonitor-agent.yaml
@@ -7,7 +7,7 @@ prometheus_remote_storage_* counters, and other Prometheus native metrics.
 
 See: Monitoring the CloudZero Agent > Monitoring Prometheus Agent Scrape Health
 */}}
-{{- if include "cloudzero-agent.monitoring.enabled" . }}
+{{- if include "cloudzero-agent.monitoring.serviceMonitorsActive" . }}
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
diff --git a/helm/templates/servicemonitor-collector.yaml b/helm/templates/servicemonitor-collector.yaml
index 44719d59d..02992536d 100644
--- a/helm/templates/servicemonitor-collector.yaml
+++ b/helm/templates/servicemonitor-collector.yaml
@@ -7,7 +7,7 @@ metrics_received_cost_total, and http_request_duration_seconds.
 
 See: Monitoring the CloudZero Agent > Monitoring Data Pipeline Health
 */}}
-{{- if include "cloudzero-agent.monitoring.enabled" . }}
+{{- if include "cloudzero-agent.monitoring.serviceMonitorsActive" . }}
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
diff --git a/helm/templates/servicemonitor-shipper.yaml b/helm/templates/servicemonitor-shipper.yaml
index 5f2371eb2..be47046be 100644
--- a/helm/templates/servicemonitor-shipper.yaml
+++ b/helm/templates/servicemonitor-shipper.yaml
@@ -8,7 +8,7 @@ shipper_disk_* gauges.
 
 See: Monitoring the CloudZero Agent > Monitoring Shipper Health
 */}}
-{{- if include "cloudzero-agent.monitoring.enabled" . }}
+{{- if include "cloudzero-agent.monitoring.serviceMonitorsActive" . }}
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
diff --git a/helm/templates/servicemonitor-webhook.yaml b/helm/templates/servicemonitor-webhook.yaml
index 3ee47911b..c3ece3e85 100644
--- a/helm/templates/servicemonitor-webhook.yaml
+++ b/helm/templates/servicemonitor-webhook.yaml
@@ -18,7 +18,7 @@ which means Prometheus connects directly to the webhook (bypassing the sidecar).
 See: Monitoring the CloudZero Agent > Monitoring Webhook Event Processing
      Monitoring the CloudZero Agent > Monitoring Webhook Metadata Delivery
 */}}
-{{- if include "cloudzero-agent.monitoring.enabled" . }}
+{{- if include "cloudzero-agent.monitoring.serviceMonitorsActive" . }}
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
diff --git a/helm/templates/webhook-service.yaml b/helm/templates/webhook-service.yaml
index b3703de4d..b3d367d7d 100644
--- a/helm/templates/webhook-service.yaml
+++ b/helm/templates/webhook-service.yaml
@@ -11,13 +11,17 @@ metadata:
         .Values.components.webhookServer.labels
       )
     ) | nindent 2 }}
+  {{- $promAnnotations := dict -}}
+  {{- if include "cloudzero-agent.monitoring.annotationsActive" . -}}
+  {{- $promAnnotations = dict "prometheus.io/scrape" "true" "prometheus.io/port" "8443" "prometheus.io/path" "/metrics" "prometheus.io/scheme" "https" -}}
+  {{- end -}}
   {{- include "cloudzero-agent.generateAnnotations" (dict
       "root" .
       "annotations" (list
         .Values.defaults.annotations
         .Values.components.webhookServer.annotations
         (dict "nginx.ingress.kubernetes.io/ssl-redirect" "false")
-        (dict "prometheus.io/scrape" "true" "prometheus.io/port" "8443" "prometheus.io/path" "/metrics" "prometheus.io/scheme" "https")
+        $promAnnotations
       )
     ) | nindent 2 }}
   namespace: {{ .Release.Namespace }}
diff --git a/helm/tests/defaults_service_test.yaml b/helm/tests/defaults_service_test.yaml
index 9aa083d58..8653a86d2 100644
--- a/helm/tests/defaults_service_test.yaml
+++ b/helm/tests/defaults_service_test.yaml
@@ -6,11 +6,16 @@
 # Services only support metadata-level defaults (labels and annotations).
 # PodSpec defaults (affinity, tolerations, etc.) do not apply to Services.
 #
+# Also validates that monitoring.discovery.annotations controls the
+# prometheus.io/* annotations on Services.
+#
 # Templates tested:
+# - agent-service.yaml
 # - aggregator-service.yaml
 # - webhook-service.yaml
 suite: defaults.* properties apply to Service resources
 templates:
+  - agent-service.yaml
   - aggregator-service.yaml
   - webhook-service.yaml
 tests:
@@ -91,3 +96,68 @@ tests:
       - equal:
           path: metadata.annotations.test-defaults-annotation
           value: sentinel-value-annotation
+
+  # ============================================================================
+  # monitoring.discovery.method: prometheus.io/* annotation gating.
+  # Annotations render only when monitoring is enabled AND method=annotations.
+  # enabled defaults to false, so by default the annotations are omitted.
+  # ============================================================================
+  - it: should omit prometheus.io annotations on agent-service by default
+    template: agent-service.yaml
+    asserts:
+      - isNull:
+          path: metadata.annotations["prometheus.io/scrape"]
+
+  - it: should omit prometheus.io annotations on aggregator-service by default
+    template: aggregator-service.yaml
+    asserts:
+      - isNull:
+          path: metadata.annotations["prometheus.io/scrape"]
+
+  - it: should omit prometheus.io annotations on webhook-service by default
+    template: webhook-service.yaml
+    set:
+      insightsController.enabled: true
+    asserts:
+      - isNull:
+          path: metadata.annotations["prometheus.io/scrape"]
+
+  - it: should add prometheus.io annotations on agent-service when enabled + method=annotations
+    template: agent-service.yaml
+    set:
+      components.monitoring.enabled: true
+      components.monitoring.discovery.method: annotations
+    asserts:
+      - equal:
+          path: metadata.annotations["prometheus.io/scrape"]
+          value: "true"
+
+  - it: should add prometheus.io annotations on aggregator-service when enabled + method=annotations
+    template: aggregator-service.yaml
+    set:
+      components.monitoring.enabled: true
+      components.monitoring.discovery.method: annotations
+    asserts:
+      - equal:
+          path: metadata.annotations["prometheus.io/scrape"]
+          value: "true"
+
+  - it: should add prometheus.io annotations on webhook-service when enabled + method=annotations
+    template: webhook-service.yaml
+    set:
+      insightsController.enabled: true
+      components.monitoring.enabled: true
+      components.monitoring.discovery.method: annotations
+    asserts:
+      - equal:
+          path: metadata.annotations["prometheus.io/scrape"]
+          value: "true"
+
+  - it: should NOT add prometheus.io annotations when method=serviceMonitors (operator path)
+    template: agent-service.yaml
+    set:
+      components.monitoring.enabled: true
+      components.monitoring.discovery.method: serviceMonitors
+    asserts:
+      - isNull:
+          path: metadata.annotations["prometheus.io/scrape"]
diff --git a/helm/tests/monitoring_integration_test.yaml b/helm/tests/monitoring_integration_test.yaml
index f7209bedd..a28b08483 100644
--- a/helm/tests/monitoring_integration_test.yaml
+++ b/helm/tests/monitoring_integration_test.yaml
@@ -7,71 +7,10 @@ templates:
   - servicemonitor-webhook.yaml
 tests:
   # ============================================================================
-  # enabled=null (release default, currently maps to disabled)
+  # enabled gates everything (plain boolean, default false)
   # ============================================================================
-  - it: "null: should NOT create PrometheusRule (release default is disabled)"
+  - it: "enabled=false (default): no PrometheusRule"
     template: prometheusrule.yaml
-    set:
-      apiKey: "test-key"
-      existingSecretName: null
-      clusterName: "test-cluster"
-      components.monitoring.enabled: null
-    asserts:
-      - hasDocuments:
-          count: 0
-
-  - it: "null: should NOT create ServiceMonitor for agent"
-    template: servicemonitor-agent.yaml
-    set:
-      apiKey: "test-key"
-      existingSecretName: null
-      clusterName: "test-cluster"
-      components.monitoring.enabled: null
-    asserts:
-      - hasDocuments:
-          count: 0
-
-  - it: "null: should NOT create ServiceMonitor for collector"
-    template: servicemonitor-collector.yaml
-    set:
-      apiKey: "test-key"
-      existingSecretName: null
-      clusterName: "test-cluster"
-      components.monitoring.enabled: null
-    asserts:
-      - hasDocuments:
-          count: 0
-
-  - it: "null: should NOT create ServiceMonitor for shipper"
-    template: servicemonitor-shipper.yaml
-    set:
-      apiKey: "test-key"
-      existingSecretName: null
-      clusterName: "test-cluster"
-      components.monitoring.enabled: null
-    asserts:
-      - hasDocuments:
-          count: 0
-
-  - it: "null: should NOT create ServiceMonitor for webhook"
-    template: servicemonitor-webhook.yaml
-    set:
-      apiKey: "test-key"
-      existingSecretName: null
-      clusterName: "test-cluster"
-      components.monitoring.enabled: null
-    asserts:
-      - hasDocuments:
-          count: 0
-
-  # ============================================================================
-  # enabled=false (explicit disable)
-  # ============================================================================
-  - it: "false: should NOT create PrometheusRule even with CRDs present"
-    template: prometheusrule.yaml
-    capabilities:
-      apiVersions:
-        - monitoring.coreos.com/v1
     set:
       apiKey: "test-key"
       existingSecretName: null
@@ -81,11 +20,8 @@ tests:
       - hasDocuments:
           count: 0
 
-  - it: "false: should NOT create any ServiceMonitors even with CRDs present"
+  - it: "enabled=false (default): no ServiceMonitor"
     template: servicemonitor-agent.yaml
-    capabilities:
-      apiVersions:
-        - monitoring.coreos.com/v1
     set:
       apiKey: "test-key"
       existingSecretName: null
@@ -95,10 +31,7 @@ tests:
       - hasDocuments:
           count: 0
 
-  # ============================================================================
-  # enabled=true (force enable)
-  # ============================================================================
-  - it: "true: should create PrometheusRule"
+  - it: "enabled=true: creates PrometheusRule (method auto -> serviceMonitors)"
     template: prometheusrule.yaml
     set:
       apiKey: "test-key"
@@ -111,7 +44,7 @@ tests:
       - isKind:
           of: PrometheusRule
 
-  - it: "true: should create ServiceMonitor for agent"
+  - it: "enabled=true: creates ServiceMonitor (method auto -> serviceMonitors)"
     template: servicemonitor-agent.yaml
     set:
       apiKey: "test-key"
@@ -124,39 +57,41 @@ tests:
       - isKind:
           of: ServiceMonitor
 
-  - it: "true: should create ServiceMonitor for collector"
-    template: servicemonitor-collector.yaml
+  # ============================================================================
+  # discovery.method = annotations: discovery-only, no Operator bundle
+  # ============================================================================
+  - it: "method=annotations: no ServiceMonitor"
+    template: servicemonitor-agent.yaml
     set:
       apiKey: "test-key"
       existingSecretName: null
       clusterName: "test-cluster"
       components.monitoring.enabled: true
+      components.monitoring.discovery.method: annotations
     asserts:
       - hasDocuments:
-          count: 1
-      - isKind:
-          of: ServiceMonitor
+          count: 0
 
-  - it: "true: should create ServiceMonitor for shipper"
-    template: servicemonitor-shipper.yaml
+  - it: "method=annotations: no PrometheusRule"
+    template: prometheusrule.yaml
     set:
       apiKey: "test-key"
       existingSecretName: null
       clusterName: "test-cluster"
       components.monitoring.enabled: true
+      components.monitoring.discovery.method: annotations
     asserts:
       - hasDocuments:
-          count: 1
-      - isKind:
-          of: ServiceMonitor
+          count: 0
 
-  - it: "true: should create ServiceMonitor for webhook"
-    template: servicemonitor-webhook.yaml
+  - it: "method=serviceMonitors (explicit): creates ServiceMonitor"
+    template: servicemonitor-collector.yaml
     set:
       apiKey: "test-key"
       existingSecretName: null
       clusterName: "test-cluster"
       components.monitoring.enabled: true
+      components.monitoring.discovery.method: serviceMonitors
     asserts:
       - hasDocuments:
           count: 1
@@ -164,71 +99,7 @@ tests:
           of: ServiceMonitor
 
   # ============================================================================
-  # enabled="auto" with CRDs present (should create)
-  # ============================================================================
-  - it: "auto: should create PrometheusRule when Prometheus Operator CRDs present"
-    template: prometheusrule.yaml
-    capabilities:
-      apiVersions:
-        - monitoring.coreos.com/v1
-    set:
-      apiKey: "test-key"
-      existingSecretName: null
-      clusterName: "test-cluster"
-      components.monitoring.enabled: auto
-    asserts:
-      - hasDocuments:
-          count: 1
-      - isKind:
-          of: PrometheusRule
-
-  - it: "auto: should create all ServiceMonitors when CRDs present"
-    template: servicemonitor-agent.yaml
-    capabilities:
-      apiVersions:
-        - monitoring.coreos.com/v1
-    set:
-      apiKey: "test-key"
-      existingSecretName: null
-      clusterName: "test-cluster"
-      components.monitoring.enabled: auto
-    asserts:
-      - hasDocuments:
-          count: 1
-      - isKind:
-          of: ServiceMonitor
-
-  # ============================================================================
-  # enabled="auto" without CRDs (should NOT create)
-  # ============================================================================
-  - it: "auto: should NOT create PrometheusRule when no CRDs"
-    template: prometheusrule.yaml
-    capabilities:
-      apiVersions: []
-    set:
-      apiKey: "test-key"
-      existingSecretName: null
-      clusterName: "test-cluster"
-      components.monitoring.enabled: auto
-    asserts:
-      - hasDocuments:
-          count: 0
-
-  - it: "auto: should NOT create ServiceMonitors when no CRDs"
-    template: servicemonitor-agent.yaml
-    capabilities:
-      apiVersions: []
-    set:
-      apiKey: "test-key"
-      existingSecretName: null
-      clusterName: "test-cluster"
-      components.monitoring.enabled: auto
-    asserts:
-      - hasDocuments:
-          count: 0
-
-  # ============================================================================
-  # PrometheusRule content validation
+  # PrometheusRule content validation (enabled=true; method auto -> serviceMonitors)
   # ============================================================================
   - it: "should use Helm-generated service names in alert job labels"
     template: prometheusrule.yaml
@@ -257,9 +128,6 @@ tests:
           path: spec.groups[0].rules
           count: 14
 
-  # ============================================================================
-  # Aggregator Down alert
-  # ============================================================================
   - it: "should include CloudZeroAggregatorDown alert with correct job label"
     template: prometheusrule.yaml
     release:
@@ -277,9 +145,6 @@ tests:
           path: spec.groups[0].rules[3].alert
           value: CloudZeroAggregatorDown
 
-  # ============================================================================
-  # Dynamic shipper thresholds (based on costMaxInterval)
-  # ============================================================================
   - it: "shipper alerts should use 2x and 1.5x of default costMaxInterval (30m)"
     template: prometheusrule.yaml
     set:
@@ -369,7 +234,7 @@ tests:
           value: "450s"
 
   # ============================================================================
-  # Custom labels and namespace
+  # Custom labels and namespace (enabled=true; method auto -> serviceMonitors)
   # ============================================================================
   - it: "should apply custom labels to ServiceMonitor"
     template: servicemonitor-agent.yaml
@@ -557,60 +422,6 @@ tests:
           path: spec.namespaceSelector.matchNames[0]
           value: cloudzero
 
-  - it: "namespace null: collector ServiceMonitor should use release namespace"
-    template: servicemonitor-collector.yaml
-    release:
-      namespace: cloudzero
-    set:
-      apiKey: "test-key"
-      existingSecretName: null
-      clusterName: "test-cluster"
-      components.monitoring.enabled: true
-      components.monitoring.namespace: null
-    asserts:
-      - equal:
-          path: metadata.namespace
-          value: cloudzero
-      - equal:
-          path: spec.namespaceSelector.matchNames[0]
-          value: cloudzero
-
-  - it: "namespace null: shipper ServiceMonitor should use release namespace"
-    template: servicemonitor-shipper.yaml
-    release:
-      namespace: cloudzero
-    set:
-      apiKey: "test-key"
-      existingSecretName: null
-      clusterName: "test-cluster"
-      components.monitoring.enabled: true
-      components.monitoring.namespace: null
-    asserts:
-      - equal:
-          path: metadata.namespace
-          value: cloudzero
-      - equal:
-          path: spec.namespaceSelector.matchNames[0]
-          value: cloudzero
-
-  - it: "namespace null: webhook ServiceMonitor should use release namespace"
-    template: servicemonitor-webhook.yaml
-    release:
-      namespace: cloudzero
-    set:
-      apiKey: "test-key"
-      existingSecretName: null
-      clusterName: "test-cluster"
-      components.monitoring.enabled: true
-      components.monitoring.namespace: null
-    asserts:
-      - equal:
-          path: metadata.namespace
-          value: cloudzero
-      - equal:
-          path: spec.namespaceSelector.matchNames[0]
-          value: cloudzero
-
   # Empty string should behave identically to null
   - it: "namespace empty string: PrometheusRule should use release namespace"
     template: prometheusrule.yaml
diff --git a/helm/values.schema.json b/helm/values.schema.json
index 215f47dcf..d1317ad62 100644
--- a/helm/values.schema.json
+++ b/helm/values.schema.json
@@ -6309,20 +6309,20 @@
         "monitoring": {
           "additionalProperties": false,
           "properties": {
-            "enabled": {
-              "default": null,
-              "oneOf": [
-                {
-                  "type": "null"
-                },
-                {
-                  "type": "boolean"
-                },
-                {
-                  "enum": ["auto"],
+            "discovery": {
+              "additionalProperties": false,
+              "properties": {
+                "method": {
+                  "default": "auto",
+                  "enum": ["auto", "serviceMonitors", "annotations"],
                   "type": "string"
                 }
-              ]
+              },
+              "type": "object"
+            },
+            "enabled": {
+              "default": false,
+              "type": "boolean"
             },
             "labels": {
               "additionalProperties": {
diff --git a/helm/values.schema.yaml b/helm/values.schema.yaml
index d593841f3..d1905a087 100644
--- a/helm/values.schema.yaml
+++ b/helm/values.schema.yaml
@@ -1342,36 +1342,46 @@ properties:
 
       monitoring:
         description: |
-          Prometheus Operator monitoring integration.
+          Monitoring integration with the customer's Prometheus stack.
 
-          When enabled, the chart creates ServiceMonitor and PrometheusRule CRDs
-          that allow the Prometheus Operator to automatically discover and scrape
-          CloudZero Agent metrics and evaluate pre-configured alert rules.
-
-          Regardless of this setting, prometheus.io/* annotations are always added
-          to Services for customers using standard Prometheus service discovery.
+          The chart integrates two ways: via the Prometheus Operator (ServiceMonitor
+          + PrometheusRule CRDs) and/or via prometheus.io/* annotations on Services.
+          monitoring.enabled turns it on; monitoring.discovery.method selects which.
         additionalProperties: false
         type: object
         properties:
           enabled:
             description: |
-              Controls creation of Prometheus Operator CRDs (ServiceMonitor, PrometheusRule).
-
-              - null (default): Follow the release default. Currently maps to false
-                while this feature is being validated. In a future release, null will
-                map to "auto".
-              - "auto": Auto-detect. Creates monitoring CRDs only if the Prometheus
-                Operator CRDs (monitoring.coreos.com/v1) are available in the cluster.
-              - true: Always create monitoring CRDs. Helm will fail if the Prometheus
-                Operator CRDs are not installed.
-              - false: Never create monitoring CRDs.
-            default: null
-            oneOf:
-              - type: "null"
-              - type: boolean
-              - type: string
+              Whether to create the monitoring resources. Off by default; set to
+              true to enable. With the default discovery method (serviceMonitors),
+              this creates monitoring.coreos.com/v1 CRDs, so the install fails if the
+              Prometheus Operator is not present -- use discovery.method: annotations
+              otherwise.
+            default: false
+            type: boolean
+          discovery:
+            description: |
+              How the agent's metrics are advertised when monitoring.enabled is on.
+            additionalProperties: false
+            type: object
+            properties:
+              method:
+                description: |
+                  Discovery mechanism used when monitoring.enabled resolves on.
+
+                  - "auto" (default): Pick automatically. Resolves to "serviceMonitors"
+                    today; kept as an enum so a future auto-detectable backend can extend
+                    it without changing the default.
+                  - "serviceMonitors": Emit ServiceMonitor CRDs plus the PrometheusRule
+                    alert bundle for the Prometheus Operator.
+                  - "annotations": Emit prometheus.io/* annotations only -- no Operator
+                    CRDs and no alert bundle. Explicit opt-in; never auto-selected.
+                default: auto
+                type: string
                 enum:
                   - auto
+                  - serviceMonitors
+                  - annotations
           namespace:
             description: |
               Namespace override for PrometheusRule and ServiceMonitor CRDs.
diff --git a/helm/values.yaml b/helm/values.yaml
index d7b5d5713..dd4da1c76 100644
--- a/helm/values.yaml
+++ b/helm/values.yaml
@@ -867,29 +867,26 @@ components:
       # https://kubernetes.io/docs/tasks/configure-pod-container/security-context/
       securityContext: {}
 
-  # Prometheus Operator monitoring integration.
+  # Monitoring integration with the customer's Prometheus stack.
   #
-  # When enabled, the chart creates ServiceMonitor and PrometheusRule CRDs that
-  # allow the Prometheus Operator to automatically discover and scrape CloudZero
-  # Agent metrics, and to evaluate pre-configured alert rules.
-  #
-  # This follows the same null/true/false pattern as integrations.istio:
-  #
-  # - null (default): Follow the release default. Currently maps to "false"
-  #   (disabled) while this feature is being validated. In a future release,
-  #   null will map to "auto". Use null unless you have a reason to override.
-  # - "auto": Auto-detect. Creates monitoring CRDs only if the Prometheus
-  #   Operator CRDs (monitoring.coreos.com/v1) are available in the cluster.
-  # - true: Always create monitoring CRDs. Helm will fail if the Prometheus
-  #   Operator CRDs are not installed in the cluster.
-  # - false: Never create monitoring CRDs.
-  #
-  # To opt in now, set to "auto" or true.
-  #
-  # Regardless of this setting, prometheus.io/* annotations are always added to
-  # Services for customers using standard Prometheus service discovery.
+  # The chart integrates two ways: via the Prometheus Operator (ServiceMonitor +
+  # PrometheusRule CRDs) and/or via prometheus.io/* annotations on Services.
   monitoring:
-    enabled: null
+    # Whether to create the monitoring resources. Off by default; set to true to
+    # enable. With the default discovery method (serviceMonitors), this creates
+    # monitoring.coreos.com/v1 CRDs, so the install fails if the Prometheus
+    # Operator is not present -- use discovery.method: annotations otherwise.
+    enabled: false
+
+    discovery:
+      # How your Prometheus discovers the agent's metrics (when monitoring is on).
+      #
+      # - auto (default): use ServiceMonitors.
+      # - serviceMonitors: create ServiceMonitor resources for the Prometheus
+      #   Operator, along with the alert rules.
+      # - annotations: add prometheus.io/* annotations to the Services instead, for
+      #   a Prometheus that scrapes by annotation. No alert rules.
+      method: auto
 
     # Namespace override for PrometheusRule and ServiceMonitor CRDs.
     # null (default) = same namespace as the agent installation.
diff --git a/tests/helm/schema/monitoring.discovery.annotations.pass.yaml b/tests/helm/schema/monitoring.discovery.annotations.pass.yaml
new file mode 100644
index 000000000..6779dc3b8
--- /dev/null
+++ b/tests/helm/schema/monitoring.discovery.annotations.pass.yaml
@@ -0,0 +1,9 @@
+# Annotation discovery opt-in: gate forced on + method=annotations renders the
+# prometheus.io/* annotations only (no Operator CRDs, no alert bundle), so it
+# applies cleanly on a cluster without the Prometheus Operator.
+clusterName: "test-cluster"
+components:
+  monitoring:
+    enabled: true
+    discovery:
+      method: annotations
diff --git a/tests/helm/schema/monitoring.discovery.default.pass.yaml b/tests/helm/schema/monitoring.discovery.default.pass.yaml
new file mode 100644
index 000000000..e7cdcef82
--- /dev/null
+++ b/tests/helm/schema/monitoring.discovery.default.pass.yaml
@@ -0,0 +1,3 @@
+# Chart defaults (enabled: false). Monitoring is off, so nothing
+# monitoring-related renders. Validates the default install.
+clusterName: "test-cluster"
diff --git a/tests/helm/schema/monitoring.discovery.method.invalid.fail.yaml b/tests/helm/schema/monitoring.discovery.method.invalid.fail.yaml
new file mode 100644
index 000000000..16b149d5a
--- /dev/null
+++ b/tests/helm/schema/monitoring.discovery.method.invalid.fail.yaml
@@ -0,0 +1,6 @@
+# Invalid: discovery.method must be one of auto | serviceMonitors | annotations.
+clusterName: "test-cluster"
+components:
+  monitoring:
+    discovery:
+      method: bogus
diff --git a/tests/helm/schema/monitoring.discovery.none.pass.yaml b/tests/helm/schema/monitoring.discovery.none.pass.yaml
new file mode 100644
index 000000000..ae7d97efb
--- /dev/null
+++ b/tests/helm/schema/monitoring.discovery.none.pass.yaml
@@ -0,0 +1,5 @@
+# Gate off -> no Operator CRDs, no annotations. The chart still renders cleanly.
+clusterName: "test-cluster"
+components:
+  monitoring:
+    enabled: false
diff --git a/tests/helm/schema/monitoring.discovery.servicemonitors.pass.yaml b/tests/helm/schema/monitoring.discovery.servicemonitors.pass.yaml
new file mode 100644
index 000000000..f84608594
--- /dev/null
+++ b/tests/helm/schema/monitoring.discovery.servicemonitors.pass.yaml
@@ -0,0 +1,10 @@
+# Force the gate on (enabled: true) with the serviceMonitors path so the
+# ServiceMonitors + PrometheusRule render even though `helm template` detects no
+# Operator. kubeconform validates these monitoring.coreos.com/v1 CRDs against the
+# datreeio CRDs-catalog.
+clusterName: "test-cluster"
+components:
+  monitoring:
+    enabled: true
+    discovery:
+      method: serviceMonitors
diff --git a/tests/helm/schema/monitoring.enabled.invalid-type.fail.yaml b/tests/helm/schema/monitoring.enabled.invalid-type.fail.yaml
new file mode 100644
index 000000000..441a24f8d
--- /dev/null
+++ b/tests/helm/schema/monitoring.enabled.invalid-type.fail.yaml
@@ -0,0 +1,5 @@
+# Invalid: enabled is a plain boolean -- "auto" (and null) are no longer accepted.
+clusterName: "test-cluster"
+components:
+  monitoring:
+    enabled: auto
diff --git a/tests/helm/template/alloy.yaml b/tests/helm/template/alloy.yaml
index fa506732c..3ec38f36c 100644
--- a/tests/helm/template/alloy.yaml
+++ b/tests/helm/template/alloy.yaml
@@ -1103,7 +1103,9 @@ data:
               memory: 64Mi
           securityContext: {}
       monitoring:
-        enabled: null
+        discovery:
+          method: auto
+        enabled: false
         labels: {}
         namespace: null
         sharedSecret: false
@@ -2334,10 +2336,7 @@ metadata:
     app.kubernetes.io/part-of: cloudzero-agent
     app.kubernetes.io/version: v3.10.0
     helm.sh/chart: cloudzero-agent-1.1.0-dev
-  annotations:
-    prometheus.io/path: /metrics
-    prometheus.io/port: "9090"
-    prometheus.io/scrape: "true"
+  
 spec:
   type: ClusterIP
   ports:
@@ -2362,10 +2361,7 @@ metadata:
     app.kubernetes.io/part-of: cloudzero-agent
     app.kubernetes.io/version: v3.10.0
     helm.sh/chart: cloudzero-agent-1.1.0-dev
-  annotations:
-    prometheus.io/path: /metrics
-    prometheus.io/port: '"8080"'
-    prometheus.io/scrape: "true"
+  
 spec:
   selector:
     app.kubernetes.io/name: aggregator
@@ -2395,10 +2391,6 @@ metadata:
     helm.sh/chart: cloudzero-agent-1.1.0-dev
   annotations:
     nginx.ingress.kubernetes.io/ssl-redirect: "false"
-    prometheus.io/path: /metrics
-    prometheus.io/port: "8443"
-    prometheus.io/scheme: https
-    prometheus.io/scrape: "true"
   namespace: cz-agent
 spec:
   type: ClusterIP
diff --git a/tests/helm/template/cert-manager.yaml b/tests/helm/template/cert-manager.yaml
index 46a9eb676..25d386f7c 100644
--- a/tests/helm/template/cert-manager.yaml
+++ b/tests/helm/template/cert-manager.yaml
@@ -1018,7 +1018,9 @@ data:
               memory: 64Mi
           securityContext: {}
       monitoring:
-        enabled: null
+        discovery:
+          method: auto
+        enabled: false
         labels: {}
         namespace: null
         sharedSecret: false
@@ -2249,10 +2251,7 @@ metadata:
     app.kubernetes.io/part-of: cloudzero-agent
     app.kubernetes.io/version: v3.10.0
     helm.sh/chart: cloudzero-agent-1.1.0-dev
-  annotations:
-    prometheus.io/path: /metrics
-    prometheus.io/port: "9090"
-    prometheus.io/scrape: "true"
+  
 spec:
   type: ClusterIP
   ports:
@@ -2277,10 +2276,7 @@ metadata:
     app.kubernetes.io/part-of: cloudzero-agent
     app.kubernetes.io/version: v3.10.0
     helm.sh/chart: cloudzero-agent-1.1.0-dev
-  annotations:
-    prometheus.io/path: /metrics
-    prometheus.io/port: '"8080"'
-    prometheus.io/scrape: "true"
+  
 spec:
   selector:
     app.kubernetes.io/name: aggregator
@@ -2310,10 +2306,6 @@ metadata:
     helm.sh/chart: cloudzero-agent-1.1.0-dev
   annotations:
     nginx.ingress.kubernetes.io/ssl-redirect: "false"
-    prometheus.io/path: /metrics
-    prometheus.io/port: "8443"
-    prometheus.io/scheme: https
-    prometheus.io/scrape: "true"
   namespace: cz-agent
 spec:
   type: ClusterIP
diff --git a/tests/helm/template/federated.yaml b/tests/helm/template/federated.yaml
index 2ea82b3d3..2b037d766 100644
--- a/tests/helm/template/federated.yaml
+++ b/tests/helm/template/federated.yaml
@@ -1106,7 +1106,9 @@ data:
               memory: 64Mi
           securityContext: {}
       monitoring:
-        enabled: null
+        discovery:
+          method: auto
+        enabled: false
         labels: {}
         namespace: null
         sharedSecret: false
@@ -2337,10 +2339,7 @@ metadata:
     app.kubernetes.io/part-of: cloudzero-agent
     app.kubernetes.io/version: v3.10.0
     helm.sh/chart: cloudzero-agent-1.1.0-dev
-  annotations:
-    prometheus.io/path: /metrics
-    prometheus.io/port: "9090"
-    prometheus.io/scrape: "true"
+  
 spec:
   type: ClusterIP
   ports:
@@ -2365,10 +2364,7 @@ metadata:
     app.kubernetes.io/part-of: cloudzero-agent
     app.kubernetes.io/version: v3.10.0
     helm.sh/chart: cloudzero-agent-1.1.0-dev
-  annotations:
-    prometheus.io/path: /metrics
-    prometheus.io/port: '"8080"'
-    prometheus.io/scrape: "true"
+  
 spec:
   selector:
     app.kubernetes.io/name: aggregator
@@ -2398,10 +2394,6 @@ metadata:
     helm.sh/chart: cloudzero-agent-1.1.0-dev
   annotations:
     nginx.ingress.kubernetes.io/ssl-redirect: "false"
-    prometheus.io/path: /metrics
-    prometheus.io/port: "8443"
-    prometheus.io/scheme: https
-    prometheus.io/scrape: "true"
   namespace: cz-agent
 spec:
   type: ClusterIP
diff --git a/tests/helm/template/istio.yaml b/tests/helm/template/istio.yaml
index 00eac588e..6bee829f5 100644
--- a/tests/helm/template/istio.yaml
+++ b/tests/helm/template/istio.yaml
@@ -1033,7 +1033,9 @@ data:
               memory: 64Mi
           securityContext: {}
       monitoring:
-        enabled: null
+        discovery:
+          method: auto
+        enabled: false
         labels: {}
         namespace: null
         sharedSecret: false
@@ -2264,10 +2266,7 @@ metadata:
     app.kubernetes.io/part-of: cloudzero-agent
     app.kubernetes.io/version: v3.10.0
     helm.sh/chart: cloudzero-agent-1.1.0-dev
-  annotations:
-    prometheus.io/path: /metrics
-    prometheus.io/port: "9090"
-    prometheus.io/scrape: "true"
+  
 spec:
   type: ClusterIP
   ports:
@@ -2292,10 +2291,7 @@ metadata:
     app.kubernetes.io/part-of: cloudzero-agent
     app.kubernetes.io/version: v3.10.0
     helm.sh/chart: cloudzero-agent-1.1.0-dev
-  annotations:
-    prometheus.io/path: /metrics
-    prometheus.io/port: '"8080"'
-    prometheus.io/scrape: "true"
+  
 spec:
   selector:
     app.kubernetes.io/name: aggregator
@@ -2325,10 +2321,6 @@ metadata:
     helm.sh/chart: cloudzero-agent-1.1.0-dev
   annotations:
     nginx.ingress.kubernetes.io/ssl-redirect: "false"
-    prometheus.io/path: /metrics
-    prometheus.io/port: "8443"
-    prometheus.io/scheme: https
-    prometheus.io/scrape: "true"
   namespace: cz-agent
 spec:
   type: ClusterIP
diff --git a/tests/helm/template/kubestate.yaml b/tests/helm/template/kubestate.yaml
index 20c54ceaf..4b1cdebef 100644
--- a/tests/helm/template/kubestate.yaml
+++ b/tests/helm/template/kubestate.yaml
@@ -1070,7 +1070,9 @@ data:
               memory: 64Mi
           securityContext: {}
       monitoring:
-        enabled: null
+        discovery:
+          method: auto
+        enabled: false
         labels: {}
         namespace: null
         sharedSecret: false
@@ -1874,10 +1876,7 @@ metadata:
     app.kubernetes.io/part-of: cloudzero-agent
     app.kubernetes.io/version: v3.10.0
     helm.sh/chart: cloudzero-agent-1.1.0-dev
-  annotations:
-    prometheus.io/path: /metrics
-    prometheus.io/port: "9090"
-    prometheus.io/scrape: "true"
+  
 spec:
   type: ClusterIP
   ports:
@@ -1902,10 +1901,7 @@ metadata:
     app.kubernetes.io/part-of: cloudzero-agent
     app.kubernetes.io/version: v3.10.0
     helm.sh/chart: cloudzero-agent-1.1.0-dev
-  annotations:
-    prometheus.io/path: /metrics
-    prometheus.io/port: '"8080"'
-    prometheus.io/scrape: "true"
+  
 spec:
   selector:
     app.kubernetes.io/name: aggregator
@@ -1935,10 +1931,6 @@ metadata:
     helm.sh/chart: cloudzero-agent-1.1.0-dev
   annotations:
     nginx.ingress.kubernetes.io/ssl-redirect: "false"
-    prometheus.io/path: /metrics
-    prometheus.io/port: "8443"
-    prometheus.io/scheme: https
-    prometheus.io/scrape: "true"
   namespace: cz-agent
 spec:
   type: ClusterIP
diff --git a/tests/helm/template/manifest.yaml b/tests/helm/template/manifest.yaml
index 51582f517..9fcaf326d 100644
--- a/tests/helm/template/manifest.yaml
+++ b/tests/helm/template/manifest.yaml
@@ -1033,7 +1033,9 @@ data:
               memory: 64Mi
           securityContext: {}
       monitoring:
-        enabled: null
+        discovery:
+          method: auto
+        enabled: false
         labels: {}
         namespace: null
         sharedSecret: false
@@ -2264,10 +2266,7 @@ metadata:
     app.kubernetes.io/part-of: cloudzero-agent
     app.kubernetes.io/version: v3.10.0
     helm.sh/chart: cloudzero-agent-1.1.0-dev
-  annotations:
-    prometheus.io/path: /metrics
-    prometheus.io/port: "9090"
-    prometheus.io/scrape: "true"
+  
 spec:
   type: ClusterIP
   ports:
@@ -2292,10 +2291,7 @@ metadata:
     app.kubernetes.io/part-of: cloudzero-agent
     app.kubernetes.io/version: v3.10.0
     helm.sh/chart: cloudzero-agent-1.1.0-dev
-  annotations:
-    prometheus.io/path: /metrics
-    prometheus.io/port: '"8080"'
-    prometheus.io/scrape: "true"
+  
 spec:
   selector:
     app.kubernetes.io/name: aggregator
@@ -2325,10 +2321,6 @@ metadata:
     helm.sh/chart: cloudzero-agent-1.1.0-dev
   annotations:
     nginx.ingress.kubernetes.io/ssl-redirect: "false"
-    prometheus.io/path: /metrics
-    prometheus.io/port: "8443"
-    prometheus.io/scheme: https
-    prometheus.io/scrape: "true"
   namespace: cz-agent
 spec:
   type: ClusterIP