From 42dfb5e26a97e7021b606a7c6189f50cd42b2180 Mon Sep 17 00:00:00 2001 From: haseeb Date: Thu, 2 Apr 2026 15:25:43 +0530 Subject: [PATCH 1/9] feat: enable site clusters to run Nautobot Celery workers with mTLS Sites need to run background task processing locally to reduce cross-cluster latency and scale worker capacity independently. Workers connect back to the global PostgreSQL and Redis, so cross-cluster connections require stronger auth than passwords alone. Adds a site-scoped ArgoCD Application that deploys only the Celery worker portion of the Nautobot Helm chart. The web server, Redis, and PostgreSQL remain on the global cluster. All cross-cluster connections use end-to-end mTLS: - nautobot_config.py gains conditional SSL/mTLS logic for both PostgreSQL (NAUTOBOT_DB_SSLMODE) and Redis (auto-detected from mounted CA cert) - nautobot-worker component values disable everything except celery - envoy-configs gateway template supports gatewayPort on TLS passthrough listeners for non-443 ports (5432, 6379) - envoy-configs schema adds gatewayPort to the tls route type - Deploy guide documents the full architecture, step-by-step site onboarding, certificate infrastructure, and troubleshooting --- .../application-nautobot-worker.yaml | 50 ++ charts/argocd-understack/values.yaml | 6 + .../templates/gw-external.yaml.tpl | 5 +- components/envoy-configs/values.schema.json | 6 + components/nautobot-worker/values.yaml | 69 +++ components/nautobot/nautobot_config.py | 49 ++ .../components/nautobot-worker.md | 547 ++++++++++++++++++ mkdocs.yml | 1 + 8 files changed, 730 insertions(+), 3 deletions(-) create mode 100644 charts/argocd-understack/templates/application-nautobot-worker.yaml create mode 100644 components/nautobot-worker/values.yaml create mode 100644 docs/deploy-guide/components/nautobot-worker.md diff --git a/charts/argocd-understack/templates/application-nautobot-worker.yaml b/charts/argocd-understack/templates/application-nautobot-worker.yaml new file mode 100644 index 000000000..36165b393 --- /dev/null +++ b/charts/argocd-understack/templates/application-nautobot-worker.yaml @@ -0,0 +1,50 @@ +{{- if eq (include "understack.isEnabled" (list $.Values.site "nautobot_worker")) "true" }} +--- +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: {{ printf "%s-%s" $.Release.Name "nautobot-worker" }} + finalizers: + - resources-finalizer.argocd.argoproj.io + annotations: + argocd.argoproj.io/compare-options: ServerSideDiff=true,IncludeMutationWebhook=true +spec: + destination: + namespace: nautobot + server: {{ $.Values.cluster_server }} + project: understack + sources: + - chart: nautobot + helm: + fileParameters: + - name: nautobot.config + path: $understack/components/nautobot/nautobot_config.py + ignoreMissingValueFiles: true + releaseName: nautobot-worker + valueFiles: + - $understack/components/nautobot-worker/values.yaml + - $deploy/{{ include "understack.deploy_path" $ }}/nautobot-worker/values.yaml + repoURL: https://nautobot.github.io/helm-charts/ + targetRevision: 2.5.6 + + - path: components/nautobot-worker + ref: understack + repoURL: {{ include "understack.understack_url" $ }} + targetRevision: {{ include "understack.understack_ref" $ }} + - path: {{ include "understack.deploy_path" $ }}/nautobot-worker + ref: deploy + repoURL: {{ include "understack.deploy_url" $ }} + targetRevision: {{ include "understack.deploy_ref" $ }} + syncPolicy: + automated: + prune: true + selfHeal: true + managedNamespaceMetadata: + annotations: + argocd.argoproj.io/sync-options: Delete=false + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + - RespectIgnoreDifferences=true + - ApplyOutOfSyncOnly=true +{{- end }} diff --git a/charts/argocd-understack/values.yaml b/charts/argocd-understack/values.yaml index 0ffa5b048..5f36997bb 100644 --- a/charts/argocd-understack/values.yaml +++ b/charts/argocd-understack/values.yaml @@ -556,6 +556,12 @@ site: # @default -- false enabled: false + # -- Nautobot Celery workers (site-level, connects to global Nautobot) + nautobot_worker: + # -- Enable/disable deploying Nautobot workers at the site level + # @default -- false + enabled: false + # -- SNMP exporter for network device monitoring snmp_exporter: # -- Enable/disable deploying SNMP exporter diff --git a/components/envoy-configs/templates/gw-external.yaml.tpl b/components/envoy-configs/templates/gw-external.yaml.tpl index dca86126b..97331a421 100644 --- a/components/envoy-configs/templates/gw-external.yaml.tpl +++ b/components/envoy-configs/templates/gw-external.yaml.tpl @@ -35,13 +35,11 @@ spec: {{- range .Values.routes.tls }} {{- $listenerName := .name | default (index (splitList "." .fqdn) 0) }} - name: {{ $listenerName }} - port: {{ $.Values.gateways.external.port | default 443 }} + port: {{ .gatewayPort | default ($.Values.gateways.external.port | default 443) }} protocol: TLS hostname: {{ .fqdn | quote }} tls: mode: Passthrough - certificateRefs: - - name: {{ $listenerName }}-tls allowedRoutes: namespaces: {{- if .selector }} @@ -52,6 +50,7 @@ spec: from: {{ .from | default "All" }} {{- end }} {{- end }} + {{- if .Values.gateways.external.serviceAnnotations }} infrastructure: parametersRef: diff --git a/components/envoy-configs/values.schema.json b/components/envoy-configs/values.schema.json index 5ba888206..02bf09fbb 100644 --- a/components/envoy-configs/values.schema.json +++ b/components/envoy-configs/values.schema.json @@ -180,6 +180,12 @@ "type": "string", "description": "Namespace where the httproute will be installed (same as backend service)" }, + "gatewayPort": { + "type": "integer", + "minimum": 1, + "maximum": 65535, + "description": "Port exposed on the gateway for this TLS passthrough listener. Defaults to the external gateway port (443) if not specified." + }, "service": { "type": "object", "description": "Kubernetes service backend configuration for the route", diff --git a/components/nautobot-worker/values.yaml b/components/nautobot-worker/values.yaml new file mode 100644 index 000000000..2679e9c16 --- /dev/null +++ b/components/nautobot-worker/values.yaml @@ -0,0 +1,69 @@ +# Nautobot Worker (site-level) +# +# Deploys only Celery workers that connect back to the global Nautobot +# database and Redis. The web server is disabled because it lives on +# the global cluster. Redis and PostgreSQL are disabled because the +# workers reach the global instances over the network. +--- + +# Disable the Nautobot web server — workers only +nautobot: + enabled: false + replicaCount: 0 + + db: + engine: "django.db.backends.postgresql" + # Override in deploy repo values to point at the global CNPG service + host: "" + port: 5432 + name: "app" + user: "app" + existingSecret: "nautobot-db" + existingSecretPasswordKey: "password" + + django: + existingSecret: nautobot-django + + superUser: + enabled: false + + redis: + # Override in deploy repo values to point at the global Redis service + host: "" + port: 6379 + ssl: false + username: "" + +celery: + enabled: true + concurrency: 2 + replicaCount: 1 + extraEnvVarsSecret: + - nautobot-django + livenessProbe: + initialDelaySeconds: 60 + periodSeconds: 120 + timeoutSeconds: 60 + readinessProbe: + initialDelaySeconds: 60 + periodSeconds: 120 + timeoutSeconds: 60 + +# Disable celery beat — scheduling runs on the global cluster only +workers: + beat: + enabled: false + +# Do not deploy local Redis — use the global instance +redis: + enabled: false + +# Do not deploy local PostgreSQL — use the global CNPG instance +postgresql: + enabled: false + +ingress: + enabled: false + +metrics: + enabled: false diff --git a/components/nautobot/nautobot_config.py b/components/nautobot/nautobot_config.py index 612c9d30b..f4c15bbd1 100644 --- a/components/nautobot/nautobot_config.py +++ b/components/nautobot/nautobot_config.py @@ -64,6 +64,55 @@ if DATABASES["default"]["ENGINE"].endswith("mysql"): # noqa F405 DATABASES["default"]["OPTIONS"] = {"charset": "utf8mb4"} # noqa F405 +# SSL/mTLS options for PostgreSQL connections. +# When NAUTOBOT_DB_SSLMODE is set to "verify-ca" or "verify-full", the client +# certificate, key, and CA root cert must be present at the configured paths. +_db_sslcert = os.getenv("NAUTOBOT_DB_SSLCERT", "/etc/nautobot/mtls/tls.crt") +_db_sslkey = os.getenv("NAUTOBOT_DB_SSLKEY", "/etc/nautobot/mtls/tls.key") +_db_sslrootcert = os.getenv("NAUTOBOT_DB_SSLROOTCERT", "/etc/nautobot/mtls/ca.crt") +_db_sslmode = os.getenv("NAUTOBOT_DB_SSLMODE", "") + +if _db_sslmode in ("verify-ca", "verify-full"): + for _path, _label in [ + (_db_sslcert, "NAUTOBOT_DB_SSLCERT"), + (_db_sslkey, "NAUTOBOT_DB_SSLKEY"), + (_db_sslrootcert, "NAUTOBOT_DB_SSLROOTCERT"), + ]: + if not os.path.isfile(_path): + raise FileNotFoundError( + f"SSL certificate file required by {_label} not found: {_path}" + ) + DATABASES["default"]["OPTIONS"] = { # noqa F405 + "sslmode": _db_sslmode, + "sslcert": _db_sslcert, + "sslkey": _db_sslkey, + "sslrootcert": _db_sslrootcert, + } + +# SSL/mTLS options for Redis connections. +# When NAUTOBOT_REDIS_SSL env var is "true" (set by Helm `nautobot.redis.ssl`), +# the Helm chart switches the URL scheme to rediss://. We still need to tell +# the Python redis client *which* certs to use for mutual TLS. +import ssl as _ssl # noqa: E402 + +_redis_ca = os.getenv("NAUTOBOT_REDIS_SSL_CA_CERTS", "/etc/nautobot/mtls/ca.crt") +_redis_cert = os.getenv("NAUTOBOT_REDIS_SSL_CERTFILE", "/etc/nautobot/mtls/tls.crt") +_redis_key = os.getenv("NAUTOBOT_REDIS_SSL_KEYFILE", "/etc/nautobot/mtls/tls.key") + +if os.path.isfile(_redis_ca): + _redis_ssl_kwargs = { + "ssl_cert_reqs": _ssl.CERT_REQUIRED, + "ssl_ca_certs": _redis_ca, + "ssl_certfile": _redis_cert, + "ssl_keyfile": _redis_key, + } + CACHES["default"].setdefault("OPTIONS", {}) # noqa F405 + CACHES["default"]["OPTIONS"].setdefault("CONNECTION_POOL_KWARGS", {}) # noqa F405 + CACHES["default"]["OPTIONS"]["CONNECTION_POOL_KWARGS"].update(_redis_ssl_kwargs) # noqa F405 + CELERY_BROKER_USE_SSL = _redis_ssl_kwargs # noqa F405 + CELERY_REDIS_BACKEND_USE_SSL = _redis_ssl_kwargs # noqa F405 + CELERY_BROKER_TRANSPORT_OPTIONS = {"ssl": _redis_ssl_kwargs} # noqa F405 + # This key is used for secure generation of random numbers and strings. It must never be exposed outside of this file. # For optimal security, SECRET_KEY should be at least 50 characters in length and contain a mix of letters, numbers, and # symbols. Nautobot will not run without this defined. For more information, see diff --git a/docs/deploy-guide/components/nautobot-worker.md b/docs/deploy-guide/components/nautobot-worker.md new file mode 100644 index 000000000..06a7e03df --- /dev/null +++ b/docs/deploy-guide/components/nautobot-worker.md @@ -0,0 +1,547 @@ +--- +charts: +- nautobot +kustomize_paths: +- components/nautobot-worker +deploy_overrides: + helm: + mode: values + kustomize: + mode: second_source +--- + +# nautobot-worker + +Site-level Nautobot Celery workers that connect to the global Nautobot +database and Redis. This component deploys only the Celery worker +portion of the Nautobot Helm chart on site clusters, allowing sites to +process background tasks locally without running the full Nautobot web +application. The web server, Redis, and PostgreSQL all remain on the +global cluster -- site workers connect back to those shared services +over the network. + +## Deployment Scope + +- Cluster scope: site +- Values key: `site.nautobot_worker` +- ArgoCD Application template: `charts/argocd-understack/templates/application-nautobot-worker.yaml` + +## How ArgoCD Builds It + +{{ component_argocd_builds() }} + +## How to Enable + +Enable this component in your site deployment values file: + +```yaml title="$CLUSTER_NAME/deploy.yaml" +site: + nautobot_worker: + enabled: true +``` + +## Architecture + +Site workers connect to the global cluster's PostgreSQL (CNPG) and Redis +through the Envoy Gateway. Both connections use mutual TLS (mTLS) with +TLS passthrough at the gateway, so the cryptographic handshake happens +directly between the worker pod and the database/Redis server. + +```text +Site Cluster Global Cluster ++------------------+ +---------------------------+ +| Worker Pod | TLS+ClientCert | Envoy Gateway | +| - celery | ---------------> | port 5432 (passthrough) | --> CNPG PostgreSQL +| - mtls certs | ---------------> | port 6379 (passthrough) | --> Redis ++------------------+ +---------------------------+ +``` + +The worker pods mount a client certificate (issued by a dedicated +internal CA via cert-manager) and present it during the TLS handshake. +PostgreSQL and Redis on the global cluster verify the client certificate +against the same CA before accepting the connection. + +### Why mTLS? + +Site workers run on remote clusters and connect to the global database +and Redis over the network. Password-only authentication is insufficient +for cross-cluster connections -- if a credential leaks, any host with +network access could connect to the production database. mTLS ensures +that even with a leaked password, connections without a valid client +certificate are rejected. Traffic is encrypted end-to-end between the +worker pod and the server. + +## Connection Security + +### PostgreSQL (CNPG) + +The global CNPG cluster is configured with: + +- `spec.certificates.serverTLSSecret` and `spec.certificates.serverCASecret` + for server-side TLS. PostgreSQL uses the CA in `serverCASecret` to + verify client certificates presented during `pg_hba cert` authentication. + `clientCASecret` is intentionally NOT set -- CNPG uses that field + internally to sign replication client certificates, which requires the + CA private key. CNPG manages its own replication client CA. +- `pg_hba` rules that require `hostssl ... cert` for remote connections + and allow `host ... scram-sha-256` for local pods on the global cluster + +Site workers connect with `sslmode=verify-ca`, presenting their client +certificate, key, and the CA root cert via Django's `DATABASES` OPTIONS. + +The `nautobot_config.py` SSL logic is conditional on the +`NAUTOBOT_DB_SSLMODE` environment variable. When set to `verify-ca` or +`verify-full`, it reads the cert paths from environment variables (with +defaults pointing to `/etc/nautobot/mtls/`) and sets +`DATABASES["default"]["OPTIONS"]`. When the env var is unset or empty +(as on the global cluster), no SSL options are applied and pods connect +with password-only auth. + +#### pg_hba Rule Order + +The CNPG `pg_hba` rules are evaluated top-to-bottom: + +1. `host all all 10.0.0.0/8 scram-sha-256` -- local pods on the global + cluster connect with password only (no TLS required) +2. `hostssl all all 0.0.0.0/0 cert` -- remote connections with a valid + client certificate are accepted (cert CN maps to DB user) +3. `hostssl all all 0.0.0.0/0 scram-sha-256` -- transitional rule: + remote connections over TLS with password only (no client cert). + Remove this rule once all sites have mTLS deployed. + +### Redis + +The global Redis instance has TLS enabled with `authClients: true` +(Bitnami Redis subchart), requiring client certificates from all +connections -- including local pods on the global cluster. + +The `nautobot_config.py` Redis mTLS logic checks if the CA cert file +exists at the default path (`/etc/nautobot/mtls/ca.crt`). If present, +it configures `ssl_cert_reqs`, `ssl_ca_certs`, `ssl_certfile`, and +`ssl_keyfile` on the Redis connection pool, Celery broker, and Celery +result backend. Both global and site pods automatically pick up Redis +mTLS when the cert volume is mounted. + +Because `authClients: true` applies to all connections (Redis has no +equivalent of `pg_hba` to distinguish local vs remote), a +`nautobot-mtls-client` Certificate resource is also deployed on the +global cluster so that local Nautobot web and Celery pods can present +a valid client cert. + +### Envoy Gateway + +Both PostgreSQL (port 5432) and Redis (port 6379) use `routes.tls` +entries with TLS passthrough mode. The gateway routes traffic based on +SNI hostname without terminating TLS, preserving end-to-end mTLS. + +## Certificate Infrastructure + +### Global Cluster + +The global cluster hosts the mTLS CA hierarchy (managed by cert-manager): + +| Resource | Kind | Purpose | +|---|---|---| +| `mtls-selfsigned` | Issuer | Bootstraps the self-signed root | +| `mtls-ca` | Certificate | Root CA (ECDSA P-256, 10yr duration, 1yr renewBefore) | +| `mtls-ca-issuer` | Issuer | Signs all client and server certificates | +| `mtls-ca-cert` | Certificate | CA public cert secret used by CNPG and Redis for client verification | +| `nautobot-cluster-server-tls` | Certificate | PostgreSQL server certificate | +| `nautobot-redis-server-tls` | Certificate | Redis server certificate | +| `nautobot-mtls-client` | Certificate | Client certificate for global Nautobot/Celery pods (needed because Redis `authClients: true` applies to all connections) | + +All resources live in the `nautobot` namespace. + +### Site Clusters + +Each site cluster needs: + +1. The mTLS CA key pair distributed via your external secrets provider + (secret name: `mtls-ca-key-pair`) +2. An `mtls-ca-issuer` Issuer referencing that secret +3. A `nautobot-mtls-client` Certificate resource that cert-manager uses + to issue the client certificate (ECDSA P-256, 1yr duration, 30d + auto-renewal) + +The client certificate is mounted into worker pods at +`/etc/nautobot/mtls/` containing `tls.crt`, `tls.key`, and `ca.crt`. + +## Adding a New Site + +This section walks through configuring `nautobot-worker` for a new site +cluster. All files go in `/nautobot-worker/` in the deploy +repo. + +### Prerequisites + +Before starting, ensure the global cluster already has: + +- The mTLS CA hierarchy deployed (issuers, root CA, CA issuer) +- Server TLS certificates for PostgreSQL and Redis +- A global `nautobot-mtls-client` certificate (for Redis `authClients`) +- CNPG configured with `serverTLSSecret`, `serverCASecret`, and `pg_hba` +- Redis TLS enabled with `authClients: true` +- Envoy Gateway TLS passthrough routes on ports 5432 and 6379 + +You also need the mTLS CA key pair stored in your external secrets +provider so the site cluster can pull it. + +### Step 1: Create the site directory + +```text +/nautobot-worker/ +``` + +### Step 2: Create ExternalSecrets for credentials + +Create ExternalSecret resources that pull credentials from your secrets +provider into the `nautobot` namespace. You need four: + +| ExternalSecret | Target Secret | Purpose | +|---|---|---| +| `externalsecret-nautobot-django.yaml` | `nautobot-django` | Django `SECRET_KEY` -- must match the global instance | +| `externalsecret-nautobot-db.yaml` | `nautobot-db` | CNPG app user password (satisfies Helm chart requirement) | +| `externalsecret-nautobot-worker-redis.yaml` | `nautobot-redis` | Redis password | +| `externalsecret-dockerconfigjson-github-com.yaml` | `dockerconfigjson-github-com` | Container registry credentials | + +Each ExternalSecret should reference your `ClusterSecretStore` and map +the credential into the key format the Nautobot Helm chart expects. + +### Step 3: Create the mTLS CA key pair ExternalSecret + +Create `externalsecret-mtls-ca-key-pair.yaml` to distribute the mTLS CA +certificate and private key to this site cluster. The resulting secret +must be a `kubernetes.io/tls` type with these keys: + +| Key | Content | +|---|---| +| `tls.crt` | CA certificate (PEM) | +| `tls.key` | CA private key (PEM) | +| `ca.crt` | CA certificate (PEM, same as `tls.crt`) | + +cert-manager's CA Issuer reads `tls.crt` and `tls.key` from this secret +to sign client certificates. + +### Step 4: Create the cert-manager CA Issuer + +Create `issuer-mtls-ca-issuer.yaml`: + +```yaml +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: mtls-ca-issuer + namespace: nautobot +spec: + ca: + secretName: mtls-ca-key-pair +``` + +### Step 5: Create the client certificate + +Create `certificate-nautobot-mtls.yaml`. The `commonName` must match the +PostgreSQL database user (typically `app`) because `pg_hba cert` maps +the certificate CN to the DB user. + +```yaml +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: nautobot-mtls-client + namespace: nautobot +spec: + secretName: nautobot-mtls-client + duration: 8760h # 1 year + renewBefore: 720h # 30 days + commonName: app + usages: + - client auth + privateKey: + algorithm: ECDSA + size: 256 + issuerRef: + name: mtls-ca-issuer + kind: Issuer +``` + +### Step 6: Create the kustomization + +Create `kustomization.yaml` listing all resources: + +```yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - externalsecret-nautobot-django.yaml + - externalsecret-nautobot-db.yaml + - externalsecret-nautobot-worker-redis.yaml + - externalsecret-dockerconfigjson-github-com.yaml + - externalsecret-mtls-ca-key-pair.yaml + - issuer-mtls-ca-issuer.yaml + - certificate-nautobot-mtls.yaml +``` + +### Step 7: Create the values file + +Create `values.yaml` with the site-specific overrides. Replace +`` with your environment identifier and `` with +the site's partition name. + +```yaml +nautobot: + db: + host: "nautobot-db..undercloud.rackspace.net" + redis: + host: "nautobot-redis..undercloud.rackspace.net" + ssl: true + image: + registry: "ghcr.io" + repository: "/" + tag: "latest" + pullPolicy: "Always" + pullSecrets: + - dockerconfigjson-github-com + +celery: + extraEnvVars: + - name: NAUTOBOT_CONFIG + value: /opt/nautobot/nautobot_config.py + - name: UC_PARTITION + value: + - name: NAUTOBOT_DB_SSLMODE + value: verify-ca + - name: NAUTOBOT_REDIS_SSL_CERT_REQS + value: required + - name: NAUTOBOT_REDIS_SSL_CA_CERTS + value: /etc/nautobot/mtls/ca.crt + - name: NAUTOBOT_REDIS_SSL_CERTFILE + value: /etc/nautobot/mtls/tls.crt + - name: NAUTOBOT_REDIS_SSL_KEYFILE + value: /etc/nautobot/mtls/tls.key + - name: SSL_CERT_FILE + value: /etc/nautobot/mtls/ca.crt + - name: REQUESTS_CA_BUNDLE + value: /etc/nautobot/mtls/ca.crt + extraVolumes: + - name: mtls-certs + secret: + secretName: nautobot-mtls-client + defaultMode: 256 + extraVolumeMounts: + - name: mtls-certs + mountPath: /etc/nautobot/mtls + readOnly: true +``` + +### Step 8: Enable in deploy.yaml + +Add `nautobot_worker` to the site's `deploy.yaml`: + +```yaml +site: + nautobot_worker: + enabled: true +``` + +### Step 9: Verify + +After ArgoCD syncs, verify the worker is running and connected: + +```bash +# Check the certificate was issued +kubectl get certificate nautobot-mtls-client -n nautobot + +# Check the worker pod is running +kubectl get pods -n nautobot -l app.kubernetes.io/component=nautobot-celery + +# Check worker logs for successful DB/Redis connections +kubectl logs -n nautobot -l app.kubernetes.io/component=nautobot-celery --tail=50 +``` + +### Final directory structure + +```text +/nautobot-worker/ + certificate-nautobot-mtls.yaml + externalsecret-dockerconfigjson-github-com.yaml + externalsecret-mtls-ca-key-pair.yaml + externalsecret-nautobot-db.yaml + externalsecret-nautobot-django.yaml + externalsecret-nautobot-worker-redis.yaml + issuer-mtls-ca-issuer.yaml + kustomization.yaml + values.yaml +``` + +## Environment Variable Reference + +| Variable | Where Set | Purpose | +|---|---|---| +| `NAUTOBOT_DB_SSLMODE` | Site worker values | Controls PostgreSQL SSL mode. Set to `verify-ca` for mTLS. Unset on global cluster. | +| `NAUTOBOT_DB_SSLCERT` | Optional override | Path to client cert for PG (default: `/etc/nautobot/mtls/tls.crt`) | +| `NAUTOBOT_DB_SSLKEY` | Optional override | Path to client key for PG (default: `/etc/nautobot/mtls/tls.key`) | +| `NAUTOBOT_DB_SSLROOTCERT` | Optional override | Path to CA cert for PG (default: `/etc/nautobot/mtls/ca.crt`) | +| `NAUTOBOT_REDIS_SSL_CERT_REQS` | Site worker values | Set to `required` to enforce Redis server cert verification | +| `NAUTOBOT_REDIS_SSL_CA_CERTS` | Site worker values | Path to CA cert for Redis | +| `NAUTOBOT_REDIS_SSL_CERTFILE` | Site worker values | Path to client cert for Redis | +| `NAUTOBOT_REDIS_SSL_KEYFILE` | Site worker values | Path to client key for Redis | +| `SSL_CERT_FILE` | Site worker values | System-wide CA bundle override for outbound HTTPS | +| `REQUESTS_CA_BUNDLE` | Site worker values | Python requests library CA bundle override | +| `NAUTOBOT_CONFIG` | Both global and site | Path to `nautobot_config.py` | +| `UC_PARTITION` | Site worker values | Site partition identifier for Celery task routing | + +## Design Decisions + +- The cert-manager CA hierarchy (self-signed bootstrap -> root CA -> + CA issuer) handles issuance and renewal on both global and site + clusters without manual intervention. + +- CNPG's native TLS support (`serverTLSSecret`, `serverCASecret`) + integrates directly with cert-manager secrets. No sidecar proxies or + custom TLS termination needed. PostgreSQL verifies external client + certificates using the CA chain from `serverCASecret` when processing + `pg_hba cert` rules. + +- The `routes.tls` type in the Envoy Gateway template uses a + `gatewayPort` field to support non-443 ports for TLS passthrough. + PostgreSQL (5432) and Redis (6379) both use this route type. + +- The `pg_hba cert` method with CN-to-user mapping means the client + certificate CN (e.g. `app`) maps directly to the PostgreSQL user, so + no additional user mapping configuration is needed. + +- The CA key pair is distributed to site clusters via the external + secrets provider, following the existing credential distribution + pattern. + +- The `nautobot_config.py` SSL logic is conditional on + `NAUTOBOT_DB_SSLMODE`, so the same config file works for both global + pods (no mTLS) and site workers (mTLS enabled). + +- The Redis mTLS logic in `nautobot_config.py` auto-detects the CA cert + file at the default mount path. If the cert volume is mounted, Redis + mTLS is configured automatically. + +## Known Gotchas + +- **clientCASecret is NOT for external client verification.** CNPG's + `clientCASecret` field is used internally to sign replication client + certificates between PostgreSQL instances. It expects a secret with + both `ca.crt` and `ca.key`. Only `serverTLSSecret` and + `serverCASecret` should be set. PostgreSQL verifies external client + certificates using the CA chain from `serverCASecret` when processing + `pg_hba cert` rules. + +- **SSL config must be conditional.** Setting `sslmode` unconditionally + in `nautobot_config.py` would break global cluster pods, which connect + to CNPG via local password-only auth. The SSL config is gated on the + `NAUTOBOT_DB_SSLMODE` env var -- global pods don't set it, so they + are unaffected. + +- **mtls-ca-cert secret contains a private key.** cert-manager + Certificate resources always produce `tls.crt`, `tls.key`, and + `ca.crt`. CNPG only reads `ca.crt` from the referenced secret, so + the extra fields are harmless but not ideal. A future improvement + could use cert-manager `trust-manager` Bundle to distribute only the + CA cert. + +- **ExternalSecret regex splitting is fragile.** If your external + secrets provider stores the CA cert and key concatenated in a single + field, the ExternalSecret template uses regex to split them. Changes + to the credential format can break the regex. + +- **Redis authClients affects all connections.** Redis + `authClients: true` requires ALL clients (including global Nautobot + pods) to present client certificates. The global Nautobot values must + mount the mTLS client cert into both the web server and celery pods, + not just site workers. + +- **pg_hba rule ordering matters.** The transitional `pg_hba` rules + (`hostssl ... cert` and `hostssl ... scram-sha-256` for remote) are + ordered so that cert-based auth is tried first. Sites without client + certs fall through to password-only over TLS. Once all sites have + mTLS deployed, the `scram-sha-256` remote rule should be removed. + +- **defaultMode 256 vs 0400.** The `defaultMode: 256` (octal 0400) on + the cert secret volume mount is correct but easy to get wrong. YAML + interprets `0400` as octal (decimal 256) -- writing `256` explicitly + avoids ambiguity. + +- **Client cert CN must match the DB user.** When using `pg_hba cert` + auth, PostgreSQL maps the client certificate CN to the database user. + The site worker client cert must use `commonName: app` to match the + CNPG app user. If the CN doesn't match, the connection is rejected + even with a valid cert. + +## Troubleshooting + +### Worker pod fails to start with FileNotFoundError + +The `nautobot_config.py` validates that cert files exist when +`NAUTOBOT_DB_SSLMODE` is `verify-ca` or `verify-full`. If the +`nautobot-mtls-client` secret doesn't exist or the volume mount is +misconfigured, the pod will crash with: + +```text +FileNotFoundError: SSL certificate file required by NAUTOBOT_DB_SSLCERT not found: /etc/nautobot/mtls/tls.crt +``` + +Check that: + +1. The `certificate-nautobot-mtls.yaml` Certificate resource exists and + is in `Ready` state: `kubectl get certificate -n nautobot` +2. The `nautobot-mtls-client` secret was created by cert-manager: + `kubectl get secret nautobot-mtls-client -n nautobot` +3. The `mtls-ca-key-pair` secret exists (needed by the Issuer): + `kubectl get secret mtls-ca-key-pair -n nautobot` +4. The `mtls-ca-issuer` Issuer is in `Ready` state: + `kubectl get issuer -n nautobot` + +### PostgreSQL rejects connection with "certificate verify failed" + +The client cert is not signed by the CA that CNPG trusts. Verify the +CA chain: + +```bash +# On the site cluster, check the client cert's issuer +kubectl get secret nautobot-mtls-client -n nautobot \ + -o jsonpath='{.data.tls\.crt}' | base64 -d | openssl x509 -noout -issuer + +# On the global cluster, check the CA cert that CNPG uses +kubectl get secret mtls-ca-cert -n nautobot \ + -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -subject +``` + +The issuer of the client cert should match the subject of the CA cert. + +### PostgreSQL rejects with "no pg_hba.conf entry" + +The connection doesn't match any `pg_hba` rule. Common causes: + +- The client is connecting without TLS but the only matching rule + requires `hostssl` +- The client cert CN doesn't match the DB user (for `cert` auth) +- The source IP doesn't match any rule's CIDR + +### Redis connection refused with TLS error + +If Redis has `authClients: true` and the connecting pod doesn't present +a client cert, the TLS handshake fails. Ensure the pod has the mTLS +cert volume mounted and the Redis SSL env vars are set. + +### Envoy Gateway not routing traffic + +If the gateway listener doesn't appear or traffic isn't reaching the +backend: + +```bash +# Check gateway status +kubectl get gateway -n envoy-gateway -o yaml + +# Check TLSRoute status +kubectl get tlsroute -n nautobot -o yaml +``` + +Verify the `fqdn` in the TLS route matches the SNI hostname the client +is connecting to. For PostgreSQL, the `nautobot.db.host` in the worker +values must match the `fqdn` in the envoy-configs route. diff --git a/mkdocs.yml b/mkdocs.yml index f2c0b2967..75760605d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -191,6 +191,7 @@ nav: - deploy-guide/components/nautobot-site.md - deploy-guide/components/nautobot.md - deploy-guide/components/nautobotop.md + - deploy-guide/components/nautobot-worker.md - deploy-guide/components/neutron.md - deploy-guide/components/nova.md - deploy-guide/components/octavia.md From 7a17317540479c222b29962115d404ad9a0c365e Mon Sep 17 00:00:00 2001 From: haseeb Date: Thu, 16 Apr 2026 18:35:21 +0530 Subject: [PATCH 2/9] set appLabels `UNDERSTACK_PARTITION`, to site job queues --- .../templates/application-nautobot-worker.yaml | 15 +++++++++++++++ components/nautobot-worker/kustomization.yaml | 10 ++++++++++ components/nautobot-worker/values.yaml | 2 ++ 3 files changed, 27 insertions(+) create mode 100644 components/nautobot-worker/kustomization.yaml diff --git a/charts/argocd-understack/templates/application-nautobot-worker.yaml b/charts/argocd-understack/templates/application-nautobot-worker.yaml index 36165b393..7253b9d83 100644 --- a/charts/argocd-understack/templates/application-nautobot-worker.yaml +++ b/charts/argocd-understack/templates/application-nautobot-worker.yaml @@ -8,6 +8,7 @@ metadata: - resources-finalizer.argocd.argoproj.io annotations: argocd.argoproj.io/compare-options: ServerSideDiff=true,IncludeMutationWebhook=true +{{- include "understack.appLabelsBlock" $ | nindent 2 }} spec: destination: namespace: nautobot @@ -24,6 +25,11 @@ spec: valueFiles: - $understack/components/nautobot-worker/values.yaml - $deploy/{{ include "understack.deploy_path" $ }}/nautobot-worker/values.yaml + {{- with index $.Values.appLabels "understack.rackspace.com/partition" }} + values: | + celery: + taskQueues: {{ . | quote }} + {{- end }} repoURL: https://nautobot.github.io/helm-charts/ targetRevision: 2.5.6 @@ -31,6 +37,15 @@ spec: ref: understack repoURL: {{ include "understack.understack_url" $ }} targetRevision: {{ include "understack.understack_ref" $ }} + kustomize: + patches: + - patch: | + - op: replace + path: /data/UNDERSTACK_PARTITION + value: "{{ index $.Values.appLabels "understack.rackspace.com/partition" | default "" }}" + target: + kind: ConfigMap + name: cluster-data - path: {{ include "understack.deploy_path" $ }}/nautobot-worker ref: deploy repoURL: {{ include "understack.deploy_url" $ }} diff --git a/components/nautobot-worker/kustomization.yaml b/components/nautobot-worker/kustomization.yaml new file mode 100644 index 000000000..79325fa8b --- /dev/null +++ b/components/nautobot-worker/kustomization.yaml @@ -0,0 +1,10 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - name: cluster-data + literals: + - UNDERSTACK_PARTITION="" + options: + disableNameSuffixHash: true diff --git a/components/nautobot-worker/values.yaml b/components/nautobot-worker/values.yaml index 2679e9c16..f2a8499a8 100644 --- a/components/nautobot-worker/values.yaml +++ b/components/nautobot-worker/values.yaml @@ -38,6 +38,8 @@ celery: enabled: true concurrency: 2 replicaCount: 1 + extraEnvVarsCM: + - cluster-data extraEnvVarsSecret: - nautobot-django livenessProbe: From e4bfc153c20c7ed262a65bef91464f11837da202 Mon Sep 17 00:00:00 2001 From: haseeb Date: Thu, 16 Apr 2026 20:16:29 +0530 Subject: [PATCH 3/9] refactoring --- components/nautobot-worker/values.yaml | 3 --- components/nautobot/nautobot_config.py | 8 ++++---- docs/deploy-guide/components/nautobot-worker.md | 2 ++ 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/components/nautobot-worker/values.yaml b/components/nautobot-worker/values.yaml index f2a8499a8..13bf23982 100644 --- a/components/nautobot-worker/values.yaml +++ b/components/nautobot-worker/values.yaml @@ -66,6 +66,3 @@ postgresql: ingress: enabled: false - -metrics: - enabled: false diff --git a/components/nautobot/nautobot_config.py b/components/nautobot/nautobot_config.py index f4c15bbd1..b0906b282 100644 --- a/components/nautobot/nautobot_config.py +++ b/components/nautobot/nautobot_config.py @@ -64,7 +64,7 @@ if DATABASES["default"]["ENGINE"].endswith("mysql"): # noqa F405 DATABASES["default"]["OPTIONS"] = {"charset": "utf8mb4"} # noqa F405 -# SSL/mTLS options for PostgreSQL connections. +# mTLS options for PostgreSQL connections. # When NAUTOBOT_DB_SSLMODE is set to "verify-ca" or "verify-full", the client # certificate, key, and CA root cert must be present at the configured paths. _db_sslcert = os.getenv("NAUTOBOT_DB_SSLCERT", "/etc/nautobot/mtls/tls.crt") @@ -89,11 +89,11 @@ "sslrootcert": _db_sslrootcert, } -# SSL/mTLS options for Redis connections. +# mTLS options for Redis connections. # When NAUTOBOT_REDIS_SSL env var is "true" (set by Helm `nautobot.redis.ssl`), # the Helm chart switches the URL scheme to rediss://. We still need to tell # the Python redis client *which* certs to use for mutual TLS. -import ssl as _ssl # noqa: E402 +from ssl import CERT_REQUIRED # noqa: E402 _redis_ca = os.getenv("NAUTOBOT_REDIS_SSL_CA_CERTS", "/etc/nautobot/mtls/ca.crt") _redis_cert = os.getenv("NAUTOBOT_REDIS_SSL_CERTFILE", "/etc/nautobot/mtls/tls.crt") @@ -101,7 +101,7 @@ if os.path.isfile(_redis_ca): _redis_ssl_kwargs = { - "ssl_cert_reqs": _ssl.CERT_REQUIRED, + "ssl_cert_reqs": CERT_REQUIRED, "ssl_ca_certs": _redis_ca, "ssl_certfile": _redis_cert, "ssl_keyfile": _redis_key, diff --git a/docs/deploy-guide/components/nautobot-worker.md b/docs/deploy-guide/components/nautobot-worker.md index 06a7e03df..c0cd644b7 100644 --- a/docs/deploy-guide/components/nautobot-worker.md +++ b/docs/deploy-guide/components/nautobot-worker.md @@ -58,6 +58,8 @@ Site Cluster Global Cluster The worker pods mount a client certificate (issued by a dedicated internal CA via cert-manager) and present it during the TLS handshake. +See [Certificate Infrastructure](#certificate-infrastructure) for +details on the CA hierarchy and how certificates are provisioned. PostgreSQL and Redis on the global cluster verify the client certificate against the same CA before accepting the connection. From 7427547240024224e8fae5198ed1a851cc54e7e4 Mon Sep 17 00:00:00 2001 From: haseeb Date: Thu, 16 Apr 2026 20:46:44 +0530 Subject: [PATCH 4/9] issue client certificates on the global cluster and transfer only the issued cert+key to sites via the external secrets provider. --- .../application-nautobot-worker.yaml | 5 +- .../components/nautobot-worker.md | 302 ++++++++++++------ 2 files changed, 213 insertions(+), 94 deletions(-) diff --git a/charts/argocd-understack/templates/application-nautobot-worker.yaml b/charts/argocd-understack/templates/application-nautobot-worker.yaml index 7253b9d83..b7c4ba472 100644 --- a/charts/argocd-understack/templates/application-nautobot-worker.yaml +++ b/charts/argocd-understack/templates/application-nautobot-worker.yaml @@ -27,8 +27,9 @@ spec: - $deploy/{{ include "understack.deploy_path" $ }}/nautobot-worker/values.yaml {{- with index $.Values.appLabels "understack.rackspace.com/partition" }} values: | - celery: - taskQueues: {{ . | quote }} + workers: + default: + taskQueues: {{ . | quote }} {{- end }} repoURL: https://nautobot.github.io/helm-charts/ targetRevision: 2.5.6 diff --git a/docs/deploy-guide/components/nautobot-worker.md b/docs/deploy-guide/components/nautobot-worker.md index c0cd644b7..c384d3ea6 100644 --- a/docs/deploy-guide/components/nautobot-worker.md +++ b/docs/deploy-guide/components/nautobot-worker.md @@ -156,17 +156,26 @@ All resources live in the `nautobot` namespace. ### Site Clusters -Each site cluster needs: +Client certificates are issued on the global cluster by cert-manager +and distributed to site clusters through your external secrets provider. +The CA private key never leaves the global cluster -- a compromised +site cannot forge certificates for other sites. -1. The mTLS CA key pair distributed via your external secrets provider - (secret name: `mtls-ca-key-pair`) -2. An `mtls-ca-issuer` Issuer referencing that secret -3. A `nautobot-mtls-client` Certificate resource that cert-manager uses - to issue the client certificate (ECDSA P-256, 1yr duration, 30d - auto-renewal) +Each site needs two credentials from the secrets provider: -The client certificate is mounted into worker pods at -`/etc/nautobot/mtls/` containing `tls.crt`, `tls.key`, and `ca.crt`. +| Credential | Content | Scope | +|---|---|---| +| Client cert+key | The issued `tls.crt` and `tls.key` for this site | Per-site | +| CA public cert | The `ca.crt` from the mTLS CA | Shared across all sites | + +The ExternalSecret on the site cluster combines these into a single +`nautobot-mtls-client` secret (type `kubernetes.io/tls`) with `tls.crt`, +`tls.key`, and `ca.crt`. This secret is mounted into worker pods at +`/etc/nautobot/mtls/`. + +Note: if your secrets provider stores PEM data with `\r\n` line endings, +the ExternalSecret template must strip carriage returns +(`| replace "\r" ""`) or OpenSSL will fail to parse the certificates. ## Adding a New Site @@ -185,19 +194,74 @@ Before starting, ensure the global cluster already has: - Redis TLS enabled with `authClients: true` - Envoy Gateway TLS passthrough routes on ports 5432 and 6379 -You also need the mTLS CA key pair stored in your external secrets -provider so the site cluster can pull it. +You also need the pre-issued client certificate stored in your external +secrets provider (see Step 1). -### Step 1: Create the site directory +### Step 1: Issue the client certificate on the global cluster + +Create a cert-manager Certificate resource on the global cluster for +this site. The `commonName` must match the PostgreSQL database user +(typically `app`) because `pg_hba cert` maps the certificate CN to the +DB user. + +```yaml title="global-cluster/nautobot/certificate-nautobot-mtls-client-.yaml" +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: nautobot-mtls-client- + namespace: nautobot +spec: + secretName: nautobot-mtls-client- + duration: 8760h # 1 year + renewBefore: 720h # 30 days + commonName: app + usages: + - client auth + privateKey: + algorithm: ECDSA + size: 256 + issuerRef: + name: mtls-ca-issuer + kind: Issuer +``` + +Add it to the global nautobot kustomization. After ArgoCD syncs, +cert-manager issues the certificate into a Kubernetes secret. + +Then extract the cert material and upload it to your secrets provider +as two separate credentials: + +```bash +# Extract the client cert + key (per-site credential) +kubectl get secret nautobot-mtls-client- -n nautobot \ + -o jsonpath='{.data.tls\.crt}' | base64 -d > /tmp/tls.crt +kubectl get secret nautobot-mtls-client- -n nautobot \ + -o jsonpath='{.data.tls\.key}' | base64 -d > /tmp/tls.key + +# Upload to your secrets provider as a single credential with +# the cert and key concatenated in one field. + +# Extract the CA public cert (shared across all sites, one-time) +kubectl get secret mtls-ca-cert -n nautobot \ + -o jsonpath='{.data.ca\.crt}' | base64 -d > /tmp/ca.crt + +# Upload to your secrets provider as a separate credential. +# This only needs to be done once -- all sites share the same CA cert. +``` + +The CA private key stays in the `mtls-ca-key-pair` secret on the global +cluster and is never extracted or distributed. + +### Step 2: Create the site directory ```text /nautobot-worker/ ``` -### Step 2: Create ExternalSecrets for credentials +### Step 3: Create ExternalSecrets for credentials Create ExternalSecret resources that pull credentials from your secrets -provider into the `nautobot` namespace. You need four: +provider into the `nautobot` namespace. You need five: | ExternalSecret | Target Secret | Purpose | |---|---|---| @@ -205,68 +269,63 @@ provider into the `nautobot` namespace. You need four: | `externalsecret-nautobot-db.yaml` | `nautobot-db` | CNPG app user password (satisfies Helm chart requirement) | | `externalsecret-nautobot-worker-redis.yaml` | `nautobot-redis` | Redis password | | `externalsecret-dockerconfigjson-github-com.yaml` | `dockerconfigjson-github-com` | Container registry credentials | +| `externalsecret-nautobot-mtls-client.yaml` | `nautobot-mtls-client` | mTLS client cert + CA cert (two credentials combined) | -Each ExternalSecret should reference your `ClusterSecretStore` and map -the credential into the key format the Nautobot Helm chart expects. - -### Step 3: Create the mTLS CA key pair ExternalSecret - -Create `externalsecret-mtls-ca-key-pair.yaml` to distribute the mTLS CA -certificate and private key to this site cluster. The resulting secret -must be a `kubernetes.io/tls` type with these keys: +The mTLS ExternalSecret pulls from two separate credentials in your +secrets provider -- the per-site client cert+key and the shared CA +public cert -- and combines them into a single `kubernetes.io/tls` +secret with `tls.crt`, `tls.key`, and `ca.crt`. -| Key | Content | -|---|---| -| `tls.crt` | CA certificate (PEM) | -| `tls.key` | CA private key (PEM) | -| `ca.crt` | CA certificate (PEM, same as `tls.crt`) | - -cert-manager's CA Issuer reads `tls.crt` and `tls.key` from this secret -to sign client certificates. - -### Step 4: Create the cert-manager CA Issuer - -Create `issuer-mtls-ca-issuer.yaml`: +If both credentials have the same field name (e.g. `password`), use +`dataFrom` with `rewrite` to prefix the keys and avoid collision: ```yaml -apiVersion: cert-manager.io/v1 -kind: Issuer -metadata: - name: mtls-ca-issuer - namespace: nautobot -spec: - ca: - secretName: mtls-ca-key-pair -``` - -### Step 5: Create the client certificate - -Create `certificate-nautobot-mtls.yaml`. The `commonName` must match the -PostgreSQL database user (typically `app`) because `pg_hba cert` maps -the certificate CN to the DB user. - -```yaml -apiVersion: cert-manager.io/v1 -kind: Certificate +apiVersion: external-secrets.io/v1 +kind: ExternalSecret metadata: name: nautobot-mtls-client - namespace: nautobot spec: - secretName: nautobot-mtls-client - duration: 8760h # 1 year - renewBefore: 720h # 30 days - commonName: app - usages: - - client auth - privateKey: - algorithm: ECDSA - size: 256 - issuerRef: - name: mtls-ca-issuer - kind: Issuer + refreshInterval: 1h + secretStoreRef: + kind: ClusterSecretStore + name: + target: + creationPolicy: Owner + deletionPolicy: Retain + template: + engineVersion: v2 + type: kubernetes.io/tls + data: + tls.crt: >- + {{ .client_password + | regexFind "-----BEGIN CERTIFICATE-----[\\s\\S]*?-----END CERTIFICATE-----" + | replace "\r" "" }} + tls.key: >- + {{ .client_password + | regexFind "-----BEGIN EC PRIVATE KEY-----[\\s\\S]*?-----END EC PRIVATE KEY-----" + | replace "\r" "" }} + ca.crt: >- + {{ .ca_password | replace "\r" "" }} + dataFrom: + - extract: + key: "" + rewrite: + - regexp: + source: "(.*)" + target: "client_$1" + - extract: + key: "" + rewrite: + - regexp: + source: "(.*)" + target: "ca_$1" ``` -### Step 6: Create the kustomization +The `replace "\r" ""` strips carriage returns that some secrets +providers add to PEM data. Without this, OpenSSL will fail to parse +the certificates. + +### Step 4: Create the kustomization Create `kustomization.yaml` listing all resources: @@ -278,12 +337,10 @@ resources: - externalsecret-nautobot-db.yaml - externalsecret-nautobot-worker-redis.yaml - externalsecret-dockerconfigjson-github-com.yaml - - externalsecret-mtls-ca-key-pair.yaml - - issuer-mtls-ca-issuer.yaml - - certificate-nautobot-mtls.yaml + - externalsecret-nautobot-mtls-client.yaml ``` -### Step 7: Create the values file +### Step 5: Create the values file Create `values.yaml` with the site-specific overrides. Replace `` with your environment identifier and `` with @@ -335,7 +392,7 @@ celery: readOnly: true ``` -### Step 8: Enable in deploy.yaml +### Step 6: Enable in deploy.yaml Add `nautobot_worker` to the site's `deploy.yaml`: @@ -345,13 +402,13 @@ site: enabled: true ``` -### Step 9: Verify +### Step 7: Verify After ArgoCD syncs, verify the worker is running and connected: ```bash -# Check the certificate was issued -kubectl get certificate nautobot-mtls-client -n nautobot +# Check the client cert secret was pulled from the secrets provider +kubectl get secret nautobot-mtls-client -n nautobot # Check the worker pod is running kubectl get pods -n nautobot -l app.kubernetes.io/component=nautobot-celery @@ -364,17 +421,48 @@ kubectl logs -n nautobot -l app.kubernetes.io/component=nautobot-celery --tail=5 ```text /nautobot-worker/ - certificate-nautobot-mtls.yaml externalsecret-dockerconfigjson-github-com.yaml - externalsecret-mtls-ca-key-pair.yaml externalsecret-nautobot-db.yaml externalsecret-nautobot-django.yaml + externalsecret-nautobot-mtls-client.yaml externalsecret-nautobot-worker-redis.yaml - issuer-mtls-ca-issuer.yaml kustomization.yaml values.yaml ``` +## Certificate Renewal + +Client certificates have a 1-year duration with 30-day auto-renewal by +cert-manager on the global cluster. When cert-manager renews a +certificate, the updated cert+key must be re-uploaded to your secrets +provider and the site ExternalSecret will pick it up on its next +refresh cycle. + +This is a manual process by default. Approaches to automate it: + +- **PushSecret (External Secrets Operator):** Use a + [PushSecret](https://external-secrets.io/latest/guides/pushsecrets/) + resource on the global cluster to automatically push the renewed cert + to your secrets provider whenever the Kubernetes secret changes. This + is event-driven and requires no CronJob. + +- **CronJob on the global cluster:** A Kubernetes CronJob that runs + periodically, reads the cert secret, and pushes it to your secrets + provider via its API. + +- **Cross-cluster secret replication:** Use a tool like + [Kubernetes Replicator](https://github.com/mittwald/kubernetes-replicator) + to copy the cert secret directly from the global cluster to site + clusters, bypassing the secrets provider entirely. + +- **CertificateRequest from site clusters:** The site cluster creates a + cert-manager + [CertificateRequest](https://cert-manager.io/docs/usage/certificaterequest/), + an operator on the global cluster approves and signs it, and the + signed cert is returned. This is similar to how kubelet certificate + management works in Kubernetes. Most complex to set up but fully + automated with no intermediate secrets provider. + ## Environment Variable Reference | Variable | Where Set | Purpose | @@ -412,9 +500,10 @@ kubectl logs -n nautobot -l app.kubernetes.io/component=nautobot-celery --tail=5 certificate CN (e.g. `app`) maps directly to the PostgreSQL user, so no additional user mapping configuration is needed. -- The CA key pair is distributed to site clusters via the external - secrets provider, following the existing credential distribution - pattern. +- Client certificates are issued on the global cluster by cert-manager + and distributed to site clusters via the external secrets provider. + The CA private key never leaves the global cluster, so a compromised + site cannot forge certificates for other sites. - The `nautobot_config.py` SSL logic is conditional on `NAUTOBOT_DB_SSLMODE`, so the same config file works for both global @@ -447,10 +536,25 @@ kubectl logs -n nautobot -l app.kubernetes.io/component=nautobot-celery --tail=5 could use cert-manager `trust-manager` Bundle to distribute only the CA cert. -- **ExternalSecret regex splitting is fragile.** If your external - secrets provider stores the CA cert and key concatenated in a single - field, the ExternalSecret template uses regex to split them. Changes - to the credential format can break the regex. +- **ca.crt must be the CA cert, not the client cert.** The `ca.crt` + field in the `nautobot-mtls-client` secret must contain the mTLS CA + certificate (`CN=understack-mtls-ca`), not the client certificate. + If `ca.crt` contains the client cert, the worker will fail with + `[SSL: CERTIFICATE_VERIFY_FAILED] self-signed certificate in + certificate chain` because it can't verify the server's cert chain. + The CA cert credential in your secrets provider is shared across all + sites and only needs to be created once. + +- **PEM data with carriage returns.** Some secrets providers store text + with `\r\n` line endings. PEM certificates with `\r` characters will + fail OpenSSL parsing with `[SSL] PEM lib`. The ExternalSecret template + must strip carriage returns using `| replace "\r" ""`. + +- **ExternalSecret format depends on your secrets provider.** The + ExternalSecret for the mTLS client cert on site clusters must produce + a `kubernetes.io/tls` secret with `tls.crt`, `tls.key`, and `ca.crt`. + How you template this depends on how your secrets provider stores the + credential. - **Redis authClients affects all connections.** Redis `authClients: true` requires ALL clients (including global Nautobot @@ -490,14 +594,13 @@ FileNotFoundError: SSL certificate file required by NAUTOBOT_DB_SSLCERT not foun Check that: -1. The `certificate-nautobot-mtls.yaml` Certificate resource exists and - is in `Ready` state: `kubectl get certificate -n nautobot` -2. The `nautobot-mtls-client` secret was created by cert-manager: +1. The `nautobot-mtls-client` secret exists on the site cluster: `kubectl get secret nautobot-mtls-client -n nautobot` -3. The `mtls-ca-key-pair` secret exists (needed by the Issuer): - `kubectl get secret mtls-ca-key-pair -n nautobot` -4. The `mtls-ca-issuer` Issuer is in `Ready` state: - `kubectl get issuer -n nautobot` +2. The ExternalSecret is syncing successfully: + `kubectl get externalsecret nautobot-mtls-client -n nautobot` +3. The secret contains `tls.crt`, `tls.key`, and `ca.crt` keys +4. On the global cluster, verify the source certificate is issued: + `kubectl get certificate -n nautobot | grep mtls-client` ### PostgreSQL rejects connection with "certificate verify failed" @@ -525,6 +628,21 @@ The connection doesn't match any `pg_hba` rule. Common causes: - The client cert CN doesn't match the DB user (for `cert` auth) - The source IP doesn't match any rule's CIDR +### Redis connection refused with "certificate verify failed" + +The `ca.crt` mounted in the pod is not the CA that signed the Redis +server certificate. Verify: + +```bash +# Should show CN=understack-mtls-ca (the CA), NOT CN=app (the client cert) +kubectl get secret nautobot-mtls-client -n nautobot \ + -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -subject +``` + +If it shows the client cert CN, the CA cert credential in your secrets +provider has the wrong content. Update it with the actual CA certificate +from the global cluster's `mtls-ca-cert` secret. + ### Redis connection refused with TLS error If Redis has `authClients: true` and the connecting pod doesn't present From 2e9443d80b124f453fe38f90442b5e570d81bac8 Mon Sep 17 00:00:00 2001 From: haseeb Date: Fri, 17 Apr 2026 08:39:46 +0530 Subject: [PATCH 5/9] site specific non default queues --- .../application-nautobot-worker.yaml | 3 + .../components/nautobot-worker.md | 9 + docs/operator-guide/nautobot-celery-queues.md | 260 ++++++++++++++++++ mkdocs.yml | 1 + 4 files changed, 273 insertions(+) create mode 100644 docs/operator-guide/nautobot-celery-queues.md diff --git a/charts/argocd-understack/templates/application-nautobot-worker.yaml b/charts/argocd-understack/templates/application-nautobot-worker.yaml index b7c4ba472..652693bfa 100644 --- a/charts/argocd-understack/templates/application-nautobot-worker.yaml +++ b/charts/argocd-understack/templates/application-nautobot-worker.yaml @@ -29,6 +29,9 @@ spec: values: | workers: default: + enabled: false + {{ . }}: + enabled: true taskQueues: {{ . | quote }} {{- end }} repoURL: https://nautobot.github.io/helm-charts/ diff --git a/docs/deploy-guide/components/nautobot-worker.md b/docs/deploy-guide/components/nautobot-worker.md index c384d3ea6..8cad0c401 100644 --- a/docs/deploy-guide/components/nautobot-worker.md +++ b/docs/deploy-guide/components/nautobot-worker.md @@ -20,6 +20,11 @@ application. The web server, Redis, and PostgreSQL all remain on the global cluster -- site workers connect back to those shared services over the network. +For details on how Celery task queues are configured per site and how to +route jobs to site-specific workers, see the +[Nautobot Celery Queues](../../operator-guide/nautobot-celery-queues.md) +operator guide. + ## Deployment Scope - Cluster scope: site @@ -279,6 +284,8 @@ secret with `tls.crt`, `tls.key`, and `ca.crt`. If both credentials have the same field name (e.g. `password`), use `dataFrom` with `rewrite` to prefix the keys and avoid collision: +{% raw %} + ```yaml apiVersion: external-secrets.io/v1 kind: ExternalSecret @@ -321,6 +328,8 @@ spec: target: "ca_$1" ``` +{% endraw %} + The `replace "\r" ""` strips carriage returns that some secrets providers add to PEM data. Without this, OpenSSL will fail to parse the certificates. diff --git a/docs/operator-guide/nautobot-celery-queues.md b/docs/operator-guide/nautobot-celery-queues.md new file mode 100644 index 000000000..f9f6fa763 --- /dev/null +++ b/docs/operator-guide/nautobot-celery-queues.md @@ -0,0 +1,260 @@ +# Nautobot Celery Queues + +This guide covers how Celery task queues work in the understack +nautobot-worker deployment, how the queue name is derived from the +site partition, and how to route jobs to site-specific queues +programmatically. + +## How the Queue Name is Set + +The ArgoCD Application template for `nautobot-worker` automatically +sets the Celery queue name to match the site's partition label +(`understack.rackspace.com/partition`). The relevant section in +`application-nautobot-worker.yaml`: + +{% raw %} + +```yaml +{{- with index $.Values.appLabels "understack.rackspace.com/partition" }} +values: | + workers: + default: + enabled: false + {{ . }}: + enabled: true + taskQueues: {{ . | quote }} +{{- end }} +``` + +{% endraw %} + +For a site with partition `rax-dev`, this renders as: + +```yaml +workers: + default: + enabled: false + rax-dev: + enabled: true + taskQueues: "rax-dev" +``` + +This produces a Deployment named `nautobot-worker-celery-rax-dev` with +the label `app.kubernetes.io/component: nautobot-celery-rax-dev` and +the environment variable `CELERY_TASK_QUEUES=rax-dev`. + +### Why workers.default must be disabled + +The upstream Nautobot Helm chart defines `workers.default.taskQueues: +"default"` in its own `values.yaml`. The chart's `nautobot.workers` +helper merges worker-specific values on top of the `celery` defaults. +If you only set `celery.taskQueues`, the chart's `workers.default` +overrides it because worker-level values take precedence. Disabling +`workers.default` and creating a new worker key avoids this conflict. + +## Nautobot JobQueue Setup + +Before any job can be dispatched to a site queue, a `JobQueue` record +must exist in Nautobot's database. Without it, the API rejects the +request with a validation error. + +### Create via the UI + +Navigate to Jobs > Job Queues > Add and create a queue with: + +- Name: `rax-dev` (must match the worker's `taskQueues` value) +- Queue Type: `celery` + +### Create via the REST API + +```bash +curl -X POST \ + -H "Authorization: Token $TOKEN" \ + -H "Content-Type: application/json" \ + https://nautobot.example.com/api/extras/job-queues/ \ + --data '{"name": "rax-dev", "queue_type": "celery"}' +``` + +### Create via pynautobot + +```python +import pynautobot + +nb = pynautobot.api("https://nautobot.example.com", token="your-token") +nb.extras.job_queues.create(name="rax-dev", queue_type="celery") +``` + +### Automate via Ansible + +The `ansible/roles/jobs/tasks/main.yml` role enables Rackspace jobs +but does not currently create JobQueues. You can extend it: + +{% raw %} + +```yaml +- name: "Ensure partition JobQueue exists" + ansible.builtin.uri: + url: "{{ nautobot_url }}/api/extras/job-queues/" + method: POST + headers: + Authorization: "Token {{ nautobot_token }}" + body_format: json + body: + name: "{{ partition }}" + queue_type: "celery" + status_code: [200, 201, 400] +``` + +{% endraw %} + +## Assigning Jobs to Queues + +A job must list the queue in its allowed queues before it can be +dispatched there. There are three ways to do this. + +### Option 1: In the Job class (code) + +Set `task_queues` in the Job's Meta class. This is baked into the +job's source code and applies everywhere the job is installed. + +```python +from nautobot.apps.jobs import Job + +class SyncSiteConfig(Job): + class Meta: + name = "Sync Site Config" + task_queues = ["rax-dev", "default"] +``` + +### Option 2: Via the Nautobot UI + +Navigate to Jobs > Jobs, select the job, click Edit, and add the +desired JobQueue(s) under the Job Queues field. Check "Override +job queues" to use the UI-configured queues instead of the ones +defined in code. + +### Option 3: Via the REST API + +```bash +curl -X PATCH \ + -H "Authorization: Token $TOKEN" \ + -H "Content-Type: application/json" \ + https://nautobot.example.com/api/extras/jobs/$JOB_ID/ \ + --data '{ + "job_queues": [{"name": "rax-dev"}, {"name": "default"}], + "job_queues_override": true + }' +``` + +## Running Jobs on a Specific Queue + +### Via pynautobot + +```python +import pynautobot + +nb = pynautobot.api("https://nautobot.example.com", token="your-token") + +job = nb.extras.jobs.get(name="my_app.jobs.SyncSiteConfig") + +# Run on the rax-dev site worker +result = job.run(data={"device": "server-01"}, task_queue="rax-dev") +``` + +The `task_queue` parameter (or `job_queue` -- both are accepted in +Nautobot 2.4+) tells Nautobot to dispatch the Celery task to the +specified queue. The site worker listening on that queue picks it up. + +### Via the REST API + +```bash +curl -X POST \ + -H "Authorization: Token $TOKEN" \ + -H "Content-Type: application/json" \ + https://nautobot.example.com/api/extras/jobs/$JOB_ID/run/ \ + --data '{ + "data": {"device": "server-01"}, + "task_queue": "rax-dev" + }' +``` + +### Via the Nautobot UI + +When running a job from the web UI, if the job has multiple queues +configured, a dropdown appears allowing you to select the target +queue before clicking "Run Job". + +### Default behavior + +If `task_queue` is not specified, Nautobot dispatches the job to the +job's `default_job_queue`. If no default is configured, it falls back +to `CELERY_TASK_DEFAULT_QUEUE` (typically `"default"`). + +## Validation + +Nautobot validates two things before accepting a job run request: + +1. The requested queue must be in the job's allowed queues list. + If not, the API returns: + `{"task_queue": ["\"rax-dev\" is not a valid choice."]}` + +2. At least one Celery worker must be actively listening on the + requested queue. If no worker is found, the API returns a + `CeleryWorkerNotRunningException`. This check uses Celery's + `inspect` to count active workers on the queue. + +## Verifying Workers are Listening + +To confirm a site worker is consuming from the correct queue: + +```bash +# Check the CELERY_TASK_QUEUES env var in the running pod +kubectl get deploy -n nautobot \ + -l app.kubernetes.io/component=nautobot-celery-rax-dev \ + -o jsonpath='{.items[0].spec.template.spec.containers[0].env}' \ + | python3 -m json.tool | grep -A1 CELERY_TASK_QUEUES + +# Check worker logs for the queue binding +kubectl logs -n nautobot \ + -l app.kubernetes.io/component=nautobot-celery-rax-dev \ + --tail=20 | grep "ready" +``` + +## Multiple Sites + +Each site gets its own queue named after its partition. For example: + +| Site | Partition | Queue Name | Deployment | +|---|---|---|---| +| DC1 Staging | dc1-staging | dc1-staging | nautobot-worker-celery-dc1-staging | +| DC1 Prod | dc1-prod | dc1-prod | nautobot-worker-celery-dc1-prod | +| DC2 Prod | dc2-prod | dc2-prod | nautobot-worker-celery-dc2-prod | +| DC3 Prod | dc3-prod | dc3-prod | nautobot-worker-celery-dc3-prod | + +Each site's worker only processes tasks from its own queue. The global +Nautobot instance dispatches jobs to the appropriate queue based on the +`task_queue` parameter in the API call. + +## Troubleshooting + +### "is not a valid choice" when running a job + +The job does not have the requested queue in its allowed queues. Either: + +- Add the queue to the job's `task_queues` in code, or +- Add the JobQueue to the job via the UI/API with `job_queues_override: true` + +### CeleryWorkerNotRunningException + +No worker is listening on the requested queue. Check: + +- The site's nautobot-worker ArgoCD Application is synced and healthy +- The worker pod is running: `kubectl get pods -n nautobot -l app.kubernetes.io/component=nautobot-celery-` +- The `CELERY_TASK_QUEUES` env var matches the queue name + +### Job runs but nothing happens + +The job was dispatched to a queue that no worker is consuming. This +can happen if `task_queue` was not specified and the job defaulted to +`"default"`, but the site worker is listening on `"rax-dev"`. Always +pass `task_queue` explicitly when targeting a site worker. diff --git a/mkdocs.yml b/mkdocs.yml index 75760605d..aec09ea02 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -241,6 +241,7 @@ nav: - operator-guide/rook-ceph.md - operator-guide/nautobot.md - operator-guide/nautobotop.md + - operator-guide/nautobot-celery-queues.md - operator-guide/troubleshooting-osh.md - operator-guide/logging.md - operator-guide/ansible-local-usage.md From 805b776a4de4408ac39db27bb4224aacdc00b7f8 Mon Sep 17 00:00:00 2001 From: haseeb Date: Mon, 20 Apr 2026 17:31:37 +0530 Subject: [PATCH 6/9] NAUTOBOT_EXTRA_PLUGINS and NAUTOBOT_EXTRA_PLUGINS_CONFIG --- components/nautobot/nautobot_config.py | 104 +++++++- .../components/nautobot-worker.md | 90 ++----- docs/deploy-guide/components/nautobot.md | 80 ++++++ .../nautobot-mtls-certificate-renewal.md | 126 ++++++++++ docs/operator-guide/nautobot.md | 234 ++++++++++++++++++ mkdocs.yml | 1 + 6 files changed, 561 insertions(+), 74 deletions(-) create mode 100644 docs/operator-guide/nautobot-mtls-certificate-renewal.md diff --git a/components/nautobot/nautobot_config.py b/components/nautobot/nautobot_config.py index b0906b282..10b6b2b00 100644 --- a/components/nautobot/nautobot_config.py +++ b/components/nautobot/nautobot_config.py @@ -1,4 +1,7 @@ +import json as _json import os +import re as _re +from ssl import CERT_REQUIRED from nautobot.core.settings import * # noqa F401,F403 from nautobot.core.settings_funcs import is_truthy @@ -93,8 +96,6 @@ # When NAUTOBOT_REDIS_SSL env var is "true" (set by Helm `nautobot.redis.ssl`), # the Helm chart switches the URL scheme to rediss://. We still need to tell # the Python redis client *which* certs to use for mutual TLS. -from ssl import CERT_REQUIRED # noqa: E402 - _redis_ca = os.getenv("NAUTOBOT_REDIS_SSL_CA_CERTS", "/etc/nautobot/mtls/ca.crt") _redis_cert = os.getenv("NAUTOBOT_REDIS_SSL_CERTFILE", "/etc/nautobot/mtls/tls.crt") _redis_key = os.getenv("NAUTOBOT_REDIS_SSL_KEYFILE", "/etc/nautobot/mtls/tls.key") @@ -401,6 +402,11 @@ os.getenv("NAUTOBOT_INSTALLATION_METRICS_ENABLED", "True") ) +# Partition identifier used by computed fields (e.g. device URN generation). +# Populated from the cluster-data ConfigMap which is patched by ArgoCD from +# the appLabels["understack.rackspace.com/partition"] value. +UNDERSTACK_PARTITION = os.environ.get("UNDERSTACK_PARTITION", "") + # Storage backend to use for Job input files and Job output files. # # Note: the default is for backwards compatibility and it is recommended to change it if possible for your deployment. @@ -460,8 +466,32 @@ # PER_PAGE_DEFAULTS = [25, 50, 100, 250, 500, 1000] # Enable installed plugins. Add the name of each plugin to the list. -# -# PLUGINS = [] +# Use try/except to only load plugins that are installed in this container, +# since different deployments may have different plugin sets. +# +PLUGINS = [] +for _plugin_name in [ + "nautobot_plugin_nornir", + "nautobot_golden_config", +]: + try: + __import__(_plugin_name) + PLUGINS.append(_plugin_name) + except ImportError: + pass + +# Allow additional plugins to be specified via the NAUTOBOT_EXTRA_PLUGINS +# environment variable (comma-separated list of plugin module names). +# This lets private deployments add their own plugins without modifying +# this file. +_extra_plugins = os.getenv("NAUTOBOT_EXTRA_PLUGINS", "") +for _plugin_name in (p.strip() for p in _extra_plugins.split(",") if p.strip()): + try: + __import__(_plugin_name) + if _plugin_name not in PLUGINS: + PLUGINS.append(_plugin_name) + except ImportError: + pass # Plugins configuration settings. These settings are used by various plugins that the user may have installed. # Each key in the dictionary is the name of an installed plugin and its value is a dictionary of settings. @@ -472,13 +502,67 @@ # 'buzz': 'bazz' # } # } -PLUGINS_CONFIG = { - "vni_custom_model": { - "FORCE_UNIQUE_VLANS": is_truthy( - os.getenv("VNI_CUSTOM_MODEL_FORCE_UNIQUE_VLANS", "false") - ) +PLUGINS_CONFIG = {} + +# Configuration for open-source plugins (only applied when the plugin is loaded). +if "nautobot_plugin_nornir" in PLUGINS: + PLUGINS_CONFIG["nautobot_plugin_nornir"] = { + "nornir_settings": { + "credentials": "nautobot_plugin_nornir.plugins.credentials.nautobot_secrets.CredentialsNautobotSecrets", + "runner": { + "plugin": "threaded", + "options": { + "num_workers": 20, + }, + }, + }, + "use_config_context": { + "connection_options": True, + }, + } + +if "nautobot_golden_config" in PLUGINS: + PLUGINS_CONFIG["nautobot_golden_config"] = { + "per_feature_bar_width": 0.15, + "per_feature_width": 13, + "per_feature_height": 4, + "enable_backup": True, + "enable_compliance": True, + "enable_intended": True, + "enable_sotagg": True, + "sot_agg_transposer": None, + "enable_postprocessing": True, + "postprocessing_callables": [], + "postprocessing_subscribed": [], + "platform_slug_map": None, } -} + + +# Allow plugin configuration via the NAUTOBOT_EXTRA_PLUGINS_CONFIG environment +# variable. Value must be a JSON object whose keys are plugin names and values +# are config dicts. Supports ${ENV_VAR} syntax for referencing environment +# variables in string values (useful for secrets). +def _interpolate_env(obj): + """Recursively replace ${VAR} patterns with environment variable values.""" + if isinstance(obj, str): + return _re.sub( + r"\$\{(\w+)\}", + lambda m: os.environ.get(m.group(1), ""), + obj, + ) + if isinstance(obj, dict): + return {k: _interpolate_env(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_interpolate_env(v) for v in obj] + return obj + + +_extra_cfg = os.getenv("NAUTOBOT_EXTRA_PLUGINS_CONFIG", "") +if _extra_cfg: + try: + PLUGINS_CONFIG.update(_interpolate_env(_json.loads(_extra_cfg))) + except (ValueError, TypeError): + pass # Prefer IPv6 addresses or IPv4 addresses in selecting a device's primary IP address? # diff --git a/docs/deploy-guide/components/nautobot-worker.md b/docs/deploy-guide/components/nautobot-worker.md index 8cad0c401..24ee4bb00 100644 --- a/docs/deploy-guide/components/nautobot-worker.md +++ b/docs/deploy-guide/components/nautobot-worker.md @@ -78,6 +78,15 @@ that even with a leaked password, connections without a valid client certificate are rejected. Traffic is encrypted end-to-end between the worker pod and the server. +## Plugin Loading + +The shared `nautobot_config.py` supports a generic plugin loading +mechanism described in the +[Nautobot Plugin Loading](../../operator-guide/nautobot.md#plugin-loading) +operator guide. Site workers use the same mechanism -- open-source +plugins are loaded automatically, and additional plugins can be added +via the `NAUTOBOT_EXTRA_PLUGINS` environment variable. + ## Connection Security ### PostgreSQL (CNPG) @@ -118,22 +127,10 @@ The CNPG `pg_hba` rules are evaluated top-to-bottom: ### Redis -The global Redis instance has TLS enabled with `authClients: true` -(Bitnami Redis subchart), requiring client certificates from all -connections -- including local pods on the global cluster. - -The `nautobot_config.py` Redis mTLS logic checks if the CA cert file -exists at the default path (`/etc/nautobot/mtls/ca.crt`). If present, -it configures `ssl_cert_reqs`, `ssl_ca_certs`, `ssl_certfile`, and -`ssl_keyfile` on the Redis connection pool, Celery broker, and Celery -result backend. Both global and site pods automatically pick up Redis -mTLS when the cert volume is mounted. - -Because `authClients: true` applies to all connections (Redis has no -equivalent of `pg_hba` to distinguish local vs remote), a -`nautobot-mtls-client` Certificate resource is also deployed on the -global cluster so that local Nautobot web and Celery pods can present -a valid client cert. +The global Redis mTLS configuration is described in the +[global nautobot deploy guide](nautobot.md#redis-mtls). Site workers +use the same auto-detection mechanism -- when the mTLS cert volume is +mounted, Redis SSL is configured automatically. ### Envoy Gateway @@ -145,19 +142,8 @@ SNI hostname without terminating TLS, preserving end-to-end mTLS. ### Global Cluster -The global cluster hosts the mTLS CA hierarchy (managed by cert-manager): - -| Resource | Kind | Purpose | -|---|---|---| -| `mtls-selfsigned` | Issuer | Bootstraps the self-signed root | -| `mtls-ca` | Certificate | Root CA (ECDSA P-256, 10yr duration, 1yr renewBefore) | -| `mtls-ca-issuer` | Issuer | Signs all client and server certificates | -| `mtls-ca-cert` | Certificate | CA public cert secret used by CNPG and Redis for client verification | -| `nautobot-cluster-server-tls` | Certificate | PostgreSQL server certificate | -| `nautobot-redis-server-tls` | Certificate | Redis server certificate | -| `nautobot-mtls-client` | Certificate | Client certificate for global Nautobot/Celery pods (needed because Redis `authClients: true` applies to all connections) | - -All resources live in the `nautobot` namespace. +The global cluster hosts the mTLS CA hierarchy described in the +[global nautobot deploy guide](nautobot.md#mtls-certificate-infrastructure). ### Site Clusters @@ -217,8 +203,8 @@ metadata: namespace: nautobot spec: secretName: nautobot-mtls-client- - duration: 8760h # 1 year - renewBefore: 720h # 30 days + duration: 26280h # 3 years + renewBefore: 2160h # 90 days commonName: app usages: - client auth @@ -374,8 +360,8 @@ celery: extraEnvVars: - name: NAUTOBOT_CONFIG value: /opt/nautobot/nautobot_config.py - - name: UC_PARTITION - value: + - name: NAUTOBOT_EXTRA_PLUGINS + value: '' - name: NAUTOBOT_DB_SSLMODE value: verify-ca - name: NAUTOBOT_REDIS_SSL_CERT_REQS @@ -441,36 +427,10 @@ kubectl logs -n nautobot -l app.kubernetes.io/component=nautobot-celery --tail=5 ## Certificate Renewal -Client certificates have a 1-year duration with 30-day auto-renewal by -cert-manager on the global cluster. When cert-manager renews a -certificate, the updated cert+key must be re-uploaded to your secrets -provider and the site ExternalSecret will pick it up on its next -refresh cycle. - -This is a manual process by default. Approaches to automate it: - -- **PushSecret (External Secrets Operator):** Use a - [PushSecret](https://external-secrets.io/latest/guides/pushsecrets/) - resource on the global cluster to automatically push the renewed cert - to your secrets provider whenever the Kubernetes secret changes. This - is event-driven and requires no CronJob. - -- **CronJob on the global cluster:** A Kubernetes CronJob that runs - periodically, reads the cert secret, and pushes it to your secrets - provider via its API. - -- **Cross-cluster secret replication:** Use a tool like - [Kubernetes Replicator](https://github.com/mittwald/kubernetes-replicator) - to copy the cert secret directly from the global cluster to site - clusters, bypassing the secrets provider entirely. - -- **CertificateRequest from site clusters:** The site cluster creates a - cert-manager - [CertificateRequest](https://cert-manager.io/docs/usage/certificaterequest/), - an operator on the global cluster approves and signs it, and the - signed cert is returned. This is similar to how kubelet certificate - management works in Kubernetes. Most complex to set up but fully - automated with no intermediate secrets provider. +For details on how mTLS client certificates are renewed and distributed +to site clusters, see the +[mTLS Certificate Renewal](../../operator-guide/nautobot-mtls-certificate-renewal.md) +operator guide. ## Environment Variable Reference @@ -487,7 +447,9 @@ This is a manual process by default. Approaches to automate it: | `SSL_CERT_FILE` | Site worker values | System-wide CA bundle override for outbound HTTPS | | `REQUESTS_CA_BUNDLE` | Site worker values | Python requests library CA bundle override | | `NAUTOBOT_CONFIG` | Both global and site | Path to `nautobot_config.py` | -| `UC_PARTITION` | Site worker values | Site partition identifier for Celery task routing | +| `NAUTOBOT_EXTRA_PLUGINS` | Both global and site values | Comma-separated list of additional plugin module names to load (beyond the open-source defaults). Plugins are loaded only if installed in the container. | +| `NAUTOBOT_EXTRA_PLUGINS_CONFIG` | Both global and site values | JSON object with plugin configuration. Supports `${ENV_VAR}` syntax for referencing environment variables in string values (useful for secrets). Merged into `PLUGINS_CONFIG`. | +| `UNDERSTACK_PARTITION` | `cluster-data` ConfigMap (patched by ArgoCD from `appLabels`) | Site partition identifier used by computed fields (e.g. device URN generation). Exposed as a Django setting. | ## Design Decisions diff --git a/docs/deploy-guide/components/nautobot.md b/docs/deploy-guide/components/nautobot.md index 4a6b78a7e..c0f84f53d 100644 --- a/docs/deploy-guide/components/nautobot.md +++ b/docs/deploy-guide/components/nautobot.md @@ -34,6 +34,71 @@ global: enabled: true ``` +## Configuration Architecture + +The `nautobot_config.py` file is managed in git at +`components/nautobot/nautobot_config.py` and injected into pods via the +Helm chart's `fileParameters` feature. ArgoCD reads the file, the Helm +chart creates a ConfigMap, and pods mount it at +`/opt/nautobot/nautobot_config.py`. The `NAUTOBOT_CONFIG` environment +variable tells Nautobot to load from that path. + +The effective configuration is built from four layers: Nautobot defaults, +the component config, Helm chart env vars from the base values, and +deploy repo value overrides. + +For the full details on how `fileParameters` works, why the baked-in +image config is not used, config layering, and the Helm list replacement +gotcha, see the +[Configuration Architecture](../../operator-guide/nautobot.md#configuration-architecture) +operator guide. + +## Plugin Loading + +For details on how plugins are loaded, configured via environment +variables, and how to add custom plugins, see the +[Plugin Loading](../../operator-guide/nautobot.md#plugin-loading) +operator guide. + +## mTLS Certificate Infrastructure + +The global cluster hosts the mTLS CA hierarchy (managed by cert-manager) +used by both the global Nautobot deployment and site-level workers: + +| Resource | Kind | Purpose | +|---|---|---| +| `mtls-selfsigned` | Issuer | Bootstraps the self-signed root | +| `mtls-ca` | Certificate | Root CA (ECDSA P-256, 10yr duration, 1yr renewBefore) | +| `mtls-ca-issuer` | Issuer | Signs all client and server certificates | +| `mtls-ca-cert` | Certificate | CA public cert secret used by CNPG and Redis for client verification | +| `nautobot-cluster-server-tls` | Certificate | PostgreSQL server certificate | +| `nautobot-redis-server-tls` | Certificate | Redis server certificate | +| `nautobot-mtls-client` | Certificate | Client certificate for global Nautobot/Celery pods (needed because Redis `authClients: true` applies to all connections) | + +All resources live in the `nautobot` namespace. + +For certificate renewal and distribution to site clusters, see the +[mTLS Certificate Renewal](../../operator-guide/nautobot-mtls-certificate-renewal.md) +operator guide. + +## Redis mTLS + +The global Redis instance has TLS enabled with `authClients: true` +(Bitnami Redis subchart), requiring client certificates from all +connections -- including local pods on the global cluster. + +The `nautobot_config.py` Redis mTLS logic checks if the CA cert file +exists at the default path (`/etc/nautobot/mtls/ca.crt`). If present, +it configures `ssl_cert_reqs`, `ssl_ca_certs`, `ssl_certfile`, and +`ssl_keyfile` on the Redis connection pool, Celery broker, and Celery +result backend. Both global and site pods automatically pick up Redis +mTLS when the cert volume is mounted. + +Because `authClients: true` applies to all connections (Redis has no +equivalent of `pg_hba` to distinguish local vs remote), the global +Nautobot deploy values must mount the `nautobot-mtls-client` cert into +both the web server and celery pods. + ## Deployment Repo Content {{ secrets_disclaimer }} @@ -53,3 +118,18 @@ Optional additions: - `nautobot-custom-env` Secret: Add any extra environment variables the deployment should inject into Nautobot, such as integration credentials or DSNs. - `Database cluster and backup manifests`: Add a CloudNativePG cluster, backup schedule, or similar database resources if this deployment owns its own PostgreSQL cluster. - `Catalog and bootstrap content`: Add app definitions, device types, location types, locations, rack groups, or racks if you want Nautobot preloaded with inventory metadata. + +## Known Gotchas + +- **Helm list values are replaced, not merged.** When the deploy repo + values set `extraVolumes` or `extraVolumeMounts`, they completely + replace the base values from `components/nautobot/values.yaml`. If + the base values include volumes (e.g. SSO secret mounts), the deploy + values must re-include them alongside any new volumes. Forgetting this + will silently break features like SSO login. + +- **Redis authClients affects all connections.** Redis + `authClients: true` requires ALL clients (including global Nautobot + pods) to present client certificates. The global Nautobot values must + mount the mTLS client cert into both the web server and celery pods, + not just site workers. diff --git a/docs/operator-guide/nautobot-mtls-certificate-renewal.md b/docs/operator-guide/nautobot-mtls-certificate-renewal.md new file mode 100644 index 000000000..515843307 --- /dev/null +++ b/docs/operator-guide/nautobot-mtls-certificate-renewal.md @@ -0,0 +1,126 @@ +# Nautobot mTLS Certificate Renewal + +This guide covers how mTLS client certificates used by site-level +Nautobot workers are renewed and distributed across clusters. + +For background on the mTLS architecture and certificate infrastructure, +see the [nautobot-worker deploy guide](../deploy-guide/components/nautobot-worker.md). + +## How Certificates Are Issued + +Client certificates are issued by cert-manager on the global cluster +using the `mtls-ca-issuer` (backed by a self-signed root CA). Each site +gets its own Certificate resource: + +```yaml +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: nautobot-mtls-client- + namespace: nautobot +spec: + secretName: nautobot-mtls-client- + duration: 26280h # 3 years + renewBefore: 720h # 30 days + commonName: app + usages: + - client auth + privateKey: + algorithm: ECDSA + size: 256 + issuerRef: + name: mtls-ca-issuer + kind: Issuer +``` + +cert-manager automatically renews the certificate 90 days before +expiry, updating the Kubernetes secret on the global cluster. + +## The Distribution Problem + +cert-manager handles renewal on the global cluster automatically. The +challenge is getting the renewed certificate to the site cluster. The +site cluster pulls the cert from an external secrets provider via an +ExternalSecret resource. When cert-manager renews the cert, the updated +material must be pushed to the secrets provider so the site +ExternalSecret picks it up on its next refresh cycle. + +By default, this is a manual process: an operator extracts the renewed +cert from the global cluster and uploads it to the secrets provider. + +## Automation Approaches + +### PushSecret (External Secrets Operator) + +Use a [PushSecret](https://external-secrets.io/latest/guides/pushsecrets/) +resource on the global cluster to automatically push the renewed cert +to your secrets provider whenever the Kubernetes secret changes. This +is event-driven and requires no CronJob. + +This is the recommended approach if your secrets provider is supported +by the External Secrets Operator. + +### CronJob on the Global Cluster + +A Kubernetes CronJob that runs periodically, reads the cert secret, and +pushes it to your secrets provider via its API. Simple to implement but +introduces a delay between renewal and distribution (up to the CronJob +interval). + +### Cross-Cluster Secret Replication + +Use a tool like +[Kubernetes Replicator](https://github.com/mittwald/kubernetes-replicator) +to copy the cert secret directly from the global cluster to site +clusters, bypassing the secrets provider entirely. Requires network +connectivity between clusters and appropriate RBAC. + +### CertificateRequest from Site Clusters + +The site cluster creates a cert-manager +[CertificateRequest](https://cert-manager.io/docs/usage/certificaterequest/), +an operator on the global cluster approves and signs it, and the signed +cert is returned. This is similar to how kubelet certificate management +works in Kubernetes. Most complex to set up but fully automated with no +intermediate secrets provider. + +## Monitoring Certificate Expiry + +Check certificate status on the global cluster: + +```bash +# List all mTLS client certificates and their expiry +kubectl get certificate -n nautobot -o custom-columns=\ +NAME:.metadata.name,\ +READY:.status.conditions[0].status,\ +EXPIRY:.status.notAfter,\ +RENEWAL:.status.renewalTime + +# Check a specific site's certificate +kubectl describe certificate nautobot-mtls-client- -n nautobot +``` + +On the site cluster, verify the ExternalSecret is syncing: + +```bash +kubectl get externalsecret nautobot-mtls-client -n nautobot +``` + +If the ExternalSecret shows `SecretSyncedError`, the credential in +your secrets provider may be stale or missing. + +## What Happens When a Certificate Expires + +If a site worker's client certificate expires before it is renewed and +distributed: + +- PostgreSQL connections fail with `SSL error: certificate has expired` +- Redis connections fail with `[SSL: CERTIFICATE_VERIFY_FAILED]` +- The worker pod stays running but all tasks fail +- The health check reports Redis as unavailable + +To recover, manually extract the renewed cert from the global cluster +and upload it to your secrets provider. The site ExternalSecret will +pick it up on the next refresh cycle, and the worker pods will +automatically get the new cert on their next restart (or when the +secret volume is refreshed by kubelet). diff --git a/docs/operator-guide/nautobot.md b/docs/operator-guide/nautobot.md index a066fe8f5..e5d1141f9 100644 --- a/docs/operator-guide/nautobot.md +++ b/docs/operator-guide/nautobot.md @@ -1,5 +1,239 @@ # Nautobot +## Related Guides + +- [Nautobot Celery Queues](nautobot-celery-queues.md) -- configuring + per-site Celery task queues and routing jobs to site-specific workers +- [mTLS Certificate Renewal](nautobot-mtls-certificate-renewal.md) -- + how mTLS client certificates for site workers are renewed and + distributed across clusters + +## Configuration Architecture + +Nautobot requires a `nautobot_config.py` file that defines Django +settings, plugin loading, database options, and authentication +backends. In understack, this file lives at +`components/nautobot/nautobot_config.py` and is injected into pods +using the Helm chart's `fileParameters` feature. + +### How fileParameters Works + +Both the `nautobot` and `nautobot-worker` ArgoCD Applications use a +multi-source setup. The Helm chart source includes: + +```yaml +helm: + fileParameters: + - name: nautobot.config + path: $understack/components/nautobot/nautobot_config.py +``` + +ArgoCD reads the file content from the understack git repo and passes +it as the `nautobot.config` Helm value. The Nautobot Helm chart then +creates a ConfigMap from that content and mounts it into pods at +`/opt/nautobot/nautobot_config.py`. The `NAUTOBOT_CONFIG` environment +variable (set in the deploy repo values) tells Nautobot to load its +configuration from that path. + +This approach means: + +- The config file is version-controlled in git alongside the component + it configures +- Changes to the config trigger ArgoCD syncs and pod restarts + automatically (the Helm chart checksums the ConfigMap) +- The same config file is shared by both the global nautobot deployment + and site-level workers, avoiding drift + +### Why Not Use the Baked-In Config? + +Container images may include their own `nautobot_config.py` at build +time (e.g. at `/opt/nautobot_config/nautobot_config.py`). While this +works for simple deployments, it has limitations: + +- Config changes require rebuilding and redeploying the container image +- Different deployments (global vs site workers) may need different + settings (e.g. mTLS, plugin sets) but share the same image +- Private deployment-specific settings (plugin credentials, SSO config) + get baked into the image + +The Helm `fileParameters` approach decouples the config from the image. +The image provides the runtime (Nautobot + installed plugins), while +the git-managed config and deploy-repo environment variables control +behavior. This separation allows: + +- The same container image to be used across global and site deployments + with different configurations +- mTLS, SSL, and other connection settings to be conditional on + environment variables rather than hardcoded +- Private plugin configuration to be injected via environment variables + in the deploy repo without modifying the public config file + +### Config Layering + +The effective configuration is built from multiple layers: + +1. **Nautobot defaults** -- `from nautobot.core.settings import *` + provides all default Django and Nautobot settings +2. **Component config** -- `components/nautobot/nautobot_config.py` + overrides defaults with understack-specific settings (mTLS, plugin + loading, SSO, partition identifier) +3. **Helm chart env vars** -- the base `components/nautobot/values.yaml` + sets database, Redis, and other connection parameters as environment + variables that the config reads via `os.getenv()` +4. **Deploy repo values** -- site-specific overrides (hostnames, image + tags, extra plugins, credentials) that Helm merges on top of the + base values + +### Important: Helm List Replacement + +Helm merges scalar and map values from multiple value files, but +**replaces lists entirely**. If the base `components/nautobot/values.yaml` +defines: + +```yaml +nautobot: + extraVolumes: + - name: nautobot-sso + secret: + secretName: nautobot-sso +``` + +And the deploy repo values set: + +```yaml +nautobot: + extraVolumes: + - name: mtls-certs + secret: + secretName: nautobot-mtls-client +``` + +The result is **only** `mtls-certs` -- the `nautobot-sso` volume is +gone. The deploy values must re-include any base volumes they need to +preserve. + +## Plugin Loading + +The shared `nautobot_config.py` (mounted via Helm `fileParameters`) +uses a generic plugin loading mechanism that works across different +container images and deployments: + +1. Open-source plugins (`nautobot_plugin_nornir`, `nautobot_golden_config`) + are loaded automatically if installed in the container image. +2. Additional plugins can be specified via the `NAUTOBOT_EXTRA_PLUGINS` + environment variable (comma-separated module names). Each plugin is + loaded only if it's actually installed in the container -- missing + plugins are silently skipped. +3. Plugin configuration is provided via the `NAUTOBOT_EXTRA_PLUGINS_CONFIG` + environment variable as a JSON object. This supports `${ENV_VAR}` + syntax for referencing environment variables in string values, which + is useful for injecting secrets at runtime without hardcoding them in + the config. + +This design allows the same `nautobot_config.py` to be used by both +the global Nautobot deployment (which may have additional private +plugins) and site workers (which may have a different plugin set), +without any deployment-specific code in the public repository. + +Example deploy values for adding custom plugins: + +```yaml +nautobot: + extraEnvVars: + - name: NAUTOBOT_EXTRA_PLUGINS + value: 'my_custom_plugin,another_plugin' + - name: NAUTOBOT_EXTRA_PLUGINS_CONFIG + value: '{"my_custom_plugin":{"API_KEY":"${MY_API_KEY}"}}' +``` + +### Current Limitations + +The `NAUTOBOT_EXTRA_PLUGINS_CONFIG` environment variable works but has +ergonomic drawbacks as the number of plugins grows: + +- All plugin config is a single JSON string in the deploy values, which + becomes hard to read and review in PRs +- JSON cannot express Python-native types like `None` or call functions + like `is_truthy()` -- only plain JSON types (`null`, `false`, etc.) +- Adding or removing a plugin means editing a long inline JSON blob + +### Future Improvement: Per-Plugin Config Files + +A cleaner approach for deployments with many plugins is to store each +plugin's configuration as a separate JSON file in the deploy repo, +managed via a Kustomize `configMapGenerator`, and mounted into the pod +as a directory. The `nautobot_config.py` would then glob that directory +and load each file into `PLUGINS_CONFIG`. + +Example structure in the deploy repo: + +```text +/nautobot/plugin-configs/ + nautobot_golden_config.json + my_custom_plugin.json + vni_custom_model.json +``` + +Each file contains the plugin's config as a JSON object: + +```json title="my_custom_plugin.json" +{ + "API_KEY": "${MY_API_KEY}", + "TIMEOUT": 30 +} +``` + +A Kustomize `configMapGenerator` creates a ConfigMap from the directory: + +```yaml title="kustomization.yaml" +configMapGenerator: + - name: nautobot-plugin-configs + files: + - plugin-configs/nautobot_golden_config.json + - plugin-configs/my_custom_plugin.json + options: + disableNameSuffixHash: true +``` + +The deploy values mount it as a volume: + +```yaml +nautobot: + extraVolumes: + - name: plugin-configs + configMap: + name: nautobot-plugin-configs + extraVolumeMounts: + - name: plugin-configs + mountPath: /etc/nautobot/plugin-configs + readOnly: true +``` + +And the `nautobot_config.py` loads all files from the directory: + +```python +import glob, json, os, re + +def _interpolate_env(obj): + if isinstance(obj, str): + return re.sub(r"\$\{(\w+)\}", lambda m: os.environ.get(m.group(1), ""), obj) + if isinstance(obj, dict): + return {k: _interpolate_env(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_interpolate_env(v) for v in obj] + return obj + +for _path in sorted(glob.glob("/etc/nautobot/plugin-configs/*.json")): + _name = os.path.splitext(os.path.basename(_path))[0] + with open(_path) as _f: + PLUGINS_CONFIG[_name] = _interpolate_env(json.load(_f)) +``` + +This gives each plugin its own readable file, makes PRs easy to review, +and keeps the `${ENV_VAR}` interpolation for secrets. It can be +implemented alongside the current env var approach without breaking +existing deployments. + ## Nautobot Django shell You can access the Nautobot Django shell by connecting to the pod and running the diff --git a/mkdocs.yml b/mkdocs.yml index aec09ea02..c791d971a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -242,6 +242,7 @@ nav: - operator-guide/nautobot.md - operator-guide/nautobotop.md - operator-guide/nautobot-celery-queues.md + - operator-guide/nautobot-mtls-certificate-renewal.md - operator-guide/troubleshooting-osh.md - operator-guide/logging.md - operator-guide/ansible-local-usage.md From 7423e9a7ce827d6dc33923c7b0c1d0af0ccf0369 Mon Sep 17 00:00:00 2001 From: haseeb Date: Mon, 20 Apr 2026 23:51:31 +0530 Subject: [PATCH 7/9] authenticate CNPG with certs --- components/nautobot/nautobot_config.py | 20 +- .../components/nautobot-worker.md | 142 ++++++++----- docs/deploy-guide/components/nautobot.md | 38 +++- .../nautobot-mtls-certificate-renewal.md | 11 +- docs/operator-guide/nautobot.md | 190 ++++++++++++++++++ 5 files changed, 344 insertions(+), 57 deletions(-) diff --git a/components/nautobot/nautobot_config.py b/components/nautobot/nautobot_config.py index 10b6b2b00..15dd9873d 100644 --- a/components/nautobot/nautobot_config.py +++ b/components/nautobot/nautobot_config.py @@ -67,9 +67,19 @@ if DATABASES["default"]["ENGINE"].endswith("mysql"): # noqa F405 DATABASES["default"]["OPTIONS"] = {"charset": "utf8mb4"} # noqa F405 -# mTLS options for PostgreSQL connections. -# When NAUTOBOT_DB_SSLMODE is set to "verify-ca" or "verify-full", the client -# certificate, key, and CA root cert must be present at the configured paths. +# SSL/mTLS options for PostgreSQL connections. +# +# Supported NAUTOBOT_DB_SSLMODE values: +# "require" -- encrypt the connection but skip server CA and client cert +# verification. Suitable for same-cluster pods that just need +# to satisfy hostssl pg_hba rules. +# "verify-ca" -- encrypt and verify the server certificate against the CA +# "verify-full" -- like verify-ca but also checks the server hostname +# +# When sslmode is "verify-ca" or "verify-full", the client certificate, key, +# and CA root cert must be present at the configured paths (full mTLS). +# When sslmode is "require", only encryption is enforced -- no cert files are +# needed and no client certificate is presented. _db_sslcert = os.getenv("NAUTOBOT_DB_SSLCERT", "/etc/nautobot/mtls/tls.crt") _db_sslkey = os.getenv("NAUTOBOT_DB_SSLKEY", "/etc/nautobot/mtls/tls.key") _db_sslrootcert = os.getenv("NAUTOBOT_DB_SSLROOTCERT", "/etc/nautobot/mtls/ca.crt") @@ -91,6 +101,10 @@ "sslkey": _db_sslkey, "sslrootcert": _db_sslrootcert, } +elif _db_sslmode == "require": + DATABASES["default"]["OPTIONS"] = { # noqa F405 + "sslmode": "require", + } # mTLS options for Redis connections. # When NAUTOBOT_REDIS_SSL env var is "true" (set by Helm `nautobot.redis.ssl`), diff --git a/docs/deploy-guide/components/nautobot-worker.md b/docs/deploy-guide/components/nautobot-worker.md index 24ee4bb00..c684a3822 100644 --- a/docs/deploy-guide/components/nautobot-worker.md +++ b/docs/deploy-guide/components/nautobot-worker.md @@ -94,36 +94,45 @@ via the `NAUTOBOT_EXTRA_PLUGINS` environment variable. The global CNPG cluster is configured with: - `spec.certificates.serverTLSSecret` and `spec.certificates.serverCASecret` - for server-side TLS. PostgreSQL uses the CA in `serverCASecret` to - verify client certificates presented during `pg_hba cert` authentication. - `clientCASecret` is intentionally NOT set -- CNPG uses that field - internally to sign replication client certificates, which requires the - CA private key. CNPG manages its own replication client CA. -- `pg_hba` rules that require `hostssl ... cert` for remote connections - and allow `host ... scram-sha-256` for local pods on the global cluster - -Site workers connect with `sslmode=verify-ca`, presenting their client -certificate, key, and the CA root cert via Django's `DATABASES` OPTIONS. + for server-side TLS. +- `spec.certificates.clientCASecret` set to the CA public cert secret + (`mtls-ca-cert`). CNPG uses this to populate PostgreSQL's + `ssl_ca_file`, which is what PostgreSQL checks when verifying client + certificates during `pg_hba cert` authentication. The secret only + needs `ca.crt` (the root CA public cert). +- `spec.certificates.replicationTLSSecret` set to a cert-manager + Certificate (`nautobot-cluster-replication`) with + `commonName: streaming_replica`. This provides the client cert CNPG + uses for streaming replication between PostgreSQL instances. When + `replicationTLSSecret` is provided, CNPG does not need the CA private + key in `clientCASecret`, which is why we can use `mtls-ca-cert` + (which only has `ca.crt`) instead of `mtls-ca-key-pair`. +- `pg_hba` rules that require `hostssl ... cert` for all connections, + enforcing client certificate authentication over TLS + +Both global pods and site workers connect with `sslmode=verify-ca`, +presenting their client certificate, key, and the CA root cert via +Django's `DATABASES` OPTIONS. The `nautobot_config.py` SSL logic is conditional on the -`NAUTOBOT_DB_SSLMODE` environment variable. When set to `verify-ca` or -`verify-full`, it reads the cert paths from environment variables (with -defaults pointing to `/etc/nautobot/mtls/`) and sets -`DATABASES["default"]["OPTIONS"]`. When the env var is unset or empty -(as on the global cluster), no SSL options are applied and pods connect -with password-only auth. +`NAUTOBOT_DB_SSLMODE` environment variable: + +- `verify-ca` or `verify-full`: reads cert paths from environment + variables (defaults to `/etc/nautobot/mtls/`) and sets full mTLS + options on `DATABASES["default"]["OPTIONS"]`. Used by both global + pods and site workers. +- `require`: sets `sslmode=require` only -- encrypts the connection + without presenting a client certificate or verifying the server CA. +- Unset or empty: no SSL options are applied and pods connect with + password-only auth over plain TCP. -#### pg_hba Rule Order +#### pg_hba Rule -The CNPG `pg_hba` rules are evaluated top-to-bottom: +The CNPG cluster uses a single `pg_hba` rule: -1. `host all all 10.0.0.0/8 scram-sha-256` -- local pods on the global - cluster connect with password only (no TLS required) -2. `hostssl all all 0.0.0.0/0 cert` -- remote connections with a valid - client certificate are accepted (cert CN maps to DB user) -3. `hostssl all all 0.0.0.0/0 scram-sha-256` -- transitional rule: - remote connections over TLS with password only (no client cert). - Remove this rule once all sites have mTLS deployed. +1. `hostssl all all 0.0.0.0/0 cert` -- all connections must use TLS + and present a valid client certificate. The certificate CN maps to + the PostgreSQL user (must be `app`). ### Redis @@ -181,7 +190,7 @@ Before starting, ensure the global cluster already has: - The mTLS CA hierarchy deployed (issuers, root CA, CA issuer) - Server TLS certificates for PostgreSQL and Redis - A global `nautobot-mtls-client` certificate (for Redis `authClients`) -- CNPG configured with `serverTLSSecret`, `serverCASecret`, and `pg_hba` +- CNPG configured with `serverTLSSecret`, `serverCASecret`, `clientCASecret`, and `pg_hba` - Redis TLS enabled with `authClients: true` - Envoy Gateway TLS passthrough routes on ports 5432 and 6379 @@ -436,7 +445,7 @@ operator guide. | Variable | Where Set | Purpose | |---|---|---| -| `NAUTOBOT_DB_SSLMODE` | Site worker values | Controls PostgreSQL SSL mode. Set to `verify-ca` for mTLS. Unset on global cluster. | +| `NAUTOBOT_DB_SSLMODE` | Both global and site values | Controls PostgreSQL SSL mode. Set to `verify-ca` for mTLS on all pods. | | `NAUTOBOT_DB_SSLCERT` | Optional override | Path to client cert for PG (default: `/etc/nautobot/mtls/tls.crt`) | | `NAUTOBOT_DB_SSLKEY` | Optional override | Path to client key for PG (default: `/etc/nautobot/mtls/tls.key`) | | `NAUTOBOT_DB_SSLROOTCERT` | Optional override | Path to CA cert for PG (default: `/etc/nautobot/mtls/ca.crt`) | @@ -457,11 +466,14 @@ operator guide. CA issuer) handles issuance and renewal on both global and site clusters without manual intervention. -- CNPG's native TLS support (`serverTLSSecret`, `serverCASecret`) - integrates directly with cert-manager secrets. No sidecar proxies or - custom TLS termination needed. PostgreSQL verifies external client - certificates using the CA chain from `serverCASecret` when processing - `pg_hba cert` rules. +- CNPG's native TLS support (`serverTLSSecret`, `serverCASecret`, + `clientCASecret`, `replicationTLSSecret`) integrates directly with + cert-manager secrets. No sidecar proxies or custom TLS termination + needed. `clientCASecret` populates PostgreSQL's `ssl_ca_file` for + client cert verification during `pg_hba cert` auth. It points to the + CA public cert secret (`mtls-ca-cert`). `replicationTLSSecret` + provides the streaming replication client cert so CNPG does not need + the CA private key in `clientCASecret`. - The `routes.tls` type in the Envoy Gateway template uses a `gatewayPort` field to support non-443 ports for TLS passthrough. @@ -478,7 +490,8 @@ operator guide. - The `nautobot_config.py` SSL logic is conditional on `NAUTOBOT_DB_SSLMODE`, so the same config file works for both global - pods (no mTLS) and site workers (mTLS enabled). + pods and site workers. All pods set `verify-ca` to present client + certificates for `pg_hba cert` authentication. - The Redis mTLS logic in `nautobot_config.py` auto-detects the CA cert file at the default mount path. If the cert volume is mounted, Redis @@ -486,19 +499,22 @@ operator guide. ## Known Gotchas -- **clientCASecret is NOT for external client verification.** CNPG's - `clientCASecret` field is used internally to sign replication client - certificates between PostgreSQL instances. It expects a secret with - both `ca.crt` and `ca.key`. Only `serverTLSSecret` and - `serverCASecret` should be set. PostgreSQL verifies external client - certificates using the CA chain from `serverCASecret` when processing - `pg_hba cert` rules. - -- **SSL config must be conditional.** Setting `sslmode` unconditionally - in `nautobot_config.py` would break global cluster pods, which connect - to CNPG via local password-only auth. The SSL config is gated on the - `NAUTOBOT_DB_SSLMODE` env var -- global pods don't set it, so they - are unaffected. +- **clientCASecret is required for client cert verification.** CNPG + uses `clientCASecret` to populate PostgreSQL's `ssl_ca_file`, which + is what verifies client certificates during `pg_hba cert` auth. + `serverCASecret` only provides the CA cert sent to clients for server + verification -- it does NOT populate `ssl_ca_file`. Without + `clientCASecret`, CNPG auto-generates its own internal replication CA + and uses that for `ssl_ca_file`, causing `tlsv1 alert unknown ca` + errors for external client certs. When providing `clientCASecret`, + you must also set `replicationTLSSecret` so CNPG does not need the + CA private key (`ca.key`) in the `clientCASecret` secret. + +- **SSL config must be conditional.** The mTLS config in + `nautobot_config.py` is gated on the `NAUTOBOT_DB_SSLMODE` env var. + Both global pods and site workers must set it to `verify-ca`. If the + env var is unset, no SSL options are applied and the connection will + be rejected by the `hostssl ... cert` pg_hba rule. - **mtls-ca-cert secret contains a private key.** cert-manager Certificate resources always produce `tls.crt`, `tls.key`, and @@ -533,11 +549,11 @@ operator guide. mount the mTLS client cert into both the web server and celery pods, not just site workers. -- **pg_hba rule ordering matters.** The transitional `pg_hba` rules - (`hostssl ... cert` and `hostssl ... scram-sha-256` for remote) are - ordered so that cert-based auth is tried first. Sites without client - certs fall through to password-only over TLS. Once all sites have - mTLS deployed, the `scram-sha-256` remote rule should be removed. +- **pg_hba uses cert auth for all connections.** The single + `hostssl all all 0.0.0.0/0 cert` rule requires every connection -- + local and remote -- to present a valid client certificate over TLS. + All pods (global and site workers) must have `NAUTOBOT_DB_SSLMODE` + set to `verify-ca` and the mTLS client cert mounted. - **defaultMode 256 vs 0400.** The `defaultMode: 256` (octal 0400) on the cert secret volume mount is correct but easy to get wrong. YAML @@ -573,6 +589,30 @@ Check that: 4. On the global cluster, verify the source certificate is issued: `kubectl get certificate -n nautobot | grep mtls-client` +### PostgreSQL rejects connection with "tlsv1 alert unknown ca" + +PostgreSQL's `ssl_ca_file` does not contain the CA that signed the +client certificate. This is a TLS-level rejection that happens before +`pg_hba` rules are evaluated. + +The most common cause is that `clientCASecret` is not set on the CNPG +Cluster resource. Without it, CNPG auto-generates its own internal +replication CA and uses that for `ssl_ca_file`. External client certs +signed by the mTLS CA will be rejected. + +Verify what CA PostgreSQL is actually using: + +```bash +kubectl exec -n nautobot nautobot-cluster-1 -c postgres -- \ + openssl x509 -noout -subject -in /controller/certificates/client-ca.crt +``` + +If it shows `CN=nautobot-cluster` (CNPG's internal CA) instead of +`CN=understack-mtls-ca`, set `clientCASecret` and +`replicationTLSSecret` on the CNPG Cluster. See the +[PostgreSQL mTLS](../../operator-guide/nautobot.md#postgresql-mtls) +operator guide for details. + ### PostgreSQL rejects connection with "certificate verify failed" The client cert is not signed by the CA that CNPG trusts. Verify the diff --git a/docs/deploy-guide/components/nautobot.md b/docs/deploy-guide/components/nautobot.md index c0f84f53d..04ac46449 100644 --- a/docs/deploy-guide/components/nautobot.md +++ b/docs/deploy-guide/components/nautobot.md @@ -70,10 +70,11 @@ used by both the global Nautobot deployment and site-level workers: | `mtls-selfsigned` | Issuer | Bootstraps the self-signed root | | `mtls-ca` | Certificate | Root CA (ECDSA P-256, 10yr duration, 1yr renewBefore) | | `mtls-ca-issuer` | Issuer | Signs all client and server certificates | -| `mtls-ca-cert` | Certificate | CA public cert secret used by CNPG and Redis for client verification | +| `mtls-ca-cert` | Certificate | CA public cert secret used by CNPG (`clientCASecret` and `serverCASecret`) and Redis for client verification | | `nautobot-cluster-server-tls` | Certificate | PostgreSQL server certificate | +| `nautobot-cluster-replication` | Certificate | Streaming replication client certificate (`CN=streaming_replica`). Required so CNPG does not need the CA private key in `clientCASecret`. | | `nautobot-redis-server-tls` | Certificate | Redis server certificate | -| `nautobot-mtls-client` | Certificate | Client certificate for global Nautobot/Celery pods (needed because Redis `authClients: true` applies to all connections) | +| `nautobot-mtls-client` | Certificate | Client certificate for global Nautobot/Celery pods (`CN=app`). Used for both PostgreSQL `pg_hba cert` auth and Redis `authClients`. | All resources live in the `nautobot` namespace. @@ -99,6 +100,39 @@ equivalent of `pg_hba` to distinguish local vs remote), the global Nautobot deploy values must mount the `nautobot-mtls-client` cert into both the web server and celery pods. +## PostgreSQL mTLS + +The global CNPG cluster enforces client certificate authentication for +all connections via a single `pg_hba` rule: + +```text +hostssl all all 0.0.0.0/0 cert +``` + +The CNPG Cluster resource configures four certificate fields: + +| Field | Secret | Purpose | +|---|---|---| +| `serverTLSSecret` | `nautobot-cluster-server-tls` | Server cert presented to clients during TLS handshake | +| `serverCASecret` | `mtls-ca-cert` | CA cert sent to clients for server verification (`sslrootcert`) | +| `clientCASecret` | `mtls-ca-cert` | CA cert used by PostgreSQL's `ssl_ca_file` to verify client certs | +| `replicationTLSSecret` | `nautobot-cluster-replication` | Client cert for streaming replication (`CN=streaming_replica`) | + +`clientCASecret` is the critical field for client cert verification. +Without it, CNPG auto-generates its own internal CA and uses that for +`ssl_ca_file`, causing `tlsv1 alert unknown ca` errors for external +client certs signed by the mTLS CA. + +`replicationTLSSecret` must be provided alongside `clientCASecret` so +CNPG does not need the CA private key (`ca.key`) in the +`clientCASecret` secret. Without it, CNPG tries to generate its own +replication cert and fails with `missing ca.key secret data`. + +Both global Nautobot pods and site workers set +`NAUTOBOT_DB_SSLMODE=verify-ca` to present their client certificates +(`CN=app`) during the TLS handshake. The `pg_hba cert` rule maps the +certificate CN to the PostgreSQL user. + ## Deployment Repo Content {{ secrets_disclaimer }} diff --git a/docs/operator-guide/nautobot-mtls-certificate-renewal.md b/docs/operator-guide/nautobot-mtls-certificate-renewal.md index 515843307..c32f13905 100644 --- a/docs/operator-guide/nautobot-mtls-certificate-renewal.md +++ b/docs/operator-guide/nautobot-mtls-certificate-renewal.md @@ -33,9 +33,18 @@ spec: kind: Issuer ``` -cert-manager automatically renews the certificate 90 days before +cert-manager automatically renews the certificate 30 days before expiry, updating the Kubernetes secret on the global cluster. +The global cluster also has: + +- `nautobot-mtls-client` -- client cert for global Nautobot/Celery pods + (`CN=app`). Renewed automatically by cert-manager. +- `nautobot-cluster-replication` -- streaming replication client cert + (`CN=streaming_replica`). Renewed automatically by cert-manager. + Required so CNPG does not need the CA private key in + `clientCASecret`. + ## The Distribution Problem cert-manager handles renewal on the global cluster automatically. The diff --git a/docs/operator-guide/nautobot.md b/docs/operator-guide/nautobot.md index e5d1141f9..23f40b279 100644 --- a/docs/operator-guide/nautobot.md +++ b/docs/operator-guide/nautobot.md @@ -8,6 +8,196 @@ how mTLS client certificates for site workers are renewed and distributed across clusters +## PostgreSQL mTLS + +All PostgreSQL connections -- both from global Nautobot pods and +site-level workers -- use mutual TLS with client certificate +authentication. The CNPG cluster enforces this with a single `pg_hba` +rule: + +```text +hostssl all all 0.0.0.0/0 cert +``` + +This means every client must connect over TLS and present a valid +client certificate signed by the mTLS CA. The certificate CN is mapped +to the PostgreSQL user (`app`). + +### CNPG Certificate Configuration + +The CNPG Cluster resource has four certificate fields. Understanding +what each one does is critical for troubleshooting TLS errors: + +| Field | Secret | What CNPG Does With It | +|---|---|---| +| `serverTLSSecret` | `nautobot-cluster-server-tls` | Mounted as the PostgreSQL server cert. Presented to clients during the TLS handshake. | +| `serverCASecret` | `mtls-ca-cert` | The `ca.crt` from this secret is sent to clients so they can verify the server cert (`sslrootcert` on the client side). | +| `clientCASecret` | `mtls-ca-cert` | The `ca.crt` from this secret populates PostgreSQL's `ssl_ca_file`. This is what PostgreSQL uses to verify client certificates during `pg_hba cert` auth. | +| `replicationTLSSecret` | `nautobot-cluster-replication` | Client cert (`CN=streaming_replica`) used for streaming replication between PostgreSQL instances. | + +Key points: + +- `clientCASecret` is the field that controls client cert verification. + Without it, CNPG auto-generates its own internal CA and uses that for + `ssl_ca_file`. External client certs signed by the mTLS CA will be + rejected with `tlsv1 alert unknown ca`. +- `serverCASecret` does NOT populate `ssl_ca_file`. It only provides + the CA cert that clients use to verify the server. This is a common + source of confusion. +- `replicationTLSSecret` must be provided when setting `clientCASecret`. + Without it, CNPG tries to generate its own replication cert and needs + `ca.key` in the `clientCASecret` secret. Since `mtls-ca-cert` only + has `ca.crt` (not the CA private key), CNPG fails with + `missing ca.key secret data`. +- Both `clientCASecret` and `serverCASecret` can point to the same + secret (`mtls-ca-cert`) when the same CA signs both server and client + certificates. + +### How nautobot_config.py Handles SSL + +The `nautobot_config.py` SSL logic is gated on the `NAUTOBOT_DB_SSLMODE` +environment variable: + +| Value | Behavior | Use Case | +|---|---|---| +| `verify-ca` | Sets `sslmode`, `sslcert`, `sslkey`, `sslrootcert` on the Django DB connection. Validates cert files exist at startup. | Global pods and site workers (production). | +| `verify-full` | Same as `verify-ca` but also verifies the server hostname matches the cert. | Stricter verification if needed. | +| `require` | Sets `sslmode=require` only. Encrypts the connection but does not present a client cert or verify the server CA. | Not suitable for `pg_hba cert` -- use `verify-ca` instead. | +| Unset or empty | No SSL options applied. Plain TCP connection. | Will be rejected by `hostssl ... cert` pg_hba rule. | + +All pods (global and site) must set `NAUTOBOT_DB_SSLMODE=verify-ca` in +their `extraEnvVars` and have the mTLS client cert volume mounted at +`/etc/nautobot/mtls/`. + +### Verifying the Certificate Chain + +To confirm the CNPG cluster is using the correct CA for client cert +verification: + +```bash +# Check what CA PostgreSQL is using for ssl_ca_file +kubectl exec -n nautobot nautobot-cluster-1 -c postgres -- \ + openssl x509 -noout -subject -issuer \ + -in /controller/certificates/client-ca.crt +# Expected: subject=CN=understack-mtls-ca + +# Check the client cert CN and issuer +kubectl get secret nautobot-mtls-client -n nautobot \ + -o jsonpath='{.data.tls\.crt}' | base64 -d | \ + openssl x509 -noout -subject -issuer +# Expected: subject=CN=app, issuer=CN=understack-mtls-ca + +# Verify the client cert against the CA +kubectl get secret mtls-ca-cert -n nautobot \ + -o jsonpath='{.data.ca\.crt}' | base64 -d > /tmp/ca.crt +kubectl get secret nautobot-mtls-client -n nautobot \ + -o jsonpath='{.data.tls\.crt}' | base64 -d > /tmp/client.crt +openssl verify -CAfile /tmp/ca.crt /tmp/client.crt +# Expected: /tmp/client.crt: OK +``` + +### Common Errors + +| Error | Cause | Fix | +|---|---|---| +| `tlsv1 alert unknown ca` | `clientCASecret` not set or points to wrong secret. CNPG uses its internal CA for `ssl_ca_file`. | Set `clientCASecret: mtls-ca-cert` and `replicationTLSSecret: nautobot-cluster-replication`. | +| `missing ca.key secret data` | `clientCASecret` set but `replicationTLSSecret` not provided. CNPG needs CA key to generate replication certs. | Add `replicationTLSSecret` with a cert-manager Certificate (`CN=streaming_replica`). | +| `connection requires a valid client certificate` | Client connected over TLS but did not present a cert. | Set `NAUTOBOT_DB_SSLMODE=verify-ca` on the pod. | +| `certificate authentication failed for user` | Client cert CN does not match the PostgreSQL user. | Ensure cert has `commonName: app`. | +| `x509: certificate signed by unknown authority` (CNPG status) | Old replication secret signed by CNPG's internal CA, not the mTLS CA. | Delete the old secret: `kubectl delete secret nautobot-cluster-replication -n nautobot`. cert-manager recreates it. | +| `no pg_hba.conf entry` | Client is not connecting over TLS, or the source IP / auth method does not match any rule. | Ensure `NAUTOBOT_DB_SSLMODE=verify-ca` is set. Check that the pg_hba rules cover the connection type. | + +### Forcing CNPG to Reconcile + +After changing certificate fields on the CNPG Cluster resource, the +operator may not immediately pick up the change. Force a reconcile: + +```bash +kubectl annotate cluster nautobot-cluster -n nautobot \ + cnpg.io/reconcile=$(date +%s) --overwrite +``` + +Check the result: + +```bash +kubectl get cluster nautobot-cluster -n nautobot \ + -o jsonpath='{.status.phase}{"\n"}{.status.phaseReason}{"\n"}' +``` + +If the phase is healthy, the change was applied. If it shows an error, +see the Common Errors table above. + +### Handling Stale CNPG-Managed Secrets + +When adding `replicationTLSSecret`, CNPG may have already created a +secret with the same name (e.g. `nautobot-cluster-replication`) using +its internal CA. cert-manager will not overwrite a secret it did not +create. You must delete the old secret first: + +```bash +kubectl delete secret nautobot-cluster-replication -n nautobot +# cert-manager recreates it within seconds, signed by mtls-ca-issuer +``` + +Verify the new secret: + +```bash +kubectl get secret nautobot-cluster-replication -n nautobot +# Should show DATA=3 (tls.crt, tls.key, ca.crt) + +kubectl get secret nautobot-cluster-replication -n nautobot \ + -o jsonpath='{.data.tls\.crt}' | base64 -d | \ + openssl x509 -noout -subject -issuer +# Expected: subject=CN=streaming_replica, issuer=CN=understack-mtls-ca +``` + +Then force a CNPG reconcile (see above). + +### Restarting CNPG Pods + +If the CNPG pods have not picked up updated certificate secrets (e.g. +`client-ca.crt` still shows the old CA), restart them one at a time: + +```bash +kubectl delete pod -n nautobot nautobot-cluster-2 +# wait for ready +kubectl delete pod -n nautobot nautobot-cluster-3 +# wait for ready +kubectl delete pod -n nautobot nautobot-cluster-1 +``` + +Start with replicas, then the primary, to minimize downtime. + +### pg_hba Behavior + +pg_hba rules are evaluated top-to-bottom. PostgreSQL stops at the first +rule matching the connection type and source IP. If authentication fails +on that rule, the connection is rejected -- it does NOT fall through to +the next rule. This means two rules with the same +`hostssl all all 0.0.0.0/0` prefix makes the second unreachable. Use +CIDR scoping if you need different auth methods for different source +networks. + +### Rollback to Password Auth + +To revert global pods to password-based auth while keeping cert auth +for site workers: + +1. Add back the `host` rule for local pods: + + ```yaml + postgresql: + pg_hba: + - host all all 10.0.0.0/8 scram-sha-256 + - hostssl all all 0.0.0.0/0 cert + ``` + +2. Remove `NAUTOBOT_DB_SSLMODE` from global pod `extraEnvVars` (keep + it on site workers). + +3. Optionally remove `clientCASecret` and `replicationTLSSecret` from + the CNPG spec to let CNPG manage its own replication CA again. + ## Configuration Architecture Nautobot requires a `nautobot_config.py` file that defines Django From 37975416e238f7fe4861f033fbf7b481b3282640 Mon Sep 17 00:00:00 2001 From: haseeb Date: Tue, 21 Apr 2026 13:14:53 +0530 Subject: [PATCH 8/9] allow alternative nautobot config to be supplied Allows for a different nautobot config file to be stored in the deploy repo and supplied to Nautobot. --- .../templates/application-nautobot-worker.yaml | 2 +- .../argocd-understack/templates/application-nautobot.yaml | 2 +- charts/argocd-understack/values.yaml | 6 ++++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/charts/argocd-understack/templates/application-nautobot-worker.yaml b/charts/argocd-understack/templates/application-nautobot-worker.yaml index 652693bfa..dc5ce9392 100644 --- a/charts/argocd-understack/templates/application-nautobot-worker.yaml +++ b/charts/argocd-understack/templates/application-nautobot-worker.yaml @@ -19,7 +19,7 @@ spec: helm: fileParameters: - name: nautobot.config - path: $understack/components/nautobot/nautobot_config.py + path: {{ $.Values.site.nautobot_worker.nautobot_config }} ignoreMissingValueFiles: true releaseName: nautobot-worker valueFiles: diff --git a/charts/argocd-understack/templates/application-nautobot.yaml b/charts/argocd-understack/templates/application-nautobot.yaml index c6a14bf94..b65de9928 100644 --- a/charts/argocd-understack/templates/application-nautobot.yaml +++ b/charts/argocd-understack/templates/application-nautobot.yaml @@ -19,7 +19,7 @@ spec: helm: fileParameters: - name: nautobot.config - path: $understack/components/nautobot/nautobot_config.py + path: {{ $.Values.global.nautobot.nautobot_config }} ignoreMissingValueFiles: true releaseName: nautobot valueFiles: diff --git a/charts/argocd-understack/values.yaml b/charts/argocd-understack/values.yaml index 5f36997bb..54280c479 100644 --- a/charts/argocd-understack/values.yaml +++ b/charts/argocd-understack/values.yaml @@ -144,6 +144,9 @@ global: # -- Enable/disable deploying Nautobot # @default -- false enabled: false + # -- config file to use for Nautobot scoped to either $understack or $deploy repo + # @default -- $understack/components/nautobot/nautobot_config.py + nautobot_config: '$understack/components/nautobot/nautobot_config.py' # -- Nautobot API token generation jobs nautobot_api_tokens: @@ -561,6 +564,9 @@ site: # -- Enable/disable deploying Nautobot workers at the site level # @default -- false enabled: false + # -- config file to use for Nautobot scoped to either $understack or $deploy repo + # @default -- $understack/components/nautobot/nautobot_config.py + nautobot_config: '$understack/components/nautobot/nautobot_config.py' # -- SNMP exporter for network device monitoring snmp_exporter: From f67b30f63413fbca8095d92820aef28e88133d1f Mon Sep 17 00:00:00 2001 From: haseeb Date: Tue, 21 Apr 2026 18:44:26 +0530 Subject: [PATCH 9/9] docs improvement --- .../components/nautobot-worker.md | 35 +++++++++---------- docs/operator-guide/nautobot-celery-queues.md | 6 ++-- .../nautobot-mtls-certificate-renewal.md | 6 +--- docs/operator-guide/nautobot.md | 20 +++++++---- 4 files changed, 33 insertions(+), 34 deletions(-) diff --git a/docs/deploy-guide/components/nautobot-worker.md b/docs/deploy-guide/components/nautobot-worker.md index c684a3822..207bbe07c 100644 --- a/docs/deploy-guide/components/nautobot-worker.md +++ b/docs/deploy-guide/components/nautobot-worker.md @@ -173,9 +173,11 @@ The ExternalSecret on the site cluster combines these into a single `tls.key`, and `ca.crt`. This secret is mounted into worker pods at `/etc/nautobot/mtls/`. -Note: if your secrets provider stores PEM data with `\r\n` line endings, -the ExternalSecret template must strip carriage returns -(`| replace "\r" ""`) or OpenSSL will fail to parse the certificates. +Note: if your secrets provider stores PEM data with `\r\n` line endings +or concatenates multiple PEM blocks in a single field, use the +[`filterPEM`](https://external-secrets.io/latest/guides/templating/#filter-pem-blocks) +template function to extract specific block types. `filterPEM` handles +carriage-return stripping automatically. ## Adding a New Site @@ -298,16 +300,9 @@ spec: engineVersion: v2 type: kubernetes.io/tls data: - tls.crt: >- - {{ .client_password - | regexFind "-----BEGIN CERTIFICATE-----[\\s\\S]*?-----END CERTIFICATE-----" - | replace "\r" "" }} - tls.key: >- - {{ .client_password - | regexFind "-----BEGIN EC PRIVATE KEY-----[\\s\\S]*?-----END EC PRIVATE KEY-----" - | replace "\r" "" }} - ca.crt: >- - {{ .ca_password | replace "\r" "" }} + tls.crt: '{{ .client_password | filterPEM "CERTIFICATE" }}' + tls.key: '{{ .client_password | filterPEM "EC PRIVATE KEY" }}' + ca.crt: '{{ .ca_password | filterPEM "CERTIFICATE" }}' dataFrom: - extract: key: "" @@ -325,9 +320,10 @@ spec: {% endraw %} -The `replace "\r" ""` strips carriage returns that some secrets -providers add to PEM data. Without this, OpenSSL will fail to parse -the certificates. +The [`filterPEM`](https://external-secrets.io/latest/guides/templating/#filter-pem-blocks) +function extracts PEM blocks by type and strips carriage returns +automatically. Pass the PEM block type without the `BEGIN`/`END` +markers (e.g. `"CERTIFICATE"`, `"EC PRIVATE KEY"`, `"PRIVATE KEY"`). ### Step 4: Create the kustomization @@ -534,8 +530,11 @@ operator guide. - **PEM data with carriage returns.** Some secrets providers store text with `\r\n` line endings. PEM certificates with `\r` characters will - fail OpenSSL parsing with `[SSL] PEM lib`. The ExternalSecret template - must strip carriage returns using `| replace "\r" ""`. + fail OpenSSL parsing with `[SSL] PEM lib`. Use the + [`filterPEM`](https://external-secrets.io/latest/guides/templating/#filter-pem-blocks) + template function to extract PEM blocks by type -- it handles + carriage-return stripping automatically. Avoid manual `regexFind` + + `replace "\r" ""` patterns. - **ExternalSecret format depends on your secrets provider.** The ExternalSecret for the mTLS client cert on site clusters must produce diff --git a/docs/operator-guide/nautobot-celery-queues.md b/docs/operator-guide/nautobot-celery-queues.md index f9f6fa763..18ced9edb 100644 --- a/docs/operator-guide/nautobot-celery-queues.md +++ b/docs/operator-guide/nautobot-celery-queues.md @@ -209,10 +209,8 @@ To confirm a site worker is consuming from the correct queue: ```bash # Check the CELERY_TASK_QUEUES env var in the running pod -kubectl get deploy -n nautobot \ - -l app.kubernetes.io/component=nautobot-celery-rax-dev \ - -o jsonpath='{.items[0].spec.template.spec.containers[0].env}' \ - | python3 -m json.tool | grep -A1 CELERY_TASK_QUEUES +kubectl -n nautobot get deploy nautobot-worker-celery-rax-dev \ + -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name=="CELERY_TASK_QUEUES")].value}' # Check worker logs for the queue binding kubectl logs -n nautobot \ diff --git a/docs/operator-guide/nautobot-mtls-certificate-renewal.md b/docs/operator-guide/nautobot-mtls-certificate-renewal.md index c32f13905..8ed9d37df 100644 --- a/docs/operator-guide/nautobot-mtls-certificate-renewal.md +++ b/docs/operator-guide/nautobot-mtls-certificate-renewal.md @@ -99,11 +99,7 @@ Check certificate status on the global cluster: ```bash # List all mTLS client certificates and their expiry -kubectl get certificate -n nautobot -o custom-columns=\ -NAME:.metadata.name,\ -READY:.status.conditions[0].status,\ -EXPIRY:.status.notAfter,\ -RENEWAL:.status.renewalTime +kubectl get certificate -n nautobot -o custom-columns='NAME:.metadata.name,READY:.status.conditions[0].status,EXPIRY:.status.notAfter,RENEWAL:.status.renewalTime' # Check a specific site's certificate kubectl describe certificate nautobot-mtls-client- -n nautobot diff --git a/docs/operator-guide/nautobot.md b/docs/operator-guide/nautobot.md index 23f40b279..7139a046b 100644 --- a/docs/operator-guide/nautobot.md +++ b/docs/operator-guide/nautobot.md @@ -156,17 +156,23 @@ Then force a CNPG reconcile (see above). ### Restarting CNPG Pods If the CNPG pods have not picked up updated certificate secrets (e.g. -`client-ca.crt` still shows the old CA), restart them one at a time: +`client-ca.crt` still shows the old CA), use the `cnpg` kubectl plugin +to perform a rolling restart: ```bash -kubectl delete pod -n nautobot nautobot-cluster-2 -# wait for ready -kubectl delete pod -n nautobot nautobot-cluster-3 -# wait for ready -kubectl delete pod -n nautobot nautobot-cluster-1 +kubectl cnpg restart nautobot-cluster -n nautobot ``` -Start with replicas, then the primary, to minimize downtime. +This performs a rolling restart of all instances, handling replica/primary +ordering automatically and waiting for each pod to be ready before +proceeding. + +If you only need pods to reload configuration (e.g. updated `pg_hba` +or PostgreSQL parameters) without a full restart: + +```bash +kubectl cnpg reload nautobot-cluster -n nautobot +``` ### pg_hba Behavior