diff --git a/charts/argocd-understack/templates/application-nautobot-worker.yaml b/charts/argocd-understack/templates/application-nautobot-worker.yaml new file mode 100644 index 000000000..dc5ce9392 --- /dev/null +++ b/charts/argocd-understack/templates/application-nautobot-worker.yaml @@ -0,0 +1,69 @@ +{{- if eq (include "understack.isEnabled" (list $.Values.site "nautobot_worker")) "true" }} +--- +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: {{ printf "%s-%s" $.Release.Name "nautobot-worker" }} + finalizers: + - resources-finalizer.argocd.argoproj.io + annotations: + argocd.argoproj.io/compare-options: ServerSideDiff=true,IncludeMutationWebhook=true +{{- include "understack.appLabelsBlock" $ | nindent 2 }} +spec: + destination: + namespace: nautobot + server: {{ $.Values.cluster_server }} + project: understack + sources: + - chart: nautobot + helm: + fileParameters: + - name: nautobot.config + path: {{ $.Values.site.nautobot_worker.nautobot_config }} + ignoreMissingValueFiles: true + releaseName: nautobot-worker + valueFiles: + - $understack/components/nautobot-worker/values.yaml + - $deploy/{{ include "understack.deploy_path" $ }}/nautobot-worker/values.yaml + {{- with index $.Values.appLabels "understack.rackspace.com/partition" }} + values: | + workers: + default: + enabled: false + {{ . }}: + enabled: true + taskQueues: {{ . | quote }} + {{- end }} + repoURL: https://nautobot.github.io/helm-charts/ + targetRevision: 2.5.6 + + - path: components/nautobot-worker + ref: understack + repoURL: {{ include "understack.understack_url" $ }} + targetRevision: {{ include "understack.understack_ref" $ }} + kustomize: + patches: + - patch: | + - op: replace + path: /data/UNDERSTACK_PARTITION + value: "{{ index $.Values.appLabels "understack.rackspace.com/partition" | default "" }}" + target: + kind: ConfigMap + name: cluster-data + - path: {{ include "understack.deploy_path" $ }}/nautobot-worker + ref: deploy + repoURL: {{ include "understack.deploy_url" $ }} + targetRevision: {{ include "understack.deploy_ref" $ }} + syncPolicy: + automated: + prune: true + selfHeal: true + managedNamespaceMetadata: + annotations: + argocd.argoproj.io/sync-options: Delete=false + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + - RespectIgnoreDifferences=true + - ApplyOutOfSyncOnly=true +{{- end }} diff --git a/charts/argocd-understack/templates/application-nautobot.yaml b/charts/argocd-understack/templates/application-nautobot.yaml index c6a14bf94..b65de9928 100644 --- a/charts/argocd-understack/templates/application-nautobot.yaml +++ b/charts/argocd-understack/templates/application-nautobot.yaml @@ -19,7 +19,7 @@ spec: helm: fileParameters: - name: nautobot.config - path: $understack/components/nautobot/nautobot_config.py + path: {{ $.Values.global.nautobot.nautobot_config }} ignoreMissingValueFiles: true releaseName: nautobot valueFiles: diff --git a/charts/argocd-understack/values.yaml b/charts/argocd-understack/values.yaml index 0ffa5b048..54280c479 100644 --- a/charts/argocd-understack/values.yaml +++ b/charts/argocd-understack/values.yaml @@ -144,6 +144,9 @@ global: # -- Enable/disable deploying Nautobot # @default -- false enabled: false + # -- config file to use for Nautobot scoped to either $understack or $deploy repo + # @default -- $understack/components/nautobot/nautobot_config.py + nautobot_config: '$understack/components/nautobot/nautobot_config.py' # -- Nautobot API token generation jobs nautobot_api_tokens: @@ -556,6 +559,15 @@ site: # @default -- false enabled: false + # -- Nautobot Celery workers (site-level, connects to global Nautobot) + nautobot_worker: + # -- Enable/disable deploying Nautobot workers at the site level + # @default -- false + enabled: false + # -- config file to use for Nautobot scoped to either $understack or $deploy repo + # @default -- $understack/components/nautobot/nautobot_config.py + nautobot_config: '$understack/components/nautobot/nautobot_config.py' + # -- SNMP exporter for network device monitoring snmp_exporter: # -- Enable/disable deploying SNMP exporter diff --git a/components/envoy-configs/templates/gw-external.yaml.tpl b/components/envoy-configs/templates/gw-external.yaml.tpl index dca86126b..97331a421 100644 --- a/components/envoy-configs/templates/gw-external.yaml.tpl +++ b/components/envoy-configs/templates/gw-external.yaml.tpl @@ -35,13 +35,11 @@ spec: {{- range .Values.routes.tls }} {{- $listenerName := .name | default (index (splitList "." .fqdn) 0) }} - name: {{ $listenerName }} - port: {{ $.Values.gateways.external.port | default 443 }} + port: {{ .gatewayPort | default ($.Values.gateways.external.port | default 443) }} protocol: TLS hostname: {{ .fqdn | quote }} tls: mode: Passthrough - certificateRefs: - - name: {{ $listenerName }}-tls allowedRoutes: namespaces: {{- if .selector }} @@ -52,6 +50,7 @@ spec: from: {{ .from | default "All" }} {{- end }} {{- end }} + {{- if .Values.gateways.external.serviceAnnotations }} infrastructure: parametersRef: diff --git a/components/envoy-configs/values.schema.json b/components/envoy-configs/values.schema.json index 5ba888206..02bf09fbb 100644 --- a/components/envoy-configs/values.schema.json +++ b/components/envoy-configs/values.schema.json @@ -180,6 +180,12 @@ "type": "string", "description": "Namespace where the httproute will be installed (same as backend service)" }, + "gatewayPort": { + "type": "integer", + "minimum": 1, + "maximum": 65535, + "description": "Port exposed on the gateway for this TLS passthrough listener. Defaults to the external gateway port (443) if not specified." + }, "service": { "type": "object", "description": "Kubernetes service backend configuration for the route", diff --git a/components/nautobot-worker/kustomization.yaml b/components/nautobot-worker/kustomization.yaml new file mode 100644 index 000000000..79325fa8b --- /dev/null +++ b/components/nautobot-worker/kustomization.yaml @@ -0,0 +1,10 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - name: cluster-data + literals: + - UNDERSTACK_PARTITION="" + options: + disableNameSuffixHash: true diff --git a/components/nautobot-worker/values.yaml b/components/nautobot-worker/values.yaml new file mode 100644 index 000000000..13bf23982 --- /dev/null +++ b/components/nautobot-worker/values.yaml @@ -0,0 +1,68 @@ +# Nautobot Worker (site-level) +# +# Deploys only Celery workers that connect back to the global Nautobot +# database and Redis. The web server is disabled because it lives on +# the global cluster. Redis and PostgreSQL are disabled because the +# workers reach the global instances over the network. +--- + +# Disable the Nautobot web server — workers only +nautobot: + enabled: false + replicaCount: 0 + + db: + engine: "django.db.backends.postgresql" + # Override in deploy repo values to point at the global CNPG service + host: "" + port: 5432 + name: "app" + user: "app" + existingSecret: "nautobot-db" + existingSecretPasswordKey: "password" + + django: + existingSecret: nautobot-django + + superUser: + enabled: false + + redis: + # Override in deploy repo values to point at the global Redis service + host: "" + port: 6379 + ssl: false + username: "" + +celery: + enabled: true + concurrency: 2 + replicaCount: 1 + extraEnvVarsCM: + - cluster-data + extraEnvVarsSecret: + - nautobot-django + livenessProbe: + initialDelaySeconds: 60 + periodSeconds: 120 + timeoutSeconds: 60 + readinessProbe: + initialDelaySeconds: 60 + periodSeconds: 120 + timeoutSeconds: 60 + +# Disable celery beat — scheduling runs on the global cluster only +workers: + beat: + enabled: false + +# Do not deploy local Redis — use the global instance +redis: + enabled: false + +# Do not deploy local PostgreSQL — use the global CNPG instance +postgresql: + enabled: false + +ingress: + enabled: false diff --git a/components/nautobot/nautobot_config.py b/components/nautobot/nautobot_config.py index 612c9d30b..15dd9873d 100644 --- a/components/nautobot/nautobot_config.py +++ b/components/nautobot/nautobot_config.py @@ -1,4 +1,7 @@ +import json as _json import os +import re as _re +from ssl import CERT_REQUIRED from nautobot.core.settings import * # noqa F401,F403 from nautobot.core.settings_funcs import is_truthy @@ -64,6 +67,67 @@ if DATABASES["default"]["ENGINE"].endswith("mysql"): # noqa F405 DATABASES["default"]["OPTIONS"] = {"charset": "utf8mb4"} # noqa F405 +# SSL/mTLS options for PostgreSQL connections. +# +# Supported NAUTOBOT_DB_SSLMODE values: +# "require" -- encrypt the connection but skip server CA and client cert +# verification. Suitable for same-cluster pods that just need +# to satisfy hostssl pg_hba rules. +# "verify-ca" -- encrypt and verify the server certificate against the CA +# "verify-full" -- like verify-ca but also checks the server hostname +# +# When sslmode is "verify-ca" or "verify-full", the client certificate, key, +# and CA root cert must be present at the configured paths (full mTLS). +# When sslmode is "require", only encryption is enforced -- no cert files are +# needed and no client certificate is presented. +_db_sslcert = os.getenv("NAUTOBOT_DB_SSLCERT", "/etc/nautobot/mtls/tls.crt") +_db_sslkey = os.getenv("NAUTOBOT_DB_SSLKEY", "/etc/nautobot/mtls/tls.key") +_db_sslrootcert = os.getenv("NAUTOBOT_DB_SSLROOTCERT", "/etc/nautobot/mtls/ca.crt") +_db_sslmode = os.getenv("NAUTOBOT_DB_SSLMODE", "") + +if _db_sslmode in ("verify-ca", "verify-full"): + for _path, _label in [ + (_db_sslcert, "NAUTOBOT_DB_SSLCERT"), + (_db_sslkey, "NAUTOBOT_DB_SSLKEY"), + (_db_sslrootcert, "NAUTOBOT_DB_SSLROOTCERT"), + ]: + if not os.path.isfile(_path): + raise FileNotFoundError( + f"SSL certificate file required by {_label} not found: {_path}" + ) + DATABASES["default"]["OPTIONS"] = { # noqa F405 + "sslmode": _db_sslmode, + "sslcert": _db_sslcert, + "sslkey": _db_sslkey, + "sslrootcert": _db_sslrootcert, + } +elif _db_sslmode == "require": + DATABASES["default"]["OPTIONS"] = { # noqa F405 + "sslmode": "require", + } + +# mTLS options for Redis connections. +# When NAUTOBOT_REDIS_SSL env var is "true" (set by Helm `nautobot.redis.ssl`), +# the Helm chart switches the URL scheme to rediss://. We still need to tell +# the Python redis client *which* certs to use for mutual TLS. +_redis_ca = os.getenv("NAUTOBOT_REDIS_SSL_CA_CERTS", "/etc/nautobot/mtls/ca.crt") +_redis_cert = os.getenv("NAUTOBOT_REDIS_SSL_CERTFILE", "/etc/nautobot/mtls/tls.crt") +_redis_key = os.getenv("NAUTOBOT_REDIS_SSL_KEYFILE", "/etc/nautobot/mtls/tls.key") + +if os.path.isfile(_redis_ca): + _redis_ssl_kwargs = { + "ssl_cert_reqs": CERT_REQUIRED, + "ssl_ca_certs": _redis_ca, + "ssl_certfile": _redis_cert, + "ssl_keyfile": _redis_key, + } + CACHES["default"].setdefault("OPTIONS", {}) # noqa F405 + CACHES["default"]["OPTIONS"].setdefault("CONNECTION_POOL_KWARGS", {}) # noqa F405 + CACHES["default"]["OPTIONS"]["CONNECTION_POOL_KWARGS"].update(_redis_ssl_kwargs) # noqa F405 + CELERY_BROKER_USE_SSL = _redis_ssl_kwargs # noqa F405 + CELERY_REDIS_BACKEND_USE_SSL = _redis_ssl_kwargs # noqa F405 + CELERY_BROKER_TRANSPORT_OPTIONS = {"ssl": _redis_ssl_kwargs} # noqa F405 + # This key is used for secure generation of random numbers and strings. It must never be exposed outside of this file. # For optimal security, SECRET_KEY should be at least 50 characters in length and contain a mix of letters, numbers, and # symbols. Nautobot will not run without this defined. For more information, see @@ -352,6 +416,11 @@ os.getenv("NAUTOBOT_INSTALLATION_METRICS_ENABLED", "True") ) +# Partition identifier used by computed fields (e.g. device URN generation). +# Populated from the cluster-data ConfigMap which is patched by ArgoCD from +# the appLabels["understack.rackspace.com/partition"] value. +UNDERSTACK_PARTITION = os.environ.get("UNDERSTACK_PARTITION", "") + # Storage backend to use for Job input files and Job output files. # # Note: the default is for backwards compatibility and it is recommended to change it if possible for your deployment. @@ -411,8 +480,32 @@ # PER_PAGE_DEFAULTS = [25, 50, 100, 250, 500, 1000] # Enable installed plugins. Add the name of each plugin to the list. -# -# PLUGINS = [] +# Use try/except to only load plugins that are installed in this container, +# since different deployments may have different plugin sets. +# +PLUGINS = [] +for _plugin_name in [ + "nautobot_plugin_nornir", + "nautobot_golden_config", +]: + try: + __import__(_plugin_name) + PLUGINS.append(_plugin_name) + except ImportError: + pass + +# Allow additional plugins to be specified via the NAUTOBOT_EXTRA_PLUGINS +# environment variable (comma-separated list of plugin module names). +# This lets private deployments add their own plugins without modifying +# this file. +_extra_plugins = os.getenv("NAUTOBOT_EXTRA_PLUGINS", "") +for _plugin_name in (p.strip() for p in _extra_plugins.split(",") if p.strip()): + try: + __import__(_plugin_name) + if _plugin_name not in PLUGINS: + PLUGINS.append(_plugin_name) + except ImportError: + pass # Plugins configuration settings. These settings are used by various plugins that the user may have installed. # Each key in the dictionary is the name of an installed plugin and its value is a dictionary of settings. @@ -423,13 +516,67 @@ # 'buzz': 'bazz' # } # } -PLUGINS_CONFIG = { - "vni_custom_model": { - "FORCE_UNIQUE_VLANS": is_truthy( - os.getenv("VNI_CUSTOM_MODEL_FORCE_UNIQUE_VLANS", "false") - ) +PLUGINS_CONFIG = {} + +# Configuration for open-source plugins (only applied when the plugin is loaded). +if "nautobot_plugin_nornir" in PLUGINS: + PLUGINS_CONFIG["nautobot_plugin_nornir"] = { + "nornir_settings": { + "credentials": "nautobot_plugin_nornir.plugins.credentials.nautobot_secrets.CredentialsNautobotSecrets", + "runner": { + "plugin": "threaded", + "options": { + "num_workers": 20, + }, + }, + }, + "use_config_context": { + "connection_options": True, + }, + } + +if "nautobot_golden_config" in PLUGINS: + PLUGINS_CONFIG["nautobot_golden_config"] = { + "per_feature_bar_width": 0.15, + "per_feature_width": 13, + "per_feature_height": 4, + "enable_backup": True, + "enable_compliance": True, + "enable_intended": True, + "enable_sotagg": True, + "sot_agg_transposer": None, + "enable_postprocessing": True, + "postprocessing_callables": [], + "postprocessing_subscribed": [], + "platform_slug_map": None, } -} + + +# Allow plugin configuration via the NAUTOBOT_EXTRA_PLUGINS_CONFIG environment +# variable. Value must be a JSON object whose keys are plugin names and values +# are config dicts. Supports ${ENV_VAR} syntax for referencing environment +# variables in string values (useful for secrets). +def _interpolate_env(obj): + """Recursively replace ${VAR} patterns with environment variable values.""" + if isinstance(obj, str): + return _re.sub( + r"\$\{(\w+)\}", + lambda m: os.environ.get(m.group(1), ""), + obj, + ) + if isinstance(obj, dict): + return {k: _interpolate_env(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_interpolate_env(v) for v in obj] + return obj + + +_extra_cfg = os.getenv("NAUTOBOT_EXTRA_PLUGINS_CONFIG", "") +if _extra_cfg: + try: + PLUGINS_CONFIG.update(_interpolate_env(_json.loads(_extra_cfg))) + except (ValueError, TypeError): + pass # Prefer IPv6 addresses or IPv4 addresses in selecting a device's primary IP address? # diff --git a/docs/deploy-guide/components/nautobot-worker.md b/docs/deploy-guide/components/nautobot-worker.md new file mode 100644 index 000000000..207bbe07c --- /dev/null +++ b/docs/deploy-guide/components/nautobot-worker.md @@ -0,0 +1,677 @@ +--- +charts: +- nautobot +kustomize_paths: +- components/nautobot-worker +deploy_overrides: + helm: + mode: values + kustomize: + mode: second_source +--- + +# nautobot-worker + +Site-level Nautobot Celery workers that connect to the global Nautobot +database and Redis. This component deploys only the Celery worker +portion of the Nautobot Helm chart on site clusters, allowing sites to +process background tasks locally without running the full Nautobot web +application. The web server, Redis, and PostgreSQL all remain on the +global cluster -- site workers connect back to those shared services +over the network. + +For details on how Celery task queues are configured per site and how to +route jobs to site-specific workers, see the +[Nautobot Celery Queues](../../operator-guide/nautobot-celery-queues.md) +operator guide. + +## Deployment Scope + +- Cluster scope: site +- Values key: `site.nautobot_worker` +- ArgoCD Application template: `charts/argocd-understack/templates/application-nautobot-worker.yaml` + +## How ArgoCD Builds It + +{{ component_argocd_builds() }} + +## How to Enable + +Enable this component in your site deployment values file: + +```yaml title="$CLUSTER_NAME/deploy.yaml" +site: + nautobot_worker: + enabled: true +``` + +## Architecture + +Site workers connect to the global cluster's PostgreSQL (CNPG) and Redis +through the Envoy Gateway. Both connections use mutual TLS (mTLS) with +TLS passthrough at the gateway, so the cryptographic handshake happens +directly between the worker pod and the database/Redis server. + +```text +Site Cluster Global Cluster ++------------------+ +---------------------------+ +| Worker Pod | TLS+ClientCert | Envoy Gateway | +| - celery | ---------------> | port 5432 (passthrough) | --> CNPG PostgreSQL +| - mtls certs | ---------------> | port 6379 (passthrough) | --> Redis ++------------------+ +---------------------------+ +``` + +The worker pods mount a client certificate (issued by a dedicated +internal CA via cert-manager) and present it during the TLS handshake. +See [Certificate Infrastructure](#certificate-infrastructure) for +details on the CA hierarchy and how certificates are provisioned. +PostgreSQL and Redis on the global cluster verify the client certificate +against the same CA before accepting the connection. + +### Why mTLS? + +Site workers run on remote clusters and connect to the global database +and Redis over the network. Password-only authentication is insufficient +for cross-cluster connections -- if a credential leaks, any host with +network access could connect to the production database. mTLS ensures +that even with a leaked password, connections without a valid client +certificate are rejected. Traffic is encrypted end-to-end between the +worker pod and the server. + +## Plugin Loading + +The shared `nautobot_config.py` supports a generic plugin loading +mechanism described in the +[Nautobot Plugin Loading](../../operator-guide/nautobot.md#plugin-loading) +operator guide. Site workers use the same mechanism -- open-source +plugins are loaded automatically, and additional plugins can be added +via the `NAUTOBOT_EXTRA_PLUGINS` environment variable. + +## Connection Security + +### PostgreSQL (CNPG) + +The global CNPG cluster is configured with: + +- `spec.certificates.serverTLSSecret` and `spec.certificates.serverCASecret` + for server-side TLS. +- `spec.certificates.clientCASecret` set to the CA public cert secret + (`mtls-ca-cert`). CNPG uses this to populate PostgreSQL's + `ssl_ca_file`, which is what PostgreSQL checks when verifying client + certificates during `pg_hba cert` authentication. The secret only + needs `ca.crt` (the root CA public cert). +- `spec.certificates.replicationTLSSecret` set to a cert-manager + Certificate (`nautobot-cluster-replication`) with + `commonName: streaming_replica`. This provides the client cert CNPG + uses for streaming replication between PostgreSQL instances. When + `replicationTLSSecret` is provided, CNPG does not need the CA private + key in `clientCASecret`, which is why we can use `mtls-ca-cert` + (which only has `ca.crt`) instead of `mtls-ca-key-pair`. +- `pg_hba` rules that require `hostssl ... cert` for all connections, + enforcing client certificate authentication over TLS + +Both global pods and site workers connect with `sslmode=verify-ca`, +presenting their client certificate, key, and the CA root cert via +Django's `DATABASES` OPTIONS. + +The `nautobot_config.py` SSL logic is conditional on the +`NAUTOBOT_DB_SSLMODE` environment variable: + +- `verify-ca` or `verify-full`: reads cert paths from environment + variables (defaults to `/etc/nautobot/mtls/`) and sets full mTLS + options on `DATABASES["default"]["OPTIONS"]`. Used by both global + pods and site workers. +- `require`: sets `sslmode=require` only -- encrypts the connection + without presenting a client certificate or verifying the server CA. +- Unset or empty: no SSL options are applied and pods connect with + password-only auth over plain TCP. + +#### pg_hba Rule + +The CNPG cluster uses a single `pg_hba` rule: + +1. `hostssl all all 0.0.0.0/0 cert` -- all connections must use TLS + and present a valid client certificate. The certificate CN maps to + the PostgreSQL user (must be `app`). + +### Redis + +The global Redis mTLS configuration is described in the +[global nautobot deploy guide](nautobot.md#redis-mtls). Site workers +use the same auto-detection mechanism -- when the mTLS cert volume is +mounted, Redis SSL is configured automatically. + +### Envoy Gateway + +Both PostgreSQL (port 5432) and Redis (port 6379) use `routes.tls` +entries with TLS passthrough mode. The gateway routes traffic based on +SNI hostname without terminating TLS, preserving end-to-end mTLS. + +## Certificate Infrastructure + +### Global Cluster + +The global cluster hosts the mTLS CA hierarchy described in the +[global nautobot deploy guide](nautobot.md#mtls-certificate-infrastructure). + +### Site Clusters + +Client certificates are issued on the global cluster by cert-manager +and distributed to site clusters through your external secrets provider. +The CA private key never leaves the global cluster -- a compromised +site cannot forge certificates for other sites. + +Each site needs two credentials from the secrets provider: + +| Credential | Content | Scope | +|---|---|---| +| Client cert+key | The issued `tls.crt` and `tls.key` for this site | Per-site | +| CA public cert | The `ca.crt` from the mTLS CA | Shared across all sites | + +The ExternalSecret on the site cluster combines these into a single +`nautobot-mtls-client` secret (type `kubernetes.io/tls`) with `tls.crt`, +`tls.key`, and `ca.crt`. This secret is mounted into worker pods at +`/etc/nautobot/mtls/`. + +Note: if your secrets provider stores PEM data with `\r\n` line endings +or concatenates multiple PEM blocks in a single field, use the +[`filterPEM`](https://external-secrets.io/latest/guides/templating/#filter-pem-blocks) +template function to extract specific block types. `filterPEM` handles +carriage-return stripping automatically. + +## Adding a New Site + +This section walks through configuring `nautobot-worker` for a new site +cluster. All files go in `/nautobot-worker/` in the deploy +repo. + +### Prerequisites + +Before starting, ensure the global cluster already has: + +- The mTLS CA hierarchy deployed (issuers, root CA, CA issuer) +- Server TLS certificates for PostgreSQL and Redis +- A global `nautobot-mtls-client` certificate (for Redis `authClients`) +- CNPG configured with `serverTLSSecret`, `serverCASecret`, `clientCASecret`, and `pg_hba` +- Redis TLS enabled with `authClients: true` +- Envoy Gateway TLS passthrough routes on ports 5432 and 6379 + +You also need the pre-issued client certificate stored in your external +secrets provider (see Step 1). + +### Step 1: Issue the client certificate on the global cluster + +Create a cert-manager Certificate resource on the global cluster for +this site. The `commonName` must match the PostgreSQL database user +(typically `app`) because `pg_hba cert` maps the certificate CN to the +DB user. + +```yaml title="global-cluster/nautobot/certificate-nautobot-mtls-client-.yaml" +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: nautobot-mtls-client- + namespace: nautobot +spec: + secretName: nautobot-mtls-client- + duration: 26280h # 3 years + renewBefore: 2160h # 90 days + commonName: app + usages: + - client auth + privateKey: + algorithm: ECDSA + size: 256 + issuerRef: + name: mtls-ca-issuer + kind: Issuer +``` + +Add it to the global nautobot kustomization. After ArgoCD syncs, +cert-manager issues the certificate into a Kubernetes secret. + +Then extract the cert material and upload it to your secrets provider +as two separate credentials: + +```bash +# Extract the client cert + key (per-site credential) +kubectl get secret nautobot-mtls-client- -n nautobot \ + -o jsonpath='{.data.tls\.crt}' | base64 -d > /tmp/tls.crt +kubectl get secret nautobot-mtls-client- -n nautobot \ + -o jsonpath='{.data.tls\.key}' | base64 -d > /tmp/tls.key + +# Upload to your secrets provider as a single credential with +# the cert and key concatenated in one field. + +# Extract the CA public cert (shared across all sites, one-time) +kubectl get secret mtls-ca-cert -n nautobot \ + -o jsonpath='{.data.ca\.crt}' | base64 -d > /tmp/ca.crt + +# Upload to your secrets provider as a separate credential. +# This only needs to be done once -- all sites share the same CA cert. +``` + +The CA private key stays in the `mtls-ca-key-pair` secret on the global +cluster and is never extracted or distributed. + +### Step 2: Create the site directory + +```text +/nautobot-worker/ +``` + +### Step 3: Create ExternalSecrets for credentials + +Create ExternalSecret resources that pull credentials from your secrets +provider into the `nautobot` namespace. You need five: + +| ExternalSecret | Target Secret | Purpose | +|---|---|---| +| `externalsecret-nautobot-django.yaml` | `nautobot-django` | Django `SECRET_KEY` -- must match the global instance | +| `externalsecret-nautobot-db.yaml` | `nautobot-db` | CNPG app user password (satisfies Helm chart requirement) | +| `externalsecret-nautobot-worker-redis.yaml` | `nautobot-redis` | Redis password | +| `externalsecret-dockerconfigjson-github-com.yaml` | `dockerconfigjson-github-com` | Container registry credentials | +| `externalsecret-nautobot-mtls-client.yaml` | `nautobot-mtls-client` | mTLS client cert + CA cert (two credentials combined) | + +The mTLS ExternalSecret pulls from two separate credentials in your +secrets provider -- the per-site client cert+key and the shared CA +public cert -- and combines them into a single `kubernetes.io/tls` +secret with `tls.crt`, `tls.key`, and `ca.crt`. + +If both credentials have the same field name (e.g. `password`), use +`dataFrom` with `rewrite` to prefix the keys and avoid collision: + +{% raw %} + +```yaml +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: nautobot-mtls-client +spec: + refreshInterval: 1h + secretStoreRef: + kind: ClusterSecretStore + name: + target: + creationPolicy: Owner + deletionPolicy: Retain + template: + engineVersion: v2 + type: kubernetes.io/tls + data: + tls.crt: '{{ .client_password | filterPEM "CERTIFICATE" }}' + tls.key: '{{ .client_password | filterPEM "EC PRIVATE KEY" }}' + ca.crt: '{{ .ca_password | filterPEM "CERTIFICATE" }}' + dataFrom: + - extract: + key: "" + rewrite: + - regexp: + source: "(.*)" + target: "client_$1" + - extract: + key: "" + rewrite: + - regexp: + source: "(.*)" + target: "ca_$1" +``` + +{% endraw %} + +The [`filterPEM`](https://external-secrets.io/latest/guides/templating/#filter-pem-blocks) +function extracts PEM blocks by type and strips carriage returns +automatically. Pass the PEM block type without the `BEGIN`/`END` +markers (e.g. `"CERTIFICATE"`, `"EC PRIVATE KEY"`, `"PRIVATE KEY"`). + +### Step 4: Create the kustomization + +Create `kustomization.yaml` listing all resources: + +```yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - externalsecret-nautobot-django.yaml + - externalsecret-nautobot-db.yaml + - externalsecret-nautobot-worker-redis.yaml + - externalsecret-dockerconfigjson-github-com.yaml + - externalsecret-nautobot-mtls-client.yaml +``` + +### Step 5: Create the values file + +Create `values.yaml` with the site-specific overrides. Replace +`` with your environment identifier and `` with +the site's partition name. + +```yaml +nautobot: + db: + host: "nautobot-db..undercloud.rackspace.net" + redis: + host: "nautobot-redis..undercloud.rackspace.net" + ssl: true + image: + registry: "ghcr.io" + repository: "/" + tag: "latest" + pullPolicy: "Always" + pullSecrets: + - dockerconfigjson-github-com + +celery: + extraEnvVars: + - name: NAUTOBOT_CONFIG + value: /opt/nautobot/nautobot_config.py + - name: NAUTOBOT_EXTRA_PLUGINS + value: '' + - name: NAUTOBOT_DB_SSLMODE + value: verify-ca + - name: NAUTOBOT_REDIS_SSL_CERT_REQS + value: required + - name: NAUTOBOT_REDIS_SSL_CA_CERTS + value: /etc/nautobot/mtls/ca.crt + - name: NAUTOBOT_REDIS_SSL_CERTFILE + value: /etc/nautobot/mtls/tls.crt + - name: NAUTOBOT_REDIS_SSL_KEYFILE + value: /etc/nautobot/mtls/tls.key + - name: SSL_CERT_FILE + value: /etc/nautobot/mtls/ca.crt + - name: REQUESTS_CA_BUNDLE + value: /etc/nautobot/mtls/ca.crt + extraVolumes: + - name: mtls-certs + secret: + secretName: nautobot-mtls-client + defaultMode: 256 + extraVolumeMounts: + - name: mtls-certs + mountPath: /etc/nautobot/mtls + readOnly: true +``` + +### Step 6: Enable in deploy.yaml + +Add `nautobot_worker` to the site's `deploy.yaml`: + +```yaml +site: + nautobot_worker: + enabled: true +``` + +### Step 7: Verify + +After ArgoCD syncs, verify the worker is running and connected: + +```bash +# Check the client cert secret was pulled from the secrets provider +kubectl get secret nautobot-mtls-client -n nautobot + +# Check the worker pod is running +kubectl get pods -n nautobot -l app.kubernetes.io/component=nautobot-celery + +# Check worker logs for successful DB/Redis connections +kubectl logs -n nautobot -l app.kubernetes.io/component=nautobot-celery --tail=50 +``` + +### Final directory structure + +```text +/nautobot-worker/ + externalsecret-dockerconfigjson-github-com.yaml + externalsecret-nautobot-db.yaml + externalsecret-nautobot-django.yaml + externalsecret-nautobot-mtls-client.yaml + externalsecret-nautobot-worker-redis.yaml + kustomization.yaml + values.yaml +``` + +## Certificate Renewal + +For details on how mTLS client certificates are renewed and distributed +to site clusters, see the +[mTLS Certificate Renewal](../../operator-guide/nautobot-mtls-certificate-renewal.md) +operator guide. + +## Environment Variable Reference + +| Variable | Where Set | Purpose | +|---|---|---| +| `NAUTOBOT_DB_SSLMODE` | Both global and site values | Controls PostgreSQL SSL mode. Set to `verify-ca` for mTLS on all pods. | +| `NAUTOBOT_DB_SSLCERT` | Optional override | Path to client cert for PG (default: `/etc/nautobot/mtls/tls.crt`) | +| `NAUTOBOT_DB_SSLKEY` | Optional override | Path to client key for PG (default: `/etc/nautobot/mtls/tls.key`) | +| `NAUTOBOT_DB_SSLROOTCERT` | Optional override | Path to CA cert for PG (default: `/etc/nautobot/mtls/ca.crt`) | +| `NAUTOBOT_REDIS_SSL_CERT_REQS` | Site worker values | Set to `required` to enforce Redis server cert verification | +| `NAUTOBOT_REDIS_SSL_CA_CERTS` | Site worker values | Path to CA cert for Redis | +| `NAUTOBOT_REDIS_SSL_CERTFILE` | Site worker values | Path to client cert for Redis | +| `NAUTOBOT_REDIS_SSL_KEYFILE` | Site worker values | Path to client key for Redis | +| `SSL_CERT_FILE` | Site worker values | System-wide CA bundle override for outbound HTTPS | +| `REQUESTS_CA_BUNDLE` | Site worker values | Python requests library CA bundle override | +| `NAUTOBOT_CONFIG` | Both global and site | Path to `nautobot_config.py` | +| `NAUTOBOT_EXTRA_PLUGINS` | Both global and site values | Comma-separated list of additional plugin module names to load (beyond the open-source defaults). Plugins are loaded only if installed in the container. | +| `NAUTOBOT_EXTRA_PLUGINS_CONFIG` | Both global and site values | JSON object with plugin configuration. Supports `${ENV_VAR}` syntax for referencing environment variables in string values (useful for secrets). Merged into `PLUGINS_CONFIG`. | +| `UNDERSTACK_PARTITION` | `cluster-data` ConfigMap (patched by ArgoCD from `appLabels`) | Site partition identifier used by computed fields (e.g. device URN generation). Exposed as a Django setting. | + +## Design Decisions + +- The cert-manager CA hierarchy (self-signed bootstrap -> root CA -> + CA issuer) handles issuance and renewal on both global and site + clusters without manual intervention. + +- CNPG's native TLS support (`serverTLSSecret`, `serverCASecret`, + `clientCASecret`, `replicationTLSSecret`) integrates directly with + cert-manager secrets. No sidecar proxies or custom TLS termination + needed. `clientCASecret` populates PostgreSQL's `ssl_ca_file` for + client cert verification during `pg_hba cert` auth. It points to the + CA public cert secret (`mtls-ca-cert`). `replicationTLSSecret` + provides the streaming replication client cert so CNPG does not need + the CA private key in `clientCASecret`. + +- The `routes.tls` type in the Envoy Gateway template uses a + `gatewayPort` field to support non-443 ports for TLS passthrough. + PostgreSQL (5432) and Redis (6379) both use this route type. + +- The `pg_hba cert` method with CN-to-user mapping means the client + certificate CN (e.g. `app`) maps directly to the PostgreSQL user, so + no additional user mapping configuration is needed. + +- Client certificates are issued on the global cluster by cert-manager + and distributed to site clusters via the external secrets provider. + The CA private key never leaves the global cluster, so a compromised + site cannot forge certificates for other sites. + +- The `nautobot_config.py` SSL logic is conditional on + `NAUTOBOT_DB_SSLMODE`, so the same config file works for both global + pods and site workers. All pods set `verify-ca` to present client + certificates for `pg_hba cert` authentication. + +- The Redis mTLS logic in `nautobot_config.py` auto-detects the CA cert + file at the default mount path. If the cert volume is mounted, Redis + mTLS is configured automatically. + +## Known Gotchas + +- **clientCASecret is required for client cert verification.** CNPG + uses `clientCASecret` to populate PostgreSQL's `ssl_ca_file`, which + is what verifies client certificates during `pg_hba cert` auth. + `serverCASecret` only provides the CA cert sent to clients for server + verification -- it does NOT populate `ssl_ca_file`. Without + `clientCASecret`, CNPG auto-generates its own internal replication CA + and uses that for `ssl_ca_file`, causing `tlsv1 alert unknown ca` + errors for external client certs. When providing `clientCASecret`, + you must also set `replicationTLSSecret` so CNPG does not need the + CA private key (`ca.key`) in the `clientCASecret` secret. + +- **SSL config must be conditional.** The mTLS config in + `nautobot_config.py` is gated on the `NAUTOBOT_DB_SSLMODE` env var. + Both global pods and site workers must set it to `verify-ca`. If the + env var is unset, no SSL options are applied and the connection will + be rejected by the `hostssl ... cert` pg_hba rule. + +- **mtls-ca-cert secret contains a private key.** cert-manager + Certificate resources always produce `tls.crt`, `tls.key`, and + `ca.crt`. CNPG only reads `ca.crt` from the referenced secret, so + the extra fields are harmless but not ideal. A future improvement + could use cert-manager `trust-manager` Bundle to distribute only the + CA cert. + +- **ca.crt must be the CA cert, not the client cert.** The `ca.crt` + field in the `nautobot-mtls-client` secret must contain the mTLS CA + certificate (`CN=understack-mtls-ca`), not the client certificate. + If `ca.crt` contains the client cert, the worker will fail with + `[SSL: CERTIFICATE_VERIFY_FAILED] self-signed certificate in + certificate chain` because it can't verify the server's cert chain. + The CA cert credential in your secrets provider is shared across all + sites and only needs to be created once. + +- **PEM data with carriage returns.** Some secrets providers store text + with `\r\n` line endings. PEM certificates with `\r` characters will + fail OpenSSL parsing with `[SSL] PEM lib`. Use the + [`filterPEM`](https://external-secrets.io/latest/guides/templating/#filter-pem-blocks) + template function to extract PEM blocks by type -- it handles + carriage-return stripping automatically. Avoid manual `regexFind` + + `replace "\r" ""` patterns. + +- **ExternalSecret format depends on your secrets provider.** The + ExternalSecret for the mTLS client cert on site clusters must produce + a `kubernetes.io/tls` secret with `tls.crt`, `tls.key`, and `ca.crt`. + How you template this depends on how your secrets provider stores the + credential. + +- **Redis authClients affects all connections.** Redis + `authClients: true` requires ALL clients (including global Nautobot + pods) to present client certificates. The global Nautobot values must + mount the mTLS client cert into both the web server and celery pods, + not just site workers. + +- **pg_hba uses cert auth for all connections.** The single + `hostssl all all 0.0.0.0/0 cert` rule requires every connection -- + local and remote -- to present a valid client certificate over TLS. + All pods (global and site workers) must have `NAUTOBOT_DB_SSLMODE` + set to `verify-ca` and the mTLS client cert mounted. + +- **defaultMode 256 vs 0400.** The `defaultMode: 256` (octal 0400) on + the cert secret volume mount is correct but easy to get wrong. YAML + interprets `0400` as octal (decimal 256) -- writing `256` explicitly + avoids ambiguity. + +- **Client cert CN must match the DB user.** When using `pg_hba cert` + auth, PostgreSQL maps the client certificate CN to the database user. + The site worker client cert must use `commonName: app` to match the + CNPG app user. If the CN doesn't match, the connection is rejected + even with a valid cert. + +## Troubleshooting + +### Worker pod fails to start with FileNotFoundError + +The `nautobot_config.py` validates that cert files exist when +`NAUTOBOT_DB_SSLMODE` is `verify-ca` or `verify-full`. If the +`nautobot-mtls-client` secret doesn't exist or the volume mount is +misconfigured, the pod will crash with: + +```text +FileNotFoundError: SSL certificate file required by NAUTOBOT_DB_SSLCERT not found: /etc/nautobot/mtls/tls.crt +``` + +Check that: + +1. The `nautobot-mtls-client` secret exists on the site cluster: + `kubectl get secret nautobot-mtls-client -n nautobot` +2. The ExternalSecret is syncing successfully: + `kubectl get externalsecret nautobot-mtls-client -n nautobot` +3. The secret contains `tls.crt`, `tls.key`, and `ca.crt` keys +4. On the global cluster, verify the source certificate is issued: + `kubectl get certificate -n nautobot | grep mtls-client` + +### PostgreSQL rejects connection with "tlsv1 alert unknown ca" + +PostgreSQL's `ssl_ca_file` does not contain the CA that signed the +client certificate. This is a TLS-level rejection that happens before +`pg_hba` rules are evaluated. + +The most common cause is that `clientCASecret` is not set on the CNPG +Cluster resource. Without it, CNPG auto-generates its own internal +replication CA and uses that for `ssl_ca_file`. External client certs +signed by the mTLS CA will be rejected. + +Verify what CA PostgreSQL is actually using: + +```bash +kubectl exec -n nautobot nautobot-cluster-1 -c postgres -- \ + openssl x509 -noout -subject -in /controller/certificates/client-ca.crt +``` + +If it shows `CN=nautobot-cluster` (CNPG's internal CA) instead of +`CN=understack-mtls-ca`, set `clientCASecret` and +`replicationTLSSecret` on the CNPG Cluster. See the +[PostgreSQL mTLS](../../operator-guide/nautobot.md#postgresql-mtls) +operator guide for details. + +### PostgreSQL rejects connection with "certificate verify failed" + +The client cert is not signed by the CA that CNPG trusts. Verify the +CA chain: + +```bash +# On the site cluster, check the client cert's issuer +kubectl get secret nautobot-mtls-client -n nautobot \ + -o jsonpath='{.data.tls\.crt}' | base64 -d | openssl x509 -noout -issuer + +# On the global cluster, check the CA cert that CNPG uses +kubectl get secret mtls-ca-cert -n nautobot \ + -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -subject +``` + +The issuer of the client cert should match the subject of the CA cert. + +### PostgreSQL rejects with "no pg_hba.conf entry" + +The connection doesn't match any `pg_hba` rule. Common causes: + +- The client is connecting without TLS but the only matching rule + requires `hostssl` +- The client cert CN doesn't match the DB user (for `cert` auth) +- The source IP doesn't match any rule's CIDR + +### Redis connection refused with "certificate verify failed" + +The `ca.crt` mounted in the pod is not the CA that signed the Redis +server certificate. Verify: + +```bash +# Should show CN=understack-mtls-ca (the CA), NOT CN=app (the client cert) +kubectl get secret nautobot-mtls-client -n nautobot \ + -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -subject +``` + +If it shows the client cert CN, the CA cert credential in your secrets +provider has the wrong content. Update it with the actual CA certificate +from the global cluster's `mtls-ca-cert` secret. + +### Redis connection refused with TLS error + +If Redis has `authClients: true` and the connecting pod doesn't present +a client cert, the TLS handshake fails. Ensure the pod has the mTLS +cert volume mounted and the Redis SSL env vars are set. + +### Envoy Gateway not routing traffic + +If the gateway listener doesn't appear or traffic isn't reaching the +backend: + +```bash +# Check gateway status +kubectl get gateway -n envoy-gateway -o yaml + +# Check TLSRoute status +kubectl get tlsroute -n nautobot -o yaml +``` + +Verify the `fqdn` in the TLS route matches the SNI hostname the client +is connecting to. For PostgreSQL, the `nautobot.db.host` in the worker +values must match the `fqdn` in the envoy-configs route. diff --git a/docs/deploy-guide/components/nautobot.md b/docs/deploy-guide/components/nautobot.md index 4a6b78a7e..04ac46449 100644 --- a/docs/deploy-guide/components/nautobot.md +++ b/docs/deploy-guide/components/nautobot.md @@ -34,6 +34,105 @@ global: enabled: true ``` +## Configuration Architecture + +The `nautobot_config.py` file is managed in git at +`components/nautobot/nautobot_config.py` and injected into pods via the +Helm chart's `fileParameters` feature. ArgoCD reads the file, the Helm +chart creates a ConfigMap, and pods mount it at +`/opt/nautobot/nautobot_config.py`. The `NAUTOBOT_CONFIG` environment +variable tells Nautobot to load from that path. + +The effective configuration is built from four layers: Nautobot defaults, +the component config, Helm chart env vars from the base values, and +deploy repo value overrides. + +For the full details on how `fileParameters` works, why the baked-in +image config is not used, config layering, and the Helm list replacement +gotcha, see the +[Configuration Architecture](../../operator-guide/nautobot.md#configuration-architecture) +operator guide. + +## Plugin Loading + +For details on how plugins are loaded, configured via environment +variables, and how to add custom plugins, see the +[Plugin Loading](../../operator-guide/nautobot.md#plugin-loading) +operator guide. + +## mTLS Certificate Infrastructure + +The global cluster hosts the mTLS CA hierarchy (managed by cert-manager) +used by both the global Nautobot deployment and site-level workers: + +| Resource | Kind | Purpose | +|---|---|---| +| `mtls-selfsigned` | Issuer | Bootstraps the self-signed root | +| `mtls-ca` | Certificate | Root CA (ECDSA P-256, 10yr duration, 1yr renewBefore) | +| `mtls-ca-issuer` | Issuer | Signs all client and server certificates | +| `mtls-ca-cert` | Certificate | CA public cert secret used by CNPG (`clientCASecret` and `serverCASecret`) and Redis for client verification | +| `nautobot-cluster-server-tls` | Certificate | PostgreSQL server certificate | +| `nautobot-cluster-replication` | Certificate | Streaming replication client certificate (`CN=streaming_replica`). Required so CNPG does not need the CA private key in `clientCASecret`. | +| `nautobot-redis-server-tls` | Certificate | Redis server certificate | +| `nautobot-mtls-client` | Certificate | Client certificate for global Nautobot/Celery pods (`CN=app`). Used for both PostgreSQL `pg_hba cert` auth and Redis `authClients`. | + +All resources live in the `nautobot` namespace. + +For certificate renewal and distribution to site clusters, see the +[mTLS Certificate Renewal](../../operator-guide/nautobot-mtls-certificate-renewal.md) +operator guide. + +## Redis mTLS + +The global Redis instance has TLS enabled with `authClients: true` +(Bitnami Redis subchart), requiring client certificates from all +connections -- including local pods on the global cluster. + +The `nautobot_config.py` Redis mTLS logic checks if the CA cert file +exists at the default path (`/etc/nautobot/mtls/ca.crt`). If present, +it configures `ssl_cert_reqs`, `ssl_ca_certs`, `ssl_certfile`, and +`ssl_keyfile` on the Redis connection pool, Celery broker, and Celery +result backend. Both global and site pods automatically pick up Redis +mTLS when the cert volume is mounted. + +Because `authClients: true` applies to all connections (Redis has no +equivalent of `pg_hba` to distinguish local vs remote), the global +Nautobot deploy values must mount the `nautobot-mtls-client` cert into +both the web server and celery pods. + +## PostgreSQL mTLS + +The global CNPG cluster enforces client certificate authentication for +all connections via a single `pg_hba` rule: + +```text +hostssl all all 0.0.0.0/0 cert +``` + +The CNPG Cluster resource configures four certificate fields: + +| Field | Secret | Purpose | +|---|---|---| +| `serverTLSSecret` | `nautobot-cluster-server-tls` | Server cert presented to clients during TLS handshake | +| `serverCASecret` | `mtls-ca-cert` | CA cert sent to clients for server verification (`sslrootcert`) | +| `clientCASecret` | `mtls-ca-cert` | CA cert used by PostgreSQL's `ssl_ca_file` to verify client certs | +| `replicationTLSSecret` | `nautobot-cluster-replication` | Client cert for streaming replication (`CN=streaming_replica`) | + +`clientCASecret` is the critical field for client cert verification. +Without it, CNPG auto-generates its own internal CA and uses that for +`ssl_ca_file`, causing `tlsv1 alert unknown ca` errors for external +client certs signed by the mTLS CA. + +`replicationTLSSecret` must be provided alongside `clientCASecret` so +CNPG does not need the CA private key (`ca.key`) in the +`clientCASecret` secret. Without it, CNPG tries to generate its own +replication cert and fails with `missing ca.key secret data`. + +Both global Nautobot pods and site workers set +`NAUTOBOT_DB_SSLMODE=verify-ca` to present their client certificates +(`CN=app`) during the TLS handshake. The `pg_hba cert` rule maps the +certificate CN to the PostgreSQL user. + ## Deployment Repo Content {{ secrets_disclaimer }} @@ -53,3 +152,18 @@ Optional additions: - `nautobot-custom-env` Secret: Add any extra environment variables the deployment should inject into Nautobot, such as integration credentials or DSNs. - `Database cluster and backup manifests`: Add a CloudNativePG cluster, backup schedule, or similar database resources if this deployment owns its own PostgreSQL cluster. - `Catalog and bootstrap content`: Add app definitions, device types, location types, locations, rack groups, or racks if you want Nautobot preloaded with inventory metadata. + +## Known Gotchas + +- **Helm list values are replaced, not merged.** When the deploy repo + values set `extraVolumes` or `extraVolumeMounts`, they completely + replace the base values from `components/nautobot/values.yaml`. If + the base values include volumes (e.g. SSO secret mounts), the deploy + values must re-include them alongside any new volumes. Forgetting this + will silently break features like SSO login. + +- **Redis authClients affects all connections.** Redis + `authClients: true` requires ALL clients (including global Nautobot + pods) to present client certificates. The global Nautobot values must + mount the mTLS client cert into both the web server and celery pods, + not just site workers. diff --git a/docs/operator-guide/nautobot-celery-queues.md b/docs/operator-guide/nautobot-celery-queues.md new file mode 100644 index 000000000..18ced9edb --- /dev/null +++ b/docs/operator-guide/nautobot-celery-queues.md @@ -0,0 +1,258 @@ +# Nautobot Celery Queues + +This guide covers how Celery task queues work in the understack +nautobot-worker deployment, how the queue name is derived from the +site partition, and how to route jobs to site-specific queues +programmatically. + +## How the Queue Name is Set + +The ArgoCD Application template for `nautobot-worker` automatically +sets the Celery queue name to match the site's partition label +(`understack.rackspace.com/partition`). The relevant section in +`application-nautobot-worker.yaml`: + +{% raw %} + +```yaml +{{- with index $.Values.appLabels "understack.rackspace.com/partition" }} +values: | + workers: + default: + enabled: false + {{ . }}: + enabled: true + taskQueues: {{ . | quote }} +{{- end }} +``` + +{% endraw %} + +For a site with partition `rax-dev`, this renders as: + +```yaml +workers: + default: + enabled: false + rax-dev: + enabled: true + taskQueues: "rax-dev" +``` + +This produces a Deployment named `nautobot-worker-celery-rax-dev` with +the label `app.kubernetes.io/component: nautobot-celery-rax-dev` and +the environment variable `CELERY_TASK_QUEUES=rax-dev`. + +### Why workers.default must be disabled + +The upstream Nautobot Helm chart defines `workers.default.taskQueues: +"default"` in its own `values.yaml`. The chart's `nautobot.workers` +helper merges worker-specific values on top of the `celery` defaults. +If you only set `celery.taskQueues`, the chart's `workers.default` +overrides it because worker-level values take precedence. Disabling +`workers.default` and creating a new worker key avoids this conflict. + +## Nautobot JobQueue Setup + +Before any job can be dispatched to a site queue, a `JobQueue` record +must exist in Nautobot's database. Without it, the API rejects the +request with a validation error. + +### Create via the UI + +Navigate to Jobs > Job Queues > Add and create a queue with: + +- Name: `rax-dev` (must match the worker's `taskQueues` value) +- Queue Type: `celery` + +### Create via the REST API + +```bash +curl -X POST \ + -H "Authorization: Token $TOKEN" \ + -H "Content-Type: application/json" \ + https://nautobot.example.com/api/extras/job-queues/ \ + --data '{"name": "rax-dev", "queue_type": "celery"}' +``` + +### Create via pynautobot + +```python +import pynautobot + +nb = pynautobot.api("https://nautobot.example.com", token="your-token") +nb.extras.job_queues.create(name="rax-dev", queue_type="celery") +``` + +### Automate via Ansible + +The `ansible/roles/jobs/tasks/main.yml` role enables Rackspace jobs +but does not currently create JobQueues. You can extend it: + +{% raw %} + +```yaml +- name: "Ensure partition JobQueue exists" + ansible.builtin.uri: + url: "{{ nautobot_url }}/api/extras/job-queues/" + method: POST + headers: + Authorization: "Token {{ nautobot_token }}" + body_format: json + body: + name: "{{ partition }}" + queue_type: "celery" + status_code: [200, 201, 400] +``` + +{% endraw %} + +## Assigning Jobs to Queues + +A job must list the queue in its allowed queues before it can be +dispatched there. There are three ways to do this. + +### Option 1: In the Job class (code) + +Set `task_queues` in the Job's Meta class. This is baked into the +job's source code and applies everywhere the job is installed. + +```python +from nautobot.apps.jobs import Job + +class SyncSiteConfig(Job): + class Meta: + name = "Sync Site Config" + task_queues = ["rax-dev", "default"] +``` + +### Option 2: Via the Nautobot UI + +Navigate to Jobs > Jobs, select the job, click Edit, and add the +desired JobQueue(s) under the Job Queues field. Check "Override +job queues" to use the UI-configured queues instead of the ones +defined in code. + +### Option 3: Via the REST API + +```bash +curl -X PATCH \ + -H "Authorization: Token $TOKEN" \ + -H "Content-Type: application/json" \ + https://nautobot.example.com/api/extras/jobs/$JOB_ID/ \ + --data '{ + "job_queues": [{"name": "rax-dev"}, {"name": "default"}], + "job_queues_override": true + }' +``` + +## Running Jobs on a Specific Queue + +### Via pynautobot + +```python +import pynautobot + +nb = pynautobot.api("https://nautobot.example.com", token="your-token") + +job = nb.extras.jobs.get(name="my_app.jobs.SyncSiteConfig") + +# Run on the rax-dev site worker +result = job.run(data={"device": "server-01"}, task_queue="rax-dev") +``` + +The `task_queue` parameter (or `job_queue` -- both are accepted in +Nautobot 2.4+) tells Nautobot to dispatch the Celery task to the +specified queue. The site worker listening on that queue picks it up. + +### Via the REST API + +```bash +curl -X POST \ + -H "Authorization: Token $TOKEN" \ + -H "Content-Type: application/json" \ + https://nautobot.example.com/api/extras/jobs/$JOB_ID/run/ \ + --data '{ + "data": {"device": "server-01"}, + "task_queue": "rax-dev" + }' +``` + +### Via the Nautobot UI + +When running a job from the web UI, if the job has multiple queues +configured, a dropdown appears allowing you to select the target +queue before clicking "Run Job". + +### Default behavior + +If `task_queue` is not specified, Nautobot dispatches the job to the +job's `default_job_queue`. If no default is configured, it falls back +to `CELERY_TASK_DEFAULT_QUEUE` (typically `"default"`). + +## Validation + +Nautobot validates two things before accepting a job run request: + +1. The requested queue must be in the job's allowed queues list. + If not, the API returns: + `{"task_queue": ["\"rax-dev\" is not a valid choice."]}` + +2. At least one Celery worker must be actively listening on the + requested queue. If no worker is found, the API returns a + `CeleryWorkerNotRunningException`. This check uses Celery's + `inspect` to count active workers on the queue. + +## Verifying Workers are Listening + +To confirm a site worker is consuming from the correct queue: + +```bash +# Check the CELERY_TASK_QUEUES env var in the running pod +kubectl -n nautobot get deploy nautobot-worker-celery-rax-dev \ + -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name=="CELERY_TASK_QUEUES")].value}' + +# Check worker logs for the queue binding +kubectl logs -n nautobot \ + -l app.kubernetes.io/component=nautobot-celery-rax-dev \ + --tail=20 | grep "ready" +``` + +## Multiple Sites + +Each site gets its own queue named after its partition. For example: + +| Site | Partition | Queue Name | Deployment | +|---|---|---|---| +| DC1 Staging | dc1-staging | dc1-staging | nautobot-worker-celery-dc1-staging | +| DC1 Prod | dc1-prod | dc1-prod | nautobot-worker-celery-dc1-prod | +| DC2 Prod | dc2-prod | dc2-prod | nautobot-worker-celery-dc2-prod | +| DC3 Prod | dc3-prod | dc3-prod | nautobot-worker-celery-dc3-prod | + +Each site's worker only processes tasks from its own queue. The global +Nautobot instance dispatches jobs to the appropriate queue based on the +`task_queue` parameter in the API call. + +## Troubleshooting + +### "is not a valid choice" when running a job + +The job does not have the requested queue in its allowed queues. Either: + +- Add the queue to the job's `task_queues` in code, or +- Add the JobQueue to the job via the UI/API with `job_queues_override: true` + +### CeleryWorkerNotRunningException + +No worker is listening on the requested queue. Check: + +- The site's nautobot-worker ArgoCD Application is synced and healthy +- The worker pod is running: `kubectl get pods -n nautobot -l app.kubernetes.io/component=nautobot-celery-` +- The `CELERY_TASK_QUEUES` env var matches the queue name + +### Job runs but nothing happens + +The job was dispatched to a queue that no worker is consuming. This +can happen if `task_queue` was not specified and the job defaulted to +`"default"`, but the site worker is listening on `"rax-dev"`. Always +pass `task_queue` explicitly when targeting a site worker. diff --git a/docs/operator-guide/nautobot-mtls-certificate-renewal.md b/docs/operator-guide/nautobot-mtls-certificate-renewal.md new file mode 100644 index 000000000..8ed9d37df --- /dev/null +++ b/docs/operator-guide/nautobot-mtls-certificate-renewal.md @@ -0,0 +1,131 @@ +# Nautobot mTLS Certificate Renewal + +This guide covers how mTLS client certificates used by site-level +Nautobot workers are renewed and distributed across clusters. + +For background on the mTLS architecture and certificate infrastructure, +see the [nautobot-worker deploy guide](../deploy-guide/components/nautobot-worker.md). + +## How Certificates Are Issued + +Client certificates are issued by cert-manager on the global cluster +using the `mtls-ca-issuer` (backed by a self-signed root CA). Each site +gets its own Certificate resource: + +```yaml +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: nautobot-mtls-client- + namespace: nautobot +spec: + secretName: nautobot-mtls-client- + duration: 26280h # 3 years + renewBefore: 720h # 30 days + commonName: app + usages: + - client auth + privateKey: + algorithm: ECDSA + size: 256 + issuerRef: + name: mtls-ca-issuer + kind: Issuer +``` + +cert-manager automatically renews the certificate 30 days before +expiry, updating the Kubernetes secret on the global cluster. + +The global cluster also has: + +- `nautobot-mtls-client` -- client cert for global Nautobot/Celery pods + (`CN=app`). Renewed automatically by cert-manager. +- `nautobot-cluster-replication` -- streaming replication client cert + (`CN=streaming_replica`). Renewed automatically by cert-manager. + Required so CNPG does not need the CA private key in + `clientCASecret`. + +## The Distribution Problem + +cert-manager handles renewal on the global cluster automatically. The +challenge is getting the renewed certificate to the site cluster. The +site cluster pulls the cert from an external secrets provider via an +ExternalSecret resource. When cert-manager renews the cert, the updated +material must be pushed to the secrets provider so the site +ExternalSecret picks it up on its next refresh cycle. + +By default, this is a manual process: an operator extracts the renewed +cert from the global cluster and uploads it to the secrets provider. + +## Automation Approaches + +### PushSecret (External Secrets Operator) + +Use a [PushSecret](https://external-secrets.io/latest/guides/pushsecrets/) +resource on the global cluster to automatically push the renewed cert +to your secrets provider whenever the Kubernetes secret changes. This +is event-driven and requires no CronJob. + +This is the recommended approach if your secrets provider is supported +by the External Secrets Operator. + +### CronJob on the Global Cluster + +A Kubernetes CronJob that runs periodically, reads the cert secret, and +pushes it to your secrets provider via its API. Simple to implement but +introduces a delay between renewal and distribution (up to the CronJob +interval). + +### Cross-Cluster Secret Replication + +Use a tool like +[Kubernetes Replicator](https://github.com/mittwald/kubernetes-replicator) +to copy the cert secret directly from the global cluster to site +clusters, bypassing the secrets provider entirely. Requires network +connectivity between clusters and appropriate RBAC. + +### CertificateRequest from Site Clusters + +The site cluster creates a cert-manager +[CertificateRequest](https://cert-manager.io/docs/usage/certificaterequest/), +an operator on the global cluster approves and signs it, and the signed +cert is returned. This is similar to how kubelet certificate management +works in Kubernetes. Most complex to set up but fully automated with no +intermediate secrets provider. + +## Monitoring Certificate Expiry + +Check certificate status on the global cluster: + +```bash +# List all mTLS client certificates and their expiry +kubectl get certificate -n nautobot -o custom-columns='NAME:.metadata.name,READY:.status.conditions[0].status,EXPIRY:.status.notAfter,RENEWAL:.status.renewalTime' + +# Check a specific site's certificate +kubectl describe certificate nautobot-mtls-client- -n nautobot +``` + +On the site cluster, verify the ExternalSecret is syncing: + +```bash +kubectl get externalsecret nautobot-mtls-client -n nautobot +``` + +If the ExternalSecret shows `SecretSyncedError`, the credential in +your secrets provider may be stale or missing. + +## What Happens When a Certificate Expires + +If a site worker's client certificate expires before it is renewed and +distributed: + +- PostgreSQL connections fail with `SSL error: certificate has expired` +- Redis connections fail with `[SSL: CERTIFICATE_VERIFY_FAILED]` +- The worker pod stays running but all tasks fail +- The health check reports Redis as unavailable + +To recover, manually extract the renewed cert from the global cluster +and upload it to your secrets provider. The site ExternalSecret will +pick it up on the next refresh cycle, and the worker pods will +automatically get the new cert on their next restart (or when the +secret volume is refreshed by kubelet). diff --git a/docs/operator-guide/nautobot.md b/docs/operator-guide/nautobot.md index a066fe8f5..7139a046b 100644 --- a/docs/operator-guide/nautobot.md +++ b/docs/operator-guide/nautobot.md @@ -1,5 +1,435 @@ # Nautobot +## Related Guides + +- [Nautobot Celery Queues](nautobot-celery-queues.md) -- configuring + per-site Celery task queues and routing jobs to site-specific workers +- [mTLS Certificate Renewal](nautobot-mtls-certificate-renewal.md) -- + how mTLS client certificates for site workers are renewed and + distributed across clusters + +## PostgreSQL mTLS + +All PostgreSQL connections -- both from global Nautobot pods and +site-level workers -- use mutual TLS with client certificate +authentication. The CNPG cluster enforces this with a single `pg_hba` +rule: + +```text +hostssl all all 0.0.0.0/0 cert +``` + +This means every client must connect over TLS and present a valid +client certificate signed by the mTLS CA. The certificate CN is mapped +to the PostgreSQL user (`app`). + +### CNPG Certificate Configuration + +The CNPG Cluster resource has four certificate fields. Understanding +what each one does is critical for troubleshooting TLS errors: + +| Field | Secret | What CNPG Does With It | +|---|---|---| +| `serverTLSSecret` | `nautobot-cluster-server-tls` | Mounted as the PostgreSQL server cert. Presented to clients during the TLS handshake. | +| `serverCASecret` | `mtls-ca-cert` | The `ca.crt` from this secret is sent to clients so they can verify the server cert (`sslrootcert` on the client side). | +| `clientCASecret` | `mtls-ca-cert` | The `ca.crt` from this secret populates PostgreSQL's `ssl_ca_file`. This is what PostgreSQL uses to verify client certificates during `pg_hba cert` auth. | +| `replicationTLSSecret` | `nautobot-cluster-replication` | Client cert (`CN=streaming_replica`) used for streaming replication between PostgreSQL instances. | + +Key points: + +- `clientCASecret` is the field that controls client cert verification. + Without it, CNPG auto-generates its own internal CA and uses that for + `ssl_ca_file`. External client certs signed by the mTLS CA will be + rejected with `tlsv1 alert unknown ca`. +- `serverCASecret` does NOT populate `ssl_ca_file`. It only provides + the CA cert that clients use to verify the server. This is a common + source of confusion. +- `replicationTLSSecret` must be provided when setting `clientCASecret`. + Without it, CNPG tries to generate its own replication cert and needs + `ca.key` in the `clientCASecret` secret. Since `mtls-ca-cert` only + has `ca.crt` (not the CA private key), CNPG fails with + `missing ca.key secret data`. +- Both `clientCASecret` and `serverCASecret` can point to the same + secret (`mtls-ca-cert`) when the same CA signs both server and client + certificates. + +### How nautobot_config.py Handles SSL + +The `nautobot_config.py` SSL logic is gated on the `NAUTOBOT_DB_SSLMODE` +environment variable: + +| Value | Behavior | Use Case | +|---|---|---| +| `verify-ca` | Sets `sslmode`, `sslcert`, `sslkey`, `sslrootcert` on the Django DB connection. Validates cert files exist at startup. | Global pods and site workers (production). | +| `verify-full` | Same as `verify-ca` but also verifies the server hostname matches the cert. | Stricter verification if needed. | +| `require` | Sets `sslmode=require` only. Encrypts the connection but does not present a client cert or verify the server CA. | Not suitable for `pg_hba cert` -- use `verify-ca` instead. | +| Unset or empty | No SSL options applied. Plain TCP connection. | Will be rejected by `hostssl ... cert` pg_hba rule. | + +All pods (global and site) must set `NAUTOBOT_DB_SSLMODE=verify-ca` in +their `extraEnvVars` and have the mTLS client cert volume mounted at +`/etc/nautobot/mtls/`. + +### Verifying the Certificate Chain + +To confirm the CNPG cluster is using the correct CA for client cert +verification: + +```bash +# Check what CA PostgreSQL is using for ssl_ca_file +kubectl exec -n nautobot nautobot-cluster-1 -c postgres -- \ + openssl x509 -noout -subject -issuer \ + -in /controller/certificates/client-ca.crt +# Expected: subject=CN=understack-mtls-ca + +# Check the client cert CN and issuer +kubectl get secret nautobot-mtls-client -n nautobot \ + -o jsonpath='{.data.tls\.crt}' | base64 -d | \ + openssl x509 -noout -subject -issuer +# Expected: subject=CN=app, issuer=CN=understack-mtls-ca + +# Verify the client cert against the CA +kubectl get secret mtls-ca-cert -n nautobot \ + -o jsonpath='{.data.ca\.crt}' | base64 -d > /tmp/ca.crt +kubectl get secret nautobot-mtls-client -n nautobot \ + -o jsonpath='{.data.tls\.crt}' | base64 -d > /tmp/client.crt +openssl verify -CAfile /tmp/ca.crt /tmp/client.crt +# Expected: /tmp/client.crt: OK +``` + +### Common Errors + +| Error | Cause | Fix | +|---|---|---| +| `tlsv1 alert unknown ca` | `clientCASecret` not set or points to wrong secret. CNPG uses its internal CA for `ssl_ca_file`. | Set `clientCASecret: mtls-ca-cert` and `replicationTLSSecret: nautobot-cluster-replication`. | +| `missing ca.key secret data` | `clientCASecret` set but `replicationTLSSecret` not provided. CNPG needs CA key to generate replication certs. | Add `replicationTLSSecret` with a cert-manager Certificate (`CN=streaming_replica`). | +| `connection requires a valid client certificate` | Client connected over TLS but did not present a cert. | Set `NAUTOBOT_DB_SSLMODE=verify-ca` on the pod. | +| `certificate authentication failed for user` | Client cert CN does not match the PostgreSQL user. | Ensure cert has `commonName: app`. | +| `x509: certificate signed by unknown authority` (CNPG status) | Old replication secret signed by CNPG's internal CA, not the mTLS CA. | Delete the old secret: `kubectl delete secret nautobot-cluster-replication -n nautobot`. cert-manager recreates it. | +| `no pg_hba.conf entry` | Client is not connecting over TLS, or the source IP / auth method does not match any rule. | Ensure `NAUTOBOT_DB_SSLMODE=verify-ca` is set. Check that the pg_hba rules cover the connection type. | + +### Forcing CNPG to Reconcile + +After changing certificate fields on the CNPG Cluster resource, the +operator may not immediately pick up the change. Force a reconcile: + +```bash +kubectl annotate cluster nautobot-cluster -n nautobot \ + cnpg.io/reconcile=$(date +%s) --overwrite +``` + +Check the result: + +```bash +kubectl get cluster nautobot-cluster -n nautobot \ + -o jsonpath='{.status.phase}{"\n"}{.status.phaseReason}{"\n"}' +``` + +If the phase is healthy, the change was applied. If it shows an error, +see the Common Errors table above. + +### Handling Stale CNPG-Managed Secrets + +When adding `replicationTLSSecret`, CNPG may have already created a +secret with the same name (e.g. `nautobot-cluster-replication`) using +its internal CA. cert-manager will not overwrite a secret it did not +create. You must delete the old secret first: + +```bash +kubectl delete secret nautobot-cluster-replication -n nautobot +# cert-manager recreates it within seconds, signed by mtls-ca-issuer +``` + +Verify the new secret: + +```bash +kubectl get secret nautobot-cluster-replication -n nautobot +# Should show DATA=3 (tls.crt, tls.key, ca.crt) + +kubectl get secret nautobot-cluster-replication -n nautobot \ + -o jsonpath='{.data.tls\.crt}' | base64 -d | \ + openssl x509 -noout -subject -issuer +# Expected: subject=CN=streaming_replica, issuer=CN=understack-mtls-ca +``` + +Then force a CNPG reconcile (see above). + +### Restarting CNPG Pods + +If the CNPG pods have not picked up updated certificate secrets (e.g. +`client-ca.crt` still shows the old CA), use the `cnpg` kubectl plugin +to perform a rolling restart: + +```bash +kubectl cnpg restart nautobot-cluster -n nautobot +``` + +This performs a rolling restart of all instances, handling replica/primary +ordering automatically and waiting for each pod to be ready before +proceeding. + +If you only need pods to reload configuration (e.g. updated `pg_hba` +or PostgreSQL parameters) without a full restart: + +```bash +kubectl cnpg reload nautobot-cluster -n nautobot +``` + +### pg_hba Behavior + +pg_hba rules are evaluated top-to-bottom. PostgreSQL stops at the first +rule matching the connection type and source IP. If authentication fails +on that rule, the connection is rejected -- it does NOT fall through to +the next rule. This means two rules with the same +`hostssl all all 0.0.0.0/0` prefix makes the second unreachable. Use +CIDR scoping if you need different auth methods for different source +networks. + +### Rollback to Password Auth + +To revert global pods to password-based auth while keeping cert auth +for site workers: + +1. Add back the `host` rule for local pods: + + ```yaml + postgresql: + pg_hba: + - host all all 10.0.0.0/8 scram-sha-256 + - hostssl all all 0.0.0.0/0 cert + ``` + +2. Remove `NAUTOBOT_DB_SSLMODE` from global pod `extraEnvVars` (keep + it on site workers). + +3. Optionally remove `clientCASecret` and `replicationTLSSecret` from + the CNPG spec to let CNPG manage its own replication CA again. + +## Configuration Architecture + +Nautobot requires a `nautobot_config.py` file that defines Django +settings, plugin loading, database options, and authentication +backends. In understack, this file lives at +`components/nautobot/nautobot_config.py` and is injected into pods +using the Helm chart's `fileParameters` feature. + +### How fileParameters Works + +Both the `nautobot` and `nautobot-worker` ArgoCD Applications use a +multi-source setup. The Helm chart source includes: + +```yaml +helm: + fileParameters: + - name: nautobot.config + path: $understack/components/nautobot/nautobot_config.py +``` + +ArgoCD reads the file content from the understack git repo and passes +it as the `nautobot.config` Helm value. The Nautobot Helm chart then +creates a ConfigMap from that content and mounts it into pods at +`/opt/nautobot/nautobot_config.py`. The `NAUTOBOT_CONFIG` environment +variable (set in the deploy repo values) tells Nautobot to load its +configuration from that path. + +This approach means: + +- The config file is version-controlled in git alongside the component + it configures +- Changes to the config trigger ArgoCD syncs and pod restarts + automatically (the Helm chart checksums the ConfigMap) +- The same config file is shared by both the global nautobot deployment + and site-level workers, avoiding drift + +### Why Not Use the Baked-In Config? + +Container images may include their own `nautobot_config.py` at build +time (e.g. at `/opt/nautobot_config/nautobot_config.py`). While this +works for simple deployments, it has limitations: + +- Config changes require rebuilding and redeploying the container image +- Different deployments (global vs site workers) may need different + settings (e.g. mTLS, plugin sets) but share the same image +- Private deployment-specific settings (plugin credentials, SSO config) + get baked into the image + +The Helm `fileParameters` approach decouples the config from the image. +The image provides the runtime (Nautobot + installed plugins), while +the git-managed config and deploy-repo environment variables control +behavior. This separation allows: + +- The same container image to be used across global and site deployments + with different configurations +- mTLS, SSL, and other connection settings to be conditional on + environment variables rather than hardcoded +- Private plugin configuration to be injected via environment variables + in the deploy repo without modifying the public config file + +### Config Layering + +The effective configuration is built from multiple layers: + +1. **Nautobot defaults** -- `from nautobot.core.settings import *` + provides all default Django and Nautobot settings +2. **Component config** -- `components/nautobot/nautobot_config.py` + overrides defaults with understack-specific settings (mTLS, plugin + loading, SSO, partition identifier) +3. **Helm chart env vars** -- the base `components/nautobot/values.yaml` + sets database, Redis, and other connection parameters as environment + variables that the config reads via `os.getenv()` +4. **Deploy repo values** -- site-specific overrides (hostnames, image + tags, extra plugins, credentials) that Helm merges on top of the + base values + +### Important: Helm List Replacement + +Helm merges scalar and map values from multiple value files, but +**replaces lists entirely**. If the base `components/nautobot/values.yaml` +defines: + +```yaml +nautobot: + extraVolumes: + - name: nautobot-sso + secret: + secretName: nautobot-sso +``` + +And the deploy repo values set: + +```yaml +nautobot: + extraVolumes: + - name: mtls-certs + secret: + secretName: nautobot-mtls-client +``` + +The result is **only** `mtls-certs` -- the `nautobot-sso` volume is +gone. The deploy values must re-include any base volumes they need to +preserve. + +## Plugin Loading + +The shared `nautobot_config.py` (mounted via Helm `fileParameters`) +uses a generic plugin loading mechanism that works across different +container images and deployments: + +1. Open-source plugins (`nautobot_plugin_nornir`, `nautobot_golden_config`) + are loaded automatically if installed in the container image. +2. Additional plugins can be specified via the `NAUTOBOT_EXTRA_PLUGINS` + environment variable (comma-separated module names). Each plugin is + loaded only if it's actually installed in the container -- missing + plugins are silently skipped. +3. Plugin configuration is provided via the `NAUTOBOT_EXTRA_PLUGINS_CONFIG` + environment variable as a JSON object. This supports `${ENV_VAR}` + syntax for referencing environment variables in string values, which + is useful for injecting secrets at runtime without hardcoding them in + the config. + +This design allows the same `nautobot_config.py` to be used by both +the global Nautobot deployment (which may have additional private +plugins) and site workers (which may have a different plugin set), +without any deployment-specific code in the public repository. + +Example deploy values for adding custom plugins: + +```yaml +nautobot: + extraEnvVars: + - name: NAUTOBOT_EXTRA_PLUGINS + value: 'my_custom_plugin,another_plugin' + - name: NAUTOBOT_EXTRA_PLUGINS_CONFIG + value: '{"my_custom_plugin":{"API_KEY":"${MY_API_KEY}"}}' +``` + +### Current Limitations + +The `NAUTOBOT_EXTRA_PLUGINS_CONFIG` environment variable works but has +ergonomic drawbacks as the number of plugins grows: + +- All plugin config is a single JSON string in the deploy values, which + becomes hard to read and review in PRs +- JSON cannot express Python-native types like `None` or call functions + like `is_truthy()` -- only plain JSON types (`null`, `false`, etc.) +- Adding or removing a plugin means editing a long inline JSON blob + +### Future Improvement: Per-Plugin Config Files + +A cleaner approach for deployments with many plugins is to store each +plugin's configuration as a separate JSON file in the deploy repo, +managed via a Kustomize `configMapGenerator`, and mounted into the pod +as a directory. The `nautobot_config.py` would then glob that directory +and load each file into `PLUGINS_CONFIG`. + +Example structure in the deploy repo: + +```text +/nautobot/plugin-configs/ + nautobot_golden_config.json + my_custom_plugin.json + vni_custom_model.json +``` + +Each file contains the plugin's config as a JSON object: + +```json title="my_custom_plugin.json" +{ + "API_KEY": "${MY_API_KEY}", + "TIMEOUT": 30 +} +``` + +A Kustomize `configMapGenerator` creates a ConfigMap from the directory: + +```yaml title="kustomization.yaml" +configMapGenerator: + - name: nautobot-plugin-configs + files: + - plugin-configs/nautobot_golden_config.json + - plugin-configs/my_custom_plugin.json + options: + disableNameSuffixHash: true +``` + +The deploy values mount it as a volume: + +```yaml +nautobot: + extraVolumes: + - name: plugin-configs + configMap: + name: nautobot-plugin-configs + extraVolumeMounts: + - name: plugin-configs + mountPath: /etc/nautobot/plugin-configs + readOnly: true +``` + +And the `nautobot_config.py` loads all files from the directory: + +```python +import glob, json, os, re + +def _interpolate_env(obj): + if isinstance(obj, str): + return re.sub(r"\$\{(\w+)\}", lambda m: os.environ.get(m.group(1), ""), obj) + if isinstance(obj, dict): + return {k: _interpolate_env(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_interpolate_env(v) for v in obj] + return obj + +for _path in sorted(glob.glob("/etc/nautobot/plugin-configs/*.json")): + _name = os.path.splitext(os.path.basename(_path))[0] + with open(_path) as _f: + PLUGINS_CONFIG[_name] = _interpolate_env(json.load(_f)) +``` + +This gives each plugin its own readable file, makes PRs easy to review, +and keeps the `${ENV_VAR}` interpolation for secrets. It can be +implemented alongside the current env var approach without breaking +existing deployments. + ## Nautobot Django shell You can access the Nautobot Django shell by connecting to the pod and running the diff --git a/mkdocs.yml b/mkdocs.yml index f2c0b2967..c791d971a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -191,6 +191,7 @@ nav: - deploy-guide/components/nautobot-site.md - deploy-guide/components/nautobot.md - deploy-guide/components/nautobotop.md + - deploy-guide/components/nautobot-worker.md - deploy-guide/components/neutron.md - deploy-guide/components/nova.md - deploy-guide/components/octavia.md @@ -240,6 +241,8 @@ nav: - operator-guide/rook-ceph.md - operator-guide/nautobot.md - operator-guide/nautobotop.md + - operator-guide/nautobot-celery-queues.md + - operator-guide/nautobot-mtls-certificate-renewal.md - operator-guide/troubleshooting-osh.md - operator-guide/logging.md - operator-guide/ansible-local-usage.md