Skip to content
25 changes: 21 additions & 4 deletions cdk/src/constructs/approval-metrics-publisher-consumer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import * as path from 'path';
import { Duration, RemovalPolicy } from 'aws-cdk-lib';
import * as cloudwatch from 'aws-cdk-lib/aws-cloudwatch';
import * as dynamodb from 'aws-cdk-lib/aws-dynamodb';
import { FilterCriteria, FilterRule, StartingPosition, Architecture, Runtime } from 'aws-cdk-lib/aws-lambda';
import { DynamoEventSource, SqsDlq } from 'aws-cdk-lib/aws-lambda-event-sources';
Expand Down Expand Up @@ -84,6 +85,8 @@ export interface ApprovalMetricsPublisherConsumerProps {
export class ApprovalMetricsPublisherConsumer extends Construct {
public readonly fn: lambda.NodejsFunction;
public readonly dlq: sqs.Queue;
/** CloudWatch alarm that fires when the DLQ has at least one poison-pill record. */
public readonly dlqAlarm: cloudwatch.IAlarm;

constructor(scope: Construct, id: string, props: ApprovalMetricsPublisherConsumerProps) {
super(scope, id);
Expand All @@ -93,10 +96,7 @@ export class ApprovalMetricsPublisherConsumer extends Construct {
this.dlq = new sqs.Queue(this, 'ApprovalMetricsPublisherDlq', {
// Persistent failures (malformed records the handler's
// per-record try/catch throws on three times in a row) land
// here for operator inspection. Alarm wiring is deferred to
// Chunk 10 follow-ups — until a notification channel is wired
// to SNS, an alarm on ``ApproximateNumberOfMessagesVisible``
// would fire into the void.
// here for operator inspection.
retentionPeriod: Duration.days(14),
enforceSSL: true,
});
Expand Down Expand Up @@ -153,6 +153,23 @@ export class ApprovalMetricsPublisherConsumer extends Construct {
filters: [agentMilestoneFilter],
}));

// #117: alarm on DLQ depth so poison-pill records don't silently
// accumulate without operator visibility.
this.dlqAlarm = new cloudwatch.Alarm(this, 'DlqMessageAlarm', {
metric: this.dlq.metricApproximateNumberOfMessagesVisible({
period: Duration.minutes(5),
statistic: 'Maximum',
}),
threshold: 1,
evaluationPeriods: 1,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
alarmDescription:
'ApprovalMetricsPublisher DLQ has at least one message; investigate poison records. ' +
'Check CloudWatch Logs for the ApprovalMetricsPublisherFn error that caused the DLQ send. ' +
'See: https://github.com/aws-samples/sample-autonomous-cloud-coding-agents/issues/117',
treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
});

NagSuppressions.addResourceSuppressions(this.fn, [
{
id: 'AwsSolutions-IAM4',
Expand Down
20 changes: 20 additions & 0 deletions cdk/src/constructs/fanout-consumer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import * as path from 'path';
import { Duration, RemovalPolicy } from 'aws-cdk-lib';
import * as cloudwatch from 'aws-cdk-lib/aws-cloudwatch';
import * as dynamodb from 'aws-cdk-lib/aws-dynamodb';
import * as iam from 'aws-cdk-lib/aws-iam';
import { StartingPosition, Architecture, Runtime } from 'aws-cdk-lib/aws-lambda';
Expand Down Expand Up @@ -106,6 +107,8 @@ export interface FanOutConsumerProps {
export class FanOutConsumer extends Construct {
public readonly fn: lambda.NodejsFunction;
public readonly dlq: sqs.Queue;
/** CloudWatch alarm that fires when the DLQ has at least one poison-pill record. */
public readonly dlqAlarm: cloudwatch.IAlarm;

constructor(scope: Construct, id: string, props: FanOutConsumerProps) {
super(scope, id);
Expand Down Expand Up @@ -185,6 +188,23 @@ export class FanOutConsumer extends Construct {
reportBatchItemFailures: true,
}));

// #117: alarm on DLQ depth so poison-pill records don't silently
// accumulate without operator visibility.
this.dlqAlarm = new cloudwatch.Alarm(this, 'DlqMessageAlarm', {
metric: this.dlq.metricApproximateNumberOfMessagesVisible({
period: Duration.minutes(5),
statistic: 'Maximum',
}),
threshold: 1,
evaluationPeriods: 1,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
alarmDescription:
'FanOutConsumer DLQ has at least one message; investigate poison records. ' +
'Check CloudWatch Logs for the FanOutFn error that caused the DLQ send. ' +
'See: https://github.com/aws-samples/sample-autonomous-cloud-coding-agents/issues/117',
treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
});

NagSuppressions.addResourceSuppressions(this.fn, [
{
id: 'AwsSolutions-IAM4',
Expand Down
15 changes: 15 additions & 0 deletions cdk/test/constructs/approval-metrics-publisher-consumer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -189,4 +189,19 @@ describe('ApprovalMetricsPublisherConsumer', () => {
// expect exactly one Table in the synthesized stack.
template.resourceCountIs('AWS::DynamoDB::Table', 1);
});

test('creates a CloudWatch alarm on DLQ ApproximateNumberOfMessagesVisible (#117)', () => {
const { template } = createStack();

template.hasResourceProperties('AWS::CloudWatch::Alarm', {
MetricName: 'ApproximateNumberOfMessagesVisible',
Namespace: 'AWS/SQS',
Threshold: 1,
EvaluationPeriods: 1,
ComparisonOperator: 'GreaterThanOrEqualToThreshold',
TreatMissingData: 'notBreaching',
Statistic: 'Maximum',
Period: 300,
});
});
});
20 changes: 20 additions & 0 deletions cdk/test/constructs/fanout-consumer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -170,4 +170,24 @@ describe('FanOutConsumer', () => {
expect(vars.TASK_TABLE_NAME).toBeUndefined();
}
});

test('creates a CloudWatch alarm on DLQ ApproximateNumberOfMessagesVisible (#117)', () => {
const app = new App();
const stack = new Stack(app, 'TestStack');
new FanOutConsumer(stack, 'FanOut', {
taskEventsTable: makeTaskEventsTable(stack),
});
const template = Template.fromStack(stack);

template.hasResourceProperties('AWS::CloudWatch::Alarm', {
MetricName: 'ApproximateNumberOfMessagesVisible',
Namespace: 'AWS/SQS',
Threshold: 1,
EvaluationPeriods: 1,
ComparisonOperator: 'GreaterThanOrEqualToThreshold',
TreatMissingData: 'notBreaching',
Statistic: 'Maximum',
Period: 300,
});
});
});
21 changes: 13 additions & 8 deletions docs/design/CEDAR_HITL_GATES.md
Original file line number Diff line number Diff line change
Expand Up @@ -1585,21 +1585,26 @@ Extend `TaskDashboard` (`cdk/src/constructs/task-dashboard.ts`). These are read-

Every `agent_milestone("approval_*")` event carries `trace_id` / `span_id`. A span `hitl.approval_wait` brackets the PreToolUse poll loop: `span.duration = decided_at - created_at`. `hitl.approval_race_loss` emitted when the agent's local timeout fired <5s before a late user decision (useful for tuning).

### 11.5 CloudWatch alarms — deferred (notification-channel gated)
### 11.5 CloudWatch alarms

**DLQ-depth alarms (shipped):** CloudWatch alarms on `ApproximateNumberOfMessagesVisible >= 1` (5-min period, Maximum statistic, `treatMissingData: NOT_BREACHING`) are deployed for:

- **FanOutConsumer DLQ** — poison-pill DynamoDB Stream records that failed three consecutive Lambda invocations.
- **ApprovalMetricsPublisher DLQ** — same failure mode for the metrics-publisher consumer.

These alarms transition to `ALARM` state in CloudWatch and appear in the console/dashboard, providing operator visibility into silent record loss. They ship **without an `addAlarmAction` / SNS notification target** — operators must check the CloudWatch Alarms console or configure a subscription manually. This is an intentional intermediate step: alarm state is durable and queryable even without push notifications, and prevents poison records from accumulating silently for the full 14-day DLQ retention window.

**Follow-up — notification channel wiring:** Once an operational notification channel (SNS topic → Slack / PagerDuty / email) is provisioned, add `alarm.addAlarmAction(new SnsAction(topic))` to both alarms. No metric or alarm restructuring is needed.

**Additional alarms (not yet shipped):** The following remain deferred until the notification channel exists (alarm-without-action provides limited value for rate/latency conditions that require human triage):

Operator-facing CloudWatch alarms that would page on:
- High approval-timeout rate (users not responding, notifications broken)
- Tasks stuck in AWAITING_APPROVAL beyond `timeout_s + 60s` (reconciler failure)
- High approval-write failure rate (DDB throttled or IAM drift)
- Approval-gate cap hit (suspicious retry loop)
- Publisher / fanout DLQ non-empty (persistent consumer-side poison pills)
- `MetricEmitSkipped` sustained > 0 (publisher schema mismatch — agent / publisher version skew)
- `MetricsPublisherHeartbeat` flat-line (publisher pipeline broken)

…are **out of scope for v1** because the project does not yet have a notification channel (Slack / PagerDuty / SNS topic / email distribution list) configured for operational alerts. Adding alarms without a notification channel produces CloudWatch widgets that nobody sees — no safety benefit.

**Plumbing status (post-Chunk 8):** the supporting metric data now flows as native CloudWatch metrics in namespace `ABCA/Cedar-HITL` via `ApprovalMetricsPublisherFn` (§11.3). Alarm wiring becomes a per-threshold `cloudwatch.Alarm` + `SnsAction`; no additional metric-extraction infra is needed. The remaining gap is the SNS topic + subscriber wiring itself — when that lands, the alarms above are a small bounded follow-up (not a multi-PR metrics build-out as they were pre-Chunk-8).

---

## 12. Security model
Expand Down Expand Up @@ -2074,7 +2079,7 @@ See §17.18 for the off-hours escalation future-work primitive, and §13.14 for
**Future work — polish (tracked in §17):**
- CLI inline streaming prompt (UX research first)
- `approve --defer` / allowlist revocation (`bgagent revoke-approval`)
- CloudWatch alarm plumbing (§11.5) — deferred until an operational notification channel is available
- CloudWatch alarm SNS notification wiring (§11.5) — DLQ-depth alarms ship without an action target; add `SnsAction` once a notification channel is provisioned
- More soft-deny policies in the default set based on real usage
- Persistent recent-decision cache (if container-restart telemetry justifies it)
- Persistent per-minute rate limit (if restart amplification becomes significant)
Expand Down
21 changes: 13 additions & 8 deletions docs/src/content/docs/architecture/Cedar-hitl-gates.md
Original file line number Diff line number Diff line change
Expand Up @@ -1589,21 +1589,26 @@ Extend `TaskDashboard` (`cdk/src/constructs/task-dashboard.ts`). These are read-

Every `agent_milestone("approval_*")` event carries `trace_id` / `span_id`. A span `hitl.approval_wait` brackets the PreToolUse poll loop: `span.duration = decided_at - created_at`. `hitl.approval_race_loss` emitted when the agent's local timeout fired <5s before a late user decision (useful for tuning).

### 11.5 CloudWatch alarms — deferred (notification-channel gated)
### 11.5 CloudWatch alarms

**DLQ-depth alarms (shipped):** CloudWatch alarms on `ApproximateNumberOfMessagesVisible >= 1` (5-min period, Maximum statistic, `treatMissingData: NOT_BREACHING`) are deployed for:

- **FanOutConsumer DLQ** — poison-pill DynamoDB Stream records that failed three consecutive Lambda invocations.
- **ApprovalMetricsPublisher DLQ** — same failure mode for the metrics-publisher consumer.

These alarms transition to `ALARM` state in CloudWatch and appear in the console/dashboard, providing operator visibility into silent record loss. They ship **without an `addAlarmAction` / SNS notification target** — operators must check the CloudWatch Alarms console or configure a subscription manually. This is an intentional intermediate step: alarm state is durable and queryable even without push notifications, and prevents poison records from accumulating silently for the full 14-day DLQ retention window.

**Follow-up — notification channel wiring:** Once an operational notification channel (SNS topic → Slack / PagerDuty / email) is provisioned, add `alarm.addAlarmAction(new SnsAction(topic))` to both alarms. No metric or alarm restructuring is needed.

**Additional alarms (not yet shipped):** The following remain deferred until the notification channel exists (alarm-without-action provides limited value for rate/latency conditions that require human triage):

Operator-facing CloudWatch alarms that would page on:
- High approval-timeout rate (users not responding, notifications broken)
- Tasks stuck in AWAITING_APPROVAL beyond `timeout_s + 60s` (reconciler failure)
- High approval-write failure rate (DDB throttled or IAM drift)
- Approval-gate cap hit (suspicious retry loop)
- Publisher / fanout DLQ non-empty (persistent consumer-side poison pills)
- `MetricEmitSkipped` sustained > 0 (publisher schema mismatch — agent / publisher version skew)
- `MetricsPublisherHeartbeat` flat-line (publisher pipeline broken)

…are **out of scope for v1** because the project does not yet have a notification channel (Slack / PagerDuty / SNS topic / email distribution list) configured for operational alerts. Adding alarms without a notification channel produces CloudWatch widgets that nobody sees — no safety benefit.

**Plumbing status (post-Chunk 8):** the supporting metric data now flows as native CloudWatch metrics in namespace `ABCA/Cedar-HITL` via `ApprovalMetricsPublisherFn` (§11.3). Alarm wiring becomes a per-threshold `cloudwatch.Alarm` + `SnsAction`; no additional metric-extraction infra is needed. The remaining gap is the SNS topic + subscriber wiring itself — when that lands, the alarms above are a small bounded follow-up (not a multi-PR metrics build-out as they were pre-Chunk-8).

---

## 12. Security model
Expand Down Expand Up @@ -2078,7 +2083,7 @@ See §17.18 for the off-hours escalation future-work primitive, and §13.14 for
**Future work — polish (tracked in §17):**
- CLI inline streaming prompt (UX research first)
- `approve --defer` / allowlist revocation (`bgagent revoke-approval`)
- CloudWatch alarm plumbing (§11.5) — deferred until an operational notification channel is available
- CloudWatch alarm SNS notification wiring (§11.5) — DLQ-depth alarms ship without an action target; add `SnsAction` once a notification channel is provisioned
- More soft-deny policies in the default set based on real usage
- Persistent recent-decision cache (if container-restart telemetry justifies it)
- Persistent per-minute rate limit (if restart amplification becomes significant)
Expand Down
Loading