aws-samples · nizar-lahlali · Jun 4, 2026 · Jun 4, 2026
@@ -337,6 +337,8 @@ def build_config(
     initial_approval_gate_count: int = 0,
     approval_gate_cap: int | None = None,
     attachments: list[dict] | None = None,
+    self_review_enabled: bool = False,
+    self_review_max_turns: int = 5,
 ) -> TaskConfig:
     """Build and validate configuration from explicit parameters.
 
@@ -407,6 +409,8 @@ def build_config(
         initial_approval_gate_count=initial_approval_gate_count,
         approval_gate_cap=approval_gate_cap,
         attachments=validated_attachments,
+        self_review_enabled=self_review_enabled,
+        self_review_max_turns=self_review_max_turns,
     )
 
 
@@ -431,6 +435,9 @@ def get_config() -> TaskConfig:
             # an unreachable ``traces//`` key.
             trace=os.environ.get("TRACE", "").lower() in ("1", "true", "yes"),
             user_id=os.environ.get("USER_ID", ""),
+            self_review_enabled=os.environ.get("SELF_REVIEW_ENABLED", "").lower()
+            in ("1", "true", "yes"),
+            self_review_max_turns=int(os.environ.get("SELF_REVIEW_MAX_TURNS", "5")),
         )
     except ValueError as e:
         print(f"ERROR: {e}", file=sys.stderr)

@@ -186,6 +186,9 @@ class TaskConfig(BaseModel):
     # Attachments from the orchestrator payload (Phase 3). Validated as
     # AttachmentConfig models. Empty list for tasks without attachments.
     attachments: list[AttachmentConfig] = Field(default_factory=list)
+    # Self-review: optional LLM diff critique before PR creation.
+    self_review_enabled: bool = False
+    self_review_max_turns: int = 5  # Cap on turns allocated to self-review
 
     @model_validator(mode="after")
     def _validate_trace_requires_user_id(self) -> Self:

@@ -23,12 +23,14 @@
     _extract_agent_notes,
     ensure_committed,
     ensure_pr,
+    post_self_review_comment,
     verify_build,
     verify_lint,
 )
 from progress_writer import _ProgressWriter
 from prompt_builder import build_system_prompt, discover_project_config
 from runner import run_agent
+from self_review import run_self_review
 from shell import log, log_error_cw
 from system_prompt import SYSTEM_PROMPT
 from telemetry import (
@@ -279,6 +281,8 @@ def run_task(
     trace: bool = False,
     user_id: str = "",
     attachments: list[dict] | None = None,
+    self_review_enabled: bool = False,
+    self_review_max_turns: int = 5,
 ) -> dict:
     """Run the full agent pipeline and return a serialized result dict.
 
@@ -318,6 +322,8 @@ def run_task(
         initial_approval_gate_count=initial_approval_gate_count,
         approval_gate_cap=approval_gate_cap,
         attachments=attachments,
+        self_review_enabled=self_review_enabled,
+        self_review_max_turns=self_review_max_turns,
     )
 
     # Inject Cedar policies into config for the PolicyEngine in runner.py
@@ -623,6 +629,22 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
                     "turns_attempted": agent_result.num_turns or agent_result.turns,
                 }
 
+            # Self-review phase: LLM critiques its own diff before PR creation.
+            # Runs between cancel-check and post-hooks. Fail-open: errors here
+            # never block PR creation.
+            with task_span("task.self_review"):
+                review_result = run_self_review(
+                    config, setup, agent_result, trajectory, progress
+                )
+                if review_result is not None:
+                    # Accumulate turns and cost from the review phase
+                    agent_result.turns += review_result.turns
+                    agent_result.num_turns += review_result.num_turns or review_result.turns
+                    if review_result.cost_usd is not None:
+                        agent_result.cost_usd = (
+                            (agent_result.cost_usd or 0.0) + review_result.cost_usd
+                        )
+
             # Post-hooks (agent_result is guaranteed set by the try/except above)
             with task_span("task.post_hooks") as post_span:
                 # Safety net: commit any uncommitted tracked changes (skip for read-only tasks)
@@ -643,6 +665,10 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
             if pr_url:
                 progress.write_agent_milestone("pr_created", pr_url)
 
+            # Post self-review summary as PR comment (if self-review ran and produced findings)
+            if pr_url and review_result is not None:
+                post_self_review_comment(setup.repo_dir, pr_url, config)
+
             # Memory write — capture task episode and repo learnings
             memory_written = False
             effective_memory_id = memory_id or os.environ.get("MEMORY_ID", "")

@@ -327,6 +327,68 @@ def ensure_pr(
         return None
 
 
+def post_self_review_comment(repo_dir: str, pr_url: str, config: TaskConfig) -> bool:
+    """Post the self-review summary as a PR comment.
+
+    Reads the summary file written by the self-review agent, formats it as a
+    comment, and posts it via `gh pr comment`. Fail-open: exceptions are logged
+    but never propagated.
+
+    Returns True if a comment was posted, False otherwise.
+    """
+    from self_review import read_self_review_summary
+
+    try:
+        summary = read_self_review_summary(repo_dir)
+    except Exception as e:
+        log("WARN", f"post_self_review_comment: failed to read summary: {type(e).__name__}: {e}")
+        return False
+
+    if not summary:
+        log("POST", "post_self_review_comment: no summary file found — skipping")
+        return False
+
+    # Extract PR number from URL (e.g. https://github.com/owner/repo/pull/123)
+    match = re.search(r"/pull/(\d+)", pr_url)
+    if not match:
+        log("WARN", f"post_self_review_comment: could not extract PR number from {pr_url}")
+        return False
+    pr_number = match.group(1)
+
+    comment_body = f"## \U0001f50d Self-Review Summary\n\n{summary}"
+
+    try:
+        result = subprocess.run(
+            [
+                "gh",
+                "pr",
+                "comment",
+                pr_number,
+                "--repo",
+                config.repo_url,
+                "--body",
+                comment_body,
+            ],
+            cwd=repo_dir,
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+        if result.returncode == 0:
+            log("POST", f"Self-review summary posted as comment on PR #{pr_number}")
+            return True
+        stderr = result.stderr.strip()[:200] if result.stderr else ""
+        log(
+            "WARN",
+            f"post_self_review_comment: gh pr comment failed "
+            f"(rc={result.returncode}): {stderr}",
+        )
+        return False
+    except (subprocess.TimeoutExpired, OSError) as e:
+        log("WARN", f"post_self_review_comment: {type(e).__name__}: {e}")
+        return False
+
+
 def _extract_agent_notes(repo_dir: str, branch: str, config: TaskConfig) -> str | None:
     """Extract the "## Agent notes" section from the PR body.
 

@@ -4,6 +4,7 @@
 from .new_task import NEW_TASK_WORKFLOW
 from .pr_iteration import PR_ITERATION_WORKFLOW
 from .pr_review import PR_REVIEW_WORKFLOW
+from .self_review import SELF_REVIEW_PROMPT as SELF_REVIEW_PROMPT
 
 _PROMPTS = {
     "new_task": BASE_PROMPT.replace("{workflow}", NEW_TASK_WORKFLOW),

@@ -0,0 +1,61 @@
+"""Self-review prompt template for pre-PR diff critique."""
+
+SELF_REVIEW_PROMPT = """\
+You are reviewing your own work before it becomes a pull request. Below is the \
+cumulative diff of all changes on this branch compared to the base branch.
+
+<diff>
+{diff}
+</diff>
+
+## Task context
+
+{task_description}
+
+## Review checklist
+
+Examine the diff carefully for:
+
+1. **Correctness** — Logic errors, off-by-one mistakes, missing edge cases, \
+incorrect assumptions about data shapes or API contracts.
+2. **Bugs** — Null/undefined dereferences, unhandled error paths, resource leaks, \
+race conditions.
+3. **Security** — Injection vulnerabilities (SQL, command, XSS), hardcoded secrets, \
+insecure defaults, OWASP Top 10 issues.
+4. **Style & consistency** — Naming conventions, code style violations relative to \
+the surrounding codebase, unnecessary complexity.
+5. **Test gaps** — Important behaviour that is untested, assertions that don't \
+verify the right thing, missing edge-case coverage.
+
+## Instructions
+
+- If you find issues, fix them directly: edit the files, run the build/tests to \
+verify your fixes, and commit the changes.
+- If no issues are found, stop immediately — do not make changes for the sake of \
+making changes.
+- Do NOT refactor code that was not part of the original diff unless it has a \
+concrete bug or security issue.
+- Keep fixes minimal and focused. Each fix should be a separate commit with a \
+clear message.
+
+## Summary output
+
+After completing your review (whether you made fixes or not), write a file \
+`.self-review-summary.md` in the repository root with your findings in this format:
+
+```markdown
+### Self-Review Summary
+
+**Findings:** <number of issues found>
+**Fixes applied:** <number of fixes committed>
+
+#### Issues found
+
+- <category>: <brief description of issue> — <fixed | not fixed (reason)>
+```
+
+If no issues were found, write the file with: "No issues found — code looks good."
+
+This file is a pipeline artifact and will be deleted automatically — it will NOT \
+appear in the pull request.
+"""