Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions agent/src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,8 @@ def build_config(
initial_approval_gate_count: int = 0,
approval_gate_cap: int | None = None,
attachments: list[dict] | None = None,
self_review_enabled: bool = False,
self_review_max_turns: int = 5,
) -> TaskConfig:
"""Build and validate configuration from explicit parameters.

Expand Down Expand Up @@ -407,6 +409,8 @@ def build_config(
initial_approval_gate_count=initial_approval_gate_count,
approval_gate_cap=approval_gate_cap,
attachments=validated_attachments,
self_review_enabled=self_review_enabled,
self_review_max_turns=self_review_max_turns,
)


Expand All @@ -431,6 +435,9 @@ def get_config() -> TaskConfig:
# an unreachable ``traces//`` key.
trace=os.environ.get("TRACE", "").lower() in ("1", "true", "yes"),
user_id=os.environ.get("USER_ID", ""),
self_review_enabled=os.environ.get("SELF_REVIEW_ENABLED", "").lower()
in ("1", "true", "yes"),
self_review_max_turns=int(os.environ.get("SELF_REVIEW_MAX_TURNS", "5")),
)
except ValueError as e:
print(f"ERROR: {e}", file=sys.stderr)
Expand Down
3 changes: 3 additions & 0 deletions agent/src/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,9 @@ class TaskConfig(BaseModel):
# Attachments from the orchestrator payload (Phase 3). Validated as
# AttachmentConfig models. Empty list for tasks without attachments.
attachments: list[AttachmentConfig] = Field(default_factory=list)
# Self-review: optional LLM diff critique before PR creation.
self_review_enabled: bool = False
self_review_max_turns: int = 5 # Cap on turns allocated to self-review

@model_validator(mode="after")
def _validate_trace_requires_user_id(self) -> Self:
Expand Down
26 changes: 26 additions & 0 deletions agent/src/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@
_extract_agent_notes,
ensure_committed,
ensure_pr,
post_self_review_comment,
verify_build,
verify_lint,
)
from progress_writer import _ProgressWriter
from prompt_builder import build_system_prompt, discover_project_config
from runner import run_agent
from self_review import run_self_review
from shell import log, log_error_cw
from system_prompt import SYSTEM_PROMPT
from telemetry import (
Expand Down Expand Up @@ -279,6 +281,8 @@ def run_task(
trace: bool = False,
user_id: str = "",
attachments: list[dict] | None = None,
self_review_enabled: bool = False,
self_review_max_turns: int = 5,
) -> dict:
"""Run the full agent pipeline and return a serialized result dict.

Expand Down Expand Up @@ -318,6 +322,8 @@ def run_task(
initial_approval_gate_count=initial_approval_gate_count,
approval_gate_cap=approval_gate_cap,
attachments=attachments,
self_review_enabled=self_review_enabled,
self_review_max_turns=self_review_max_turns,
)

# Inject Cedar policies into config for the PolicyEngine in runner.py
Expand Down Expand Up @@ -623,6 +629,22 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
"turns_attempted": agent_result.num_turns or agent_result.turns,
}

# Self-review phase: LLM critiques its own diff before PR creation.
# Runs between cancel-check and post-hooks. Fail-open: errors here
# never block PR creation.
with task_span("task.self_review"):
review_result = run_self_review(
config, setup, agent_result, trajectory, progress
)
if review_result is not None:
# Accumulate turns and cost from the review phase
agent_result.turns += review_result.turns
agent_result.num_turns += review_result.num_turns or review_result.turns
if review_result.cost_usd is not None:
agent_result.cost_usd = (
(agent_result.cost_usd or 0.0) + review_result.cost_usd
)

# Post-hooks (agent_result is guaranteed set by the try/except above)
with task_span("task.post_hooks") as post_span:
# Safety net: commit any uncommitted tracked changes (skip for read-only tasks)
Expand All @@ -643,6 +665,10 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
if pr_url:
progress.write_agent_milestone("pr_created", pr_url)

# Post self-review summary as PR comment (if self-review ran and produced findings)
if pr_url and review_result is not None:
post_self_review_comment(setup.repo_dir, pr_url, config)

# Memory write — capture task episode and repo learnings
memory_written = False
effective_memory_id = memory_id or os.environ.get("MEMORY_ID", "")
Expand Down
62 changes: 62 additions & 0 deletions agent/src/post_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,68 @@ def ensure_pr(
return None


def post_self_review_comment(repo_dir: str, pr_url: str, config: TaskConfig) -> bool:
"""Post the self-review summary as a PR comment.

Reads the summary file written by the self-review agent, formats it as a
comment, and posts it via `gh pr comment`. Fail-open: exceptions are logged
but never propagated.

Returns True if a comment was posted, False otherwise.
"""
from self_review import read_self_review_summary

try:
summary = read_self_review_summary(repo_dir)
except Exception as e:
log("WARN", f"post_self_review_comment: failed to read summary: {type(e).__name__}: {e}")
return False

if not summary:
log("POST", "post_self_review_comment: no summary file found — skipping")
return False

# Extract PR number from URL (e.g. https://github.com/owner/repo/pull/123)
match = re.search(r"/pull/(\d+)", pr_url)
if not match:
log("WARN", f"post_self_review_comment: could not extract PR number from {pr_url}")
return False
pr_number = match.group(1)

comment_body = f"## \U0001f50d Self-Review Summary\n\n{summary}"

try:
result = subprocess.run(
[
"gh",
"pr",
"comment",
pr_number,
"--repo",
config.repo_url,
"--body",
comment_body,
],
cwd=repo_dir,
capture_output=True,
text=True,
timeout=60,
)
if result.returncode == 0:
log("POST", f"Self-review summary posted as comment on PR #{pr_number}")
return True
stderr = result.stderr.strip()[:200] if result.stderr else ""
log(
"WARN",
f"post_self_review_comment: gh pr comment failed "
f"(rc={result.returncode}): {stderr}",
)
return False
except (subprocess.TimeoutExpired, OSError) as e:
log("WARN", f"post_self_review_comment: {type(e).__name__}: {e}")
return False


def _extract_agent_notes(repo_dir: str, branch: str, config: TaskConfig) -> str | None:
"""Extract the "## Agent notes" section from the PR body.

Expand Down
1 change: 1 addition & 0 deletions agent/src/prompts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .new_task import NEW_TASK_WORKFLOW
from .pr_iteration import PR_ITERATION_WORKFLOW
from .pr_review import PR_REVIEW_WORKFLOW
from .self_review import SELF_REVIEW_PROMPT as SELF_REVIEW_PROMPT

_PROMPTS = {
"new_task": BASE_PROMPT.replace("{workflow}", NEW_TASK_WORKFLOW),
Expand Down
61 changes: 61 additions & 0 deletions agent/src/prompts/self_review.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Self-review prompt template for pre-PR diff critique."""

SELF_REVIEW_PROMPT = """\
You are reviewing your own work before it becomes a pull request. Below is the \
cumulative diff of all changes on this branch compared to the base branch.

<diff>
{diff}
</diff>

## Task context

{task_description}

## Review checklist

Examine the diff carefully for:

1. **Correctness** — Logic errors, off-by-one mistakes, missing edge cases, \
incorrect assumptions about data shapes or API contracts.
2. **Bugs** — Null/undefined dereferences, unhandled error paths, resource leaks, \
race conditions.
3. **Security** — Injection vulnerabilities (SQL, command, XSS), hardcoded secrets, \
insecure defaults, OWASP Top 10 issues.
4. **Style & consistency** — Naming conventions, code style violations relative to \
the surrounding codebase, unnecessary complexity.
5. **Test gaps** — Important behaviour that is untested, assertions that don't \
verify the right thing, missing edge-case coverage.

## Instructions

- If you find issues, fix them directly: edit the files, run the build/tests to \
verify your fixes, and commit the changes.
- If no issues are found, stop immediately — do not make changes for the sake of \
making changes.
- Do NOT refactor code that was not part of the original diff unless it has a \
concrete bug or security issue.
- Keep fixes minimal and focused. Each fix should be a separate commit with a \
clear message.

## Summary output

After completing your review (whether you made fixes or not), write a file \
`.self-review-summary.md` in the repository root with your findings in this format:

```markdown
### Self-Review Summary

**Findings:** <number of issues found>
**Fixes applied:** <number of fixes committed>

#### Issues found

- <category>: <brief description of issue> — <fixed | not fixed (reason)>
```

If no issues were found, write the file with: "No issues found — code looks good."

This file is a pipeline artifact and will be deleted automatically — it will NOT \
appear in the pull request.
"""
Loading
Loading