From ca140f2be802381c054ed845155780521229b2ae Mon Sep 17 00:00:00 2001
From: nkkko <nikola.balic@gmail.com>
Date: Fri, 29 May 2026 11:32:18 +0200
Subject: [PATCH] feat: add Steel cloud browser support

Add "steel" as a third browser_mode alongside "browserbase" and "local".
The harness forwards STEEL_API_KEY to generated scripts via _browser_env,
and the prompt templates teach the agent to create a Steel cloud session
(POST https://api.steel.dev/v1/sessions with the Steel-Api-Key header) and
connect Playwright over CDP to wss://connect.steel.dev. Steel needs no
project id, only STEEL_API_KEY. Session creation uses raw httpx (already a
dependency); no new package required.

- local_workspace.py: forward STEEL_API_KEY; document the new mode
- base.yaml / task_showcase.yaml: Steel branch in the Browser Mode section
- crafted_cli.yaml: make the cloud-session helper BROWSER_MODE-aware
- README: document browser_mode and required env vars
- tests: cover _browser_env credential forwarding for steel/browserbase

Verified end-to-end against a live Steel session: create -> CDP connect ->
navigate -> screenshot -> release.
---
 README.md                                     | 12 +++++-
 src/webwright/config/base.yaml                | 41 +++++++++++++++---
 src/webwright/config/crafted_cli.yaml         | 24 ++++++++---
 src/webwright/config/task_showcase.yaml       | 14 +++---
 src/webwright/environments/local_workspace.py | 10 +++--
 tests/unit/test_browser_env.py                | 43 +++++++++++++++++++
 6 files changed, 121 insertions(+), 23 deletions(-)
 create mode 100644 tests/unit/test_browser_env.py

diff --git a/README.md b/README.md
index 22bd91e..6876bc7 100644
--- a/README.md
+++ b/README.md
@@ -185,7 +185,17 @@ playwright install chromium
 Export credentials for the configured backend (for example, `OPENAI_API_KEY`
 with `model_openai.yaml` or `ANTHROPIC_API_KEY` with `model_claude.yaml`). The
 `image_qa` and `self_reflection` tools use the same configured model by default,
-so an Anthropic run does not require an OpenAI key. Then:
+so an Anthropic run does not require an OpenAI key.
+
+The browser backend is selected by `environment.browser_mode` (default `local`):
+
+| `browser_mode` | What the agent's scripts do | Required env |
+|----------------|-----------------------------|--------------|
+| `local`        | Launch a local Playwright Chromium | — |
+| `browserbase`  | Create a [Browserbase](https://browserbase.com) cloud session over CDP | `BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID` |
+| `steel`        | Create a [Steel](https://steel.dev) cloud session over CDP | `STEEL_API_KEY` |
+
+Then:
 
 ```bash
 python -m webwright.run.cli \
diff --git a/src/webwright/config/base.yaml b/src/webwright/config/base.yaml
index e09320e..432e818 100644
--- a/src/webwright/config/base.yaml
+++ b/src/webwright/config/base.yaml
@@ -18,6 +18,7 @@
 #   - OPENAI_API_KEY    (only when the configured agent or tool model_class is openai)
 #   - ANTHROPIC_API_KEY (only when stacking model_claude.yaml)
 #   - BROWSERBASE_API_KEY + BROWSERBASE_PROJECT_ID (only when browser_mode=browserbase)
+#   - STEEL_API_KEY (only when browser_mode=steel)
 
 model:
   # model_class / model_name / endpoint come from the model modifier yaml.
@@ -57,11 +58,12 @@ environment:
   command_timeout_seconds: 240
   shell: /bin/bash
   # Path to a shell file that exports credentials (BROWSERBASE_API_KEY,
-  # BROWSERBASE_PROJECT_ID, ANTHROPIC_API_KEY, OPENAI_API_KEY, ...). Leave
-  # empty to read these from the parent process environment instead.
+  # BROWSERBASE_PROJECT_ID, STEEL_API_KEY, ANTHROPIC_API_KEY, OPENAI_API_KEY,
+  # ...). Leave empty to read these from the parent process environment instead.
   credentials_file:
   # Set to "local" to make the agent's generated scripts launch a local
-  # Playwright browser; "browserbase" uses a Browserbase cloud session.
+  # Playwright browser; "browserbase" uses a Browserbase cloud session and
+  # "steel" uses a Steel cloud session.
   browser_mode: local
   task_metadata_filename: task.json
   final_script_name: final_script.py
@@ -115,9 +117,36 @@ agent:
 
     ## Browser Mode
 
-    The harness exposes `BROWSER_MODE` to your scripts (value: `browserbase` or `local`).
+    The harness exposes `BROWSER_MODE` to your scripts (value: `browserbase`, `steel`, or `local`).
     - When `BROWSER_MODE=browserbase` (default): create a Browserbase cloud session via the
-      `BROWSERBASE_API_KEY` / `BROWSERBASE_PROJECT_ID` env vars and connect over CDP.
+      `BROWSERBASE_API_KEY` / `BROWSERBASE_PROJECT_ID` env vars and connect over CDP, e.g.:
+      ```python
+      async with httpx.AsyncClient(timeout=30) as client:
+        resp = await client.post(
+          "https://api.browserbase.com/v1/sessions",
+          headers={"x-bb-api-key": os.environ["BROWSERBASE_API_KEY"]},
+          json={"projectId": os.environ["BROWSERBASE_PROJECT_ID"]},
+        )
+        resp.raise_for_status()
+        session = resp.json()
+      browser = await playwright.chromium.connect_over_cdp(session["connectUrl"])
+      ```
+    - When `BROWSER_MODE=steel`: create a Steel cloud session via the `STEEL_API_KEY`
+      env var (no project id) and connect over CDP to a constructed websocket URL, e.g.:
+      ```python
+      key = os.environ["STEEL_API_KEY"]
+      async with httpx.AsyncClient(timeout=30) as client:
+        resp = await client.post(
+          "https://api.steel.dev/v1/sessions",
+          headers={"Steel-Api-Key": key},
+          json={},  # optional: {"solveCaptcha": True, "useProxy": True}
+        )
+        resp.raise_for_status()
+        session = resp.json()
+      browser = await playwright.chromium.connect_over_cdp(
+        f"wss://connect.steel.dev?apiKey={key}&sessionId={session['id']}"
+      )
+      ```
     - When `BROWSER_MODE=local`: launch a local Playwright Chromium browser
       (`playwright.chromium.launch(...)`) instead. No external credentials required.
 
@@ -337,7 +366,7 @@ agent:
     - The required final artifact is `{{ final_script_path }}`.
     - Create `final_runs/run_<id>/` folders for every clean execution of the final script. Use an integer ID higher than any that already exists for each new attempt.
     - Store each run's `final_script.py`, `final_script_log.txt`, and final verification screenshots **only** inside that run folder.
-    - The browser mode is `{{ browser_mode }}`. Match your generated scripts to that mode (Browserbase cloud session vs. local Playwright launch).
+    - The browser mode is `{{ browser_mode }}`. Match your generated scripts to that mode (Browserbase or Steel cloud session vs. local Playwright launch).
 
     ## Web Task Rules
 
diff --git a/src/webwright/config/crafted_cli.yaml b/src/webwright/config/crafted_cli.yaml
index b67560b..e63408c 100644
--- a/src/webwright/config/crafted_cli.yaml
+++ b/src/webwright/config/crafted_cli.yaml
@@ -33,7 +33,7 @@ agent:
     - Put exactly one shell command in the `bash_command` string. Never emit raw Python or shell outside that field. Use heredocs (`python - <<'PY' ... PY`) to run Python inline when needed.
     - Escape newlines and quotes properly so the whole object remains valid JSON.
     - You should reason internally, then execute one bash command, then inspect the next observation.
-    - There is NO persistent browser state. Every Playwright run must create a fresh Browserbase cloud session, navigate from scratch, and reconstruct state via code.
+    - There is NO persistent browser state. Every Playwright run must create a fresh cloud browser session matching `BROWSER_MODE` (`browserbase` or `steel`), navigate from scratch, and reconstruct state via code.
     - Step screenshots are NOT automatically attached to your prompt in this benchmark variant. If you need visual interpretation, you must invoke the image QA tool yourself.
     - Set `"done": true` only when the task goal is complete and `final_script.py` is the final artifact.
     - NEVER set `"done": true` in the same response as a non-empty `bash_command`. Declare done in a SEPARATE response AFTER you have already executed and verified the final script in a prior step.
@@ -69,7 +69,7 @@ agent:
        `help=` (copied from the docstring), and a sensible default equal to the
        concrete task value so that running `python final_script.py` with no
        arguments reproduces the original task.
-    5. The CLI must still perform the full end-to-end run (Browserbase session,
+    5. The CLI must still perform the full end-to-end run (cloud browser session,
        screenshots, `final_script_log.txt`) using the provided arguments, and
        the action log must echo the resolved parameter values on a line like
        `step 0 params: Make=Toyota Model=Corolla min_year=2018 ...` so the judge
@@ -94,8 +94,18 @@ agent:
     SCREENSHOTS = WORKSPACE / "screenshots"
     SCREENSHOTS.mkdir(parents=True, exist_ok=True)
 
-    async def create_browserbase_session():
+    async def create_cloud_browser_cdp_url():
+      mode = os.environ.get("BROWSER_MODE", "browserbase")
       async with httpx.AsyncClient(timeout=30) as client:
+        if mode == "steel":
+          key = os.environ["STEEL_API_KEY"]
+          response = await client.post(
+            "https://api.steel.dev/v1/sessions",
+            headers={"Steel-Api-Key": key},
+            json={},
+          )
+          response.raise_for_status()
+          return f"wss://connect.steel.dev?apiKey={key}&sessionId={response.json()['id']}"
         response = await client.post(
           "https://api.browserbase.com/v1/sessions",
           headers={
@@ -110,12 +120,12 @@ agent:
           },
         )
         response.raise_for_status()
-        return response.json()
+        return response.json()["connectUrl"]
 
     async def main():
-      session = await create_browserbase_session()
+      cdp_url = await create_cloud_browser_cdp_url()
       async with async_playwright() as playwright:
-        browser = await playwright.chromium.connect_over_cdp(session["connectUrl"])
+        browser = await playwright.chromium.connect_over_cdp(cdp_url)
         context = browser.contexts[0] if browser.contexts else await browser.new_context()
         page = context.pages[0] if context.pages else await context.new_page()
         page.set_viewport_size({"width": 1280, "height": 1800}) # use 1280x1800 viewport for better desktop site rendering and more visible content in screenshots
@@ -320,7 +330,7 @@ agent:
     - The required final artifact is `{{ final_script_path }}`.
     - Create `final_runs/run_<id>/` folders for every clean execution of the final script. Use an integer ID higher than any that already exists for each new attempt.
     - Store each run's `final_script.py`, `final_script_log.txt`, and final verification screenshots **only** inside that run folder.
-    - Always use Browserbase cloud sessions.
+    - Always use a cloud browser session matching `BROWSER_MODE` (`browserbase` or `steel`), never a local launch.
 
     ## Web Task Rules
 
diff --git a/src/webwright/config/task_showcase.yaml b/src/webwright/config/task_showcase.yaml
index e3e1d02..a1393ef 100644
--- a/src/webwright/config/task_showcase.yaml
+++ b/src/webwright/config/task_showcase.yaml
@@ -53,11 +53,12 @@ environment:
   command_timeout_seconds: 240
   shell: /bin/bash
   # Path to a shell file that exports credentials (BROWSERBASE_API_KEY,
-  # BROWSERBASE_PROJECT_ID, ANTHROPIC_API_KEY, OPENAI_API_KEY, ...). Leave
-  # empty to read these from the parent process environment instead.
+  # BROWSERBASE_PROJECT_ID, STEEL_API_KEY, ANTHROPIC_API_KEY, OPENAI_API_KEY,
+  # ...). Leave empty to read these from the parent process environment instead.
   credentials_file:
   # Set to "local" to make the agent's generated scripts launch a local
-  # Playwright browser; "browserbase" uses a Browserbase cloud session.
+  # Playwright browser; "browserbase" uses a Browserbase cloud session and
+  # "steel" uses a Steel cloud session.
   browser_mode: local
   task_metadata_filename: task.json
   final_script_name: final_script.py
@@ -111,9 +112,12 @@ agent:
 
     ## Browser Mode
 
-    The harness exposes `BROWSER_MODE` to your scripts (value: `browserbase` or `local`).
+    The harness exposes `BROWSER_MODE` to your scripts (value: `browserbase`, `steel`, or `local`).
     - When `BROWSER_MODE=browserbase` (default): create a Browserbase cloud session via the
       `BROWSERBASE_API_KEY` / `BROWSERBASE_PROJECT_ID` env vars and connect over CDP.
+    - When `BROWSER_MODE=steel`: create a Steel cloud session via POST
+      https://api.steel.dev/v1/sessions (header `Steel-Api-Key: $STEEL_API_KEY`), then
+      connect over CDP to `wss://connect.steel.dev?apiKey=$STEEL_API_KEY&sessionId=<id>`.
     - When `BROWSER_MODE=local`: launch a local Playwright Chromium browser
       (`playwright.chromium.launch(...)`) instead. No external credentials required.
 
@@ -435,7 +439,7 @@ agent:
       - `{{ workspace_dir }}/task_showcase/tasks/<short_id>/task.json`
       - `{{ workspace_dir }}/task_showcase/tasks/<short_id>/report.json`
     - Use `{{ task_id }}` as the preferred `<short_id>` when it is present and already URL-safe; otherwise derive a lowercase slug from the task title.
-    - The browser mode is `{{ browser_mode }}`. Match generated scripts to that mode (Browserbase cloud session vs. local Playwright launch).
+    - The browser mode is `{{ browser_mode }}`. Match generated scripts to that mode (Browserbase or Steel cloud session vs. local Playwright launch).
 
     ## Web Task Rules
 
diff --git a/src/webwright/environments/local_workspace.py b/src/webwright/environments/local_workspace.py
index 04d3055..cc04cb6 100644
--- a/src/webwright/environments/local_workspace.py
+++ b/src/webwright/environments/local_workspace.py
@@ -17,12 +17,14 @@ class LocalWorkspaceEnvironmentConfig(BaseModel):
     """Shell-based workspace environment.
 
     The agent drives a real browser through bash commands it generates inside this
-    workspace. Two browser modes are exposed to those generated scripts via
+    workspace. Three browser modes are exposed to those generated scripts via
     environment variables:
 
     * ``browser_mode = "browserbase"`` (default): the agent's scripts should
       create a Browserbase cloud session. ``BROWSERBASE_API_KEY`` and
       ``BROWSERBASE_PROJECT_ID`` are forwarded if present.
+    * ``browser_mode = "steel"``: the agent's scripts should create a Steel cloud
+      session. ``STEEL_API_KEY`` is forwarded if present.
     * ``browser_mode = "local"``: the agent's scripts should launch a local
       Playwright browser (``playwright.chromium.launch(...)``).
 
@@ -36,7 +38,7 @@ class LocalWorkspaceEnvironmentConfig(BaseModel):
     shell: str = "/bin/bash"
     env: dict[str, str] = Field(default_factory=dict)
     credentials_file: Path | None = None
-    browser_mode: str = "browserbase"  # "browserbase" or "local"
+    browser_mode: str = "browserbase"  # "browserbase", "steel", or "local"
     task_metadata_filename: str = "task.json"
     final_script_name: str = "final_script.py"
     output_truncation_chars: int = 12000
@@ -165,9 +167,9 @@ def prepare(self, **kwargs) -> None:
         self._task_metadata_path().write_text(json.dumps(kwargs, indent=2), encoding="utf-8")
 
     def _browser_env(self) -> dict[str, str]:
-        """Forward Browserbase / browser-mode hints to the subprocess."""
+        """Forward cloud-browser / browser-mode hints to the subprocess."""
         env: dict[str, str] = {"BROWSER_MODE": str(self.config.browser_mode or "browserbase")}
-        for var in ("BROWSERBASE_API_KEY", "BROWSERBASE_PROJECT_ID"):
+        for var in ("BROWSERBASE_API_KEY", "BROWSERBASE_PROJECT_ID", "STEEL_API_KEY"):
             value = self._credential_env.get(var) or os.environ.get(var)
             if value:
                 env[var] = value
diff --git a/tests/unit/test_browser_env.py b/tests/unit/test_browser_env.py
new file mode 100644
index 0000000..ff9b2b6
--- /dev/null
+++ b/tests/unit/test_browser_env.py
@@ -0,0 +1,43 @@
+# ABOUTME: Tests that LocalWorkspaceEnvironment._browser_env forwards the right
+# ABOUTME: BROWSER_MODE and cloud-browser credentials (Browserbase, Steel) to subprocesses.
+
+from webwright.environments.local_workspace import LocalWorkspaceEnvironment
+
+
+def test_browser_env_forwards_steel_mode_and_key(monkeypatch) -> None:
+    monkeypatch.setenv("STEEL_API_KEY", "sk-steel-test")
+    env = LocalWorkspaceEnvironment(browser_mode="steel")
+
+    result = env._browser_env()
+
+    assert result["BROWSER_MODE"] == "steel"
+    assert result["STEEL_API_KEY"] == "sk-steel-test"
+
+
+def test_browser_env_omits_steel_key_when_unset(monkeypatch) -> None:
+    monkeypatch.delenv("STEEL_API_KEY", raising=False)
+    env = LocalWorkspaceEnvironment(browser_mode="steel")
+
+    result = env._browser_env()
+
+    assert "STEEL_API_KEY" not in result
+
+
+def test_browser_env_forwards_browserbase_credentials(monkeypatch) -> None:
+    monkeypatch.setenv("BROWSERBASE_API_KEY", "bb-key")
+    monkeypatch.setenv("BROWSERBASE_PROJECT_ID", "bb-project")
+    env = LocalWorkspaceEnvironment(browser_mode="browserbase")
+
+    result = env._browser_env()
+
+    assert result["BROWSER_MODE"] == "browserbase"
+    assert result["BROWSERBASE_API_KEY"] == "bb-key"
+    assert result["BROWSERBASE_PROJECT_ID"] == "bb-project"
+
+
+def test_browser_env_defaults_mode_to_browserbase(monkeypatch) -> None:
+    env = LocalWorkspaceEnvironment(browser_mode="")
+
+    result = env._browser_env()
+
+    assert result["BROWSER_MODE"] == "browserbase"