From ca140f2be802381c054ed845155780521229b2ae Mon Sep 17 00:00:00 2001 From: nkkko Date: Fri, 29 May 2026 11:32:18 +0200 Subject: [PATCH] feat: add Steel cloud browser support Add "steel" as a third browser_mode alongside "browserbase" and "local". The harness forwards STEEL_API_KEY to generated scripts via _browser_env, and the prompt templates teach the agent to create a Steel cloud session (POST https://api.steel.dev/v1/sessions with the Steel-Api-Key header) and connect Playwright over CDP to wss://connect.steel.dev. Steel needs no project id, only STEEL_API_KEY. Session creation uses raw httpx (already a dependency); no new package required. - local_workspace.py: forward STEEL_API_KEY; document the new mode - base.yaml / task_showcase.yaml: Steel branch in the Browser Mode section - crafted_cli.yaml: make the cloud-session helper BROWSER_MODE-aware - README: document browser_mode and required env vars - tests: cover _browser_env credential forwarding for steel/browserbase Verified end-to-end against a live Steel session: create -> CDP connect -> navigate -> screenshot -> release. --- README.md | 12 +++++- src/webwright/config/base.yaml | 41 +++++++++++++++--- src/webwright/config/crafted_cli.yaml | 24 ++++++++--- src/webwright/config/task_showcase.yaml | 14 +++--- src/webwright/environments/local_workspace.py | 10 +++-- tests/unit/test_browser_env.py | 43 +++++++++++++++++++ 6 files changed, 121 insertions(+), 23 deletions(-) create mode 100644 tests/unit/test_browser_env.py diff --git a/README.md b/README.md index 22bd91e..6876bc7 100644 --- a/README.md +++ b/README.md @@ -185,7 +185,17 @@ playwright install chromium Export credentials for the configured backend (for example, `OPENAI_API_KEY` with `model_openai.yaml` or `ANTHROPIC_API_KEY` with `model_claude.yaml`). The `image_qa` and `self_reflection` tools use the same configured model by default, -so an Anthropic run does not require an OpenAI key. Then: +so an Anthropic run does not require an OpenAI key. + +The browser backend is selected by `environment.browser_mode` (default `local`): + +| `browser_mode` | What the agent's scripts do | Required env | +|----------------|-----------------------------|--------------| +| `local` | Launch a local Playwright Chromium | — | +| `browserbase` | Create a [Browserbase](https://browserbase.com) cloud session over CDP | `BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID` | +| `steel` | Create a [Steel](https://steel.dev) cloud session over CDP | `STEEL_API_KEY` | + +Then: ```bash python -m webwright.run.cli \ diff --git a/src/webwright/config/base.yaml b/src/webwright/config/base.yaml index e09320e..432e818 100644 --- a/src/webwright/config/base.yaml +++ b/src/webwright/config/base.yaml @@ -18,6 +18,7 @@ # - OPENAI_API_KEY (only when the configured agent or tool model_class is openai) # - ANTHROPIC_API_KEY (only when stacking model_claude.yaml) # - BROWSERBASE_API_KEY + BROWSERBASE_PROJECT_ID (only when browser_mode=browserbase) +# - STEEL_API_KEY (only when browser_mode=steel) model: # model_class / model_name / endpoint come from the model modifier yaml. @@ -57,11 +58,12 @@ environment: command_timeout_seconds: 240 shell: /bin/bash # Path to a shell file that exports credentials (BROWSERBASE_API_KEY, - # BROWSERBASE_PROJECT_ID, ANTHROPIC_API_KEY, OPENAI_API_KEY, ...). Leave - # empty to read these from the parent process environment instead. + # BROWSERBASE_PROJECT_ID, STEEL_API_KEY, ANTHROPIC_API_KEY, OPENAI_API_KEY, + # ...). Leave empty to read these from the parent process environment instead. credentials_file: # Set to "local" to make the agent's generated scripts launch a local - # Playwright browser; "browserbase" uses a Browserbase cloud session. + # Playwright browser; "browserbase" uses a Browserbase cloud session and + # "steel" uses a Steel cloud session. browser_mode: local task_metadata_filename: task.json final_script_name: final_script.py @@ -115,9 +117,36 @@ agent: ## Browser Mode - The harness exposes `BROWSER_MODE` to your scripts (value: `browserbase` or `local`). + The harness exposes `BROWSER_MODE` to your scripts (value: `browserbase`, `steel`, or `local`). - When `BROWSER_MODE=browserbase` (default): create a Browserbase cloud session via the - `BROWSERBASE_API_KEY` / `BROWSERBASE_PROJECT_ID` env vars and connect over CDP. + `BROWSERBASE_API_KEY` / `BROWSERBASE_PROJECT_ID` env vars and connect over CDP, e.g.: + ```python + async with httpx.AsyncClient(timeout=30) as client: + resp = await client.post( + "https://api.browserbase.com/v1/sessions", + headers={"x-bb-api-key": os.environ["BROWSERBASE_API_KEY"]}, + json={"projectId": os.environ["BROWSERBASE_PROJECT_ID"]}, + ) + resp.raise_for_status() + session = resp.json() + browser = await playwright.chromium.connect_over_cdp(session["connectUrl"]) + ``` + - When `BROWSER_MODE=steel`: create a Steel cloud session via the `STEEL_API_KEY` + env var (no project id) and connect over CDP to a constructed websocket URL, e.g.: + ```python + key = os.environ["STEEL_API_KEY"] + async with httpx.AsyncClient(timeout=30) as client: + resp = await client.post( + "https://api.steel.dev/v1/sessions", + headers={"Steel-Api-Key": key}, + json={}, # optional: {"solveCaptcha": True, "useProxy": True} + ) + resp.raise_for_status() + session = resp.json() + browser = await playwright.chromium.connect_over_cdp( + f"wss://connect.steel.dev?apiKey={key}&sessionId={session['id']}" + ) + ``` - When `BROWSER_MODE=local`: launch a local Playwright Chromium browser (`playwright.chromium.launch(...)`) instead. No external credentials required. @@ -337,7 +366,7 @@ agent: - The required final artifact is `{{ final_script_path }}`. - Create `final_runs/run_/` folders for every clean execution of the final script. Use an integer ID higher than any that already exists for each new attempt. - Store each run's `final_script.py`, `final_script_log.txt`, and final verification screenshots **only** inside that run folder. - - The browser mode is `{{ browser_mode }}`. Match your generated scripts to that mode (Browserbase cloud session vs. local Playwright launch). + - The browser mode is `{{ browser_mode }}`. Match your generated scripts to that mode (Browserbase or Steel cloud session vs. local Playwright launch). ## Web Task Rules diff --git a/src/webwright/config/crafted_cli.yaml b/src/webwright/config/crafted_cli.yaml index b67560b..e63408c 100644 --- a/src/webwright/config/crafted_cli.yaml +++ b/src/webwright/config/crafted_cli.yaml @@ -33,7 +33,7 @@ agent: - Put exactly one shell command in the `bash_command` string. Never emit raw Python or shell outside that field. Use heredocs (`python - <<'PY' ... PY`) to run Python inline when needed. - Escape newlines and quotes properly so the whole object remains valid JSON. - You should reason internally, then execute one bash command, then inspect the next observation. - - There is NO persistent browser state. Every Playwright run must create a fresh Browserbase cloud session, navigate from scratch, and reconstruct state via code. + - There is NO persistent browser state. Every Playwright run must create a fresh cloud browser session matching `BROWSER_MODE` (`browserbase` or `steel`), navigate from scratch, and reconstruct state via code. - Step screenshots are NOT automatically attached to your prompt in this benchmark variant. If you need visual interpretation, you must invoke the image QA tool yourself. - Set `"done": true` only when the task goal is complete and `final_script.py` is the final artifact. - NEVER set `"done": true` in the same response as a non-empty `bash_command`. Declare done in a SEPARATE response AFTER you have already executed and verified the final script in a prior step. @@ -69,7 +69,7 @@ agent: `help=` (copied from the docstring), and a sensible default equal to the concrete task value so that running `python final_script.py` with no arguments reproduces the original task. - 5. The CLI must still perform the full end-to-end run (Browserbase session, + 5. The CLI must still perform the full end-to-end run (cloud browser session, screenshots, `final_script_log.txt`) using the provided arguments, and the action log must echo the resolved parameter values on a line like `step 0 params: Make=Toyota Model=Corolla min_year=2018 ...` so the judge @@ -94,8 +94,18 @@ agent: SCREENSHOTS = WORKSPACE / "screenshots" SCREENSHOTS.mkdir(parents=True, exist_ok=True) - async def create_browserbase_session(): + async def create_cloud_browser_cdp_url(): + mode = os.environ.get("BROWSER_MODE", "browserbase") async with httpx.AsyncClient(timeout=30) as client: + if mode == "steel": + key = os.environ["STEEL_API_KEY"] + response = await client.post( + "https://api.steel.dev/v1/sessions", + headers={"Steel-Api-Key": key}, + json={}, + ) + response.raise_for_status() + return f"wss://connect.steel.dev?apiKey={key}&sessionId={response.json()['id']}" response = await client.post( "https://api.browserbase.com/v1/sessions", headers={ @@ -110,12 +120,12 @@ agent: }, ) response.raise_for_status() - return response.json() + return response.json()["connectUrl"] async def main(): - session = await create_browserbase_session() + cdp_url = await create_cloud_browser_cdp_url() async with async_playwright() as playwright: - browser = await playwright.chromium.connect_over_cdp(session["connectUrl"]) + browser = await playwright.chromium.connect_over_cdp(cdp_url) context = browser.contexts[0] if browser.contexts else await browser.new_context() page = context.pages[0] if context.pages else await context.new_page() page.set_viewport_size({"width": 1280, "height": 1800}) # use 1280x1800 viewport for better desktop site rendering and more visible content in screenshots @@ -320,7 +330,7 @@ agent: - The required final artifact is `{{ final_script_path }}`. - Create `final_runs/run_/` folders for every clean execution of the final script. Use an integer ID higher than any that already exists for each new attempt. - Store each run's `final_script.py`, `final_script_log.txt`, and final verification screenshots **only** inside that run folder. - - Always use Browserbase cloud sessions. + - Always use a cloud browser session matching `BROWSER_MODE` (`browserbase` or `steel`), never a local launch. ## Web Task Rules diff --git a/src/webwright/config/task_showcase.yaml b/src/webwright/config/task_showcase.yaml index e3e1d02..a1393ef 100644 --- a/src/webwright/config/task_showcase.yaml +++ b/src/webwright/config/task_showcase.yaml @@ -53,11 +53,12 @@ environment: command_timeout_seconds: 240 shell: /bin/bash # Path to a shell file that exports credentials (BROWSERBASE_API_KEY, - # BROWSERBASE_PROJECT_ID, ANTHROPIC_API_KEY, OPENAI_API_KEY, ...). Leave - # empty to read these from the parent process environment instead. + # BROWSERBASE_PROJECT_ID, STEEL_API_KEY, ANTHROPIC_API_KEY, OPENAI_API_KEY, + # ...). Leave empty to read these from the parent process environment instead. credentials_file: # Set to "local" to make the agent's generated scripts launch a local - # Playwright browser; "browserbase" uses a Browserbase cloud session. + # Playwright browser; "browserbase" uses a Browserbase cloud session and + # "steel" uses a Steel cloud session. browser_mode: local task_metadata_filename: task.json final_script_name: final_script.py @@ -111,9 +112,12 @@ agent: ## Browser Mode - The harness exposes `BROWSER_MODE` to your scripts (value: `browserbase` or `local`). + The harness exposes `BROWSER_MODE` to your scripts (value: `browserbase`, `steel`, or `local`). - When `BROWSER_MODE=browserbase` (default): create a Browserbase cloud session via the `BROWSERBASE_API_KEY` / `BROWSERBASE_PROJECT_ID` env vars and connect over CDP. + - When `BROWSER_MODE=steel`: create a Steel cloud session via POST + https://api.steel.dev/v1/sessions (header `Steel-Api-Key: $STEEL_API_KEY`), then + connect over CDP to `wss://connect.steel.dev?apiKey=$STEEL_API_KEY&sessionId=`. - When `BROWSER_MODE=local`: launch a local Playwright Chromium browser (`playwright.chromium.launch(...)`) instead. No external credentials required. @@ -435,7 +439,7 @@ agent: - `{{ workspace_dir }}/task_showcase/tasks//task.json` - `{{ workspace_dir }}/task_showcase/tasks//report.json` - Use `{{ task_id }}` as the preferred `` when it is present and already URL-safe; otherwise derive a lowercase slug from the task title. - - The browser mode is `{{ browser_mode }}`. Match generated scripts to that mode (Browserbase cloud session vs. local Playwright launch). + - The browser mode is `{{ browser_mode }}`. Match generated scripts to that mode (Browserbase or Steel cloud session vs. local Playwright launch). ## Web Task Rules diff --git a/src/webwright/environments/local_workspace.py b/src/webwright/environments/local_workspace.py index 04d3055..cc04cb6 100644 --- a/src/webwright/environments/local_workspace.py +++ b/src/webwright/environments/local_workspace.py @@ -17,12 +17,14 @@ class LocalWorkspaceEnvironmentConfig(BaseModel): """Shell-based workspace environment. The agent drives a real browser through bash commands it generates inside this - workspace. Two browser modes are exposed to those generated scripts via + workspace. Three browser modes are exposed to those generated scripts via environment variables: * ``browser_mode = "browserbase"`` (default): the agent's scripts should create a Browserbase cloud session. ``BROWSERBASE_API_KEY`` and ``BROWSERBASE_PROJECT_ID`` are forwarded if present. + * ``browser_mode = "steel"``: the agent's scripts should create a Steel cloud + session. ``STEEL_API_KEY`` is forwarded if present. * ``browser_mode = "local"``: the agent's scripts should launch a local Playwright browser (``playwright.chromium.launch(...)``). @@ -36,7 +38,7 @@ class LocalWorkspaceEnvironmentConfig(BaseModel): shell: str = "/bin/bash" env: dict[str, str] = Field(default_factory=dict) credentials_file: Path | None = None - browser_mode: str = "browserbase" # "browserbase" or "local" + browser_mode: str = "browserbase" # "browserbase", "steel", or "local" task_metadata_filename: str = "task.json" final_script_name: str = "final_script.py" output_truncation_chars: int = 12000 @@ -165,9 +167,9 @@ def prepare(self, **kwargs) -> None: self._task_metadata_path().write_text(json.dumps(kwargs, indent=2), encoding="utf-8") def _browser_env(self) -> dict[str, str]: - """Forward Browserbase / browser-mode hints to the subprocess.""" + """Forward cloud-browser / browser-mode hints to the subprocess.""" env: dict[str, str] = {"BROWSER_MODE": str(self.config.browser_mode or "browserbase")} - for var in ("BROWSERBASE_API_KEY", "BROWSERBASE_PROJECT_ID"): + for var in ("BROWSERBASE_API_KEY", "BROWSERBASE_PROJECT_ID", "STEEL_API_KEY"): value = self._credential_env.get(var) or os.environ.get(var) if value: env[var] = value diff --git a/tests/unit/test_browser_env.py b/tests/unit/test_browser_env.py new file mode 100644 index 0000000..ff9b2b6 --- /dev/null +++ b/tests/unit/test_browser_env.py @@ -0,0 +1,43 @@ +# ABOUTME: Tests that LocalWorkspaceEnvironment._browser_env forwards the right +# ABOUTME: BROWSER_MODE and cloud-browser credentials (Browserbase, Steel) to subprocesses. + +from webwright.environments.local_workspace import LocalWorkspaceEnvironment + + +def test_browser_env_forwards_steel_mode_and_key(monkeypatch) -> None: + monkeypatch.setenv("STEEL_API_KEY", "sk-steel-test") + env = LocalWorkspaceEnvironment(browser_mode="steel") + + result = env._browser_env() + + assert result["BROWSER_MODE"] == "steel" + assert result["STEEL_API_KEY"] == "sk-steel-test" + + +def test_browser_env_omits_steel_key_when_unset(monkeypatch) -> None: + monkeypatch.delenv("STEEL_API_KEY", raising=False) + env = LocalWorkspaceEnvironment(browser_mode="steel") + + result = env._browser_env() + + assert "STEEL_API_KEY" not in result + + +def test_browser_env_forwards_browserbase_credentials(monkeypatch) -> None: + monkeypatch.setenv("BROWSERBASE_API_KEY", "bb-key") + monkeypatch.setenv("BROWSERBASE_PROJECT_ID", "bb-project") + env = LocalWorkspaceEnvironment(browser_mode="browserbase") + + result = env._browser_env() + + assert result["BROWSER_MODE"] == "browserbase" + assert result["BROWSERBASE_API_KEY"] == "bb-key" + assert result["BROWSERBASE_PROJECT_ID"] == "bb-project" + + +def test_browser_env_defaults_mode_to_browserbase(monkeypatch) -> None: + env = LocalWorkspaceEnvironment(browser_mode="") + + result = env._browser_env() + + assert result["BROWSER_MODE"] == "browserbase"