From 06ba87dc901a0ac0ded8649b1f80307b5ff55710 Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Tue, 5 May 2026 15:55:34 -0500 Subject: [PATCH 01/26] init commit --- docs/sandbox/clients.md | 3 + examples/sandbox/extensions/README.md | 30 + .../sandbox/extensions/tensorlake_runner.py | 289 +++++ pyproject.toml | 5 + src/agents/extensions/sandbox/__init__.py | 24 + .../extensions/sandbox/tensorlake/__init__.py | 17 + .../extensions/sandbox/tensorlake/sandbox.py | 983 ++++++++++++++++++ tests/extensions/sandbox/test_tensorlake.py | 647 ++++++++++++ uv.lock | 113 +- 9 files changed, 2099 insertions(+), 12 deletions(-) create mode 100644 examples/sandbox/extensions/tensorlake_runner.py create mode 100644 src/agents/extensions/sandbox/tensorlake/__init__.py create mode 100644 src/agents/extensions/sandbox/tensorlake/sandbox.py create mode 100644 tests/extensions/sandbox/test_tensorlake.py diff --git a/docs/sandbox/clients.md b/docs/sandbox/clients.md index bd21da63d3..5b5f0ac1b0 100644 --- a/docs/sandbox/clients.md +++ b/docs/sandbox/clients.md @@ -96,6 +96,7 @@ For provider-specific setup notes and links for the checked-in extension example | `E2BSandboxClient` | `openai-agents[e2b]` | [E2B runner](https://github.com/openai/openai-agents-python/blob/main/examples/sandbox/extensions/e2b_runner.py) | | `ModalSandboxClient` | `openai-agents[modal]` | [Modal runner](https://github.com/openai/openai-agents-python/blob/main/examples/sandbox/extensions/modal_runner.py) | | `RunloopSandboxClient` | `openai-agents[runloop]` | [Runloop runner](https://github.com/openai/openai-agents-python/blob/main/examples/sandbox/extensions/runloop/runner.py) | +| `TensorlakeSandboxClient` | `openai-agents[tensorlake]` | [Tensorlake runner](https://github.com/openai/openai-agents-python/blob/main/examples/sandbox/extensions/tensorlake_runner.py) | | `VercelSandboxClient` | `openai-agents[vercel]` | [Vercel runner](https://github.com/openai/openai-agents-python/blob/main/examples/sandbox/extensions/vercel_runner.py) | @@ -113,6 +114,7 @@ Hosted sandbox clients expose provider-specific mount strategies. Choose the bac | `DaytonaSandboxClient` | Supports rclone-backed cloud storage mounts with `DaytonaCloudBucketMountStrategy`; use it with `S3Mount`, `GCSMount`, `R2Mount`, `AzureBlobMount`, and `BoxMount`. | | `E2BSandboxClient` | Supports rclone-backed cloud storage mounts with `E2BCloudBucketMountStrategy`; use it with `S3Mount`, `GCSMount`, `R2Mount`, `AzureBlobMount`, and `BoxMount`. | | `RunloopSandboxClient` | Supports rclone-backed cloud storage mounts with `RunloopCloudBucketMountStrategy`; use it with `S3Mount`, `GCSMount`, `R2Mount`, `AzureBlobMount`, and `BoxMount`. | +| `TensorlakeSandboxClient` | No hosted-specific mount strategy is currently exposed. Use manifest files, repos, or other workspace inputs instead. Tensorlake's native sandbox checkpoint API is available via `workspace_persistence="snapshot"`. | | `VercelSandboxClient` | No hosted-specific mount strategy is currently exposed. Use manifest files, repos, or other workspace inputs instead. | @@ -130,6 +132,7 @@ The table below summarizes which remote storage entries each backend can mount d | `DaytonaSandboxClient` | ✓ | ✓ | ✓ | ✓ | ✓ | - | | `E2BSandboxClient` | ✓ | ✓ | ✓ | ✓ | ✓ | - | | `RunloopSandboxClient` | ✓ | ✓ | ✓ | ✓ | ✓ | - | +| `TensorlakeSandboxClient` | - | - | - | - | - | - | | `VercelSandboxClient` | - | - | - | - | - | - | diff --git a/examples/sandbox/extensions/README.md b/examples/sandbox/extensions/README.md index 837d9dfa28..18d41fb886 100644 --- a/examples/sandbox/extensions/README.md +++ b/examples/sandbox/extensions/README.md @@ -243,6 +243,36 @@ export DAYTONA_API_KEY=... uv run python examples/sandbox/extensions/daytona/daytona_runner.py --stream ``` +## Tensorlake + +### Setup + +Install the repo extra: + +```bash +uv sync --extra tensorlake +``` + +Sign up at [cloud.tensorlake.ai](https://cloud.tensorlake.ai/) (or run `tl login`) +and export the required environment variables: + +```bash +export OPENAI_API_KEY=... +export TENSORLAKE_API_KEY=... +``` + +### Run + +```bash +uv run python examples/sandbox/extensions/tensorlake_runner.py --stream +``` + +Useful flags: + +- `--image ` to pin a specific Tensorlake registered image. +- `--timeout-secs 600` +- `--workspace-persistence snapshot` to verify the native checkpoint round-trip. + ## Runloop ### Setup diff --git a/examples/sandbox/extensions/tensorlake_runner.py b/examples/sandbox/extensions/tensorlake_runner.py new file mode 100644 index 0000000000..ebe93fcd10 --- /dev/null +++ b/examples/sandbox/extensions/tensorlake_runner.py @@ -0,0 +1,289 @@ +""" +Minimal Tensorlake-backed sandbox example for manual validation. + +This mirrors the other cloud extension examples: it creates a tiny workspace, +verifies stop/resume persistence, then asks a sandboxed agent to inspect the +workspace through one shell tool. +""" + +from __future__ import annotations + +import argparse +import asyncio +import io +import os +import sys +import tempfile +from pathlib import Path +from typing import Literal, cast + +from openai.types.responses import ResponseTextDeltaEvent + +from agents import ModelSettings, Runner +from agents.models.openai_provider import OpenAIProvider +from agents.run import RunConfig +from agents.sandbox import LocalSnapshotSpec, Manifest, SandboxAgent, SandboxRunConfig +from agents.sandbox.session import BaseSandboxSession + +if __package__ is None or __package__ == "": + sys.path.insert(0, str(Path(__file__).resolve().parents[3])) + +from examples.sandbox.misc.example_support import text_manifest +from examples.sandbox.misc.workspace_shell import WorkspaceShellCapability + +try: + from agents.extensions.sandbox import ( + TensorlakeSandboxClient, + TensorlakeSandboxClientOptions, + ) +except Exception as exc: # pragma: no cover - import path depends on optional extras + raise SystemExit( + "Tensorlake sandbox examples require the optional repo extra.\n" + "Install it with: uv sync --extra tensorlake" + ) from exc + + +DEFAULT_QUESTION = "Summarize this cloud sandbox workspace in 2 sentences." +SNAPSHOT_CHECK_PATH = Path("snapshot-check.txt") +SNAPSHOT_CHECK_CONTENT = "tensorlake snapshot round-trip ok\n" +LIVE_RESUME_CHECK_PATH = Path("live-resume-check.txt") +LIVE_RESUME_CHECK_CONTENT = "tensorlake live resume ok\n" + + +def _build_manifest() -> Manifest: + return text_manifest( + { + "README.md": ( + "# Tensorlake Demo Workspace\n\n" + "This workspace exists to validate the Tensorlake sandbox backend manually.\n" + ), + "handoff.md": ( + "# Handoff\n\n" + "- Customer: Northwind Traders.\n" + "- Goal: validate Tensorlake sandbox exec and persistence flows.\n" + "- Current status: non-PTY backend slice is wired and under test.\n" + ), + "todo.md": ( + "# Todo\n\n" + "1. Inspect the workspace files.\n" + "2. Summarize the current status in two sentences.\n" + ), + } + ) + + +async def _read_text(session: BaseSandboxSession, path: Path) -> str: + data = await session.read(path) + text = cast(str | bytes, data.read()) + if isinstance(text, bytes): + return text.decode("utf-8") + return text + + +def _require_env(name: str) -> None: + if os.environ.get(name): + return + raise SystemExit(f"{name} must be set before running this example.") + + +async def _verify_stop_resume( + *, + manifest: Manifest, + image: str | None, + timeout_secs: int | None, + workspace_persistence: Literal["tar", "snapshot"], +) -> None: + client = TensorlakeSandboxClient() + options = TensorlakeSandboxClientOptions( + image=image, + timeout_secs=timeout_secs, + workspace_persistence=workspace_persistence, + ) + with tempfile.TemporaryDirectory(prefix="tensorlake-snapshot-example-") as snapshot_dir: + sandbox = await client.create( + manifest=manifest, + snapshot=LocalSnapshotSpec(base_path=Path(snapshot_dir)), + options=options, + ) + + try: + await sandbox.start() + await sandbox.write( + SNAPSHOT_CHECK_PATH, + io.BytesIO(SNAPSHOT_CHECK_CONTENT.encode("utf-8")), + ) + await sandbox.stop() + finally: + await sandbox.shutdown() + + resumed_sandbox = await client.resume(sandbox.state) + try: + await resumed_sandbox.start() + restored_text = await _read_text(resumed_sandbox, SNAPSHOT_CHECK_PATH) + if restored_text != SNAPSHOT_CHECK_CONTENT: + raise RuntimeError( + f"Snapshot resume verification failed for {workspace_persistence!r}: " + f"expected {SNAPSHOT_CHECK_CONTENT!r}, got {restored_text!r}" + ) + finally: + await resumed_sandbox.aclose() + + print(f"snapshot round-trip ok ({workspace_persistence})") + + +async def _verify_resume_running_sandbox( + *, + manifest: Manifest, + image: str | None, + timeout_secs: int | None, + workspace_persistence: Literal["tar", "snapshot"], +) -> None: + client = TensorlakeSandboxClient() + sandbox = await client.create( + manifest=manifest, + options=TensorlakeSandboxClientOptions( + image=image, + timeout_secs=timeout_secs, + workspace_persistence=workspace_persistence, + ), + ) + + try: + await sandbox.start() + await sandbox.write( + LIVE_RESUME_CHECK_PATH, + io.BytesIO(LIVE_RESUME_CHECK_CONTENT.encode("utf-8")), + ) + serialized = client.serialize_session_state(sandbox.state) + resumed_sandbox = await client.resume(client.deserialize_session_state(serialized)) + try: + restored_text = await _read_text(resumed_sandbox, LIVE_RESUME_CHECK_PATH) + if restored_text != LIVE_RESUME_CHECK_CONTENT: + raise RuntimeError( + "Running sandbox resume verification failed: " + f"expected {LIVE_RESUME_CHECK_CONTENT!r}, got {restored_text!r}" + ) + finally: + await resumed_sandbox.aclose() + finally: + await sandbox.shutdown() + + print(f"running sandbox resume ok ({workspace_persistence})") + + +async def main( + *, + model: str, + question: str, + image: str | None, + timeout_secs: int | None, + workspace_persistence: Literal["tar", "snapshot"], + stream: bool, +) -> None: + _require_env("OPENAI_API_KEY") + _require_env("TENSORLAKE_API_KEY") + + manifest = _build_manifest() + + await _verify_stop_resume( + manifest=manifest, + image=image, + timeout_secs=timeout_secs, + workspace_persistence=workspace_persistence, + ) + await _verify_resume_running_sandbox( + manifest=manifest, + image=image, + timeout_secs=timeout_secs, + workspace_persistence=workspace_persistence, + ) + + agent = SandboxAgent( + name="Tensorlake Sandbox Assistant", + model=model, + instructions=( + "Answer questions about the sandbox workspace. Inspect the files before answering " + "and keep the response concise. " + "Do not invent files or statuses that are not present in the workspace. Cite the " + "file names you inspected." + ), + default_manifest=manifest, + capabilities=[WorkspaceShellCapability()], + model_settings=ModelSettings(tool_choice="required"), + ) + + client = TensorlakeSandboxClient() + sandbox = await client.create( + manifest=manifest, + options=TensorlakeSandboxClientOptions( + image=image, + timeout_secs=timeout_secs, + workspace_persistence=workspace_persistence, + ), + ) + + run_config = RunConfig( + model_provider=OpenAIProvider(), + sandbox=SandboxRunConfig(session=sandbox), + tracing_disabled=True, + workflow_name="Tensorlake sandbox example", + ) + + try: + async with sandbox: + if not stream: + result = await Runner.run(agent, question, run_config=run_config) + print(result.final_output) + return + + stream_result = Runner.run_streamed(agent, question, run_config=run_config) + saw_text_delta = False + async for event in stream_result.stream_events(): + if event.type == "raw_response_event" and isinstance( + event.data, ResponseTextDeltaEvent + ): + if not saw_text_delta: + print("assistant> ", end="", flush=True) + saw_text_delta = True + print(event.data.delta, end="", flush=True) + + if saw_text_delta: + print() + finally: + await client.delete(sandbox) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model", default="gpt-5.5", help="Model name to use.") + parser.add_argument("--question", default=DEFAULT_QUESTION, help="Prompt to send to the agent.") + parser.add_argument( + "--image", + default=None, + help="Optional Tensorlake registered image name. Falls back to the SDK default.", + ) + parser.add_argument( + "--timeout-secs", + type=int, + default=300, + help="Optional Tensorlake sandbox lifetime in seconds.", + ) + parser.add_argument( + "--workspace-persistence", + choices=("tar", "snapshot"), + default="tar", + help="Workspace persistence mode to verify before the agent run.", + ) + parser.add_argument("--stream", action="store_true", default=False, help="Stream the response.") + args = parser.parse_args() + + asyncio.run( + main( + model=args.model, + question=args.question, + image=args.image, + timeout_secs=args.timeout_secs, + workspace_persistence=cast(Literal["tar", "snapshot"], args.workspace_persistence), + stream=args.stream, + ) + ) diff --git a/pyproject.toml b/pyproject.toml index 814e4dabe8..b6c92b6a41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ cloudflare = ["aiohttp>=3.12,<4"] e2b = ["e2b==2.20.0", "e2b-code-interpreter==2.4.1"] modal = ["modal==1.3.5"] runloop = ["runloop_api_client>=1.16.0,<2.0.0"] +tensorlake = ["tensorlake>=0.5.4; python_version<'3.14'"] vercel = ["vercel>=0.5.6,<0.6"] s3 = ["boto3>=1.34"] temporal = [ @@ -156,6 +157,10 @@ ignore_missing_imports = true module = ["runloop_api_client", "runloop_api_client.*"] ignore_missing_imports = true +[[tool.mypy.overrides]] +module = ["tensorlake", "tensorlake.*"] +ignore_missing_imports = true + [[tool.mypy.overrides]] module = ["blaxel", "blaxel.*"] ignore_missing_imports = true diff --git a/src/agents/extensions/sandbox/__init__.py b/src/agents/extensions/sandbox/__init__.py index d7b082ba1f..9d45cebf25 100644 --- a/src/agents/extensions/sandbox/__init__.py +++ b/src/agents/extensions/sandbox/__init__.py @@ -97,6 +97,19 @@ except Exception: # pragma: no cover _HAS_RUNLOOP = False +try: + from .tensorlake import ( + TensorlakeSandboxClient as TensorlakeSandboxClient, + TensorlakeSandboxClientOptions as TensorlakeSandboxClientOptions, + TensorlakeSandboxSession as TensorlakeSandboxSession, + TensorlakeSandboxSessionState as TensorlakeSandboxSessionState, + TensorlakeSandboxTimeouts as TensorlakeSandboxTimeouts, + ) + + _HAS_TENSORLAKE = True +except Exception: # pragma: no cover + _HAS_TENSORLAKE = False + try: from .vercel import ( VercelSandboxClient as VercelSandboxClient, @@ -177,6 +190,17 @@ ] ) +if _HAS_TENSORLAKE: + __all__.extend( + [ + "TensorlakeSandboxClient", + "TensorlakeSandboxClientOptions", + "TensorlakeSandboxSession", + "TensorlakeSandboxSessionState", + "TensorlakeSandboxTimeouts", + ] + ) + if _HAS_VERCEL: __all__.extend( [ diff --git a/src/agents/extensions/sandbox/tensorlake/__init__.py b/src/agents/extensions/sandbox/tensorlake/__init__.py new file mode 100644 index 0000000000..c75b0c6021 --- /dev/null +++ b/src/agents/extensions/sandbox/tensorlake/__init__.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +from .sandbox import ( + TensorlakeSandboxClient, + TensorlakeSandboxClientOptions, + TensorlakeSandboxSession, + TensorlakeSandboxSessionState, + TensorlakeSandboxTimeouts, +) + +__all__ = [ + "TensorlakeSandboxClient", + "TensorlakeSandboxClientOptions", + "TensorlakeSandboxSession", + "TensorlakeSandboxSessionState", + "TensorlakeSandboxTimeouts", +] diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py new file mode 100644 index 0000000000..6cab3fe583 --- /dev/null +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -0,0 +1,983 @@ +""" +Tensorlake sandbox (https://tensorlake.ai) implementation. + +Set `TENSORLAKE_API_KEY` (or run `tl login`) to authenticate. + +This module provides a Tensorlake-backed sandbox client/session implementation backed by +`tensorlake.sandbox.Sandbox`. The Tensorlake Python SDK is synchronous, so blocking calls +are dispatched through `asyncio.to_thread` to integrate with the async session contract. + +Note: The `tensorlake` dependency is optional (installed via the `tensorlake` extra), so +imports of the SDK happen lazily within this module. Users without the extra can still +import the package; they just cannot construct a `TensorlakeSandboxClient`. +""" + +from __future__ import annotations + +import asyncio +import io +import json +import logging +import uuid +from pathlib import Path +from typing import Any, Literal, cast +from urllib.parse import urlsplit + +from pydantic import BaseModel, Field + +from ....sandbox.errors import ( + ExecNonZeroError, + ExecTimeoutError, + ExecTransportError, + ExposedPortUnavailableError, + WorkspaceArchiveReadError, + WorkspaceArchiveWriteError, + WorkspaceReadNotFoundError, + WorkspaceStartError, + WorkspaceWriteTypeError, +) +from ....sandbox.manifest import Manifest +from ....sandbox.session import SandboxSession, SandboxSessionState +from ....sandbox.session.base_sandbox_session import BaseSandboxSession +from ....sandbox.session.dependencies import Dependencies +from ....sandbox.session.manager import Instrumentation +from ....sandbox.session.mount_lifecycle import with_ephemeral_mounts_removed +from ....sandbox.session.runtime_helpers import RESOLVE_WORKSPACE_PATH_HELPER, RuntimeHelperScript +from ....sandbox.session.sandbox_client import BaseSandboxClient, BaseSandboxClientOptions +from ....sandbox.snapshot import SnapshotBase, SnapshotSpec, resolve_snapshot +from ....sandbox.types import ExecResult, ExposedPortEndpoint, User +from ....sandbox.util.tar_utils import UnsafeTarMemberError, validate_tar_bytes +from ....sandbox.workspace_paths import posix_path_for_error, sandbox_path_str + +logger = logging.getLogger(__name__) + +WorkspacePersistenceMode = Literal["tar", "snapshot"] +CheckpointMode = Literal["filesystem", "memory"] + +_WORKSPACE_PERSISTENCE_TAR: WorkspacePersistenceMode = "tar" +_WORKSPACE_PERSISTENCE_SNAPSHOT: WorkspacePersistenceMode = "snapshot" + +# Magic prefix for Tensorlake checkpoint references that are not tar bytes. +_TENSORLAKE_SNAPSHOT_MAGIC = b"TENSORLAKE_SANDBOX_SNAPSHOT_V1\n" + +_DEFAULT_EXPOSED_PORT_HOST_TEMPLATE = "{port}-{sandbox}.sandbox.tensorlake.ai" + + +def _import_tensorlake_sandbox() -> tuple[Any, Any, Any, Any]: + """Lazily import the Tensorlake SDK symbols this integration needs.""" + + try: + from tensorlake.sandbox import ( + CheckpointType, + RemoteAPIError, + Sandbox, + SandboxStatus, + ) + + return Sandbox, CheckpointType, SandboxStatus, RemoteAPIError + except ImportError as exc: # pragma: no cover - exercised via unit tests with fakes + raise ImportError( + "TensorlakeSandboxClient requires the optional `tensorlake` dependency.\n" + 'Install it with `pip install "openai-agents[tensorlake]"`.' + ) from exc + + +def _is_running_status(status: object) -> bool: + """Return True when the SDK `SandboxStatus` value is RUNNING. + + Compare via `.value` so we don't have to re-import the enum on every check. + """ + value = getattr(status, "value", status) + return isinstance(value, str) and value.lower() == "running" + + +def _encode_tensorlake_snapshot_ref(*, snapshot_id: str) -> bytes: + body = json.dumps({"snapshot_id": snapshot_id}, separators=(",", ":"), sort_keys=True).encode( + "utf-8" + ) + return _TENSORLAKE_SNAPSHOT_MAGIC + body + + +def _decode_tensorlake_snapshot_ref(raw: bytes) -> str | None: + if not raw.startswith(_TENSORLAKE_SNAPSHOT_MAGIC): + return None + body = raw[len(_TENSORLAKE_SNAPSHOT_MAGIC) :] + try: + payload = json.loads(body.decode("utf-8")) + except (UnicodeDecodeError, json.JSONDecodeError): + return None + snapshot_id = payload.get("snapshot_id") if isinstance(payload, dict) else None + return snapshot_id if isinstance(snapshot_id, str) and snapshot_id else None + + +class TensorlakeSandboxTimeouts(BaseModel): + """Timeout configuration for Tensorlake operations.""" + + # Caller-supplied timeout=None should mean "no timeout" without bypassing the safety net. + exec_timeout_unbounded_s: float = Field(default=24 * 60 * 60, ge=1) # 24 hours + fast_op_s: float = Field(default=30, ge=1) + file_upload_s: float = Field(default=300, ge=1) + snapshot_tar_s: float = Field(default=300, ge=1) + cleanup_s: float = Field(default=30, ge=1) + + +class TensorlakeSandboxClientOptions(BaseSandboxClientOptions): + """Client options for the Tensorlake sandbox backend.""" + + type: Literal["tensorlake"] = "tensorlake" + image: str | None = None + cpus: float | None = None + memory_mb: int | None = None + timeout_secs: int | None = None + name: str | None = None + secret_names: tuple[str, ...] = () + envs: dict[str, str] | None = None + allow_internet_access: bool = True + allow_out: tuple[str, ...] = () + deny_out: tuple[str, ...] = () + exposed_ports: tuple[int, ...] = () + allow_unauthenticated_port_access: bool = False + pause_on_exit: bool = False + workspace_persistence: WorkspacePersistenceMode = _WORKSPACE_PERSISTENCE_TAR + checkpoint_mode: CheckpointMode = "filesystem" + checkpoint_timeout_s: float = 300.0 + timeouts: TensorlakeSandboxTimeouts | dict[str, object] | None = None + + def __init__( + self, + image: str | None = None, + cpus: float | None = None, + memory_mb: int | None = None, + timeout_secs: int | None = None, + name: str | None = None, + secret_names: tuple[str, ...] = (), + envs: dict[str, str] | None = None, + allow_internet_access: bool = True, + allow_out: tuple[str, ...] = (), + deny_out: tuple[str, ...] = (), + exposed_ports: tuple[int, ...] = (), + allow_unauthenticated_port_access: bool = False, + pause_on_exit: bool = False, + workspace_persistence: WorkspacePersistenceMode = _WORKSPACE_PERSISTENCE_TAR, + checkpoint_mode: CheckpointMode = "filesystem", + checkpoint_timeout_s: float = 300.0, + timeouts: TensorlakeSandboxTimeouts | dict[str, object] | None = None, + *, + type: Literal["tensorlake"] = "tensorlake", + ) -> None: + super().__init__( + type=type, + image=image, + cpus=cpus, + memory_mb=memory_mb, + timeout_secs=timeout_secs, + name=name, + secret_names=secret_names, + envs=envs, + allow_internet_access=allow_internet_access, + allow_out=allow_out, + deny_out=deny_out, + exposed_ports=exposed_ports, + allow_unauthenticated_port_access=allow_unauthenticated_port_access, + pause_on_exit=pause_on_exit, + workspace_persistence=workspace_persistence, + checkpoint_mode=checkpoint_mode, + checkpoint_timeout_s=checkpoint_timeout_s, + timeouts=timeouts, + ) + + +class TensorlakeSandboxSessionState(SandboxSessionState): + """Serializable state for a Tensorlake-backed session.""" + + type: Literal["tensorlake"] = "tensorlake" + sandbox_id: str + name: str | None = None + image: str | None = None + cpus: float | None = None + memory_mb: int | None = None + timeout_secs: int | None = None + secret_names: tuple[str, ...] = () + base_envs: dict[str, str] = Field(default_factory=dict) + allow_internet_access: bool = True + allow_out: tuple[str, ...] = () + deny_out: tuple[str, ...] = () + allow_unauthenticated_port_access: bool = False + pause_on_exit: bool = False + workspace_persistence: WorkspacePersistenceMode = _WORKSPACE_PERSISTENCE_TAR + checkpoint_mode: CheckpointMode = "filesystem" + checkpoint_timeout_s: float = 300.0 + timeouts: TensorlakeSandboxTimeouts = Field(default_factory=TensorlakeSandboxTimeouts) + + +def _resolve_create_kwargs( + *, + image: str | None, + cpus: float | None, + memory_mb: int | None, + timeout_secs: int | None, + name: str | None, + secret_names: tuple[str, ...], + allow_internet_access: bool, + allow_out: tuple[str, ...], + deny_out: tuple[str, ...], + snapshot_id: str | None, +) -> dict[str, object]: + """Build the kwargs accepted by `Sandbox.create(...)`. + + Only includes optional fields when they are set so the SDK can apply its own defaults. + Tensorlake does not accept environment variables at sandbox-create time; envs are passed + on each `sandbox.run(...)` call instead. + """ + + kwargs: dict[str, object] = {"allow_internet_access": allow_internet_access} + if image is not None: + kwargs["image"] = image + if cpus is not None: + kwargs["cpus"] = cpus + if memory_mb is not None: + kwargs["memory_mb"] = memory_mb + if timeout_secs is not None: + kwargs["timeout_secs"] = timeout_secs + if name is not None: + kwargs["name"] = name + if secret_names: + kwargs["secret_names"] = list(secret_names) + if allow_out: + kwargs["allow_out"] = list(allow_out) + if deny_out: + kwargs["deny_out"] = list(deny_out) + if snapshot_id is not None: + kwargs["snapshot_id"] = snapshot_id + return kwargs + + +def _resolve_checkpoint_type(mode: CheckpointMode, checkpoint_module: Any) -> Any: + if mode == "memory": + return checkpoint_module.MEMORY + return checkpoint_module.FILESYSTEM + + +class TensorlakeSandboxSession(BaseSandboxSession): + """SandboxSession implementation backed by a Tensorlake sandbox.""" + + state: TensorlakeSandboxSessionState + _sandbox: Any + _envs_cache: dict[str, str] | None + + def __init__( + self, + *, + state: TensorlakeSandboxSessionState, + sandbox: Any, + ) -> None: + self.state = state + self._sandbox = sandbox + self._envs_cache = None + + @classmethod + def from_state( + cls, + state: TensorlakeSandboxSessionState, + *, + sandbox: Any, + ) -> TensorlakeSandboxSession: + return cls(state=state, sandbox=sandbox) + + @property + def sandbox_id(self) -> str: + return self.state.sandbox_id + + def supports_pty(self) -> bool: + # WebSocket PTY API not yet wired through this integration. + return False + + async def _validate_path_access(self, path: Path | str, *, for_write: bool = False) -> Path: + return await self._validate_remote_path_access(path, for_write=for_write) + + def _runtime_helpers(self) -> tuple[RuntimeHelperScript, ...]: + return (RESOLVE_WORKSPACE_PATH_HELPER,) + + def _current_runtime_helper_cache_key(self) -> object | None: + return self.state.sandbox_id + + async def _resolved_envs(self) -> dict[str, str]: + # The manifest is treated as immutable for the lifetime of a session, so we resolve + # secret-store/env values once and reuse the merged dict across exec/file operations. + if self._envs_cache is None: + manifest_envs = await self.state.manifest.environment.resolve() + self._envs_cache = {**self.state.base_envs, **manifest_envs} + return self._envs_cache + + def _coerce_exec_timeout(self, timeout_s: float | None) -> float: + if timeout_s is None: + return float(self.state.timeouts.exec_timeout_unbounded_s) + if timeout_s <= 0: + return 0.001 + return float(timeout_s) + + async def _prepare_backend_workspace(self) -> None: + root = sandbox_path_str(self.state.manifest.root) + try: + envs = await self._resolved_envs() + result = await asyncio.to_thread( + self._sandbox.run, + "mkdir", + ["-p", "--", root], + env=envs or None, + timeout=int(self.state.timeouts.fast_op_s), + ) + except Exception as exc: + raise WorkspaceStartError(path=Path(root), cause=exc) from exc + + exit_code = int(getattr(result, "exit_code", 0) or 0) + if exit_code != 0: + raise WorkspaceStartError( + path=Path(root), + context={ + "reason": "workspace_root_nonzero_exit", + "exit_code": exit_code, + "stderr": str(getattr(result, "stderr", "") or ""), + }, + ) + + async def _shutdown_backend(self) -> None: + sandbox = self._sandbox + if sandbox is None: + return + try: + if self.state.pause_on_exit: + await asyncio.to_thread(sandbox.suspend) + else: + await asyncio.to_thread(sandbox.terminate) + except Exception as exc: + if self.state.pause_on_exit: + logger.warning( + "Failed to suspend Tensorlake sandbox on shutdown; falling back to terminate.", + extra={"sandbox_id": self.state.sandbox_id}, + exc_info=exc, + ) + try: + await asyncio.to_thread(sandbox.terminate) + except Exception as term_exc: + logger.warning( + "Failed to terminate Tensorlake sandbox after suspend fallback failure.", + extra={"sandbox_id": self.state.sandbox_id}, + exc_info=term_exc, + ) + else: + logger.warning( + "Failed to terminate Tensorlake sandbox on shutdown.", + extra={"sandbox_id": self.state.sandbox_id}, + exc_info=exc, + ) + + async def running(self) -> bool: + sandbox = self._sandbox + if sandbox is None: + return False + try: + status = await asyncio.to_thread(getattr, sandbox, "status") + except Exception: + return False + return _is_running_status(status) + + async def _exec_internal( + self, + *command: str | Path, + timeout: float | None = None, + ) -> ExecResult: + normalized = [str(part) for part in command] + if not normalized: + return ExecResult(stdout=b"", stderr=b"", exit_code=0) + + envs = await self._resolved_envs() + cwd = sandbox_path_str(self.state.manifest.root) + exec_timeout = self._coerce_exec_timeout(timeout) + + try: + # Rely on the SDK's own `timeout`; an outer `asyncio.wait_for` would only cancel + # the awaiter, not the underlying thread, so it would not free the runtime. + result = await asyncio.to_thread( + self._sandbox.run, + normalized[0], + normalized[1:], + env=envs or None, + working_dir=cwd, + timeout=int(exec_timeout), + ) + except Exception as exc: + if "timeout" in type(exc).__name__.lower() or "timed out" in str(exc).lower(): + raise ExecTimeoutError(command=command, timeout_s=timeout, cause=exc) from exc + raise ExecTransportError( + command=command, + context={"backend": "tensorlake", "sandbox_id": self.state.sandbox_id}, + cause=exc, + ) from exc + + stdout_str = str(getattr(result, "stdout", "") or "") + stderr_str = str(getattr(result, "stderr", "") or "") + exit_code = int(getattr(result, "exit_code", 0) or 0) + return ExecResult( + stdout=stdout_str.encode("utf-8", errors="replace"), + stderr=stderr_str.encode("utf-8", errors="replace"), + exit_code=exit_code, + ) + + async def _resolve_exposed_port(self, port: int) -> ExposedPortEndpoint: + host = _DEFAULT_EXPOSED_PORT_HOST_TEMPLATE.format( + port=port, sandbox=self.state.name or self.state.sandbox_id + ) + # The Tensorlake SDK does not currently expose a discovery method for inbound URLs, + # so we rely on the documented host template. + parsed = urlsplit(f"https://{host}") + if not parsed.hostname: + raise ExposedPortUnavailableError( + port=port, + exposed_ports=self.state.exposed_ports, + reason="backend_unavailable", + context={"backend": "tensorlake", "host": host}, + ) + return ExposedPortEndpoint(host=parsed.hostname, port=443, tls=True) + + async def read(self, path: Path, *, user: str | User | None = None) -> io.IOBase: + if user is not None: + await self._check_read_with_exec(path, user=user) + + normalized_path = await self._validate_path_access(path) + + try: + payload = await asyncio.to_thread( + self._sandbox.read_file, sandbox_path_str(normalized_path) + ) + except FileNotFoundError as exc: + raise WorkspaceReadNotFoundError(path=normalized_path, cause=exc) from exc + except Exception as exc: + if getattr(exc, "status_code", None) == 404: + raise WorkspaceReadNotFoundError(path=normalized_path, cause=exc) from exc + raise WorkspaceArchiveReadError(path=normalized_path, cause=exc) from exc + + if isinstance(payload, str): + data = payload.encode("utf-8") + elif isinstance(payload, bytes | bytearray): + data = bytes(payload) + else: + data = str(payload).encode("utf-8", errors="replace") + return io.BytesIO(data) + + async def write( + self, + path: Path, + data: io.IOBase, + *, + user: str | User | None = None, + ) -> None: + if user is not None: + await self._check_write_with_exec(path, user=user) + + normalized_path = await self._validate_path_access(path, for_write=True) + + payload = data.read() + if isinstance(payload, str): + payload = payload.encode("utf-8") + if not isinstance(payload, bytes | bytearray): + raise WorkspaceWriteTypeError(path=normalized_path, actual_type=type(payload).__name__) + + try: + await asyncio.to_thread( + self._sandbox.write_file, sandbox_path_str(normalized_path), bytes(payload) + ) + except Exception as exc: + raise WorkspaceArchiveWriteError(path=normalized_path, cause=exc) from exc + + async def mkdir( + self, + path: Path | str, + *, + parents: bool = False, + user: str | User | None = None, + ) -> None: + if user is not None: + path = await self._check_mkdir_with_exec(path, parents=parents, user=user) + else: + path = await self._validate_path_access(path, for_write=True) + + if path == Path("/"): + return + + flag = "-p" if parents else "" + argv = [a for a in [flag, "--", sandbox_path_str(path)] if a] + try: + envs = await self._resolved_envs() + result = await asyncio.to_thread( + self._sandbox.run, + "mkdir", + argv, + env=envs or None, + timeout=int(self.state.timeouts.fast_op_s), + ) + except Exception as exc: + raise WorkspaceArchiveWriteError( + path=path, context={"reason": "mkdir_failed"}, cause=exc + ) from exc + + exit_code = int(getattr(result, "exit_code", 0) or 0) + if exit_code != 0: + raise WorkspaceArchiveWriteError( + path=path, + context={ + "reason": "mkdir_nonzero_exit", + "exit_code": exit_code, + "stderr": str(getattr(result, "stderr", "") or ""), + }, + ) + + async def persist_workspace(self) -> io.IOBase: + return await with_ephemeral_mounts_removed( + self, + self._persist_workspace_internal, + error_path=self._workspace_root_path(), + error_cls=WorkspaceArchiveReadError, + operation_error_context_key="snapshot_error_before_remount_corruption", + ) + + async def _persist_workspace_internal(self) -> io.IOBase: + if self.state.workspace_persistence == _WORKSPACE_PERSISTENCE_SNAPSHOT: + return await self._persist_workspace_via_checkpoint() + return await self._persist_workspace_via_tar() + + async def _persist_workspace_via_checkpoint(self) -> io.IOBase: + """Persist using Tensorlake's native sandbox checkpoint API. + + Falls back to tar when the backend declines or when path-level skips are required — + Tensorlake checkpoints capture the whole sandbox and have no path-level excludes. + """ + + root = self._workspace_root_path() + error_root = posix_path_for_error(root) + + if self._native_snapshot_requires_tar_fallback(): + return await self._persist_workspace_via_tar() + + skip = self._persist_workspace_skip_relpaths() + mount_targets = self.state.manifest.ephemeral_mount_targets() + mount_skip_rel_paths: set[Path] = set() + for _, mount_path in mount_targets: + try: + mount_skip_rel_paths.add(mount_path.relative_to(root)) + except ValueError: + continue + if skip - mount_skip_rel_paths: + return await self._persist_workspace_via_tar() + + _, CheckpointType, _, _ = _import_tensorlake_sandbox() + checkpoint_type = _resolve_checkpoint_type(self.state.checkpoint_mode, CheckpointType) + + try: + snapshot = await asyncio.wait_for( + asyncio.to_thread( + self._sandbox.checkpoint, + checkpoint_type=checkpoint_type, + timeout=int(self.state.checkpoint_timeout_s), + ), + timeout=self.state.timeouts.snapshot_tar_s, + ) + except Exception as exc: + raise WorkspaceArchiveReadError( + path=error_root, + context={"reason": "tensorlake_checkpoint_failed"}, + cause=exc, + ) from exc + + snapshot_id = getattr(snapshot, "snapshot_id", None) + if not isinstance(snapshot_id, str) or not snapshot_id: + raise WorkspaceArchiveReadError( + path=error_root, + context={ + "reason": "tensorlake_checkpoint_unexpected_return", + "type": type(snapshot).__name__, + }, + ) + return io.BytesIO(_encode_tensorlake_snapshot_ref(snapshot_id=snapshot_id)) + + async def _persist_workspace_via_tar(self) -> io.IOBase: + root = self._workspace_root_path() + error_root = posix_path_for_error(root) + archive_path = f"/tmp/openai-agents-{self.state.session_id.hex}.tar" + # Emit both `foo` and `./foo` exclude patterns since tar can produce members either way + # depending on archive provenance. + excludes: list[str] = [] + for rel in sorted(self._persist_workspace_skip_relpaths(), key=lambda p: p.as_posix()): + rel_posix = rel.as_posix().lstrip("/") + if not rel_posix or rel_posix in {".", "/"}: + continue + excludes.append(f"--exclude={rel_posix}") + excludes.append(f"--exclude=./{rel_posix}") + tar_argv = ["cf", archive_path, *excludes, "-C", root.as_posix(), "."] + + try: + envs = await self._resolved_envs() + result = await asyncio.to_thread( + self._sandbox.run, + "tar", + tar_argv, + env=envs or None, + timeout=int(self.state.timeouts.snapshot_tar_s), + ) + except Exception as exc: + raise WorkspaceArchiveReadError(path=error_root, cause=exc) from exc + + exit_code = int(getattr(result, "exit_code", 0) or 0) + if exit_code != 0: + raise WorkspaceArchiveReadError( + path=error_root, + cause=ExecNonZeroError( + ExecResult( + stdout=str(getattr(result, "stdout", "") or "").encode( + "utf-8", errors="replace" + ), + stderr=str(getattr(result, "stderr", "") or "").encode( + "utf-8", errors="replace" + ), + exit_code=exit_code, + ), + command=("tar", *tar_argv), + context={"backend": "tensorlake", "sandbox_id": self.state.sandbox_id}, + ), + ) + + try: + payload = await asyncio.to_thread(self._sandbox.read_file, archive_path) + except Exception as exc: + raise WorkspaceArchiveReadError(path=error_root, cause=exc) from exc + + if isinstance(payload, str): + archive_bytes = payload.encode("utf-8") + elif isinstance(payload, bytes | bytearray): + archive_bytes = bytes(payload) + else: + archive_bytes = str(payload).encode("utf-8", errors="replace") + + try: + await asyncio.to_thread( + self._sandbox.run, + "rm", + ["-f", "--", archive_path], + timeout=int(self.state.timeouts.cleanup_s), + ) + except Exception: + pass + + return io.BytesIO(archive_bytes) + + async def hydrate_workspace(self, data: io.IOBase) -> None: + raw = data.read() + if isinstance(raw, str): + raw = raw.encode("utf-8") + if not isinstance(raw, bytes | bytearray): + raise WorkspaceWriteTypeError( + path=self._workspace_root_path(), actual_type=type(raw).__name__ + ) + + await with_ephemeral_mounts_removed( + self, + lambda: self._hydrate_workspace_internal(bytes(raw)), + error_path=self._workspace_root_path(), + error_cls=WorkspaceArchiveWriteError, + operation_error_context_key="hydrate_error_before_remount_corruption", + ) + + async def _hydrate_workspace_internal(self, raw: bytes) -> None: + snapshot_id = _decode_tensorlake_snapshot_ref(raw) + if snapshot_id is not None: + await self._restore_from_checkpoint(snapshot_id) + return + + await self._hydrate_workspace_via_tar(raw) + + async def _restore_from_checkpoint(self, snapshot_id: str) -> None: + root = self._workspace_root_path() + error_root = posix_path_for_error(root) + Sandbox, _, _, _ = _import_tensorlake_sandbox() + + try: + await asyncio.to_thread(self._sandbox.terminate) + except Exception: + pass + + kwargs = _resolve_create_kwargs( + image=self.state.image, + cpus=self.state.cpus, + memory_mb=self.state.memory_mb, + timeout_secs=self.state.timeout_secs, + name=self.state.name, + secret_names=self.state.secret_names, + allow_internet_access=self.state.allow_internet_access, + allow_out=self.state.allow_out, + deny_out=self.state.deny_out, + snapshot_id=snapshot_id, + ) + + try: + sandbox = await asyncio.to_thread(Sandbox.create, **kwargs) + except Exception as exc: + raise WorkspaceArchiveWriteError( + path=error_root, + context={ + "reason": "tensorlake_checkpoint_restore_failed", + "snapshot_id": snapshot_id, + }, + cause=exc, + ) from exc + + self._sandbox = sandbox + new_id = getattr(sandbox, "sandbox_id", None) + if isinstance(new_id, str) and new_id: + self.state.sandbox_id = new_id + await self._maybe_apply_exposed_ports() + self.state.workspace_root_ready = True + + async def _hydrate_workspace_via_tar(self, raw: bytes) -> None: + root = self._workspace_root_path() + error_root = posix_path_for_error(root) + + try: + validate_tar_bytes(raw) + except UnsafeTarMemberError as exc: + raise WorkspaceArchiveWriteError( + path=error_root, + context={ + "reason": "unsafe_or_invalid_tar", + "member": exc.member, + "detail": str(exc), + }, + cause=exc, + ) from exc + + archive_path = f"/tmp/openai-agents-hydrate-{self.state.session_id.hex}.tar" + + try: + await self._prepare_backend_workspace() + await asyncio.to_thread(self._sandbox.write_file, archive_path, raw) + envs = await self._resolved_envs() + result = await asyncio.to_thread( + self._sandbox.run, + "tar", + ["xf", archive_path, "-C", root.as_posix()], + env=envs or None, + timeout=int(self.state.timeouts.snapshot_tar_s), + ) + except WorkspaceStartError as exc: + raise WorkspaceArchiveWriteError(path=error_root, cause=exc) from exc + except Exception as exc: + raise WorkspaceArchiveWriteError(path=error_root, cause=exc) from exc + finally: + try: + await asyncio.to_thread( + self._sandbox.run, + "rm", + ["-f", "--", archive_path], + timeout=int(self.state.timeouts.cleanup_s), + ) + except Exception: + pass + + exit_code = int(getattr(result, "exit_code", 0) or 0) + if exit_code != 0: + raise WorkspaceArchiveWriteError( + path=error_root, + context={ + "reason": "hydrate_nonzero_exit", + "exit_code": exit_code, + "stderr": str(getattr(result, "stderr", "") or ""), + }, + ) + self.state.workspace_root_ready = True + + async def _maybe_apply_exposed_ports(self) -> None: + ports = list(self.state.exposed_ports) + if not ports: + return + try: + await asyncio.to_thread( + self._sandbox.update, + exposed_ports=ports, + allow_unauthenticated_access=self.state.allow_unauthenticated_port_access, + ) + except Exception as exc: + logger.warning( + "Failed to apply exposed_ports on Tensorlake sandbox.", + extra={"sandbox_id": self.state.sandbox_id, "ports": ports}, + exc_info=exc, + ) + + +class TensorlakeSandboxClient(BaseSandboxClient[TensorlakeSandboxClientOptions]): + """Tensorlake-backed sandbox client.""" + + backend_id = "tensorlake" + _instrumentation: Instrumentation + + def __init__( + self, + *, + instrumentation: Instrumentation | None = None, + dependencies: Dependencies | None = None, + ) -> None: + super().__init__() + self._instrumentation = instrumentation or Instrumentation() + self._dependencies = dependencies + + async def create( + self, + *, + snapshot: SnapshotSpec | SnapshotBase | None = None, + manifest: Manifest | None = None, + options: TensorlakeSandboxClientOptions, + ) -> SandboxSession: + manifest = manifest or Manifest() + + timeouts_in = options.timeouts + if isinstance(timeouts_in, TensorlakeSandboxTimeouts): + timeouts = timeouts_in + elif timeouts_in is None: + timeouts = TensorlakeSandboxTimeouts() + else: + timeouts = TensorlakeSandboxTimeouts.model_validate(timeouts_in) + + if options.workspace_persistence not in ( + _WORKSPACE_PERSISTENCE_TAR, + _WORKSPACE_PERSISTENCE_SNAPSHOT, + ): + raise ValueError( + "TensorlakeSandboxClient.create requires workspace_persistence to be one of " + f"{_WORKSPACE_PERSISTENCE_TAR!r} or {_WORKSPACE_PERSISTENCE_SNAPSHOT!r}" + ) + + Sandbox, _, _, _ = _import_tensorlake_sandbox() + + base_envs = dict(options.envs or {}) + + kwargs = _resolve_create_kwargs( + image=options.image, + cpus=options.cpus, + memory_mb=options.memory_mb, + timeout_secs=options.timeout_secs, + name=options.name, + secret_names=options.secret_names, + allow_internet_access=options.allow_internet_access, + allow_out=options.allow_out, + deny_out=options.deny_out, + snapshot_id=None, + ) + + sandbox = await asyncio.to_thread(Sandbox.create, **kwargs) + sandbox_id = getattr(sandbox, "sandbox_id", None) + if not isinstance(sandbox_id, str) or not sandbox_id: + raise RuntimeError( + "Tensorlake `Sandbox.create` did not return a sandbox with a `sandbox_id`." + ) + + session_id = uuid.uuid4() + snapshot_instance = resolve_snapshot(snapshot, str(session_id)) + state = TensorlakeSandboxSessionState( + session_id=session_id, + manifest=manifest, + snapshot=snapshot_instance, + sandbox_id=sandbox_id, + name=options.name, + image=options.image, + cpus=options.cpus, + memory_mb=options.memory_mb, + timeout_secs=options.timeout_secs, + secret_names=options.secret_names, + base_envs=base_envs, + allow_internet_access=options.allow_internet_access, + allow_out=options.allow_out, + deny_out=options.deny_out, + allow_unauthenticated_port_access=options.allow_unauthenticated_port_access, + pause_on_exit=options.pause_on_exit, + workspace_persistence=options.workspace_persistence, + checkpoint_mode=options.checkpoint_mode, + checkpoint_timeout_s=options.checkpoint_timeout_s, + timeouts=timeouts, + exposed_ports=options.exposed_ports, + ) + inner = TensorlakeSandboxSession.from_state(state, sandbox=sandbox) + await inner._maybe_apply_exposed_ports() + return self._wrap_session(inner, instrumentation=self._instrumentation) + + async def delete(self, session: SandboxSession) -> SandboxSession: + inner = session._inner + if not isinstance(inner, TensorlakeSandboxSession): + raise TypeError("TensorlakeSandboxClient.delete expects a TensorlakeSandboxSession") + return session + + async def resume( + self, + state: SandboxSessionState, + ) -> SandboxSession: + if not isinstance(state, TensorlakeSandboxSessionState): + raise TypeError( + "TensorlakeSandboxClient.resume expects a TensorlakeSandboxSessionState" + ) + + Sandbox, _, _, _ = _import_tensorlake_sandbox() + + sandbox: Any = None + reconnected = False + try: + sandbox = await asyncio.to_thread(Sandbox.connect, state.sandbox_id) + if state.pause_on_exit: + # Paused sandboxes are valid reconnect targets only when pause-on-exit was + # configured; turn them back on before use. + try: + await asyncio.to_thread(sandbox.resume) + except Exception: + pass + reconnected = True + else: + status = await asyncio.to_thread(getattr, sandbox, "status") + if _is_running_status(status): + reconnected = True + else: + sandbox = None + except Exception: + sandbox = None + + if sandbox is None: + kwargs = _resolve_create_kwargs( + image=state.image, + cpus=state.cpus, + memory_mb=state.memory_mb, + timeout_secs=state.timeout_secs, + name=state.name, + secret_names=state.secret_names, + allow_internet_access=state.allow_internet_access, + allow_out=state.allow_out, + deny_out=state.deny_out, + snapshot_id=None, + ) + sandbox = await asyncio.to_thread(Sandbox.create, **kwargs) + new_id = getattr(sandbox, "sandbox_id", None) + if isinstance(new_id, str) and new_id: + state.sandbox_id = new_id + state.workspace_root_ready = False + + inner = TensorlakeSandboxSession.from_state(state, sandbox=sandbox) + inner._set_start_state_preserved(reconnected, system=reconnected) + if not reconnected: + await inner._maybe_apply_exposed_ports() + return self._wrap_session(inner, instrumentation=self._instrumentation) + + def deserialize_session_state(self, payload: dict[str, object]) -> SandboxSessionState: + return cast(SandboxSessionState, TensorlakeSandboxSessionState.model_validate(payload)) + + +__all__ = [ + "TensorlakeSandboxClient", + "TensorlakeSandboxClientOptions", + "TensorlakeSandboxSession", + "TensorlakeSandboxSessionState", + "TensorlakeSandboxTimeouts", +] diff --git a/tests/extensions/sandbox/test_tensorlake.py b/tests/extensions/sandbox/test_tensorlake.py new file mode 100644 index 0000000000..dbd742292c --- /dev/null +++ b/tests/extensions/sandbox/test_tensorlake.py @@ -0,0 +1,647 @@ +from __future__ import annotations + +import importlib +import io +import sys +import tarfile +import types +import uuid +from pathlib import Path +from typing import Any, cast + +import pytest + +from agents.sandbox import Manifest +from agents.sandbox.entries import File +from agents.sandbox.snapshot import NoopSnapshot +from tests._fake_workspace_paths import resolve_fake_workspace_path + + +class _FakeCommandResult: + def __init__(self, *, stdout: str = "", stderr: str = "", exit_code: int = 0) -> None: + self.stdout = stdout + self.stderr = stderr + self.exit_code = exit_code + + +class _FakeSnapshotInfo: + def __init__(self, snapshot_id: str) -> None: + self.snapshot_id = snapshot_id + + +class _FakeCheckpointType: + FILESYSTEM = "filesystem" + MEMORY = "memory" + + +class _FakeSandboxStatus: + class _Member: + def __init__(self, value: str) -> None: + self.value = value + + PENDING = _Member("pending") + RUNNING = _Member("running") + SUSPENDED = _Member("suspended") + TERMINATED = _Member("terminated") + + +class _FakeRemoteAPIError(Exception): + def __init__(self, status_code: int, message: str = "") -> None: + super().__init__(f"API error (status {status_code}): {message}") + self.status_code = status_code + self.message = message + + +class _FakeSandbox: + """Synchronous fake mirroring the Tensorlake `Sandbox` surface used by the integration.""" + + create_calls: list[dict[str, object]] = [] + connect_calls: list[dict[str, object]] = [] + sandboxes: dict[str, _FakeSandbox] = {} + snapshots: dict[str, dict[str, bytes]] = {} + next_sandbox_index: int = 0 + create_failures: list[BaseException] = [] + connect_failures: dict[str, BaseException] = {} + + def __init__( + self, + *, + sandbox_id: str, + name: str | None = None, + status: str = "running", + files: dict[str, bytes] | None = None, + ) -> None: + self.sandbox_id = sandbox_id + self.name = name + self.status = status + self.files: dict[str, bytes] = dict(files or {}) + self.run_calls: list[dict[str, object]] = [] + self.update_calls: list[dict[str, object]] = [] + self.terminated = False + self.suspended = False + self.resumed = False + self.next_run_result: _FakeCommandResult | None = None + self.symlinks: dict[str, str] = {} + + @classmethod + def reset(cls) -> None: + cls.create_calls = [] + cls.connect_calls = [] + cls.sandboxes = {} + cls.snapshots = {} + cls.next_sandbox_index = 0 + cls.create_failures = [] + cls.connect_failures = {} + + @classmethod + def create(cls, **kwargs: object) -> _FakeSandbox: + cls.create_calls.append(dict(kwargs)) + if cls.create_failures: + raise cls.create_failures.pop(0) + cls.next_sandbox_index += 1 + sandbox_id = f"tensorlake-sandbox-{cls.next_sandbox_index}" + files: dict[str, bytes] = {} + snapshot_id = kwargs.get("snapshot_id") + if isinstance(snapshot_id, str) and snapshot_id in cls.snapshots: + files = dict(cls.snapshots[snapshot_id]) + sandbox = cls( + sandbox_id=sandbox_id, + name=cast(str | None, kwargs.get("name")), + files=files, + ) + cls.sandboxes[sandbox_id] = sandbox + return sandbox + + @classmethod + def connect(cls, sandbox_id: str, **kwargs: object) -> _FakeSandbox: + cls.connect_calls.append({"sandbox_id": sandbox_id, **kwargs}) + if sandbox_id in cls.connect_failures: + raise cls.connect_failures[sandbox_id] + sandbox = cls.sandboxes.get(sandbox_id) + if sandbox is None: + raise RuntimeError(f"sandbox {sandbox_id} not found") + return sandbox + + def run( + self, + command: str, + args: list[str] | None = None, + env: dict[str, str] | None = None, + working_dir: str | None = None, + timeout: float | None = None, + ) -> _FakeCommandResult: + _ = (env, timeout) + args = args or [] + self.run_calls.append( + { + "command": command, + "args": list(args), + "working_dir": working_dir, + } + ) + + resolved = resolve_fake_workspace_path( + (command, *args), symlinks=self.symlinks, home_dir="/workspace" + ) + if resolved is not None: + return _FakeCommandResult( + exit_code=resolved.exit_code, + stdout=resolved.stdout, + stderr=resolved.stderr, + ) + + if self.next_run_result is not None: + result = self.next_run_result + self.next_run_result = None + return result + + if command == "mkdir": + return _FakeCommandResult() + + cwd = working_dir or "/workspace" + + if command == "tar" and args and args[0] == "cf": + archive_path = args[1] + assert "-C" in args + tar_root = args[args.index("-C") + 1] + include_dot = args[-1] == "." + exclusions = { + arg.removeprefix("--exclude=./") for arg in args if arg.startswith("--exclude=./") + } + buffer = io.BytesIO() + with tarfile.open(fileobj=buffer, mode="w") as tar: + for path, content in sorted(self.files.items()): + if not path.startswith(tar_root.rstrip("/") + "/"): + continue + rel_path = path[len(tar_root.rstrip("/")) + 1 :] + if any(rel_path == ex or rel_path.startswith(f"{ex}/") for ex in exclusions): + continue + info = tarfile.TarInfo(name=rel_path if include_dot else path) + info.size = len(content) + tar.addfile(info, io.BytesIO(content)) + self.files[archive_path] = buffer.getvalue() + return _FakeCommandResult() + + if command == "tar" and args and args[0] == "xf": + archive_path = args[1] + destination = args[args.index("-C") + 1] + raw = self.files[archive_path] + with tarfile.open(fileobj=io.BytesIO(raw), mode="r") as tar: + for member in tar.getmembers(): + if not member.isfile(): + continue + extracted = tar.extractfile(member) + assert extracted is not None + self.files[f"{destination.rstrip('/')}/{member.name}"] = extracted.read() + return _FakeCommandResult() + + if command == "rm": + for arg in args: + if arg in {"-f", "--"}: + continue + self.files.pop(arg, None) + return _FakeCommandResult() + + if command == "test" and args and args[0] == "-d": + return _FakeCommandResult(exit_code=0) + + _ = cwd + return _FakeCommandResult() + + def read_file(self, path: str) -> bytes: + if path not in self.files: + raise _FakeRemoteAPIError(404, f"file not found: {path}") + return self.files[path] + + def write_file(self, path: str, content: bytes) -> None: + self.files[path] = bytes(content) + + def terminate(self) -> None: + self.terminated = True + self.status = "terminated" + + def suspend( + self, wait: bool = True, timeout: float = 300.0, poll_interval: float = 1.0 + ) -> None: + _ = (wait, timeout, poll_interval) + self.suspended = True + self.status = "suspended" + + def resume(self, wait: bool = True, timeout: float = 300.0, poll_interval: float = 1.0) -> None: + _ = (wait, timeout, poll_interval) + self.resumed = True + self.status = "running" + + def update( + self, + name: str | None = None, + *, + allow_unauthenticated_access: bool | None = None, + exposed_ports: list[int] | None = None, + ) -> None: + self.update_calls.append( + { + "name": name, + "allow_unauthenticated_access": allow_unauthenticated_access, + "exposed_ports": list(exposed_ports) if exposed_ports is not None else None, + } + ) + + def checkpoint( + self, + wait: bool = True, + timeout: float = 300.0, + poll_interval: float = 1.0, + checkpoint_type: str | None = None, + ) -> _FakeSnapshotInfo: + _ = (wait, timeout, poll_interval, checkpoint_type) + snapshot_id = f"snap-{len(type(self).snapshots) + 1}" + type(self).snapshots[snapshot_id] = dict(self.files) + return _FakeSnapshotInfo(snapshot_id) + + +@pytest.fixture(autouse=True) +def _reset_fake_sandbox_state() -> None: + _FakeSandbox.reset() + + +def _load_tensorlake_module(monkeypatch: pytest.MonkeyPatch) -> Any: + _FakeSandbox.reset() + + fake_pkg = types.ModuleType("tensorlake") + fake_sandbox_pkg = cast(Any, types.ModuleType("tensorlake.sandbox")) + fake_sandbox_pkg.Sandbox = _FakeSandbox + fake_sandbox_pkg.CheckpointType = _FakeCheckpointType + fake_sandbox_pkg.SandboxStatus = _FakeSandboxStatus + fake_sandbox_pkg.RemoteAPIError = _FakeRemoteAPIError + + monkeypatch.setitem(sys.modules, "tensorlake", fake_pkg) + monkeypatch.setitem(sys.modules, "tensorlake.sandbox", fake_sandbox_pkg) + sys.modules.pop("agents.extensions.sandbox.tensorlake.sandbox", None) + sys.modules.pop("agents.extensions.sandbox.tensorlake", None) + + return importlib.import_module("agents.extensions.sandbox.tensorlake.sandbox") + + +def test_tensorlake_package_re_exports_backend_symbols(monkeypatch: pytest.MonkeyPatch) -> None: + module = _load_tensorlake_module(monkeypatch) + pkg = importlib.import_module("agents.extensions.sandbox.tensorlake") + + assert pkg.TensorlakeSandboxClient is module.TensorlakeSandboxClient + assert pkg.TensorlakeSandboxSessionState is module.TensorlakeSandboxSessionState + + +def test_tensorlake_supports_pty_is_disabled(monkeypatch: pytest.MonkeyPatch) -> None: + module = _load_tensorlake_module(monkeypatch) + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-000000000001"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-no-pty", + ) + fake = _FakeSandbox(sandbox_id="sandbox-no-pty") + session = module.TensorlakeSandboxSession.from_state(state, sandbox=fake) + assert session.supports_pty() is False + + +@pytest.mark.asyncio +async def test_create_passes_options_and_drops_unset_fields( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = _load_tensorlake_module(monkeypatch) + client = module.TensorlakeSandboxClient() + session = await client.create( + manifest=Manifest(), + options=module.TensorlakeSandboxClientOptions( + image="my-image", + cpus=2.0, + memory_mb=2048, + timeout_secs=600, + name="demo", + secret_names=("OPENAI_KEY",), + allow_internet_access=False, + allow_out=("10.0.0.0/8",), + deny_out=("example.com",), + exposed_ports=(8080,), + allow_unauthenticated_port_access=True, + ), + ) + + assert _FakeSandbox.create_calls == [ + { + "image": "my-image", + "cpus": 2.0, + "memory_mb": 2048, + "timeout_secs": 600, + "name": "demo", + "secret_names": ["OPENAI_KEY"], + "allow_internet_access": False, + "allow_out": ["10.0.0.0/8"], + "deny_out": ["example.com"], + } + ] + inner = session._inner + assert inner.state.sandbox_id == "tensorlake-sandbox-1" + sandbox = _FakeSandbox.sandboxes["tensorlake-sandbox-1"] + assert sandbox.update_calls == [ + { + "name": None, + "allow_unauthenticated_access": True, + "exposed_ports": [8080], + } + ] + + +@pytest.mark.asyncio +async def test_create_omits_optional_kwargs_when_unset(monkeypatch: pytest.MonkeyPatch) -> None: + module = _load_tensorlake_module(monkeypatch) + client = module.TensorlakeSandboxClient() + await client.create( + manifest=Manifest(), + options=module.TensorlakeSandboxClientOptions(), + ) + + assert _FakeSandbox.create_calls == [{"allow_internet_access": True}] + + +@pytest.mark.asyncio +async def test_exec_read_write_and_mkdir(monkeypatch: pytest.MonkeyPatch) -> None: + module = _load_tensorlake_module(monkeypatch) + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-000000000002"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-rw", + ) + fake = _FakeSandbox(sandbox_id="sandbox-rw") + session = module.TensorlakeSandboxSession.from_state(state, sandbox=fake) + + await session.write(Path("notes.txt"), io.BytesIO(b"hello")) + payload = await session.read(Path("notes.txt")) + assert payload.read() == b"hello" + + await session.mkdir(Path("subdir"), parents=True) + mkdir_calls = [c for c in fake.run_calls if c["command"] == "mkdir"] + assert mkdir_calls and mkdir_calls[-1]["args"] == ["-p", "--", "/workspace/subdir"] + + fake.next_run_result = _FakeCommandResult(stdout="hi\n", exit_code=0) + result = await session.exec("printf", "hi", shell=False) + assert result.ok() + assert result.stdout == b"hi\n" + + +@pytest.mark.asyncio +async def test_read_missing_file_raises_not_found(monkeypatch: pytest.MonkeyPatch) -> None: + module = _load_tensorlake_module(monkeypatch) + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-000000000003"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-missing", + ) + fake = _FakeSandbox(sandbox_id="sandbox-missing") + session = module.TensorlakeSandboxSession.from_state(state, sandbox=fake) + + from agents.sandbox.errors import WorkspaceReadNotFoundError + + with pytest.raises(WorkspaceReadNotFoundError): + await session.read(Path("nope.txt")) + + +@pytest.mark.asyncio +async def test_exposed_port_resolution_uses_sandbox_id(monkeypatch: pytest.MonkeyPatch) -> None: + module = _load_tensorlake_module(monkeypatch) + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-000000000004"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-ports", + exposed_ports=(3000,), + ) + fake = _FakeSandbox(sandbox_id="sandbox-ports") + session = module.TensorlakeSandboxSession.from_state(state, sandbox=fake) + + endpoint = await session.resolve_exposed_port(3000) + assert endpoint.host == "3000-sandbox-ports.sandbox.tensorlake.ai" + assert endpoint.port == 443 + assert endpoint.tls is True + + +@pytest.mark.asyncio +async def test_exposed_port_resolution_uses_named_sandbox_when_set( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = _load_tensorlake_module(monkeypatch) + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-000000000005"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-id", + name="demo", + exposed_ports=(8080,), + ) + fake = _FakeSandbox(sandbox_id="sandbox-id", name="demo") + session = module.TensorlakeSandboxSession.from_state(state, sandbox=fake) + + endpoint = await session.resolve_exposed_port(8080) + assert endpoint.host == "8080-demo.sandbox.tensorlake.ai" + + +@pytest.mark.asyncio +async def test_shutdown_terminates_by_default(monkeypatch: pytest.MonkeyPatch) -> None: + module = _load_tensorlake_module(monkeypatch) + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-000000000006"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-shutdown", + ) + fake = _FakeSandbox(sandbox_id="sandbox-shutdown") + session = module.TensorlakeSandboxSession.from_state(state, sandbox=fake) + await session.shutdown() + + assert fake.terminated is True + assert fake.suspended is False + + +@pytest.mark.asyncio +async def test_shutdown_suspends_when_pause_on_exit(monkeypatch: pytest.MonkeyPatch) -> None: + module = _load_tensorlake_module(monkeypatch) + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-000000000007"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-pause", + pause_on_exit=True, + ) + fake = _FakeSandbox(sandbox_id="sandbox-pause") + session = module.TensorlakeSandboxSession.from_state(state, sandbox=fake) + await session.shutdown() + + assert fake.suspended is True + assert fake.terminated is False + + +@pytest.mark.asyncio +async def test_persist_workspace_via_tar_round_trips_manifest( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = _load_tensorlake_module(monkeypatch) + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-000000000008"), + manifest=Manifest(entries={"notes.txt": File(content=b"payload")}), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-tar", + ) + fake = _FakeSandbox(sandbox_id="sandbox-tar") + session = module.TensorlakeSandboxSession.from_state(state, sandbox=fake) + + await session.start() + archive = await session.persist_workspace() + raw = archive.read() + assert isinstance(raw, bytes) and raw + + # Hydrate into a new sandbox and ensure files are restored. + other_state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-000000000009"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-tar-restore", + ) + other_fake = _FakeSandbox(sandbox_id="sandbox-tar-restore") + other_session = module.TensorlakeSandboxSession.from_state(other_state, sandbox=other_fake) + await other_session.hydrate_workspace(io.BytesIO(raw)) + restored = await other_session.read(Path("notes.txt")) + assert restored.read() == b"payload" + + +@pytest.mark.asyncio +async def test_persist_workspace_via_checkpoint_returns_snapshot_ref( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = _load_tensorlake_module(monkeypatch) + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-00000000000a"), + manifest=Manifest(entries={"notes.txt": File(content=b"snapshot-payload")}), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-checkpoint", + workspace_persistence="snapshot", + ) + fake = _FakeSandbox(sandbox_id="sandbox-checkpoint") + session = module.TensorlakeSandboxSession.from_state(state, sandbox=fake) + + await session.start() + archive = await session.persist_workspace() + raw = archive.read() + + assert raw.startswith(module._TENSORLAKE_SNAPSHOT_MAGIC) + snapshot_id = module._decode_tensorlake_snapshot_ref(raw) + assert snapshot_id == "snap-1" + assert _FakeSandbox.snapshots["snap-1"]["/workspace/notes.txt"] == b"snapshot-payload" + + +@pytest.mark.asyncio +async def test_hydrate_workspace_via_checkpoint_replaces_sandbox( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = _load_tensorlake_module(monkeypatch) + + # First, take a checkpoint via the tar-style helper so it is registered. + initial = _FakeSandbox(sandbox_id="sandbox-source") + initial.files["/workspace/from-snapshot.txt"] = b"snap-data" + snap = initial.checkpoint(checkpoint_type=_FakeCheckpointType.FILESYSTEM) + + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-00000000000b"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-pre-restore", + workspace_persistence="snapshot", + ) + pre_restore = _FakeSandbox(sandbox_id="sandbox-pre-restore") + session = module.TensorlakeSandboxSession.from_state(state, sandbox=pre_restore) + + payload = module._encode_tensorlake_snapshot_ref(snapshot_id=snap.snapshot_id) + await session.hydrate_workspace(io.BytesIO(payload)) + + assert pre_restore.terminated is True + assert state.sandbox_id != "sandbox-pre-restore" + new_sandbox = _FakeSandbox.sandboxes[state.sandbox_id] + assert new_sandbox.files["/workspace/from-snapshot.txt"] == b"snap-data" + + +@pytest.mark.asyncio +async def test_resume_reconnects_running_sandbox(monkeypatch: pytest.MonkeyPatch) -> None: + module = _load_tensorlake_module(monkeypatch) + + existing = _FakeSandbox(sandbox_id="sandbox-existing", status="running") + _FakeSandbox.sandboxes["sandbox-existing"] = existing + + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-00000000000c"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-existing", + ) + + client = module.TensorlakeSandboxClient() + session = await client.resume(state) + + assert _FakeSandbox.connect_calls == [{"sandbox_id": "sandbox-existing"}] + assert _FakeSandbox.create_calls == [] + assert session._inner.state.sandbox_id == "sandbox-existing" + + +@pytest.mark.asyncio +async def test_resume_creates_fresh_when_reconnect_fails( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = _load_tensorlake_module(monkeypatch) + + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-00000000000d"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-missing", + ) + + client = module.TensorlakeSandboxClient() + session = await client.resume(state) + + assert _FakeSandbox.connect_calls and _FakeSandbox.connect_calls[0]["sandbox_id"] == ( + "sandbox-missing" + ) + assert len(_FakeSandbox.create_calls) == 1 + new_id = session._inner.state.sandbox_id + assert new_id.startswith("tensorlake-sandbox-") + assert state.workspace_root_ready is False + + +def test_serialize_session_state_round_trips(monkeypatch: pytest.MonkeyPatch) -> None: + module = _load_tensorlake_module(monkeypatch) + + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-00000000000e"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-serialize", + image="custom", + cpus=4.0, + memory_mb=4096, + timeout_secs=120, + name="serialize", + allow_internet_access=False, + allow_out=("10.0.0.0/8",), + deny_out=("example.com",), + workspace_persistence="snapshot", + checkpoint_mode="memory", + ) + client = module.TensorlakeSandboxClient() + payload = client.serialize_session_state(state) + restored = client.deserialize_session_state(payload) + + assert isinstance(restored, module.TensorlakeSandboxSessionState) + assert restored.image == "custom" + assert restored.cpus == 4.0 + assert restored.workspace_persistence == "snapshot" + assert restored.checkpoint_mode == "memory" diff --git a/uv.lock b/uv.lock index 8c8f90f8ef..7117be8b29 100644 --- a/uv.lock +++ b/uv.lock @@ -9,7 +9,7 @@ resolution-markers = [ ] [options] -exclude-newer = "2026-04-25T00:26:21.546536088Z" +exclude-newer = "2026-04-28T20:23:13.622945Z" exclude-newer-span = "P7D" [[package]] @@ -732,8 +732,10 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, { name = "grpcio" }, - { name = "grpcio-status" }, - { name = "protobuf" }, + { name = "grpcio-status", version = "1.67.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "grpcio-status", version = "1.76.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "protobuf", version = "5.29.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "protobuf", version = "6.33.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, { name = "python-dateutil" }, { name = "typing-extensions" }, ] @@ -897,7 +899,8 @@ dependencies = [ { name = "httpcore" }, { name = "httpx" }, { name = "packaging" }, - { name = "protobuf" }, + { name = "protobuf", version = "5.29.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "protobuf", version = "6.33.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, { name = "python-dateutil" }, { name = "rich" }, { name = "typing-extensions" }, @@ -1201,7 +1204,8 @@ name = "googleapis-common-protos" version = "1.70.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "protobuf" }, + { name = "protobuf", version = "5.29.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "protobuf", version = "6.33.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/39/24/33db22342cf4a2ea27c9955e6713140fedd51e8b141b5ce5260897020f1a/googleapis_common_protos-1.70.0.tar.gz", hash = "sha256:0e1b44e0ea153e6594f9f394fef15193a68aaaea2d843f83e2742717ca753257", size = 145903, upload-time = "2025-04-14T10:17:02.924Z" } wheels = [ @@ -1352,16 +1356,38 @@ wheels = [ name = "grpcio-status" version = "1.67.1" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.14'", +] dependencies = [ - { name = "googleapis-common-protos" }, - { name = "grpcio" }, - { name = "protobuf" }, + { name = "googleapis-common-protos", marker = "python_full_version >= '3.14'" }, + { name = "grpcio", marker = "python_full_version >= '3.14'" }, + { name = "protobuf", version = "5.29.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/be/c7/fe0e79a80ac6346e0c6c0a24e9e3cbc3ae1c2a009acffb59eab484a6f69b/grpcio_status-1.67.1.tar.gz", hash = "sha256:2bf38395e028ceeecfd8866b081f61628114b384da7d51ae064ddc8d766a5d11", size = 13673, upload-time = "2024-10-29T06:30:21.787Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/05/18/56999a1da3577d8ccc8698a575d6638e15fe25650cc88b2ce0a087f180b9/grpcio_status-1.67.1-py3-none-any.whl", hash = "sha256:16e6c085950bdacac97c779e6a502ea671232385e6e37f258884d6883392c2bd", size = 14427, upload-time = "2024-10-29T06:27:38.228Z" }, ] +[[package]] +name = "grpcio-status" +version = "1.76.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12' and python_full_version < '3.14'", + "python_full_version == '3.11.*'", + "python_full_version < '3.11'", +] +dependencies = [ + { name = "googleapis-common-protos", marker = "python_full_version < '3.14'" }, + { name = "grpcio", marker = "python_full_version < '3.14'" }, + { name = "protobuf", version = "6.33.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3f/46/e9f19d5be65e8423f886813a2a9d0056ba94757b0c5007aa59aed1a961fa/grpcio_status-1.76.0.tar.gz", hash = "sha256:25fcbfec74c15d1a1cb5da3fab8ee9672852dc16a5a9eeb5baf7d7a9952943cd", size = 13679, upload-time = "2025-10-21T16:28:52.545Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8c/cc/27ba60ad5a5f2067963e6a858743500df408eb5855e98be778eaef8c9b02/grpcio_status-1.76.0-py3-none-any.whl", hash = "sha256:380568794055a8efbbd8871162df92012e0228a5f6dffaf57f2a00c534103b18", size = 14425, upload-time = "2025-10-21T16:28:40.853Z" }, +] + [[package]] name = "grpclib" version = "0.4.9" @@ -1449,6 +1475,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, ] +[package.optional-dependencies] +http2 = [ + { name = "h2", marker = "python_full_version < '3.14'" }, +] + [[package]] name = "httpx-sse" version = "0.4.1" @@ -1981,7 +2012,8 @@ dependencies = [ { name = "certifi" }, { name = "click" }, { name = "grpclib" }, - { name = "protobuf" }, + { name = "protobuf", version = "5.29.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "protobuf", version = "6.33.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, { name = "rich" }, { name = "synchronicity" }, { name = "toml" }, @@ -2502,6 +2534,9 @@ temporal = [ { name = "temporalio" }, { name = "textual" }, ] +tensorlake = [ + { name = "tensorlake", marker = "python_full_version < '3.14'" }, +] vercel = [ { name = "vercel" }, ] @@ -2577,6 +2612,7 @@ requires-dist = [ { name = "runloop-api-client", marker = "extra == 'runloop'", specifier = ">=1.16.0,<2.0.0" }, { name = "sqlalchemy", marker = "extra == 'sqlalchemy'", specifier = ">=2.0" }, { name = "temporalio", marker = "extra == 'temporal'", specifier = "==1.26.0" }, + { name = "tensorlake", marker = "python_full_version < '3.14' and extra == 'tensorlake'", specifier = ">=0.5.4" }, { name = "textual", marker = "extra == 'temporal'", specifier = ">=8.2.3,<8.3" }, { name = "types-requests", specifier = ">=2.0,<3" }, { name = "typing-extensions", specifier = ">=4.12.2,<5" }, @@ -2585,7 +2621,7 @@ requires-dist = [ { name = "websockets", marker = "extra == 'realtime'", specifier = ">=15.0,<17" }, { name = "websockets", marker = "extra == 'voice'", specifier = ">=15.0,<17" }, ] -provides-extras = ["voice", "viz", "litellm", "any-llm", "realtime", "sqlalchemy", "encrypt", "redis", "dapr", "mongodb", "docker", "blaxel", "daytona", "cloudflare", "e2b", "modal", "runloop", "vercel", "s3", "temporal"] +provides-extras = ["voice", "viz", "litellm", "any-llm", "realtime", "sqlalchemy", "encrypt", "redis", "dapr", "mongodb", "docker", "blaxel", "daytona", "cloudflare", "e2b", "modal", "runloop", "tensorlake", "vercel", "s3", "temporal"] [package.metadata.requires-dev] dev = [ @@ -2713,7 +2749,8 @@ name = "opentelemetry-proto" version = "1.40.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "protobuf" }, + { name = "protobuf", version = "5.29.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "protobuf", version = "6.33.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/4c/77/dd38991db037fdfce45849491cb61de5ab000f49824a00230afb112a4392/opentelemetry_proto-1.40.0.tar.gz", hash = "sha256:03f639ca129ba513f5819810f5b1f42bcb371391405d99c168fe6937c62febcd", size = 45667, upload-time = "2026-03-04T14:17:31.194Z" } wheels = [ @@ -2912,6 +2949,9 @@ wheels = [ name = "protobuf" version = "5.29.5" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.14'", +] sdist = { url = "https://files.pythonhosted.org/packages/43/29/d09e70352e4e88c9c7a198d5645d7277811448d76c23b00345670f7c8a38/protobuf-5.29.5.tar.gz", hash = "sha256:bc1463bafd4b0929216c35f437a8e28731a2b7fe3d98bb77a600efced5a15c84", size = 425226, upload-time = "2025-05-28T23:51:59.82Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/5f/11/6e40e9fc5bba02988a214c07cf324595789ca7820160bfd1f8be96e48539/protobuf-5.29.5-cp310-abi3-win32.whl", hash = "sha256:3f1c6468a2cfd102ff4703976138844f78ebd1fb45f49011afc5139e9e283079", size = 422963, upload-time = "2025-05-28T23:51:41.204Z" }, @@ -2922,6 +2962,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7e/cc/7e77861000a0691aeea8f4566e5d3aa716f2b1dece4a24439437e41d3d25/protobuf-5.29.5-py3-none-any.whl", hash = "sha256:6cf42630262c59b2d8de33954443d94b746c952b01434fc58a417fdbd2e84bd5", size = 172823, upload-time = "2025-05-28T23:51:58.157Z" }, ] +[[package]] +name = "protobuf" +version = "6.33.6" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12' and python_full_version < '3.14'", + "python_full_version == '3.11.*'", + "python_full_version < '3.11'", +] +sdist = { url = "https://files.pythonhosted.org/packages/66/70/e908e9c5e52ef7c3a6c7902c9dfbb34c7e29c25d2f81ade3856445fd5c94/protobuf-6.33.6.tar.gz", hash = "sha256:a6768d25248312c297558af96a9f9c929e8c4cee0659cb07e780731095f38135", size = 444531, upload-time = "2026-03-18T19:05:00.988Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fc/9f/2f509339e89cfa6f6a4c4ff50438db9ca488dec341f7e454adad60150b00/protobuf-6.33.6-cp310-abi3-win32.whl", hash = "sha256:7d29d9b65f8afef196f8334e80d6bc1d5d4adedb449971fefd3723824e6e77d3", size = 425739, upload-time = "2026-03-18T19:04:48.373Z" }, + { url = "https://files.pythonhosted.org/packages/76/5d/683efcd4798e0030c1bab27374fd13a89f7c2515fb1f3123efdfaa5eab57/protobuf-6.33.6-cp310-abi3-win_amd64.whl", hash = "sha256:0cd27b587afca21b7cfa59a74dcbd48a50f0a6400cfb59391340ad729d91d326", size = 437089, upload-time = "2026-03-18T19:04:50.381Z" }, + { url = "https://files.pythonhosted.org/packages/5c/01/a3c3ed5cd186f39e7880f8303cc51385a198a81469d53d0fdecf1f64d929/protobuf-6.33.6-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:9720e6961b251bde64edfdab7d500725a2af5280f3f4c87e57c0208376aa8c3a", size = 427737, upload-time = "2026-03-18T19:04:51.866Z" }, + { url = "https://files.pythonhosted.org/packages/ee/90/b3c01fdec7d2f627b3a6884243ba328c1217ed2d978def5c12dc50d328a3/protobuf-6.33.6-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:e2afbae9b8e1825e3529f88d514754e094278bb95eadc0e199751cdd9a2e82a2", size = 324610, upload-time = "2026-03-18T19:04:53.096Z" }, + { url = "https://files.pythonhosted.org/packages/9b/ca/25afc144934014700c52e05103c2421997482d561f3101ff352e1292fb81/protobuf-6.33.6-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:c96c37eec15086b79762ed265d59ab204dabc53056e3443e702d2681f4b39ce3", size = 339381, upload-time = "2026-03-18T19:04:54.616Z" }, + { url = "https://files.pythonhosted.org/packages/16/92/d1e32e3e0d894fe00b15ce28ad4944ab692713f2e7f0a99787405e43533a/protobuf-6.33.6-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:e9db7e292e0ab79dd108d7f1a94fe31601ce1ee3f7b79e0692043423020b0593", size = 323436, upload-time = "2026-03-18T19:04:55.768Z" }, + { url = "https://files.pythonhosted.org/packages/c4/72/02445137af02769918a93807b2b7890047c32bfb9f90371cbc12688819eb/protobuf-6.33.6-py3-none-any.whl", hash = "sha256:77179e006c476e69bf8e8ce866640091ec42e1beb80b213c3900006ecfba6901", size = 170656, upload-time = "2026-03-18T19:04:59.826Z" }, +] + [[package]] name = "pycparser" version = "2.22" @@ -3953,7 +4013,8 @@ version = "1.26.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "nexus-rpc" }, - { name = "protobuf" }, + { name = "protobuf", version = "5.29.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "protobuf", version = "6.33.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, { name = "python-dateutil", marker = "python_full_version < '3.11'" }, { name = "types-protobuf" }, { name = "typing-extensions" }, @@ -3967,6 +4028,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fd/9b/c50840a26af3587c0c8d9af04d9976743e22496996dc1a377efc75dcd316/temporalio-1.26.0-cp310-abi3-win_amd64.whl", hash = "sha256:1c4a0d82f0a3796cbf78864c799f8dca0b94cdaec68e7b8b224c859005686ec4", size = 14525849, upload-time = "2026-04-15T23:42:57.589Z" }, ] +[[package]] +name = "tensorlake" +version = "0.5.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "grpcio", marker = "python_full_version < '3.14'" }, + { name = "httpx", extra = ["http2"], marker = "python_full_version < '3.14'" }, + { name = "protobuf", version = "6.33.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "pydantic", marker = "python_full_version < '3.14'" }, + { name = "websocket-client", marker = "python_full_version < '3.14'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ea/1f/e613ce553e1cf8cdf0747ff99f884b1da685ca1dce88536938b6e5585459/tensorlake-0.5.4.tar.gz", hash = "sha256:f6f175517f38ccde71095a0e993a8a43ef721baa96a342f3166c8dd44f96ecf3", size = 2269787, upload-time = "2026-04-28T19:48:58.853Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a1/1b/ecd1eecd7b51ef593082fbefeba9fe7829fca0a5f9790e427f6a6b43b9d5/tensorlake-0.5.4-py3-none-macosx_11_0_arm64.whl", hash = "sha256:14a13c088a42ec454911abddd9209e10df11a7f111304988852057fef26628ab", size = 13176447, upload-time = "2026-04-28T19:48:46.151Z" }, + { url = "https://files.pythonhosted.org/packages/83/00/d0988dea31606bf0bcd7370ed41fc7c6ecea48e6ec445981dfa7ab56d462/tensorlake-0.5.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0348ff2e140821dd8cb47a084788b2162ddf1e34a77423f72f8d89a5d2499777", size = 13685134, upload-time = "2026-04-28T19:48:50.266Z" }, + { url = "https://files.pythonhosted.org/packages/2c/1d/cbeaf665406df16a58bba416e778b1f6904fb786e7b9d1c47cb6b4adcad8/tensorlake-0.5.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f61d13b595083c05214fda1a2c5431a93a6ba4284c653d4b8cee1b9557eea5a8", size = 14145054, upload-time = "2026-04-28T19:48:53.181Z" }, + { url = "https://files.pythonhosted.org/packages/b3/a9/88147b4f01904439a100b22989d5b808ae471fbaf0142cce23d5bf14e452/tensorlake-0.5.4-py3-none-win_amd64.whl", hash = "sha256:57512077aaccc2d62f4758127f36a81809c26aa6f0646ae2b905da47e7f5e7df", size = 14714070, upload-time = "2026-04-28T19:48:56.051Z" }, +] + [[package]] name = "testcontainers" version = "4.12.0" @@ -4472,6 +4552,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/eb/d8/0d1d2e9d3fabcf5d6840362adcf05f8cf3cd06a73358140c3a97189238ae/wcmatch-10.1-py3-none-any.whl", hash = "sha256:5848ace7dbb0476e5e55ab63c6bbd529745089343427caa5537f230cc01beb8a", size = 39854, upload-time = "2025-06-22T19:14:00.978Z" }, ] +[[package]] +name = "websocket-client" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2c/41/aa4bf9664e4cda14c3b39865b12251e8e7d239f4cd0e3cc1b6c2ccde25c1/websocket_client-1.9.0.tar.gz", hash = "sha256:9e813624b6eb619999a97dc7958469217c3176312b3a16a4bd1bc7e08a46ec98", size = 70576, upload-time = "2025-10-07T21:16:36.495Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/34/db/b10e48aa8fff7407e67470363eac595018441cf32d5e1001567a7aeba5d2/websocket_client-1.9.0-py3-none-any.whl", hash = "sha256:af248a825037ef591efbf6ed20cc5faa03d3b47b9e5a2230a529eeee1c1fc3ef", size = 82616, upload-time = "2025-10-07T21:16:34.951Z" }, +] + [[package]] name = "websockets" version = "15.0.1" From f91fabe99fe718ed862f79c8c5357493b0e8779e Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Tue, 5 May 2026 23:53:39 -0500 Subject: [PATCH 02/26] allow resource allocation --- docs/sandbox/clients.md | 2 +- src/agents/extensions/sandbox/tensorlake/sandbox.py | 11 +++++++++++ tests/extensions/sandbox/test_tensorlake.py | 4 ++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/docs/sandbox/clients.md b/docs/sandbox/clients.md index 5b5f0ac1b0..266ef4051d 100644 --- a/docs/sandbox/clients.md +++ b/docs/sandbox/clients.md @@ -114,7 +114,7 @@ Hosted sandbox clients expose provider-specific mount strategies. Choose the bac | `DaytonaSandboxClient` | Supports rclone-backed cloud storage mounts with `DaytonaCloudBucketMountStrategy`; use it with `S3Mount`, `GCSMount`, `R2Mount`, `AzureBlobMount`, and `BoxMount`. | | `E2BSandboxClient` | Supports rclone-backed cloud storage mounts with `E2BCloudBucketMountStrategy`; use it with `S3Mount`, `GCSMount`, `R2Mount`, `AzureBlobMount`, and `BoxMount`. | | `RunloopSandboxClient` | Supports rclone-backed cloud storage mounts with `RunloopCloudBucketMountStrategy`; use it with `S3Mount`, `GCSMount`, `R2Mount`, `AzureBlobMount`, and `BoxMount`. | -| `TensorlakeSandboxClient` | No hosted-specific mount strategy is currently exposed. Use manifest files, repos, or other workspace inputs instead. Tensorlake's native sandbox checkpoint API is available via `workspace_persistence="snapshot"`. | +| `TensorlakeSandboxClient` | No hosted-specific mount strategy is currently exposed. Use manifest files, repos, or other workspace inputs instead. Tensorlake's native sandbox checkpoint API is available via `workspace_persistence="snapshot"`; prefer this over external bucket mounts for between-run persistence. | | `VercelSandboxClient` | No hosted-specific mount strategy is currently exposed. Use manifest files, repos, or other workspace inputs instead. | diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index 6cab3fe583..5b8359d5c2 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -142,6 +142,7 @@ class TensorlakeSandboxClientOptions(BaseSandboxClientOptions): checkpoint_mode: CheckpointMode = "filesystem" checkpoint_timeout_s: float = 300.0 timeouts: TensorlakeSandboxTimeouts | dict[str, object] | None = None + disk_mb: int | None = None def __init__( self, @@ -162,6 +163,7 @@ def __init__( checkpoint_mode: CheckpointMode = "filesystem", checkpoint_timeout_s: float = 300.0, timeouts: TensorlakeSandboxTimeouts | dict[str, object] | None = None, + disk_mb: int | None = None, *, type: Literal["tensorlake"] = "tensorlake", ) -> None: @@ -184,6 +186,7 @@ def __init__( checkpoint_mode=checkpoint_mode, checkpoint_timeout_s=checkpoint_timeout_s, timeouts=timeouts, + disk_mb=disk_mb, ) @@ -208,6 +211,7 @@ class TensorlakeSandboxSessionState(SandboxSessionState): checkpoint_mode: CheckpointMode = "filesystem" checkpoint_timeout_s: float = 300.0 timeouts: TensorlakeSandboxTimeouts = Field(default_factory=TensorlakeSandboxTimeouts) + disk_mb: int | None = None def _resolve_create_kwargs( @@ -215,6 +219,7 @@ def _resolve_create_kwargs( image: str | None, cpus: float | None, memory_mb: int | None, + disk_mb: int | None, timeout_secs: int | None, name: str | None, secret_names: tuple[str, ...], @@ -237,6 +242,8 @@ def _resolve_create_kwargs( kwargs["cpus"] = cpus if memory_mb is not None: kwargs["memory_mb"] = memory_mb + if disk_mb is not None: + kwargs["disk_mb"] = disk_mb if timeout_secs is not None: kwargs["timeout_secs"] = timeout_secs if name is not None: @@ -709,6 +716,7 @@ async def _restore_from_checkpoint(self, snapshot_id: str) -> None: image=self.state.image, cpus=self.state.cpus, memory_mb=self.state.memory_mb, + disk_mb=self.state.disk_mb, timeout_secs=self.state.timeout_secs, name=self.state.name, secret_names=self.state.secret_names, @@ -862,6 +870,7 @@ async def create( image=options.image, cpus=options.cpus, memory_mb=options.memory_mb, + disk_mb=options.disk_mb, timeout_secs=options.timeout_secs, name=options.name, secret_names=options.secret_names, @@ -889,6 +898,7 @@ async def create( image=options.image, cpus=options.cpus, memory_mb=options.memory_mb, + disk_mb=options.disk_mb, timeout_secs=options.timeout_secs, secret_names=options.secret_names, base_envs=base_envs, @@ -950,6 +960,7 @@ async def resume( image=state.image, cpus=state.cpus, memory_mb=state.memory_mb, + disk_mb=state.disk_mb, timeout_secs=state.timeout_secs, name=state.name, secret_names=state.secret_names, diff --git a/tests/extensions/sandbox/test_tensorlake.py b/tests/extensions/sandbox/test_tensorlake.py index dbd742292c..5af502f27c 100644 --- a/tests/extensions/sandbox/test_tensorlake.py +++ b/tests/extensions/sandbox/test_tensorlake.py @@ -316,6 +316,7 @@ async def test_create_passes_options_and_drops_unset_fields( image="my-image", cpus=2.0, memory_mb=2048, + disk_mb=20480, timeout_secs=600, name="demo", secret_names=("OPENAI_KEY",), @@ -332,6 +333,7 @@ async def test_create_passes_options_and_drops_unset_fields( "image": "my-image", "cpus": 2.0, "memory_mb": 2048, + "disk_mb": 20480, "timeout_secs": 600, "name": "demo", "secret_names": ["OPENAI_KEY"], @@ -628,6 +630,7 @@ def test_serialize_session_state_round_trips(monkeypatch: pytest.MonkeyPatch) -> image="custom", cpus=4.0, memory_mb=4096, + disk_mb=20480, timeout_secs=120, name="serialize", allow_internet_access=False, @@ -643,5 +646,6 @@ def test_serialize_session_state_round_trips(monkeypatch: pytest.MonkeyPatch) -> assert isinstance(restored, module.TensorlakeSandboxSessionState) assert restored.image == "custom" assert restored.cpus == 4.0 + assert restored.disk_mb == 20480 assert restored.workspace_persistence == "snapshot" assert restored.checkpoint_mode == "memory" From 7d2aa8d9e1d8a510c48f540bce200571399f0d2a Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Wed, 6 May 2026 00:03:02 -0500 Subject: [PATCH 03/26] remove upper bound --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b6c92b6a41..e428a731a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,7 @@ cloudflare = ["aiohttp>=3.12,<4"] e2b = ["e2b==2.20.0", "e2b-code-interpreter==2.4.1"] modal = ["modal==1.3.5"] runloop = ["runloop_api_client>=1.16.0,<2.0.0"] -tensorlake = ["tensorlake>=0.5.4; python_version<'3.14'"] +tensorlake = ["tensorlake>=0.5.4"] vercel = ["vercel>=0.5.6,<0.6"] s3 = ["boto3>=1.34"] temporal = [ From 1447436a1f2ddb2df6242691645baaddf42e88fb Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Wed, 6 May 2026 00:19:34 -0500 Subject: [PATCH 04/26] fix traceId issue --- .../extensions/sandbox/tensorlake/sandbox.py | 32 ++++--- tests/extensions/sandbox/test_tensorlake.py | 19 +++- uv.lock | 89 ++++--------------- 3 files changed, 55 insertions(+), 85 deletions(-) diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index 5b8359d5c2..f1b0b27132 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -91,6 +91,23 @@ def _is_running_status(status: object) -> bool: return isinstance(value, str) and value.lower() == "running" +def _coerce_sandbox_payload_to_bytes(payload: Any) -> bytes: + """Coerce a Tensorlake `Sandbox.read_file` payload into raw bytes. + + The SDK wraps results in a `Traced[T]` object that carries the raw value on `.value` + and a W3C trace id on `.trace_id`; we detect it via `trace_id` so a `Traced[None]` + still unwraps correctly. Older or fake clients that already return `bytes`/`str` pass + through unchanged. + """ + if hasattr(payload, "trace_id") and not isinstance(payload, bytes | bytearray | str): + payload = payload.value + if isinstance(payload, str): + return payload.encode("utf-8") + if isinstance(payload, bytes | bytearray): + return bytes(payload) + return str(payload).encode("utf-8", errors="replace") + + def _encode_tensorlake_snapshot_ref(*, snapshot_id: str) -> bytes: body = json.dumps({"snapshot_id": snapshot_id}, separators=(",", ":"), sort_keys=True).encode( "utf-8" @@ -464,13 +481,7 @@ async def read(self, path: Path, *, user: str | User | None = None) -> io.IOBase raise WorkspaceReadNotFoundError(path=normalized_path, cause=exc) from exc raise WorkspaceArchiveReadError(path=normalized_path, cause=exc) from exc - if isinstance(payload, str): - data = payload.encode("utf-8") - elif isinstance(payload, bytes | bytearray): - data = bytes(payload) - else: - data = str(payload).encode("utf-8", errors="replace") - return io.BytesIO(data) + return io.BytesIO(_coerce_sandbox_payload_to_bytes(payload)) async def write( self, @@ -658,12 +669,7 @@ async def _persist_workspace_via_tar(self) -> io.IOBase: except Exception as exc: raise WorkspaceArchiveReadError(path=error_root, cause=exc) from exc - if isinstance(payload, str): - archive_bytes = payload.encode("utf-8") - elif isinstance(payload, bytes | bytearray): - archive_bytes = bytes(payload) - else: - archive_bytes = str(payload).encode("utf-8", errors="replace") + archive_bytes = _coerce_sandbox_payload_to_bytes(payload) try: await asyncio.to_thread( diff --git a/tests/extensions/sandbox/test_tensorlake.py b/tests/extensions/sandbox/test_tensorlake.py index 5af502f27c..201e0add49 100644 --- a/tests/extensions/sandbox/test_tensorlake.py +++ b/tests/extensions/sandbox/test_tensorlake.py @@ -52,6 +52,21 @@ def __init__(self, status_code: int, message: str = "") -> None: self.message = message +class _FakeTraced: + """Mimics the Tensorlake SDK `Traced[T]` wrapper returned by `read_file`.""" + + def __init__(self, value: Any) -> None: + self.trace_id = "trace-fake" + self._value = value + + @property + def value(self) -> Any: + return self._value + + def __getattr__(self, name: str) -> Any: + return getattr(object.__getattribute__(self, "_value"), name) + + class _FakeSandbox: """Synchronous fake mirroring the Tensorlake `Sandbox` surface used by the integration.""" @@ -208,10 +223,10 @@ def run( _ = cwd return _FakeCommandResult() - def read_file(self, path: str) -> bytes: + def read_file(self, path: str) -> _FakeTraced: if path not in self.files: raise _FakeRemoteAPIError(404, f"file not found: {path}") - return self.files[path] + return _FakeTraced(self.files[path]) def write_file(self, path: str, content: bytes) -> None: self.files[path] = bytes(content) diff --git a/uv.lock b/uv.lock index 7117be8b29..18c3a6cb6d 100644 --- a/uv.lock +++ b/uv.lock @@ -9,7 +9,7 @@ resolution-markers = [ ] [options] -exclude-newer = "2026-04-28T20:23:13.622945Z" +exclude-newer = "2026-04-29T05:04:56.465347Z" exclude-newer-span = "P7D" [[package]] @@ -732,10 +732,8 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, { name = "grpcio" }, - { name = "grpcio-status", version = "1.67.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, - { name = "grpcio-status", version = "1.76.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, - { name = "protobuf", version = "5.29.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, - { name = "protobuf", version = "6.33.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "grpcio-status" }, + { name = "protobuf" }, { name = "python-dateutil" }, { name = "typing-extensions" }, ] @@ -899,8 +897,7 @@ dependencies = [ { name = "httpcore" }, { name = "httpx" }, { name = "packaging" }, - { name = "protobuf", version = "5.29.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, - { name = "protobuf", version = "6.33.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "protobuf" }, { name = "python-dateutil" }, { name = "rich" }, { name = "typing-extensions" }, @@ -1204,8 +1201,7 @@ name = "googleapis-common-protos" version = "1.70.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "protobuf", version = "5.29.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, - { name = "protobuf", version = "6.33.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "protobuf" }, ] sdist = { url = "https://files.pythonhosted.org/packages/39/24/33db22342cf4a2ea27c9955e6713140fedd51e8b141b5ce5260897020f1a/googleapis_common_protos-1.70.0.tar.gz", hash = "sha256:0e1b44e0ea153e6594f9f394fef15193a68aaaea2d843f83e2742717ca753257", size = 145903, upload-time = "2025-04-14T10:17:02.924Z" } wheels = [ @@ -1352,36 +1348,14 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/19/41/0b430b01a2eb38ee887f88c1f07644a1df8e289353b78e82b37ef988fb64/grpcio-1.76.0-cp314-cp314-win_amd64.whl", hash = "sha256:922fa70ba549fce362d2e2871ab542082d66e2aaf0c19480ea453905b01f384e", size = 4834462, upload-time = "2025-10-21T16:22:39.772Z" }, ] -[[package]] -name = "grpcio-status" -version = "1.67.1" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", -] -dependencies = [ - { name = "googleapis-common-protos", marker = "python_full_version >= '3.14'" }, - { name = "grpcio", marker = "python_full_version >= '3.14'" }, - { name = "protobuf", version = "5.29.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/be/c7/fe0e79a80ac6346e0c6c0a24e9e3cbc3ae1c2a009acffb59eab484a6f69b/grpcio_status-1.67.1.tar.gz", hash = "sha256:2bf38395e028ceeecfd8866b081f61628114b384da7d51ae064ddc8d766a5d11", size = 13673, upload-time = "2024-10-29T06:30:21.787Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/05/18/56999a1da3577d8ccc8698a575d6638e15fe25650cc88b2ce0a087f180b9/grpcio_status-1.67.1-py3-none-any.whl", hash = "sha256:16e6c085950bdacac97c779e6a502ea671232385e6e37f258884d6883392c2bd", size = 14427, upload-time = "2024-10-29T06:27:38.228Z" }, -] - [[package]] name = "grpcio-status" version = "1.76.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12' and python_full_version < '3.14'", - "python_full_version == '3.11.*'", - "python_full_version < '3.11'", -] dependencies = [ - { name = "googleapis-common-protos", marker = "python_full_version < '3.14'" }, - { name = "grpcio", marker = "python_full_version < '3.14'" }, - { name = "protobuf", version = "6.33.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "googleapis-common-protos" }, + { name = "grpcio" }, + { name = "protobuf" }, ] sdist = { url = "https://files.pythonhosted.org/packages/3f/46/e9f19d5be65e8423f886813a2a9d0056ba94757b0c5007aa59aed1a961fa/grpcio_status-1.76.0.tar.gz", hash = "sha256:25fcbfec74c15d1a1cb5da3fab8ee9672852dc16a5a9eeb5baf7d7a9952943cd", size = 13679, upload-time = "2025-10-21T16:28:52.545Z" } wheels = [ @@ -1477,7 +1451,7 @@ wheels = [ [package.optional-dependencies] http2 = [ - { name = "h2", marker = "python_full_version < '3.14'" }, + { name = "h2" }, ] [[package]] @@ -2012,8 +1986,7 @@ dependencies = [ { name = "certifi" }, { name = "click" }, { name = "grpclib" }, - { name = "protobuf", version = "5.29.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, - { name = "protobuf", version = "6.33.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "protobuf" }, { name = "rich" }, { name = "synchronicity" }, { name = "toml" }, @@ -2535,7 +2508,7 @@ temporal = [ { name = "textual" }, ] tensorlake = [ - { name = "tensorlake", marker = "python_full_version < '3.14'" }, + { name = "tensorlake" }, ] vercel = [ { name = "vercel" }, @@ -2612,7 +2585,7 @@ requires-dist = [ { name = "runloop-api-client", marker = "extra == 'runloop'", specifier = ">=1.16.0,<2.0.0" }, { name = "sqlalchemy", marker = "extra == 'sqlalchemy'", specifier = ">=2.0" }, { name = "temporalio", marker = "extra == 'temporal'", specifier = "==1.26.0" }, - { name = "tensorlake", marker = "python_full_version < '3.14' and extra == 'tensorlake'", specifier = ">=0.5.4" }, + { name = "tensorlake", marker = "extra == 'tensorlake'", specifier = ">=0.5.4" }, { name = "textual", marker = "extra == 'temporal'", specifier = ">=8.2.3,<8.3" }, { name = "types-requests", specifier = ">=2.0,<3" }, { name = "typing-extensions", specifier = ">=4.12.2,<5" }, @@ -2749,8 +2722,7 @@ name = "opentelemetry-proto" version = "1.40.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "protobuf", version = "5.29.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, - { name = "protobuf", version = "6.33.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "protobuf" }, ] sdist = { url = "https://files.pythonhosted.org/packages/4c/77/dd38991db037fdfce45849491cb61de5ab000f49824a00230afb112a4392/opentelemetry_proto-1.40.0.tar.gz", hash = "sha256:03f639ca129ba513f5819810f5b1f42bcb371391405d99c168fe6937c62febcd", size = 45667, upload-time = "2026-03-04T14:17:31.194Z" } wheels = [ @@ -2945,32 +2917,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/35/cc0aaecf278bb4575b8555f2b137de5ab821595ddae9da9d3cd1da4072c7/propcache-0.3.2-py3-none-any.whl", hash = "sha256:98f1ec44fb675f5052cccc8e609c46ed23a35a1cfd18545ad4e29002d858a43f", size = 12663, upload-time = "2025-06-09T22:56:04.484Z" }, ] -[[package]] -name = "protobuf" -version = "5.29.5" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", -] -sdist = { url = "https://files.pythonhosted.org/packages/43/29/d09e70352e4e88c9c7a198d5645d7277811448d76c23b00345670f7c8a38/protobuf-5.29.5.tar.gz", hash = "sha256:bc1463bafd4b0929216c35f437a8e28731a2b7fe3d98bb77a600efced5a15c84", size = 425226, upload-time = "2025-05-28T23:51:59.82Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5f/11/6e40e9fc5bba02988a214c07cf324595789ca7820160bfd1f8be96e48539/protobuf-5.29.5-cp310-abi3-win32.whl", hash = "sha256:3f1c6468a2cfd102ff4703976138844f78ebd1fb45f49011afc5139e9e283079", size = 422963, upload-time = "2025-05-28T23:51:41.204Z" }, - { url = "https://files.pythonhosted.org/packages/81/7f/73cefb093e1a2a7c3ffd839e6f9fcafb7a427d300c7f8aef9c64405d8ac6/protobuf-5.29.5-cp310-abi3-win_amd64.whl", hash = "sha256:3f76e3a3675b4a4d867b52e4a5f5b78a2ef9565549d4037e06cf7b0942b1d3fc", size = 434818, upload-time = "2025-05-28T23:51:44.297Z" }, - { url = "https://files.pythonhosted.org/packages/dd/73/10e1661c21f139f2c6ad9b23040ff36fee624310dc28fba20d33fdae124c/protobuf-5.29.5-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:e38c5add5a311f2a6eb0340716ef9b039c1dfa428b28f25a7838ac329204a671", size = 418091, upload-time = "2025-05-28T23:51:45.907Z" }, - { url = "https://files.pythonhosted.org/packages/6c/04/98f6f8cf5b07ab1294c13f34b4e69b3722bb609c5b701d6c169828f9f8aa/protobuf-5.29.5-cp38-abi3-manylinux2014_aarch64.whl", hash = "sha256:fa18533a299d7ab6c55a238bf8629311439995f2e7eca5caaff08663606e9015", size = 319824, upload-time = "2025-05-28T23:51:47.545Z" }, - { url = "https://files.pythonhosted.org/packages/85/e4/07c80521879c2d15f321465ac24c70efe2381378c00bf5e56a0f4fbac8cd/protobuf-5.29.5-cp38-abi3-manylinux2014_x86_64.whl", hash = "sha256:63848923da3325e1bf7e9003d680ce6e14b07e55d0473253a690c3a8b8fd6e61", size = 319942, upload-time = "2025-05-28T23:51:49.11Z" }, - { url = "https://files.pythonhosted.org/packages/7e/cc/7e77861000a0691aeea8f4566e5d3aa716f2b1dece4a24439437e41d3d25/protobuf-5.29.5-py3-none-any.whl", hash = "sha256:6cf42630262c59b2d8de33954443d94b746c952b01434fc58a417fdbd2e84bd5", size = 172823, upload-time = "2025-05-28T23:51:58.157Z" }, -] - [[package]] name = "protobuf" version = "6.33.6" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12' and python_full_version < '3.14'", - "python_full_version == '3.11.*'", - "python_full_version < '3.11'", -] sdist = { url = "https://files.pythonhosted.org/packages/66/70/e908e9c5e52ef7c3a6c7902c9dfbb34c7e29c25d2f81ade3856445fd5c94/protobuf-6.33.6.tar.gz", hash = "sha256:a6768d25248312c297558af96a9f9c929e8c4cee0659cb07e780731095f38135", size = 444531, upload-time = "2026-03-18T19:05:00.988Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/fc/9f/2f509339e89cfa6f6a4c4ff50438db9ca488dec341f7e454adad60150b00/protobuf-6.33.6-cp310-abi3-win32.whl", hash = "sha256:7d29d9b65f8afef196f8334e80d6bc1d5d4adedb449971fefd3723824e6e77d3", size = 425739, upload-time = "2026-03-18T19:04:48.373Z" }, @@ -4013,8 +3963,7 @@ version = "1.26.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "nexus-rpc" }, - { name = "protobuf", version = "5.29.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, - { name = "protobuf", version = "6.33.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, + { name = "protobuf" }, { name = "python-dateutil", marker = "python_full_version < '3.11'" }, { name = "types-protobuf" }, { name = "typing-extensions" }, @@ -4033,11 +3982,11 @@ name = "tensorlake" version = "0.5.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "grpcio", marker = "python_full_version < '3.14'" }, - { name = "httpx", extra = ["http2"], marker = "python_full_version < '3.14'" }, - { name = "protobuf", version = "6.33.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, - { name = "pydantic", marker = "python_full_version < '3.14'" }, - { name = "websocket-client", marker = "python_full_version < '3.14'" }, + { name = "grpcio" }, + { name = "httpx", extra = ["http2"] }, + { name = "protobuf" }, + { name = "pydantic" }, + { name = "websocket-client" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ea/1f/e613ce553e1cf8cdf0747ff99f884b1da685ca1dce88536938b6e5585459/tensorlake-0.5.4.tar.gz", hash = "sha256:f6f175517f38ccde71095a0e993a8a43ef721baa96a342f3166c8dd44f96ecf3", size = 2269787, upload-time = "2026-04-28T19:48:58.853Z" } wheels = [ From e640273b25e38ef30ebbd5bcc040e8e3b1f960ca Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Wed, 6 May 2026 00:28:01 -0500 Subject: [PATCH 05/26] update --- .../extensions/sandbox/tensorlake/sandbox.py | 18 ++++------ tests/extensions/sandbox/test_tensorlake.py | 35 +++++++++++++++++++ 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index f1b0b27132..04fb2dd8ed 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -945,19 +945,15 @@ async def resume( try: sandbox = await asyncio.to_thread(Sandbox.connect, state.sandbox_id) if state.pause_on_exit: - # Paused sandboxes are valid reconnect targets only when pause-on-exit was - # configured; turn them back on before use. - try: - await asyncio.to_thread(sandbox.resume) - except Exception: - pass + # `connect` returns a handle even for a paused/expired sandbox; `resume` is + # what actually transitions it to running. Failures must fall through so the + # outer handler recreates rather than marking a dead backend as preserved. + await asyncio.to_thread(sandbox.resume) + status = await asyncio.to_thread(getattr, sandbox, "status") + if _is_running_status(status): reconnected = True else: - status = await asyncio.to_thread(getattr, sandbox, "status") - if _is_running_status(status): - reconnected = True - else: - sandbox = None + sandbox = None except Exception: sandbox = None diff --git a/tests/extensions/sandbox/test_tensorlake.py b/tests/extensions/sandbox/test_tensorlake.py index 201e0add49..aae149a09f 100644 --- a/tests/extensions/sandbox/test_tensorlake.py +++ b/tests/extensions/sandbox/test_tensorlake.py @@ -95,6 +95,7 @@ def __init__( self.terminated = False self.suspended = False self.resumed = False + self.resume_failure: BaseException | None = None self.next_run_result: _FakeCommandResult | None = None self.symlinks: dict[str, str] = {} @@ -244,6 +245,8 @@ def suspend( def resume(self, wait: bool = True, timeout: float = 300.0, poll_interval: float = 1.0) -> None: _ = (wait, timeout, poll_interval) + if self.resume_failure is not None: + raise self.resume_failure self.resumed = True self.status = "running" @@ -634,6 +637,38 @@ async def test_resume_creates_fresh_when_reconnect_fails( assert state.workspace_root_ready is False +@pytest.mark.asyncio +async def test_resume_creates_fresh_when_paused_resume_fails( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """A failed `resume()` must not be reported as a preserved running sandbox.""" + + module = _load_tensorlake_module(monkeypatch) + + existing = _FakeSandbox(sandbox_id="sandbox-paused", status="suspended") + existing.resume_failure = RuntimeError("sandbox expired") + _FakeSandbox.sandboxes["sandbox-paused"] = existing + + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-00000000000f"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-paused", + pause_on_exit=True, + ) + + client = module.TensorlakeSandboxClient() + session = await client.resume(state) + + assert len(_FakeSandbox.create_calls) == 1 + new_id = session._inner.state.sandbox_id + assert new_id != "sandbox-paused" + assert new_id.startswith("tensorlake-sandbox-") + assert state.workspace_root_ready is False + assert session._inner._workspace_state_preserved_on_start() is False + assert session._inner._system_state_preserved_on_start() is False + + def test_serialize_session_state_round_trips(monkeypatch: pytest.MonkeyPatch) -> None: module = _load_tensorlake_module(monkeypatch) From b3048c70cf450ffaa22035fa39cbe03855b3603c Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Wed, 6 May 2026 21:08:33 -0500 Subject: [PATCH 06/26] switch to native async api --- .../extensions/sandbox/tensorlake/sandbox.py | 172 ++++++++++-------- tests/extensions/sandbox/test_tensorlake.py | 172 ++++++++++++++++-- 2 files changed, 250 insertions(+), 94 deletions(-) diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index 04fb2dd8ed..ec8aee3273 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -4,8 +4,7 @@ Set `TENSORLAKE_API_KEY` (or run `tl login`) to authenticate. This module provides a Tensorlake-backed sandbox client/session implementation backed by -`tensorlake.sandbox.Sandbox`. The Tensorlake Python SDK is synchronous, so blocking calls -are dispatched through `asyncio.to_thread` to integrate with the async session contract. +`tensorlake.sandbox.AsyncSandbox`. Note: The `tensorlake` dependency is optional (installed via the `tensorlake` extra), so imports of the SDK happen lazily within this module. Users without the extra can still @@ -29,7 +28,6 @@ ExecNonZeroError, ExecTimeoutError, ExecTransportError, - ExposedPortUnavailableError, WorkspaceArchiveReadError, WorkspaceArchiveWriteError, WorkspaceReadNotFoundError, @@ -62,19 +60,23 @@ _DEFAULT_EXPOSED_PORT_HOST_TEMPLATE = "{port}-{sandbox}.sandbox.tensorlake.ai" +# Hostnames that indicate a local proxy where port-prefixed subdomain routing does not apply +# (the SDK uses a `Host` header instead). +_LOOPBACK_HOSTS = frozenset({"localhost", "127.0.0.1", "::1"}) + def _import_tensorlake_sandbox() -> tuple[Any, Any, Any, Any]: """Lazily import the Tensorlake SDK symbols this integration needs.""" try: from tensorlake.sandbox import ( + AsyncSandbox, CheckpointType, RemoteAPIError, - Sandbox, SandboxStatus, ) - return Sandbox, CheckpointType, SandboxStatus, RemoteAPIError + return AsyncSandbox, CheckpointType, SandboxStatus, RemoteAPIError except ImportError as exc: # pragma: no cover - exercised via unit tests with fakes raise ImportError( "TensorlakeSandboxClient requires the optional `tensorlake` dependency.\n" @@ -92,7 +94,7 @@ def _is_running_status(status: object) -> bool: def _coerce_sandbox_payload_to_bytes(payload: Any) -> bytes: - """Coerce a Tensorlake `Sandbox.read_file` payload into raw bytes. + """Coerce a Tensorlake `AsyncSandbox.read_file` payload into raw bytes. The SDK wraps results in a `Traced[T]` object that carries the raw value on `.value` and a W3C trace id on `.trace_id`; we detect it via `trace_id` so a `Traced[None]` @@ -245,7 +247,7 @@ def _resolve_create_kwargs( deny_out: tuple[str, ...], snapshot_id: str | None, ) -> dict[str, object]: - """Build the kwargs accepted by `Sandbox.create(...)`. + """Build the kwargs accepted by `AsyncSandbox.create(...)`. Only includes optional fields when they are set so the SDK can apply its own defaults. Tensorlake does not accept environment variables at sandbox-create time; envs are passed @@ -288,6 +290,9 @@ class TensorlakeSandboxSession(BaseSandboxSession): state: TensorlakeSandboxSessionState _sandbox: Any _envs_cache: dict[str, str] | None + _cached_proxy_hostname: str | None + _proxy_hostname_resolved: bool + _backend_terminated: bool def __init__( self, @@ -298,6 +303,9 @@ def __init__( self.state = state self._sandbox = sandbox self._envs_cache = None + self._cached_proxy_hostname = None + self._proxy_hostname_resolved = False + self._backend_terminated = False @classmethod def from_state( @@ -344,8 +352,7 @@ async def _prepare_backend_workspace(self) -> None: root = sandbox_path_str(self.state.manifest.root) try: envs = await self._resolved_envs() - result = await asyncio.to_thread( - self._sandbox.run, + result = await self._sandbox.run( "mkdir", ["-p", "--", root], env=envs or None, @@ -371,9 +378,10 @@ async def _shutdown_backend(self) -> None: return try: if self.state.pause_on_exit: - await asyncio.to_thread(sandbox.suspend) + await sandbox.suspend() else: - await asyncio.to_thread(sandbox.terminate) + await sandbox.terminate() + self._backend_terminated = True except Exception as exc: if self.state.pause_on_exit: logger.warning( @@ -382,7 +390,8 @@ async def _shutdown_backend(self) -> None: exc_info=exc, ) try: - await asyncio.to_thread(sandbox.terminate) + await sandbox.terminate() + self._backend_terminated = True except Exception as term_exc: logger.warning( "Failed to terminate Tensorlake sandbox after suspend fallback failure.", @@ -401,7 +410,7 @@ async def running(self) -> bool: if sandbox is None: return False try: - status = await asyncio.to_thread(getattr, sandbox, "status") + status = await sandbox.status() except Exception: return False return _is_running_status(status) @@ -420,10 +429,10 @@ async def _exec_internal( exec_timeout = self._coerce_exec_timeout(timeout) try: - # Rely on the SDK's own `timeout`; an outer `asyncio.wait_for` would only cancel - # the awaiter, not the underlying thread, so it would not free the runtime. - result = await asyncio.to_thread( - self._sandbox.run, + # Rely on the SDK's own `timeout` so the backend tears down the running + # process; an outer `asyncio.wait_for` only cancels the local awaiter and + # would leave the sandbox-side command running until the next tick. + result = await self._sandbox.run( normalized[0], normalized[1:], env=envs or None, @@ -449,20 +458,33 @@ async def _exec_internal( ) async def _resolve_exposed_port(self, port: int) -> ExposedPortEndpoint: - host = _DEFAULT_EXPOSED_PORT_HOST_TEMPLATE.format( - port=port, sandbox=self.state.name or self.state.sandbox_id - ) - # The Tensorlake SDK does not currently expose a discovery method for inbound URLs, - # so we rely on the documented host template. - parsed = urlsplit(f"https://{host}") - if not parsed.hostname: - raise ExposedPortUnavailableError( - port=port, - exposed_ports=self.state.exposed_ports, - reason="backend_unavailable", - context={"backend": "tensorlake", "host": host}, + # Prefer the backend's per-sandbox URL so non-default `TENSORLAKE_SANDBOX_PROXY_URL` + # deployments (e.g. tensorlake.dev) resolve correctly; fall back to the public template. + proxy_hostname = await self._get_proxy_hostname() + if proxy_hostname: + host = f"{port}-{proxy_hostname}" + else: + host = _DEFAULT_EXPOSED_PORT_HOST_TEMPLATE.format( + port=port, sandbox=self.state.name or self.state.sandbox_id ) - return ExposedPortEndpoint(host=parsed.hostname, port=443, tls=True) + return ExposedPortEndpoint(host=host, port=443, tls=True) + + async def _get_proxy_hostname(self) -> str | None: + if self._proxy_hostname_resolved: + return self._cached_proxy_hostname + try: + info = await self._sandbox.info() + except Exception: + info = None + sandbox_url = getattr(info, "sandbox_url", None) if info is not None else None + hostname: str | None = None + if isinstance(sandbox_url, str) and sandbox_url: + parsed = urlsplit(sandbox_url).hostname + if parsed and parsed not in _LOOPBACK_HOSTS: + hostname = parsed + self._cached_proxy_hostname = hostname + self._proxy_hostname_resolved = True + return hostname async def read(self, path: Path, *, user: str | User | None = None) -> io.IOBase: if user is not None: @@ -471,9 +493,7 @@ async def read(self, path: Path, *, user: str | User | None = None) -> io.IOBase normalized_path = await self._validate_path_access(path) try: - payload = await asyncio.to_thread( - self._sandbox.read_file, sandbox_path_str(normalized_path) - ) + payload = await self._sandbox.read_file(sandbox_path_str(normalized_path)) except FileNotFoundError as exc: raise WorkspaceReadNotFoundError(path=normalized_path, cause=exc) from exc except Exception as exc: @@ -502,9 +522,7 @@ async def write( raise WorkspaceWriteTypeError(path=normalized_path, actual_type=type(payload).__name__) try: - await asyncio.to_thread( - self._sandbox.write_file, sandbox_path_str(normalized_path), bytes(payload) - ) + await self._sandbox.write_file(sandbox_path_str(normalized_path), bytes(payload)) except Exception as exc: raise WorkspaceArchiveWriteError(path=normalized_path, cause=exc) from exc @@ -527,8 +545,7 @@ async def mkdir( argv = [a for a in [flag, "--", sandbox_path_str(path)] if a] try: envs = await self._resolved_envs() - result = await asyncio.to_thread( - self._sandbox.run, + result = await self._sandbox.run( "mkdir", argv, env=envs or None, @@ -593,8 +610,7 @@ async def _persist_workspace_via_checkpoint(self) -> io.IOBase: try: snapshot = await asyncio.wait_for( - asyncio.to_thread( - self._sandbox.checkpoint, + self._sandbox.checkpoint( checkpoint_type=checkpoint_type, timeout=int(self.state.checkpoint_timeout_s), ), @@ -635,8 +651,7 @@ async def _persist_workspace_via_tar(self) -> io.IOBase: try: envs = await self._resolved_envs() - result = await asyncio.to_thread( - self._sandbox.run, + result = await self._sandbox.run( "tar", tar_argv, env=envs or None, @@ -665,21 +680,13 @@ async def _persist_workspace_via_tar(self) -> io.IOBase: ) try: - payload = await asyncio.to_thread(self._sandbox.read_file, archive_path) + payload = await self._sandbox.read_file(archive_path) except Exception as exc: raise WorkspaceArchiveReadError(path=error_root, cause=exc) from exc archive_bytes = _coerce_sandbox_payload_to_bytes(payload) - try: - await asyncio.to_thread( - self._sandbox.run, - "rm", - ["-f", "--", archive_path], - timeout=int(self.state.timeouts.cleanup_s), - ) - except Exception: - pass + await self._remove_tmp_archive(archive_path) return io.BytesIO(archive_bytes) @@ -711,10 +718,10 @@ async def _hydrate_workspace_internal(self, raw: bytes) -> None: async def _restore_from_checkpoint(self, snapshot_id: str) -> None: root = self._workspace_root_path() error_root = posix_path_for_error(root) - Sandbox, _, _, _ = _import_tensorlake_sandbox() + AsyncSandbox, _, _, _ = _import_tensorlake_sandbox() try: - await asyncio.to_thread(self._sandbox.terminate) + await self._sandbox.terminate() except Exception: pass @@ -733,7 +740,7 @@ async def _restore_from_checkpoint(self, snapshot_id: str) -> None: ) try: - sandbox = await asyncio.to_thread(Sandbox.create, **kwargs) + sandbox = await AsyncSandbox.create(**kwargs) except Exception as exc: raise WorkspaceArchiveWriteError( path=error_root, @@ -772,10 +779,9 @@ async def _hydrate_workspace_via_tar(self, raw: bytes) -> None: try: await self._prepare_backend_workspace() - await asyncio.to_thread(self._sandbox.write_file, archive_path, raw) + await self._sandbox.write_file(archive_path, raw) envs = await self._resolved_envs() - result = await asyncio.to_thread( - self._sandbox.run, + result = await self._sandbox.run( "tar", ["xf", archive_path, "-C", root.as_posix()], env=envs or None, @@ -786,15 +792,7 @@ async def _hydrate_workspace_via_tar(self, raw: bytes) -> None: except Exception as exc: raise WorkspaceArchiveWriteError(path=error_root, cause=exc) from exc finally: - try: - await asyncio.to_thread( - self._sandbox.run, - "rm", - ["-f", "--", archive_path], - timeout=int(self.state.timeouts.cleanup_s), - ) - except Exception: - pass + await self._remove_tmp_archive(archive_path) exit_code = int(getattr(result, "exit_code", 0) or 0) if exit_code != 0: @@ -808,13 +806,23 @@ async def _hydrate_workspace_via_tar(self, raw: bytes) -> None: ) self.state.workspace_root_ready = True + async def _remove_tmp_archive(self, archive_path: str) -> None: + """Best-effort cleanup of a `/tmp` tar archive used for workspace persistence.""" + try: + await self._sandbox.run( + "rm", + ["-f", "--", archive_path], + timeout=int(self.state.timeouts.cleanup_s), + ) + except Exception: + pass + async def _maybe_apply_exposed_ports(self) -> None: ports = list(self.state.exposed_ports) if not ports: return try: - await asyncio.to_thread( - self._sandbox.update, + await self._sandbox.update( exposed_ports=ports, allow_unauthenticated_access=self.state.allow_unauthenticated_port_access, ) @@ -868,7 +876,7 @@ async def create( f"{_WORKSPACE_PERSISTENCE_TAR!r} or {_WORKSPACE_PERSISTENCE_SNAPSHOT!r}" ) - Sandbox, _, _, _ = _import_tensorlake_sandbox() + AsyncSandbox, _, _, _ = _import_tensorlake_sandbox() base_envs = dict(options.envs or {}) @@ -886,11 +894,11 @@ async def create( snapshot_id=None, ) - sandbox = await asyncio.to_thread(Sandbox.create, **kwargs) + sandbox = await AsyncSandbox.create(**kwargs) sandbox_id = getattr(sandbox, "sandbox_id", None) if not isinstance(sandbox_id, str) or not sandbox_id: raise RuntimeError( - "Tensorlake `Sandbox.create` did not return a sandbox with a `sandbox_id`." + "Tensorlake `AsyncSandbox.create` did not return a sandbox with a `sandbox_id`." ) session_id = uuid.uuid4() @@ -927,6 +935,16 @@ async def delete(self, session: SandboxSession) -> SandboxSession: inner = session._inner if not isinstance(inner, TensorlakeSandboxSession): raise TypeError("TensorlakeSandboxClient.delete expects a TensorlakeSandboxSession") + # `delete` runs after `shutdown()` in the manager; only terminate when shutdown didn't + # already (e.g. `pause_on_exit=True` suspended instead) so we don't double-call the + # backend, while still freeing remote resources on direct `client.delete(...)` use. + if inner._backend_terminated or inner._sandbox is None: + return session + try: + await inner._sandbox.terminate() + inner._backend_terminated = True + except Exception: + pass return session async def resume( @@ -938,18 +956,18 @@ async def resume( "TensorlakeSandboxClient.resume expects a TensorlakeSandboxSessionState" ) - Sandbox, _, _, _ = _import_tensorlake_sandbox() + AsyncSandbox, _, _, _ = _import_tensorlake_sandbox() sandbox: Any = None reconnected = False try: - sandbox = await asyncio.to_thread(Sandbox.connect, state.sandbox_id) + sandbox = await AsyncSandbox.connect(state.sandbox_id) if state.pause_on_exit: # `connect` returns a handle even for a paused/expired sandbox; `resume` is # what actually transitions it to running. Failures must fall through so the # outer handler recreates rather than marking a dead backend as preserved. - await asyncio.to_thread(sandbox.resume) - status = await asyncio.to_thread(getattr, sandbox, "status") + await sandbox.resume() + status = await sandbox.status() if _is_running_status(status): reconnected = True else: @@ -971,7 +989,7 @@ async def resume( deny_out=state.deny_out, snapshot_id=None, ) - sandbox = await asyncio.to_thread(Sandbox.create, **kwargs) + sandbox = await AsyncSandbox.create(**kwargs) new_id = getattr(sandbox, "sandbox_id", None) if isinstance(new_id, str) and new_id: state.sandbox_id = new_id diff --git a/tests/extensions/sandbox/test_tensorlake.py b/tests/extensions/sandbox/test_tensorlake.py index aae149a09f..f9726f932f 100644 --- a/tests/extensions/sandbox/test_tensorlake.py +++ b/tests/extensions/sandbox/test_tensorlake.py @@ -52,6 +52,11 @@ def __init__(self, status_code: int, message: str = "") -> None: self.message = message +class _FakeSandboxInfo: + def __init__(self, *, sandbox_url: str | None = None) -> None: + self.sandbox_url = sandbox_url + + class _FakeTraced: """Mimics the Tensorlake SDK `Traced[T]` wrapper returned by `read_file`.""" @@ -68,7 +73,7 @@ def __getattr__(self, name: str) -> Any: class _FakeSandbox: - """Synchronous fake mirroring the Tensorlake `Sandbox` surface used by the integration.""" + """Async fake mirroring the Tensorlake `AsyncSandbox` surface used by the integration.""" create_calls: list[dict[str, object]] = [] connect_calls: list[dict[str, object]] = [] @@ -85,19 +90,23 @@ def __init__( name: str | None = None, status: str = "running", files: dict[str, bytes] | None = None, + sandbox_url: str | None = None, ) -> None: self.sandbox_id = sandbox_id self.name = name - self.status = status + self._status = _FakeSandboxStatus._Member(status) self.files: dict[str, bytes] = dict(files or {}) self.run_calls: list[dict[str, object]] = [] self.update_calls: list[dict[str, object]] = [] self.terminated = False + self.terminate_count = 0 self.suspended = False self.resumed = False self.resume_failure: BaseException | None = None self.next_run_result: _FakeCommandResult | None = None self.symlinks: dict[str, str] = {} + self.sandbox_url = sandbox_url + self.info_calls = 0 @classmethod def reset(cls) -> None: @@ -110,7 +119,7 @@ def reset(cls) -> None: cls.connect_failures = {} @classmethod - def create(cls, **kwargs: object) -> _FakeSandbox: + async def create(cls, **kwargs: object) -> _FakeSandbox: cls.create_calls.append(dict(kwargs)) if cls.create_failures: raise cls.create_failures.pop(0) @@ -129,7 +138,7 @@ def create(cls, **kwargs: object) -> _FakeSandbox: return sandbox @classmethod - def connect(cls, sandbox_id: str, **kwargs: object) -> _FakeSandbox: + async def connect(cls, sandbox_id: str, **kwargs: object) -> _FakeSandbox: cls.connect_calls.append({"sandbox_id": sandbox_id, **kwargs}) if sandbox_id in cls.connect_failures: raise cls.connect_failures[sandbox_id] @@ -138,7 +147,14 @@ def connect(cls, sandbox_id: str, **kwargs: object) -> _FakeSandbox: raise RuntimeError(f"sandbox {sandbox_id} not found") return sandbox - def run( + async def status(self) -> Any: + return self._status + + async def info(self) -> _FakeSandboxInfo: + self.info_calls += 1 + return _FakeSandboxInfo(sandbox_url=self.sandbox_url) + + async def run( self, command: str, args: list[str] | None = None, @@ -224,33 +240,36 @@ def run( _ = cwd return _FakeCommandResult() - def read_file(self, path: str) -> _FakeTraced: + async def read_file(self, path: str) -> _FakeTraced: if path not in self.files: raise _FakeRemoteAPIError(404, f"file not found: {path}") return _FakeTraced(self.files[path]) - def write_file(self, path: str, content: bytes) -> None: + async def write_file(self, path: str, content: bytes) -> None: self.files[path] = bytes(content) - def terminate(self) -> None: + async def terminate(self) -> None: self.terminated = True - self.status = "terminated" + self.terminate_count += 1 + self._status = _FakeSandboxStatus.TERMINATED - def suspend( + async def suspend( self, wait: bool = True, timeout: float = 300.0, poll_interval: float = 1.0 ) -> None: _ = (wait, timeout, poll_interval) self.suspended = True - self.status = "suspended" + self._status = _FakeSandboxStatus.SUSPENDED - def resume(self, wait: bool = True, timeout: float = 300.0, poll_interval: float = 1.0) -> None: + async def resume( + self, wait: bool = True, timeout: float = 300.0, poll_interval: float = 1.0 + ) -> None: _ = (wait, timeout, poll_interval) if self.resume_failure is not None: raise self.resume_failure self.resumed = True - self.status = "running" + self._status = _FakeSandboxStatus.RUNNING - def update( + async def update( self, name: str | None = None, *, @@ -265,7 +284,7 @@ def update( } ) - def checkpoint( + async def checkpoint( self, wait: bool = True, timeout: float = 300.0, @@ -288,7 +307,7 @@ def _load_tensorlake_module(monkeypatch: pytest.MonkeyPatch) -> Any: fake_pkg = types.ModuleType("tensorlake") fake_sandbox_pkg = cast(Any, types.ModuleType("tensorlake.sandbox")) - fake_sandbox_pkg.Sandbox = _FakeSandbox + fake_sandbox_pkg.AsyncSandbox = _FakeSandbox fake_sandbox_pkg.CheckpointType = _FakeCheckpointType fake_sandbox_pkg.SandboxStatus = _FakeSandboxStatus fake_sandbox_pkg.RemoteAPIError = _FakeRemoteAPIError @@ -467,6 +486,125 @@ async def test_exposed_port_resolution_uses_named_sandbox_when_set( assert endpoint.host == "8080-demo.sandbox.tensorlake.ai" +@pytest.mark.asyncio +async def test_exposed_port_resolution_uses_backend_sandbox_url( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = _load_tensorlake_module(monkeypatch) + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-00000000000a"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-dev", + name="dev-env", + exposed_ports=(8080,), + ) + fake = _FakeSandbox( + sandbox_id="sandbox-dev", + name="dev-env", + sandbox_url="https://dev-env.sandbox.tensorlake.dev", + ) + session = module.TensorlakeSandboxSession.from_state(state, sandbox=fake) + + endpoint = await session.resolve_exposed_port(8080) + assert endpoint.host == "8080-dev-env.sandbox.tensorlake.dev" + + +@pytest.mark.asyncio +async def test_exposed_port_resolution_caches_proxy_hostname( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = _load_tensorlake_module(monkeypatch) + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-00000000000c"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-cache", + exposed_ports=(8080, 9090), + ) + fake = _FakeSandbox( + sandbox_id="sandbox-cache", + sandbox_url="https://sandbox-cache.sandbox.tensorlake.ai", + ) + session = module.TensorlakeSandboxSession.from_state(state, sandbox=fake) + + await session.resolve_exposed_port(8080) + await session.resolve_exposed_port(9090) + + assert fake.info_calls == 1 + + +@pytest.mark.asyncio +async def test_delete_terminates_remote_sandbox(monkeypatch: pytest.MonkeyPatch) -> None: + module = _load_tensorlake_module(monkeypatch) + client = module.TensorlakeSandboxClient() + session = await client.create( + manifest=Manifest(), + options=module.TensorlakeSandboxClientOptions(), + ) + fake = session._inner._sandbox + assert fake.terminated is False + + await client.delete(session) + + assert fake.terminate_count == 1 + + +@pytest.mark.asyncio +async def test_delete_terminates_even_when_pause_on_exit( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = _load_tensorlake_module(monkeypatch) + client = module.TensorlakeSandboxClient() + session = await client.create( + manifest=Manifest(), + options=module.TensorlakeSandboxClientOptions(pause_on_exit=True), + ) + fake = session._inner._sandbox + + await client.delete(session) + + assert fake.terminate_count == 1 + assert fake.suspended is False + + +@pytest.mark.asyncio +async def test_shutdown_then_delete_does_not_double_terminate( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = _load_tensorlake_module(monkeypatch) + client = module.TensorlakeSandboxClient() + session = await client.create( + manifest=Manifest(), + options=module.TensorlakeSandboxClientOptions(), + ) + fake = session._inner._sandbox + + await session.shutdown() + await client.delete(session) + + assert fake.terminate_count == 1 + + +@pytest.mark.asyncio +async def test_shutdown_pause_then_delete_terminates_once( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = _load_tensorlake_module(monkeypatch) + client = module.TensorlakeSandboxClient() + session = await client.create( + manifest=Manifest(), + options=module.TensorlakeSandboxClientOptions(pause_on_exit=True), + ) + fake = session._inner._sandbox + + await session.shutdown() + await client.delete(session) + + assert fake.suspended is True + assert fake.terminate_count == 1 + + @pytest.mark.asyncio async def test_shutdown_terminates_by_default(monkeypatch: pytest.MonkeyPatch) -> None: module = _load_tensorlake_module(monkeypatch) @@ -569,7 +707,7 @@ async def test_hydrate_workspace_via_checkpoint_replaces_sandbox( # First, take a checkpoint via the tar-style helper so it is registered. initial = _FakeSandbox(sandbox_id="sandbox-source") initial.files["/workspace/from-snapshot.txt"] = b"snap-data" - snap = initial.checkpoint(checkpoint_type=_FakeCheckpointType.FILESYSTEM) + snap = await initial.checkpoint(checkpoint_type=_FakeCheckpointType.FILESYSTEM) state = module.TensorlakeSandboxSessionState( session_id=uuid.UUID("00000000-0000-0000-0000-00000000000b"), From 7bf5192c53c5eab3b514c287c19032aa9562973e Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Wed, 6 May 2026 21:27:32 -0500 Subject: [PATCH 07/26] update sdk to 0.5.8 and above --- pyproject.toml | 2 +- tests/extensions/sandbox/test_tensorlake.py | 24 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e428a731a9..d80da27d31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,7 @@ cloudflare = ["aiohttp>=3.12,<4"] e2b = ["e2b==2.20.0", "e2b-code-interpreter==2.4.1"] modal = ["modal==1.3.5"] runloop = ["runloop_api_client>=1.16.0,<2.0.0"] -tensorlake = ["tensorlake>=0.5.4"] +tensorlake = ["tensorlake>=0.5.8"] vercel = ["vercel>=0.5.6,<0.6"] s3 = ["boto3>=1.34"] temporal = [ diff --git a/tests/extensions/sandbox/test_tensorlake.py b/tests/extensions/sandbox/test_tensorlake.py index f9726f932f..df4ba51bc2 100644 --- a/tests/extensions/sandbox/test_tensorlake.py +++ b/tests/extensions/sandbox/test_tensorlake.py @@ -673,6 +673,30 @@ async def test_persist_workspace_via_tar_round_trips_manifest( assert restored.read() == b"payload" +@pytest.mark.asyncio +async def test_persist_workspace_via_tar_nonzero_raises_archive_error( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = _load_tensorlake_module(monkeypatch) + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-000000000010"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-tar-failure", + ) + fake = _FakeSandbox(sandbox_id="sandbox-tar-failure") + session = module.TensorlakeSandboxSession.from_state(state, sandbox=fake) + + from agents.sandbox.errors import WorkspaceArchiveReadError + + fake.next_run_result = _FakeCommandResult(stderr="tar failed", exit_code=2) + + with pytest.raises(WorkspaceArchiveReadError) as exc_info: + await session.persist_workspace() + + assert "tar failed" in str(exc_info.value.__cause__) + + @pytest.mark.asyncio async def test_persist_workspace_via_checkpoint_returns_snapshot_ref( monkeypatch: pytest.MonkeyPatch, From e361e1f7261d7a9847993f25a671fa689842ad3e Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Thu, 14 May 2026 10:03:04 -0500 Subject: [PATCH 08/26] fix: prevent sub-second exec timeouts truncating to 0 and tighten tar symlink validation Use math.ceil with a minimum of 1 second when forwarding exec timeouts to the Tensorlake SDK so values like 0.5 or 0.001 no longer become 0. Also pass allow_external_symlink_targets=False when validating hydrated tar archives to reject external symlink targets. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/agents/extensions/sandbox/tensorlake/sandbox.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index ec8aee3273..3b9dd4ad18 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -17,6 +17,7 @@ import io import json import logging +import math import uuid from pathlib import Path from typing import Any, Literal, cast @@ -437,7 +438,7 @@ async def _exec_internal( normalized[1:], env=envs or None, working_dir=cwd, - timeout=int(exec_timeout), + timeout=max(1, math.ceil(exec_timeout)), ) except Exception as exc: if "timeout" in type(exc).__name__.lower() or "timed out" in str(exc).lower(): @@ -763,7 +764,7 @@ async def _hydrate_workspace_via_tar(self, raw: bytes) -> None: error_root = posix_path_for_error(root) try: - validate_tar_bytes(raw) + validate_tar_bytes(raw, allow_external_symlink_targets=False) except UnsafeTarMemberError as exc: raise WorkspaceArchiveWriteError( path=error_root, From b716d5ba91ee87dd60e46b90bf78901989c08362 Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Sun, 17 May 2026 16:59:35 -0500 Subject: [PATCH 09/26] add fast path to resume a sandbox --- .../extensions/sandbox/tensorlake/sandbox.py | 43 +++++++++++++++++-- tests/extensions/sandbox/test_tensorlake.py | 43 ++++++++++++++++++- 2 files changed, 82 insertions(+), 4 deletions(-) diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index 3b9dd4ad18..ee46a3a3e9 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -19,6 +19,7 @@ import logging import math import uuid +from contextlib import suppress from pathlib import Path from typing import Any, Literal, cast from urllib.parse import urlsplit @@ -130,6 +131,34 @@ def _decode_tensorlake_snapshot_ref(raw: bytes) -> str | None: return snapshot_id if isinstance(snapshot_id, str) and snapshot_id else None +async def _restore_tensorlake_snapshot_reference_id(snapshot: SnapshotBase) -> str | None: + """Best-effort extraction of the Tensorlake snapshot id from a persisted snapshot. + + Returns ``None`` when the persisted payload is not a Tensorlake checkpoint reference + or the snapshot store cannot be reached. `client.resume()` runs before session + dependencies are wired, so e.g. `RemoteSnapshot` would raise; callers fall back to + the slower `hydrate_workspace` path in those cases. + """ + + try: + if not await snapshot.restorable(): + return None + restored = await snapshot.restore() + try: + raw = restored.read() + finally: + with suppress(Exception): + restored.close() + except Exception: + return None + + if isinstance(raw, str): + raw = raw.encode("utf-8") + if not isinstance(raw, bytes | bytearray): + return None + return _decode_tensorlake_snapshot_ref(bytes(raw)) + + class TensorlakeSandboxTimeouts(BaseModel): """Timeout configuration for Tensorlake operations.""" @@ -976,7 +1005,14 @@ async def resume( except Exception: sandbox = None + recreate_snapshot_id: str | None = None if sandbox is None: + if state.workspace_persistence == _WORKSPACE_PERSISTENCE_SNAPSHOT: + # Skip the throwaway empty sandbox that `hydrate_workspace` would otherwise + # terminate and replace from the same snapshot. + recreate_snapshot_id = await _restore_tensorlake_snapshot_reference_id( + state.snapshot + ) kwargs = _resolve_create_kwargs( image=state.image, cpus=state.cpus, @@ -988,16 +1024,17 @@ async def resume( allow_internet_access=state.allow_internet_access, allow_out=state.allow_out, deny_out=state.deny_out, - snapshot_id=None, + snapshot_id=recreate_snapshot_id, ) sandbox = await AsyncSandbox.create(**kwargs) new_id = getattr(sandbox, "sandbox_id", None) if isinstance(new_id, str) and new_id: state.sandbox_id = new_id - state.workspace_root_ready = False + state.workspace_root_ready = recreate_snapshot_id is not None inner = TensorlakeSandboxSession.from_state(state, sandbox=sandbox) - inner._set_start_state_preserved(reconnected, system=reconnected) + preserved = reconnected or recreate_snapshot_id is not None + inner._set_start_state_preserved(preserved, system=preserved) if not reconnected: await inner._maybe_apply_exposed_ports() return self._wrap_session(inner, instrumentation=self._instrumentation) diff --git a/tests/extensions/sandbox/test_tensorlake.py b/tests/extensions/sandbox/test_tensorlake.py index df4ba51bc2..1e1b00c1ce 100644 --- a/tests/extensions/sandbox/test_tensorlake.py +++ b/tests/extensions/sandbox/test_tensorlake.py @@ -13,7 +13,7 @@ from agents.sandbox import Manifest from agents.sandbox.entries import File -from agents.sandbox.snapshot import NoopSnapshot +from agents.sandbox.snapshot import LocalSnapshot, NoopSnapshot from tests._fake_workspace_paths import resolve_fake_workspace_path @@ -831,6 +831,47 @@ async def test_resume_creates_fresh_when_paused_resume_fails( assert session._inner._system_state_preserved_on_start() is False +@pytest.mark.asyncio +async def test_resume_recreates_directly_from_snapshot_when_reconnect_fails( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + """Snapshot-mode resume must skip the throwaway empty-sandbox create when reconnect fails.""" + + module = _load_tensorlake_module(monkeypatch) + + snapshot = LocalSnapshot(id="snap", base_path=tmp_path) + payload = module._encode_tensorlake_snapshot_ref(snapshot_id="snap-stored") + await snapshot.persist(io.BytesIO(payload)) + _FakeSandbox.snapshots["snap-stored"] = {"/workspace/from-snapshot.txt": b"snap-data"} + + existing = _FakeSandbox(sandbox_id="sandbox-paused", status="suspended") + existing.resume_failure = RuntimeError("sandbox expired") + _FakeSandbox.sandboxes["sandbox-paused"] = existing + + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-000000000010"), + manifest=Manifest(), + snapshot=snapshot, + sandbox_id="sandbox-paused", + pause_on_exit=True, + workspace_persistence="snapshot", + ) + + client = module.TensorlakeSandboxClient() + session = await client.resume(state) + + assert len(_FakeSandbox.create_calls) == 1 + assert _FakeSandbox.create_calls[0].get("snapshot_id") == "snap-stored" + new_id = session._inner.state.sandbox_id + assert new_id != "sandbox-paused" + new_sandbox = _FakeSandbox.sandboxes[new_id] + assert new_sandbox.files["/workspace/from-snapshot.txt"] == b"snap-data" + assert state.workspace_root_ready is True + assert session._inner._workspace_state_preserved_on_start() is True + assert session._inner._system_state_preserved_on_start() is True + + def test_serialize_session_state_round_trips(monkeypatch: pytest.MonkeyPatch) -> None: module = _load_tensorlake_module(monkeypatch) From a2799b4feedaee57d5810a9aaa5de05a981b77f5 Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Sun, 17 May 2026 22:03:16 -0500 Subject: [PATCH 10/26] fix termination --- src/agents/extensions/sandbox/tensorlake/sandbox.py | 3 +++ tests/extensions/sandbox/test_tensorlake.py | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index ee46a3a3e9..281a3b2ad3 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -782,6 +782,9 @@ async def _restore_from_checkpoint(self, snapshot_id: str) -> None: ) from exc self._sandbox = sandbox + # `_backend_terminated` tracks the current `self._sandbox` handle; rebinding must clear it + # so `delete()` does not short-circuit on a live sandbox. + self._backend_terminated = False new_id = getattr(sandbox, "sandbox_id", None) if isinstance(new_id, str) and new_id: self.state.sandbox_id = new_id diff --git a/tests/extensions/sandbox/test_tensorlake.py b/tests/extensions/sandbox/test_tensorlake.py index 1e1b00c1ce..3f05634d36 100644 --- a/tests/extensions/sandbox/test_tensorlake.py +++ b/tests/extensions/sandbox/test_tensorlake.py @@ -750,6 +750,12 @@ async def test_hydrate_workspace_via_checkpoint_replaces_sandbox( assert state.sandbox_id != "sandbox-pre-restore" new_sandbox = _FakeSandbox.sandboxes[state.sandbox_id] assert new_sandbox.files["/workspace/from-snapshot.txt"] == b"snap-data" + assert session._backend_terminated is False + # Regression: `delete()` must still terminate the live post-restore sandbox. + client = module.TensorlakeSandboxClient() + wrapped = client._wrap_session(session, instrumentation=None) + await client.delete(wrapped) + assert new_sandbox.terminated is True @pytest.mark.asyncio From fcc943ec9a7614b46a773a18c1038b6e1a8b98c2 Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Sun, 17 May 2026 22:20:30 -0500 Subject: [PATCH 11/26] fix --- .../extensions/sandbox/tensorlake/sandbox.py | 14 +++++--- tests/extensions/sandbox/test_tensorlake.py | 33 +++++++++++++++++++ 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index 281a3b2ad3..89fc7aae38 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -1001,12 +1001,16 @@ async def resume( # outer handler recreates rather than marking a dead backend as preserved. await sandbox.resume() status = await sandbox.status() - if _is_running_status(status): - reconnected = True - else: - sandbox = None + if not _is_running_status(status): + raise RuntimeError("tensorlake sandbox is not running") + reconnected = True except Exception: - sandbox = None + # `AsyncSandbox.connect` opens a proxy rust client; close the abandoned handle + # so that resource is released instead of leaked when we fall through to recreate. + if sandbox is not None: + with suppress(Exception): + await sandbox.close() + sandbox = None recreate_snapshot_id: str | None = None if sandbox is None: diff --git a/tests/extensions/sandbox/test_tensorlake.py b/tests/extensions/sandbox/test_tensorlake.py index 3f05634d36..5529cd44df 100644 --- a/tests/extensions/sandbox/test_tensorlake.py +++ b/tests/extensions/sandbox/test_tensorlake.py @@ -102,6 +102,8 @@ def __init__( self.terminate_count = 0 self.suspended = False self.resumed = False + self.closed = False + self.close_count = 0 self.resume_failure: BaseException | None = None self.next_run_result: _FakeCommandResult | None = None self.symlinks: dict[str, str] = {} @@ -253,6 +255,10 @@ async def terminate(self) -> None: self.terminate_count += 1 self._status = _FakeSandboxStatus.TERMINATED + async def close(self) -> None: + self.closed = True + self.close_count += 1 + async def suspend( self, wait: bool = True, timeout: float = 300.0, poll_interval: float = 1.0 ) -> None: @@ -835,6 +841,33 @@ async def test_resume_creates_fresh_when_paused_resume_fails( assert state.workspace_root_ready is False assert session._inner._workspace_state_preserved_on_start() is False assert session._inner._system_state_preserved_on_start() is False + # Regression: the abandoned connect-handle must be closed so the proxy rust client + # is released instead of leaked. + assert existing.close_count == 1 + + +@pytest.mark.asyncio +async def test_resume_closes_abandoned_handle_when_status_not_running( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = _load_tensorlake_module(monkeypatch) + + existing = _FakeSandbox(sandbox_id="sandbox-dead", status="terminated") + _FakeSandbox.sandboxes["sandbox-dead"] = existing + + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-000000000011"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-dead", + ) + + client = module.TensorlakeSandboxClient() + session = await client.resume(state) + + assert len(_FakeSandbox.create_calls) == 1 + assert existing.close_count == 1 + assert session._inner._workspace_state_preserved_on_start() is False @pytest.mark.asyncio From 9e942286199854bcb9167d05f68ff50bf312025e Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Sun, 17 May 2026 22:51:24 -0500 Subject: [PATCH 12/26] use terminate() --- pyproject.toml | 2 +- .../extensions/sandbox/tensorlake/sandbox.py | 99 ++++++++++++++++++- tests/extensions/sandbox/test_tensorlake.py | 89 ++++++++++++++++- 3 files changed, 180 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2b81209222..1e6f6e2b7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,7 @@ cloudflare = ["aiohttp>=3.12,<4"] e2b = ["e2b==2.20.0", "e2b-code-interpreter==2.4.1"] modal = ["modal==1.3.5"] runloop = ["runloop_api_client>=1.16.0,<2.0.0"] -tensorlake = ["tensorlake>=0.5.8"] +tensorlake = ["tensorlake>=0.5.14"] vercel = ["vercel>=0.5.6,<0.6"] s3 = ["boto3>=1.34"] temporal = [ diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index 89fc7aae38..9ff3eda364 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -192,6 +192,12 @@ class TensorlakeSandboxClientOptions(BaseSandboxClientOptions): checkpoint_timeout_s: float = 300.0 timeouts: TensorlakeSandboxTimeouts | dict[str, object] | None = None disk_mb: int | None = None + pool_id: str | None = None + entrypoint: tuple[str, ...] = () + startup_timeout: float | None = None + proxy_url: str | None = None + api_url: str | None = None + namespace: str | None = None def __init__( self, @@ -213,6 +219,12 @@ def __init__( checkpoint_timeout_s: float = 300.0, timeouts: TensorlakeSandboxTimeouts | dict[str, object] | None = None, disk_mb: int | None = None, + pool_id: str | None = None, + entrypoint: tuple[str, ...] = (), + startup_timeout: float | None = None, + proxy_url: str | None = None, + api_url: str | None = None, + namespace: str | None = None, *, type: Literal["tensorlake"] = "tensorlake", ) -> None: @@ -236,6 +248,12 @@ def __init__( checkpoint_timeout_s=checkpoint_timeout_s, timeouts=timeouts, disk_mb=disk_mb, + pool_id=pool_id, + entrypoint=entrypoint, + startup_timeout=startup_timeout, + proxy_url=proxy_url, + api_url=api_url, + namespace=namespace, ) @@ -261,6 +279,12 @@ class TensorlakeSandboxSessionState(SandboxSessionState): checkpoint_timeout_s: float = 300.0 timeouts: TensorlakeSandboxTimeouts = Field(default_factory=TensorlakeSandboxTimeouts) disk_mb: int | None = None + pool_id: str | None = None + entrypoint: tuple[str, ...] = () + startup_timeout: float | None = None + proxy_url: str | None = None + api_url: str | None = None + namespace: str | None = None def _resolve_create_kwargs( @@ -276,6 +300,12 @@ def _resolve_create_kwargs( allow_out: tuple[str, ...], deny_out: tuple[str, ...], snapshot_id: str | None, + pool_id: str | None, + entrypoint: tuple[str, ...], + startup_timeout: float | None, + proxy_url: str | None, + api_url: str | None, + namespace: str | None, ) -> dict[str, object]: """Build the kwargs accepted by `AsyncSandbox.create(...)`. @@ -305,6 +335,36 @@ def _resolve_create_kwargs( kwargs["deny_out"] = list(deny_out) if snapshot_id is not None: kwargs["snapshot_id"] = snapshot_id + if pool_id is not None: + kwargs["pool_id"] = pool_id + if entrypoint: + kwargs["entrypoint"] = list(entrypoint) + if startup_timeout is not None: + kwargs["startup_timeout"] = startup_timeout + if proxy_url is not None: + kwargs["proxy_url"] = proxy_url + if api_url is not None: + kwargs["api_url"] = api_url + if namespace is not None: + kwargs["namespace"] = namespace + return kwargs + + +def _resolve_connect_kwargs( + *, + proxy_url: str | None, + api_url: str | None, + namespace: str | None, +) -> dict[str, object]: + """Build the kwargs accepted by `AsyncSandbox.connect(sandbox_id, ...)`.""" + + kwargs: dict[str, object] = {} + if proxy_url is not None: + kwargs["proxy_url"] = proxy_url + if api_url is not None: + kwargs["api_url"] = api_url + if namespace is not None: + kwargs["namespace"] = namespace return kwargs @@ -767,6 +827,12 @@ async def _restore_from_checkpoint(self, snapshot_id: str) -> None: allow_out=self.state.allow_out, deny_out=self.state.deny_out, snapshot_id=snapshot_id, + pool_id=self.state.pool_id, + entrypoint=self.state.entrypoint, + startup_timeout=self.state.startup_timeout, + proxy_url=self.state.proxy_url, + api_url=self.state.api_url, + namespace=self.state.namespace, ) try: @@ -925,6 +991,12 @@ async def create( allow_out=options.allow_out, deny_out=options.deny_out, snapshot_id=None, + pool_id=options.pool_id, + entrypoint=options.entrypoint, + startup_timeout=options.startup_timeout, + proxy_url=options.proxy_url, + api_url=options.api_url, + namespace=options.namespace, ) sandbox = await AsyncSandbox.create(**kwargs) @@ -959,6 +1031,12 @@ async def create( checkpoint_timeout_s=options.checkpoint_timeout_s, timeouts=timeouts, exposed_ports=options.exposed_ports, + pool_id=options.pool_id, + entrypoint=options.entrypoint, + startup_timeout=options.startup_timeout, + proxy_url=options.proxy_url, + api_url=options.api_url, + namespace=options.namespace, ) inner = TensorlakeSandboxSession.from_state(state, sandbox=sandbox) await inner._maybe_apply_exposed_ports() @@ -991,10 +1069,16 @@ async def resume( AsyncSandbox, _, _, _ = _import_tensorlake_sandbox() + connect_kwargs = _resolve_connect_kwargs( + proxy_url=state.proxy_url, + api_url=state.api_url, + namespace=state.namespace, + ) + sandbox: Any = None reconnected = False try: - sandbox = await AsyncSandbox.connect(state.sandbox_id) + sandbox = await AsyncSandbox.connect(state.sandbox_id, **connect_kwargs) if state.pause_on_exit: # `connect` returns a handle even for a paused/expired sandbox; `resume` is # what actually transitions it to running. Failures must fall through so the @@ -1005,11 +1089,12 @@ async def resume( raise RuntimeError("tensorlake sandbox is not running") reconnected = True except Exception: - # `AsyncSandbox.connect` opens a proxy rust client; close the abandoned handle - # so that resource is released instead of leaked when we fall through to recreate. + # We're about to recreate from scratch, so terminate the abandoned remote sandbox + # instead of just closing the local handle — otherwise it keeps running on the + # backend until its own timeout expires. if sandbox is not None: with suppress(Exception): - await sandbox.close() + await sandbox.terminate() sandbox = None recreate_snapshot_id: str | None = None @@ -1032,6 +1117,12 @@ async def resume( allow_out=state.allow_out, deny_out=state.deny_out, snapshot_id=recreate_snapshot_id, + pool_id=state.pool_id, + entrypoint=state.entrypoint, + startup_timeout=state.startup_timeout, + proxy_url=state.proxy_url, + api_url=state.api_url, + namespace=state.namespace, ) sandbox = await AsyncSandbox.create(**kwargs) new_id = getattr(sandbox, "sandbox_id", None) diff --git a/tests/extensions/sandbox/test_tensorlake.py b/tests/extensions/sandbox/test_tensorlake.py index 5529cd44df..79fd45766e 100644 --- a/tests/extensions/sandbox/test_tensorlake.py +++ b/tests/extensions/sandbox/test_tensorlake.py @@ -255,7 +255,7 @@ async def terminate(self) -> None: self.terminate_count += 1 self._status = _FakeSandboxStatus.TERMINATED - async def close(self) -> None: + def close(self) -> None: self.closed = True self.close_count += 1 @@ -397,6 +397,85 @@ async def test_create_passes_options_and_drops_unset_fields( ] +@pytest.mark.asyncio +async def test_create_passes_pool_and_routing_options(monkeypatch: pytest.MonkeyPatch) -> None: + module = _load_tensorlake_module(monkeypatch) + client = module.TensorlakeSandboxClient() + session = await client.create( + manifest=Manifest(), + options=module.TensorlakeSandboxClientOptions( + pool_id="pool-warm", + entrypoint=("python", "-m", "app"), + startup_timeout=90.0, + proxy_url="https://proxy.tensorlake.dev", + api_url="https://api.tensorlake.dev", + namespace="tenant-a", + ), + ) + + assert _FakeSandbox.create_calls == [ + { + "allow_internet_access": True, + "pool_id": "pool-warm", + "entrypoint": ["python", "-m", "app"], + "startup_timeout": 90.0, + "proxy_url": "https://proxy.tensorlake.dev", + "api_url": "https://api.tensorlake.dev", + "namespace": "tenant-a", + } + ] + state = session._inner.state + assert state.pool_id == "pool-warm" + assert state.entrypoint == ("python", "-m", "app") + assert state.startup_timeout == 90.0 + assert state.proxy_url == "https://proxy.tensorlake.dev" + assert state.api_url == "https://api.tensorlake.dev" + assert state.namespace == "tenant-a" + + +@pytest.mark.asyncio +async def test_resume_forwards_routing_to_connect_and_recreate( + monkeypatch: pytest.MonkeyPatch, +) -> None: + module = _load_tensorlake_module(monkeypatch) + + existing = _FakeSandbox(sandbox_id="sandbox-dead", status="terminated") + _FakeSandbox.sandboxes["sandbox-dead"] = existing + + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-000000000020"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-dead", + pool_id="pool-warm", + entrypoint=("python", "-m", "app"), + startup_timeout=90.0, + proxy_url="https://proxy.tensorlake.dev", + api_url="https://api.tensorlake.dev", + namespace="tenant-a", + ) + + client = module.TensorlakeSandboxClient() + await client.resume(state) + + assert _FakeSandbox.connect_calls == [ + { + "sandbox_id": "sandbox-dead", + "proxy_url": "https://proxy.tensorlake.dev", + "api_url": "https://api.tensorlake.dev", + "namespace": "tenant-a", + } + ] + assert len(_FakeSandbox.create_calls) == 1 + create_kwargs = _FakeSandbox.create_calls[0] + assert create_kwargs["pool_id"] == "pool-warm" + assert create_kwargs["entrypoint"] == ["python", "-m", "app"] + assert create_kwargs["startup_timeout"] == 90.0 + assert create_kwargs["proxy_url"] == "https://proxy.tensorlake.dev" + assert create_kwargs["api_url"] == "https://api.tensorlake.dev" + assert create_kwargs["namespace"] == "tenant-a" + + @pytest.mark.asyncio async def test_create_omits_optional_kwargs_when_unset(monkeypatch: pytest.MonkeyPatch) -> None: module = _load_tensorlake_module(monkeypatch) @@ -841,9 +920,9 @@ async def test_resume_creates_fresh_when_paused_resume_fails( assert state.workspace_root_ready is False assert session._inner._workspace_state_preserved_on_start() is False assert session._inner._system_state_preserved_on_start() is False - # Regression: the abandoned connect-handle must be closed so the proxy rust client - # is released instead of leaked. - assert existing.close_count == 1 + # Regression: the abandoned remote sandbox must be terminated so it doesn't keep + # running on the backend until its timeout expires. + assert existing.terminate_count == 1 @pytest.mark.asyncio @@ -866,7 +945,7 @@ async def test_resume_closes_abandoned_handle_when_status_not_running( session = await client.resume(state) assert len(_FakeSandbox.create_calls) == 1 - assert existing.close_count == 1 + assert existing.terminate_count == 1 assert session._inner._workspace_state_preserved_on_start() is False From fd81269fd6d445fba7b0624d904fc4f5b8022923 Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Sun, 17 May 2026 22:58:11 -0500 Subject: [PATCH 13/26] fix --- src/agents/extensions/sandbox/tensorlake/sandbox.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index 9ff3eda364..61c48e9441 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -333,9 +333,19 @@ def _resolve_create_kwargs( kwargs["allow_out"] = list(allow_out) if deny_out: kwargs["deny_out"] = list(deny_out) + # `AsyncSandbox.create` treats `pool_id` and `snapshot_id` as mutually exclusive: + # when both are set it claims from the pool and silently ignores `snapshot_id`. + # Snapshot restore must win here — falling back to a fresh pool sandbox would + # silently discard the persisted workspace state. if snapshot_id is not None: kwargs["snapshot_id"] = snapshot_id - if pool_id is not None: + if pool_id is not None: + logger.warning( + "Ignoring pool_id because snapshot_id is set; snapshot restore takes " + "precedence over pool claim.", + extra={"pool_id": pool_id, "snapshot_id": snapshot_id}, + ) + elif pool_id is not None: kwargs["pool_id"] = pool_id if entrypoint: kwargs["entrypoint"] = list(entrypoint) From 202342ddd4c2bf7321e651b7989c3a69cb171082 Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Sun, 17 May 2026 23:03:46 -0500 Subject: [PATCH 14/26] fix --- .../extensions/sandbox/tensorlake/sandbox.py | 23 ++++++------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index 61c48e9441..b59a285c30 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -757,14 +757,9 @@ async def _persist_workspace_via_tar(self) -> io.IOBase: env=envs or None, timeout=int(self.state.timeouts.snapshot_tar_s), ) - except Exception as exc: - raise WorkspaceArchiveReadError(path=error_root, cause=exc) from exc - - exit_code = int(getattr(result, "exit_code", 0) or 0) - if exit_code != 0: - raise WorkspaceArchiveReadError( - path=error_root, - cause=ExecNonZeroError( + exit_code = int(getattr(result, "exit_code", 0) or 0) + if exit_code != 0: + raise ExecNonZeroError( ExecResult( stdout=str(getattr(result, "stdout", "") or "").encode( "utf-8", errors="replace" @@ -776,17 +771,13 @@ async def _persist_workspace_via_tar(self) -> io.IOBase: ), command=("tar", *tar_argv), context={"backend": "tensorlake", "sandbox_id": self.state.sandbox_id}, - ), - ) - - try: + ) payload = await self._sandbox.read_file(archive_path) + archive_bytes = _coerce_sandbox_payload_to_bytes(payload) except Exception as exc: raise WorkspaceArchiveReadError(path=error_root, cause=exc) from exc - - archive_bytes = _coerce_sandbox_payload_to_bytes(payload) - - await self._remove_tmp_archive(archive_path) + finally: + await self._remove_tmp_archive(archive_path) return io.BytesIO(archive_bytes) From e928d480129214a04871cd56e86c6021652dff03 Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Sun, 17 May 2026 23:07:57 -0500 Subject: [PATCH 15/26] fix --- tests/extensions/sandbox/test_tensorlake.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/extensions/sandbox/test_tensorlake.py b/tests/extensions/sandbox/test_tensorlake.py index 79fd45766e..a233744998 100644 --- a/tests/extensions/sandbox/test_tensorlake.py +++ b/tests/extensions/sandbox/test_tensorlake.py @@ -102,8 +102,6 @@ def __init__( self.terminate_count = 0 self.suspended = False self.resumed = False - self.closed = False - self.close_count = 0 self.resume_failure: BaseException | None = None self.next_run_result: _FakeCommandResult | None = None self.symlinks: dict[str, str] = {} @@ -255,10 +253,6 @@ async def terminate(self) -> None: self.terminate_count += 1 self._status = _FakeSandboxStatus.TERMINATED - def close(self) -> None: - self.closed = True - self.close_count += 1 - async def suspend( self, wait: bool = True, timeout: float = 300.0, poll_interval: float = 1.0 ) -> None: From 9b6ec66bce77d04e27b878d50ba73b81263c10e9 Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Sun, 17 May 2026 23:25:18 -0500 Subject: [PATCH 16/26] add file-ops apis --- .../extensions/sandbox/tensorlake/sandbox.py | 31 ++++++++++++--- tests/extensions/sandbox/test_tensorlake.py | 38 +++++++++++++++---- 2 files changed, 57 insertions(+), 12 deletions(-) diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index b59a285c30..0ce314a6e6 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -167,7 +167,6 @@ class TensorlakeSandboxTimeouts(BaseModel): fast_op_s: float = Field(default=30, ge=1) file_upload_s: float = Field(default=300, ge=1) snapshot_tar_s: float = Field(default=300, ge=1) - cleanup_s: float = Field(default=30, ge=1) class TensorlakeSandboxClientOptions(BaseSandboxClientOptions): @@ -515,10 +514,24 @@ async def running(self) -> bool: return False return _is_running_status(status) + async def exec( + self, + *command: str | Path, + timeout: float | None = None, + shell: bool | list[str] = True, + user: str | User | None = None, + ) -> ExecResult: + # `AsyncSandbox.run` accepts `user=` natively, so skip the base class's + # `sudo -u --` wrap — sudo is frequently absent from minimal Tensorlake + # images and would fail the entire user-aware path (access checks + ops). + sanitized_command = self._prepare_exec_command(*command, shell=shell, user=None) + return await self._exec_internal(*sanitized_command, timeout=timeout, user=user) + async def _exec_internal( self, *command: str | Path, timeout: float | None = None, + user: str | User | None = None, ) -> ExecResult: normalized = [str(part) for part in command] if not normalized: @@ -528,6 +541,12 @@ async def _exec_internal( cwd = sandbox_path_str(self.state.manifest.root) exec_timeout = self._coerce_exec_timeout(timeout) + # `user` is conditionally injected so we don't override the SDK's default + # ("tl-user") with None. Keep the always-present kwargs visible on the call. + extra: dict[str, Any] = {} + if user is not None: + extra["user"] = user.name if isinstance(user, User) else user + try: # Rely on the SDK's own `timeout` so the backend tears down the running # process; an outer `asyncio.wait_for` only cancels the local awaiter and @@ -538,6 +557,7 @@ async def _exec_internal( env=envs or None, working_dir=cwd, timeout=max(1, math.ceil(exec_timeout)), + **extra, ) except Exception as exc: if "timeout" in type(exc).__name__.lower() or "timed out" in str(exc).lower(): @@ -909,10 +929,11 @@ async def _hydrate_workspace_via_tar(self, raw: bytes) -> None: async def _remove_tmp_archive(self, archive_path: str) -> None: """Best-effort cleanup of a `/tmp` tar archive used for workspace persistence.""" try: - await self._sandbox.run( - "rm", - ["-f", "--", archive_path], - timeout=int(self.state.timeouts.cleanup_s), + # `delete_file` has no timeout knob; bound it so a hung daemon doesn't + # block the outer persist/hydrate flow indefinitely on a best-effort op. + await asyncio.wait_for( + self._sandbox.delete_file(archive_path), + timeout=self.state.timeouts.fast_op_s, ) except Exception: pass diff --git a/tests/extensions/sandbox/test_tensorlake.py b/tests/extensions/sandbox/test_tensorlake.py index a233744998..0154707a56 100644 --- a/tests/extensions/sandbox/test_tensorlake.py +++ b/tests/extensions/sandbox/test_tensorlake.py @@ -161,6 +161,7 @@ async def run( env: dict[str, str] | None = None, working_dir: str | None = None, timeout: float | None = None, + user: str | None = None, ) -> _FakeCommandResult: _ = (env, timeout) args = args or [] @@ -169,6 +170,7 @@ async def run( "command": command, "args": list(args), "working_dir": working_dir, + "user": user, } ) @@ -227,13 +229,6 @@ async def run( self.files[f"{destination.rstrip('/')}/{member.name}"] = extracted.read() return _FakeCommandResult() - if command == "rm": - for arg in args: - if arg in {"-f", "--"}: - continue - self.files.pop(arg, None) - return _FakeCommandResult() - if command == "test" and args and args[0] == "-d": return _FakeCommandResult(exit_code=0) @@ -248,6 +243,9 @@ async def read_file(self, path: str) -> _FakeTraced: async def write_file(self, path: str, content: bytes) -> None: self.files[path] = bytes(content) + async def delete_file(self, path: str) -> None: + self.files.pop(path, None) + async def terminate(self) -> None: self.terminated = True self.terminate_count += 1 @@ -508,6 +506,32 @@ async def test_exec_read_write_and_mkdir(monkeypatch: pytest.MonkeyPatch) -> Non assert result.stdout == b"hi\n" +@pytest.mark.asyncio +async def test_exec_forwards_user_natively_without_sudo_wrap( + monkeypatch: pytest.MonkeyPatch, +) -> None: + # Minimal Tensorlake images often lack sudo; ensure user= is passed to + # AsyncSandbox.run(user=...) instead of being prepended as `sudo -u --`. + module = _load_tensorlake_module(monkeypatch) + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-00000000beef"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-user", + ) + fake = _FakeSandbox(sandbox_id="sandbox-user") + session = module.TensorlakeSandboxSession.from_state(state, sandbox=fake) + + fake.next_run_result = _FakeCommandResult(stdout="ok\n", exit_code=0) + await session.exec("printf", "ok", shell=False, user="tl-user") + + last = fake.run_calls[-1] + assert last["command"] == "printf" + assert last["user"] == "tl-user" + args = cast(list[str], last["args"]) + assert "sudo" not in args + + @pytest.mark.asyncio async def test_read_missing_file_raises_not_found(monkeypatch: pytest.MonkeyPatch) -> None: module = _load_tensorlake_module(monkeypatch) From 12dca0431a5299f74306ed54f94a7346e4ca0843 Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Sun, 17 May 2026 23:29:09 -0500 Subject: [PATCH 17/26] fixed --- src/agents/extensions/sandbox/tensorlake/sandbox.py | 4 ++++ tests/extensions/sandbox/test_tensorlake.py | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index 0ce314a6e6..40b9c4ba3e 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -729,10 +729,14 @@ async def _persist_workspace_via_checkpoint(self) -> io.IOBase: checkpoint_type = _resolve_checkpoint_type(self.state.checkpoint_mode, CheckpointType) try: + # Wait for `completed` (uploaded to remote storage), not the SDK default + # `local_ready`. Restore goes through `AsyncSandbox.create(snapshot_id=...)`, + # which can land on a different host that has no view of a local-only snapshot. snapshot = await asyncio.wait_for( self._sandbox.checkpoint( checkpoint_type=checkpoint_type, timeout=int(self.state.checkpoint_timeout_s), + wait_until="completed", ), timeout=self.state.timeouts.snapshot_tar_s, ) diff --git a/tests/extensions/sandbox/test_tensorlake.py b/tests/extensions/sandbox/test_tensorlake.py index 0154707a56..88d79fad55 100644 --- a/tests/extensions/sandbox/test_tensorlake.py +++ b/tests/extensions/sandbox/test_tensorlake.py @@ -107,6 +107,7 @@ def __init__( self.symlinks: dict[str, str] = {} self.sandbox_url = sandbox_url self.info_calls = 0 + self.last_checkpoint_wait_until: str | None = None @classmethod def reset(cls) -> None: @@ -288,8 +289,10 @@ async def checkpoint( timeout: float = 300.0, poll_interval: float = 1.0, checkpoint_type: str | None = None, + wait_until: str = "local_ready", ) -> _FakeSnapshotInfo: _ = (wait, timeout, poll_interval, checkpoint_type) + self.last_checkpoint_wait_until = wait_until snapshot_id = f"snap-{len(type(self).snapshots) + 1}" type(self).snapshots[snapshot_id] = dict(self.files) return _FakeSnapshotInfo(snapshot_id) @@ -823,6 +826,9 @@ async def test_persist_workspace_via_checkpoint_returns_snapshot_ref( snapshot_id = module._decode_tensorlake_snapshot_ref(raw) assert snapshot_id == "snap-1" assert _FakeSandbox.snapshots["snap-1"]["/workspace/notes.txt"] == b"snapshot-payload" + # Restore must succeed from any host, so persist waits for upload to remote storage + # rather than the SDK default `local_ready`. + assert fake.last_checkpoint_wait_until == "completed" @pytest.mark.asyncio From 2589810fe9cfdb75334a7a7d841f7b70c929211f Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Mon, 18 May 2026 22:16:26 -0500 Subject: [PATCH 18/26] fixed --- .../extensions/sandbox/tensorlake/sandbox.py | 38 ++++++++++++++----- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index 40b9c4ba3e..336ce5da68 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -377,6 +377,19 @@ def _resolve_connect_kwargs( return kwargs +def _get_sandbox_id(sandbox: Any) -> str | None: + """Return sandbox_id from a sandbox object, or None if unavailable. + + Uses try/except rather than getattr because sandbox_id may be a @property that raises a + non-AttributeError SDK exception (e.g. SandboxError) when the id is not yet populated. + """ + try: + value = sandbox.sandbox_id + except Exception: + return None + return value if isinstance(value, str) and value else None + + def _resolve_checkpoint_type(mode: CheckpointMode, checkpoint_module: Any) -> Any: if mode == "memory": return checkpoint_module.MEMORY @@ -876,8 +889,12 @@ async def _restore_from_checkpoint(self, snapshot_id: str) -> None: # `_backend_terminated` tracks the current `self._sandbox` handle; rebinding must clear it # so `delete()` does not short-circuit on a live sandbox. self._backend_terminated = False - new_id = getattr(sandbox, "sandbox_id", None) - if isinstance(new_id, str) and new_id: + # The new sandbox has a different sandbox_url; clear the cache so the next + # _resolve_exposed_port() call fetches the updated hostname from the new backend. + self._proxy_hostname_resolved = False + self._cached_proxy_hostname = None + new_id = _get_sandbox_id(sandbox) + if new_id is not None: self.state.sandbox_id = new_id await self._maybe_apply_exposed_ports() self.state.workspace_root_ready = True @@ -947,9 +964,12 @@ async def _maybe_apply_exposed_ports(self) -> None: if not ports: return try: - await self._sandbox.update( - exposed_ports=ports, - allow_unauthenticated_access=self.state.allow_unauthenticated_port_access, + await asyncio.wait_for( + self._sandbox.update( + exposed_ports=ports, + allow_unauthenticated_access=self.state.allow_unauthenticated_port_access, + ), + timeout=self.state.timeouts.fast_op_s, ) except Exception as exc: logger.warning( @@ -1026,8 +1046,8 @@ async def create( ) sandbox = await AsyncSandbox.create(**kwargs) - sandbox_id = getattr(sandbox, "sandbox_id", None) - if not isinstance(sandbox_id, str) or not sandbox_id: + sandbox_id = _get_sandbox_id(sandbox) + if not sandbox_id: raise RuntimeError( "Tensorlake `AsyncSandbox.create` did not return a sandbox with a `sandbox_id`." ) @@ -1151,8 +1171,8 @@ async def resume( namespace=state.namespace, ) sandbox = await AsyncSandbox.create(**kwargs) - new_id = getattr(sandbox, "sandbox_id", None) - if isinstance(new_id, str) and new_id: + new_id = _get_sandbox_id(sandbox) + if new_id is not None: state.sandbox_id = new_id state.workspace_root_ready = recreate_snapshot_id is not None From 921c6a2d9a1ce735d4f752d5946048c2c2252e55 Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Mon, 18 May 2026 23:04:55 -0500 Subject: [PATCH 19/26] fix --- .../extensions/sandbox/tensorlake/sandbox.py | 79 +++++++++++++++---- tests/extensions/sandbox/test_tensorlake.py | 71 +++++++++++++++++ 2 files changed, 133 insertions(+), 17 deletions(-) diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index 336ce5da68..a08227aca6 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -44,6 +44,7 @@ from ....sandbox.session.mount_lifecycle import with_ephemeral_mounts_removed from ....sandbox.session.runtime_helpers import RESOLVE_WORKSPACE_PATH_HELPER, RuntimeHelperScript from ....sandbox.session.sandbox_client import BaseSandboxClient, BaseSandboxClientOptions +from ....sandbox.session.tar_workspace import shell_tar_exclude_args from ....sandbox.snapshot import SnapshotBase, SnapshotSpec, resolve_snapshot from ....sandbox.types import ExecResult, ExposedPortEndpoint, User from ....sandbox.util.tar_utils import UnsafeTarMemberError, validate_tar_bytes @@ -67,7 +68,7 @@ _LOOPBACK_HOSTS = frozenset({"localhost", "127.0.0.1", "::1"}) -def _import_tensorlake_sandbox() -> tuple[Any, Any, Any, Any]: +def _import_tensorlake_sandbox() -> tuple[Any, Any, Any]: """Lazily import the Tensorlake SDK symbols this integration needs.""" try: @@ -75,10 +76,9 @@ def _import_tensorlake_sandbox() -> tuple[Any, Any, Any, Any]: AsyncSandbox, CheckpointType, RemoteAPIError, - SandboxStatus, ) - return AsyncSandbox, CheckpointType, SandboxStatus, RemoteAPIError + return AsyncSandbox, CheckpointType, RemoteAPIError except ImportError as exc: # pragma: no cover - exercised via unit tests with fakes raise ImportError( "TensorlakeSandboxClient requires the optional `tensorlake` dependency.\n" @@ -197,6 +197,9 @@ class TensorlakeSandboxClientOptions(BaseSandboxClientOptions): proxy_url: str | None = None api_url: str | None = None namespace: str | None = None + organization_id: str | None = None + project_id: str | None = None + routing_hint: str | None = None def __init__( self, @@ -224,6 +227,9 @@ def __init__( proxy_url: str | None = None, api_url: str | None = None, namespace: str | None = None, + organization_id: str | None = None, + project_id: str | None = None, + routing_hint: str | None = None, *, type: Literal["tensorlake"] = "tensorlake", ) -> None: @@ -253,6 +259,9 @@ def __init__( proxy_url=proxy_url, api_url=api_url, namespace=namespace, + organization_id=organization_id, + project_id=project_id, + routing_hint=routing_hint, ) @@ -284,6 +293,9 @@ class TensorlakeSandboxSessionState(SandboxSessionState): proxy_url: str | None = None api_url: str | None = None namespace: str | None = None + organization_id: str | None = None + project_id: str | None = None + routing_hint: str | None = None def _resolve_create_kwargs( @@ -305,12 +317,17 @@ def _resolve_create_kwargs( proxy_url: str | None, api_url: str | None, namespace: str | None, + organization_id: str | None, + project_id: str | None, ) -> dict[str, object]: """Build the kwargs accepted by `AsyncSandbox.create(...)`. Only includes optional fields when they are set so the SDK can apply its own defaults. Tensorlake does not accept environment variables at sandbox-create time; envs are passed on each `sandbox.run(...)` call instead. + + Note: `routing_hint` is accepted by `AsyncSandbox.connect` but not `AsyncSandbox.create`, + so it is intentionally absent here. """ kwargs: dict[str, object] = {"allow_internet_access": allow_internet_access} @@ -356,6 +373,10 @@ def _resolve_create_kwargs( kwargs["api_url"] = api_url if namespace is not None: kwargs["namespace"] = namespace + if organization_id is not None: + kwargs["organization_id"] = organization_id + if project_id is not None: + kwargs["project_id"] = project_id return kwargs @@ -364,6 +385,9 @@ def _resolve_connect_kwargs( proxy_url: str | None, api_url: str | None, namespace: str | None, + organization_id: str | None, + project_id: str | None, + routing_hint: str | None, ) -> dict[str, object]: """Build the kwargs accepted by `AsyncSandbox.connect(sandbox_id, ...)`.""" @@ -374,6 +398,12 @@ def _resolve_connect_kwargs( kwargs["api_url"] = api_url if namespace is not None: kwargs["namespace"] = namespace + if organization_id is not None: + kwargs["organization_id"] = organization_id + if project_id is not None: + kwargs["project_id"] = project_id + if routing_hint is not None: + kwargs["routing_hint"] = routing_hint return kwargs @@ -630,7 +660,8 @@ async def read(self, path: Path, *, user: str | User | None = None) -> io.IOBase except FileNotFoundError as exc: raise WorkspaceReadNotFoundError(path=normalized_path, cause=exc) from exc except Exception as exc: - if getattr(exc, "status_code", None) == 404: + _, _, RemoteAPIError = _import_tensorlake_sandbox() + if isinstance(exc, RemoteAPIError) and getattr(exc, "status_code", None) == 404: raise WorkspaceReadNotFoundError(path=normalized_path, cause=exc) from exc raise WorkspaceArchiveReadError(path=normalized_path, cause=exc) from exc @@ -738,7 +769,7 @@ async def _persist_workspace_via_checkpoint(self) -> io.IOBase: if skip - mount_skip_rel_paths: return await self._persist_workspace_via_tar() - _, CheckpointType, _, _ = _import_tensorlake_sandbox() + _, CheckpointType, _ = _import_tensorlake_sandbox() checkpoint_type = _resolve_checkpoint_type(self.state.checkpoint_mode, CheckpointType) try: @@ -775,15 +806,14 @@ async def _persist_workspace_via_tar(self) -> io.IOBase: root = self._workspace_root_path() error_root = posix_path_for_error(root) archive_path = f"/tmp/openai-agents-{self.state.session_id.hex}.tar" - # Emit both `foo` and `./foo` exclude patterns since tar can produce members either way - # depending on archive provenance. - excludes: list[str] = [] - for rel in sorted(self._persist_workspace_skip_relpaths(), key=lambda p: p.as_posix()): - rel_posix = rel.as_posix().lstrip("/") - if not rel_posix or rel_posix in {".", "/"}: - continue - excludes.append(f"--exclude={rel_posix}") - excludes.append(f"--exclude=./{rel_posix}") + skip = list(self._persist_workspace_skip_relpaths()) + # When the workspace root is /tmp (or /) the archive file falls inside the tree being + # archived; exclude it to prevent tar's "file is the archive" error. + try: + skip.append(Path(archive_path).relative_to(root)) + except ValueError: + pass # archive is outside the workspace root + excludes = shell_tar_exclude_args(skip) tar_argv = ["cf", archive_path, *excludes, "-C", root.as_posix(), "."] try: @@ -846,7 +876,7 @@ async def _hydrate_workspace_internal(self, raw: bytes) -> None: async def _restore_from_checkpoint(self, snapshot_id: str) -> None: root = self._workspace_root_path() error_root = posix_path_for_error(root) - AsyncSandbox, _, _, _ = _import_tensorlake_sandbox() + AsyncSandbox, _, _ = _import_tensorlake_sandbox() try: await self._sandbox.terminate() @@ -871,6 +901,8 @@ async def _restore_from_checkpoint(self, snapshot_id: str) -> None: proxy_url=self.state.proxy_url, api_url=self.state.api_url, namespace=self.state.namespace, + organization_id=self.state.organization_id, + project_id=self.state.project_id, ) try: @@ -898,6 +930,9 @@ async def _restore_from_checkpoint(self, snapshot_id: str) -> None: self.state.sandbox_id = new_id await self._maybe_apply_exposed_ports() self.state.workspace_root_ready = True + # The restored checkpoint carries full OS state (users, groups, system packages), so + # the base start flow must not re-run groupadd/useradd for accounts already present. + self._set_start_state_preserved(True, system=True) async def _hydrate_workspace_via_tar(self, raw: bytes) -> None: root = self._workspace_root_path() @@ -1021,7 +1056,7 @@ async def create( f"{_WORKSPACE_PERSISTENCE_TAR!r} or {_WORKSPACE_PERSISTENCE_SNAPSHOT!r}" ) - AsyncSandbox, _, _, _ = _import_tensorlake_sandbox() + AsyncSandbox, _, _ = _import_tensorlake_sandbox() base_envs = dict(options.envs or {}) @@ -1043,6 +1078,8 @@ async def create( proxy_url=options.proxy_url, api_url=options.api_url, namespace=options.namespace, + organization_id=options.organization_id, + project_id=options.project_id, ) sandbox = await AsyncSandbox.create(**kwargs) @@ -1083,6 +1120,9 @@ async def create( proxy_url=options.proxy_url, api_url=options.api_url, namespace=options.namespace, + organization_id=options.organization_id, + project_id=options.project_id, + routing_hint=options.routing_hint, ) inner = TensorlakeSandboxSession.from_state(state, sandbox=sandbox) await inner._maybe_apply_exposed_ports() @@ -1113,12 +1153,15 @@ async def resume( "TensorlakeSandboxClient.resume expects a TensorlakeSandboxSessionState" ) - AsyncSandbox, _, _, _ = _import_tensorlake_sandbox() + AsyncSandbox, _, _ = _import_tensorlake_sandbox() connect_kwargs = _resolve_connect_kwargs( proxy_url=state.proxy_url, api_url=state.api_url, namespace=state.namespace, + organization_id=state.organization_id, + project_id=state.project_id, + routing_hint=state.routing_hint, ) sandbox: Any = None @@ -1169,6 +1212,8 @@ async def resume( proxy_url=state.proxy_url, api_url=state.api_url, namespace=state.namespace, + organization_id=state.organization_id, + project_id=state.project_id, ) sandbox = await AsyncSandbox.create(**kwargs) new_id = _get_sandbox_id(sandbox) diff --git a/tests/extensions/sandbox/test_tensorlake.py b/tests/extensions/sandbox/test_tensorlake.py index 88d79fad55..1844399c2a 100644 --- a/tests/extensions/sandbox/test_tensorlake.py +++ b/tests/extensions/sandbox/test_tensorlake.py @@ -779,6 +779,38 @@ async def test_persist_workspace_via_tar_round_trips_manifest( assert restored.read() == b"payload" +@pytest.mark.asyncio +async def test_persist_workspace_via_tar_excludes_archive_when_root_is_tmp( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """When manifest.root is /tmp the tar archive lives inside the workspace tree. + + The archive file must be excluded from the tar command so GNU tar does not hit + its "file is the archive" error (exit code 1). + """ + module = _load_tensorlake_module(monkeypatch) + sid = uuid.UUID("00000000-0000-0000-0000-000000000040") + state = module.TensorlakeSandboxSessionState( + session_id=sid, + manifest=Manifest(root="/tmp", entries={"data.txt": File(content=b"val")}), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-tmp-root", + ) + fake = _FakeSandbox(sandbox_id="sandbox-tmp-root") + session = module.TensorlakeSandboxSession.from_state(state, sandbox=fake) + + await session.start() + await session.persist_workspace() + + tar_calls = [c for c in fake.run_calls if c["command"] == "tar"] + assert tar_calls, "expected at least one tar call" + last_tar_args = cast(list[str], tar_calls[-1]["args"]) + expected_archive_name = f"openai-agents-{sid.hex}.tar" + assert any(expected_archive_name in arg for arg in last_tar_args if "--exclude" in arg), ( + f"archive file {expected_archive_name!r} not excluded from tar args: {last_tar_args}" + ) + + @pytest.mark.asyncio async def test_persist_workspace_via_tar_nonzero_raises_archive_error( monkeypatch: pytest.MonkeyPatch, @@ -1044,3 +1076,42 @@ def test_serialize_session_state_round_trips(monkeypatch: pytest.MonkeyPatch) -> assert restored.disk_mb == 20480 assert restored.workspace_persistence == "snapshot" assert restored.checkpoint_mode == "memory" + + +@pytest.mark.asyncio +async def test_restore_from_checkpoint_marks_system_state_preserved( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """After _restore_from_checkpoint, system state must be flagged as preserved. + + When resume() cannot read a RemoteSnapshot without dependencies it creates a fresh + empty sandbox and sets _start_system_state_preserved=False. hydrate_workspace() + later replaces that sandbox with a full Tensorlake checkpoint (which already contains + OS users and groups). The base start flow must not re-run groupadd/useradd against + accounts that are already present in the restored image. + """ + module = _load_tensorlake_module(monkeypatch) + + # Seed a snapshot so the checkpoint restore can find it. + initial = _FakeSandbox(sandbox_id="sandbox-snap-src") + initial.files["/workspace/data.txt"] = b"hello" + snap = await initial.checkpoint(checkpoint_type=_FakeCheckpointType.FILESYSTEM) + + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-000000000030"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-before-restore", + workspace_persistence="snapshot", + ) + fake = _FakeSandbox(sandbox_id="sandbox-before-restore") + session = module.TensorlakeSandboxSession.from_state(state, sandbox=fake) + + # Simulate the fresh-sandbox case: resume() could not read the snapshot and set + # preserved=False before handing the session to start(). + session._set_start_state_preserved(False, system=False) + + payload = module._encode_tensorlake_snapshot_ref(snapshot_id=snap.snapshot_id) + await session.hydrate_workspace(io.BytesIO(payload)) + + assert session.should_provision_manifest_accounts_on_resume() is False From 2fa4ecca318e7b94798524b005c4223ee8013a05 Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Mon, 18 May 2026 23:14:37 -0500 Subject: [PATCH 20/26] fix --- .../extensions/sandbox/tensorlake/sandbox.py | 8 +++ tests/extensions/sandbox/test_tensorlake.py | 60 +++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index a08227aca6..188a472d7f 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -514,6 +514,12 @@ async def _prepare_backend_workspace(self) -> None: }, ) + async def _after_start(self) -> None: + # Checkpoint restore replaces the sandbox and sandbox_id; reinstall runtime helpers only + # when the cache now points at a different backend. + if self._runtime_helper_cache_key != self._current_runtime_helper_cache_key(): + await self._ensure_runtime_helpers() + async def _shutdown_backend(self) -> None: sandbox = self._sandbox if sandbox is None: @@ -548,6 +554,8 @@ async def _shutdown_backend(self) -> None: ) async def running(self) -> bool: + if not self.state.workspace_root_ready: + return False sandbox = self._sandbox if sandbox is None: return False diff --git a/tests/extensions/sandbox/test_tensorlake.py b/tests/extensions/sandbox/test_tensorlake.py index 1844399c2a..ccc3b64808 100644 --- a/tests/extensions/sandbox/test_tensorlake.py +++ b/tests/extensions/sandbox/test_tensorlake.py @@ -1115,3 +1115,63 @@ async def test_restore_from_checkpoint_marks_system_state_preserved( await session.hydrate_workspace(io.BytesIO(payload)) assert session.should_provision_manifest_accounts_on_resume() is False + + +@pytest.mark.asyncio +async def test_after_start_reinstalls_helpers_when_sandbox_id_changes( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """_after_start must reinstall runtime helpers when sandbox_id changed mid-start. + + checkpoint restore replaces the sandbox and sandbox_id during start(); the helper + cache key becomes stale. _after_start() detects the mismatch and re-runs + _ensure_runtime_helpers() so the new backend has the helpers installed. + """ + module = _load_tensorlake_module(monkeypatch) + + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-000000000031"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-new", + workspace_persistence="tar", + ) + fake = _FakeSandbox(sandbox_id="sandbox-new") + session = module.TensorlakeSandboxSession.from_state(state, sandbox=fake) + + # Simulate helpers installed on the pre-restore sandbox: cache key is stale. + session._runtime_helper_cache_key = "sandbox-old" + session._runtime_helpers_installed = set() + + await session._after_start() + + assert session._runtime_helper_cache_key == "sandbox-new" + assert any(c["command"] == "sh" for c in fake.run_calls) + + +@pytest.mark.asyncio +async def test_running_returns_false_when_workspace_not_ready( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """running() must return False when the workspace has not been set up yet. + + A Tensorlake sandbox can be in RUNNING state (backend alive) while the workspace + hasn't been provisioned. Callers must not treat such a session as usable. + """ + module = _load_tensorlake_module(monkeypatch) + + state = module.TensorlakeSandboxSessionState( + session_id=uuid.UUID("00000000-0000-0000-0000-000000000032"), + manifest=Manifest(), + snapshot=NoopSnapshot(id="snap"), + sandbox_id="sandbox-not-ready", + workspace_persistence="tar", + ) + # Backend is running but workspace_root_ready is False (before start()). + fake = _FakeSandbox(sandbox_id="sandbox-not-ready", status="running") + session = module.TensorlakeSandboxSession.from_state(state, sandbox=fake) + + assert await session.running() is False + + state.workspace_root_ready = True + assert await session.running() is True From 92b9c34a4b651ebaa91361a8c4cac3bc0c9109f4 Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Mon, 18 May 2026 23:31:39 -0500 Subject: [PATCH 21/26] fix --- .../extensions/sandbox/tensorlake/sandbox.py | 99 +++++++++++++------ tests/extensions/sandbox/test_tensorlake.py | 36 ++++++- 2 files changed, 100 insertions(+), 35 deletions(-) diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index 188a472d7f..f7aacd9f98 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -47,6 +47,11 @@ from ....sandbox.session.tar_workspace import shell_tar_exclude_args from ....sandbox.snapshot import SnapshotBase, SnapshotSpec, resolve_snapshot from ....sandbox.types import ExecResult, ExposedPortEndpoint, User +from ....sandbox.util.retry import ( + TRANSIENT_HTTP_STATUS_CODES, + exception_chain_has_status_code, + retry_async, +) from ....sandbox.util.tar_utils import UnsafeTarMemberError, validate_tar_bytes from ....sandbox.workspace_paths import posix_path_for_error, sandbox_path_str @@ -487,10 +492,17 @@ def _coerce_exec_timeout(self, timeout_s: float | None) -> float: if timeout_s is None: return float(self.state.timeouts.exec_timeout_unbounded_s) if timeout_s <= 0: - return 0.001 + # The SDK's `timeout` is an int seconds value; the call site clamps to a 1s + # floor via `max(1, math.ceil(...))`. Return 1.0 here (matching E2B) instead + # of a sub-second sentinel so the intent is obvious at the source. + return 1.0 return float(timeout_s) async def _prepare_backend_workspace(self) -> None: + # Skip the mkdir round-trip when the base start flow probed a reconnected + # sandbox and confirmed the workspace root already exists. + if self._workspace_state_preserved_on_start() and self._start_workspace_root_ready: + return root = sandbox_path_str(self.state.manifest.root) try: envs = await self._resolved_envs() @@ -655,6 +667,20 @@ async def _get_proxy_hostname(self) -> str | None: hostname = parsed self._cached_proxy_hostname = hostname self._proxy_hostname_resolved = True + if hostname is None and (self.state.proxy_url is not None or self.state.api_url is not None): + # Custom deployments cannot be reached via the public + # `-.sandbox.tensorlake.ai` template; warn once so callers + # know the fallback URL likely won't route to their backend. + logger.warning( + "Could not resolve Tensorlake sandbox URL from info(); falling back to the " + "public exposed-port template, which will not route correctly for this " + "custom proxy_url/api_url deployment.", + extra={ + "sandbox_id": self.state.sandbox_id, + "proxy_url": self.state.proxy_url, + "api_url": self.state.api_url, + }, + ) return hostname async def read(self, path: Path, *, user: str | User | None = None) -> io.IOBase: @@ -784,13 +810,12 @@ async def _persist_workspace_via_checkpoint(self) -> io.IOBase: # Wait for `completed` (uploaded to remote storage), not the SDK default # `local_ready`. Restore goes through `AsyncSandbox.create(snapshot_id=...)`, # which can land on a different host that has no view of a local-only snapshot. - snapshot = await asyncio.wait_for( - self._sandbox.checkpoint( - checkpoint_type=checkpoint_type, - timeout=int(self.state.checkpoint_timeout_s), - wait_until="completed", - ), - timeout=self.state.timeouts.snapshot_tar_s, + # Rely on the SDK's own `timeout` so the backend tears down the operation; + # an outer `asyncio.wait_for` would only cancel the local awaiter. + snapshot = await self._sandbox.checkpoint( + checkpoint_type=checkpoint_type, + timeout=int(self.state.checkpoint_timeout_s), + wait_until="completed", ) except Exception as exc: raise WorkspaceArchiveReadError( @@ -825,30 +850,7 @@ async def _persist_workspace_via_tar(self) -> io.IOBase: tar_argv = ["cf", archive_path, *excludes, "-C", root.as_posix(), "."] try: - envs = await self._resolved_envs() - result = await self._sandbox.run( - "tar", - tar_argv, - env=envs or None, - timeout=int(self.state.timeouts.snapshot_tar_s), - ) - exit_code = int(getattr(result, "exit_code", 0) or 0) - if exit_code != 0: - raise ExecNonZeroError( - ExecResult( - stdout=str(getattr(result, "stdout", "") or "").encode( - "utf-8", errors="replace" - ), - stderr=str(getattr(result, "stderr", "") or "").encode( - "utf-8", errors="replace" - ), - exit_code=exit_code, - ), - command=("tar", *tar_argv), - context={"backend": "tensorlake", "sandbox_id": self.state.sandbox_id}, - ) - payload = await self._sandbox.read_file(archive_path) - archive_bytes = _coerce_sandbox_payload_to_bytes(payload) + archive_bytes = await self._run_persist_workspace_command(tar_argv, archive_path) except Exception as exc: raise WorkspaceArchiveReadError(path=error_root, cause=exc) from exc finally: @@ -856,6 +858,39 @@ async def _persist_workspace_via_tar(self) -> io.IOBase: return io.BytesIO(archive_bytes) + @retry_async( + retry_if=lambda exc, *_args, **_kwargs: exception_chain_has_status_code( + exc, TRANSIENT_HTTP_STATUS_CODES + ) + ) + async def _run_persist_workspace_command( + self, tar_argv: list[str], archive_path: str + ) -> bytes: + envs = await self._resolved_envs() + result = await self._sandbox.run( + "tar", + tar_argv, + env=envs or None, + timeout=int(self.state.timeouts.snapshot_tar_s), + ) + exit_code = int(getattr(result, "exit_code", 0) or 0) + if exit_code != 0: + raise ExecNonZeroError( + ExecResult( + stdout=str(getattr(result, "stdout", "") or "").encode( + "utf-8", errors="replace" + ), + stderr=str(getattr(result, "stderr", "") or "").encode( + "utf-8", errors="replace" + ), + exit_code=exit_code, + ), + command=("tar", *tar_argv), + context={"backend": "tensorlake", "sandbox_id": self.state.sandbox_id}, + ) + payload = await self._sandbox.read_file(archive_path) + return _coerce_sandbox_payload_to_bytes(payload) + async def hydrate_workspace(self, data: io.IOBase) -> None: raw = data.read() if isinstance(raw, str): diff --git a/tests/extensions/sandbox/test_tensorlake.py b/tests/extensions/sandbox/test_tensorlake.py index ccc3b64808..c0be3b616b 100644 --- a/tests/extensions/sandbox/test_tensorlake.py +++ b/tests/extensions/sandbox/test_tensorlake.py @@ -30,8 +30,14 @@ def __init__(self, snapshot_id: str) -> None: class _FakeCheckpointType: - FILESYSTEM = "filesystem" - MEMORY = "memory" + # Mirror the real `CheckpointType` str-Enum shape (members expose `.value`) so the + # integration's `_resolve_checkpoint_type(...).value` path is exercised by the fake. + class _Member: + def __init__(self, value: str) -> None: + self.value = value + + FILESYSTEM = _Member("filesystem") + MEMORY = _Member("memory") class _FakeSandboxStatus: @@ -288,7 +294,7 @@ async def checkpoint( wait: bool = True, timeout: float = 300.0, poll_interval: float = 1.0, - checkpoint_type: str | None = None, + checkpoint_type: Any = None, wait_until: str = "local_ready", ) -> _FakeSnapshotInfo: _ = (wait, timeout, poll_interval, checkpoint_type) @@ -342,6 +348,30 @@ def test_tensorlake_supports_pty_is_disabled(monkeypatch: pytest.MonkeyPatch) -> assert session.supports_pty() is False +def test_get_sandbox_id_handles_raising_property(monkeypatch: pytest.MonkeyPatch) -> None: + # The real `AsyncSandbox.sandbox_id` is a property that raises `SandboxError` before + # `info()` populates the cache. `_get_sandbox_id` must swallow that and return None + # instead of propagating an SDK exception out of the integration's create/restore paths. + module = _load_tensorlake_module(monkeypatch) + + class _RaisingSandbox: + @property + def sandbox_id(self) -> str: + raise RuntimeError("sandbox_id is not yet known; call `await sandbox.info()` first.") + + assert module._get_sandbox_id(_RaisingSandbox()) is None + + class _EmptySandbox: + sandbox_id = "" + + assert module._get_sandbox_id(_EmptySandbox()) is None + + class _ReadySandbox: + sandbox_id = "sb-123" + + assert module._get_sandbox_id(_ReadySandbox()) == "sb-123" + + @pytest.mark.asyncio async def test_create_passes_options_and_drops_unset_fields( monkeypatch: pytest.MonkeyPatch, From 50f9e7ae46e53d1540ba54a7b25e6eb9c0e8843c Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Mon, 18 May 2026 23:35:57 -0500 Subject: [PATCH 22/26] tensorlake sdk version --- pyproject.toml | 2 +- uv.lock | 17 +++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 33565ae32d..0e17c97dfa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,7 @@ cloudflare = ["aiohttp>=3.12,<4"] e2b = ["e2b==2.20.0", "e2b-code-interpreter==2.4.1"] modal = ["modal==1.3.5"] runloop = ["runloop_api_client>=1.16.0,<2.0.0"] -tensorlake = ["tensorlake>=0.5.14"] +tensorlake = ["tensorlake>=0.5.9"] vercel = ["vercel>=0.5.6,<0.6"] s3 = ["boto3>=1.34"] temporal = [ diff --git a/uv.lock b/uv.lock index 6dab565e6b..9086a5f4f5 100644 --- a/uv.lock +++ b/uv.lock @@ -9,7 +9,8 @@ resolution-markers = [ ] [options] -exclude-newer = "2026-05-18T23:59:59Z" +exclude-newer = "2026-05-12T04:34:53.788697Z" +exclude-newer-span = "P7D" [[package]] name = "aiofiles" @@ -2584,7 +2585,7 @@ requires-dist = [ { name = "runloop-api-client", marker = "extra == 'runloop'", specifier = ">=1.16.0,<2.0.0" }, { name = "sqlalchemy", marker = "extra == 'sqlalchemy'", specifier = ">=2.0" }, { name = "temporalio", marker = "extra == 'temporal'", specifier = "==1.26.0" }, - { name = "tensorlake", marker = "extra == 'tensorlake'", specifier = ">=0.5.14" }, + { name = "tensorlake", marker = "extra == 'tensorlake'", specifier = ">=0.5.9" }, { name = "textual", marker = "extra == 'temporal'", specifier = ">=8.2.3,<8.3" }, { name = "types-requests", specifier = ">=2.0,<3" }, { name = "typing-extensions", specifier = ">=4.12.2,<5" }, @@ -3978,7 +3979,7 @@ wheels = [ [[package]] name = "tensorlake" -version = "0.5.15" +version = "0.5.9" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "grpcio" }, @@ -3987,12 +3988,12 @@ dependencies = [ { name = "pydantic" }, { name = "websocket-client" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/3d/51/7d1020097852f593fc628e36c4af371f5ffaf87480173d0c464bd1689a47/tensorlake-0.5.15.tar.gz", hash = "sha256:a71d834b6311d263dff88dca56757fb8638e40f5c0a91e31ea8f7f26eac4b58d", size = 2292355, upload-time = "2026-05-18T10:54:22.863Z" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/64/e28e99404d9811ca7ff860fcde40d7f456a05deab80bfedadeca7843d8fd/tensorlake-0.5.9.tar.gz", hash = "sha256:cca64ebc14944e68a9911b8e46459191056f93cb851aa099cececbcaa2eebde8", size = 2272311, upload-time = "2026-05-07T22:44:36.585Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/9a/05/b7af41a64cc86288e1c550741ffe671b8e21dcdad82627568612f538b472/tensorlake-0.5.15-py3-none-macosx_11_0_arm64.whl", hash = "sha256:10716f3b0ba67c47f4e7e426df7cd7844aeedf638090a99b98d4116720e5a077", size = 15179635, upload-time = "2026-05-18T10:54:08.783Z" }, - { url = "https://files.pythonhosted.org/packages/d7/06/d4c2883a6ef2b602bfbd00ddaf80b216e935407cb91255dac74cb5408da0/tensorlake-0.5.15-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7dcfb61da41c70a14388e9eadeed0468a1213b4d7438b8ae37dd8b707901f03", size = 15725531, upload-time = "2026-05-18T10:54:12.205Z" }, - { url = "https://files.pythonhosted.org/packages/0d/a3/103726dab4c93cb6a3d8d8fa3e88293da6f5db88574656d2a18a4322e555/tensorlake-0.5.15-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1dd07addfe98dcf3345bf7ba7756f996486357c48977bc6ffc5b4431dc4b5dc2", size = 16347997, upload-time = "2026-05-18T10:54:15.825Z" }, - { url = "https://files.pythonhosted.org/packages/de/f5/bab1a48c6397b8c85f09af166f002ffbf4a22fd815db0d1ab6b0bbb6c96f/tensorlake-0.5.15-py3-none-win_amd64.whl", hash = "sha256:a4da65cad108fe51dc3218caed7b52d3e1bfa3873c570e88cfdb65578cf41cf7", size = 17253267, upload-time = "2026-05-18T10:54:19.979Z" }, + { url = "https://files.pythonhosted.org/packages/6a/2d/17d40ba2fe8cbe03f1afb5436d65d9bea75a73bfb6a0ad36b8ca37b348dc/tensorlake-0.5.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:6979641b547e1c7415a1c2490f3526a576f5fb0970cdf61c71956e80c7f3de7d", size = 13473137, upload-time = "2026-05-07T22:44:24.342Z" }, + { url = "https://files.pythonhosted.org/packages/78/87/0d0c58ea2b600ae6ab3a28a084c607cf5abfc24b82569e0a427ecab95a2e/tensorlake-0.5.9-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75ccdc4ed6d77c460fcd600043855ee8cf786bdcb70f9b189743f06fdb55d3ce", size = 13972704, upload-time = "2026-05-07T22:44:27.899Z" }, + { url = "https://files.pythonhosted.org/packages/b8/a3/60af74d20a1ff1d15c99b974d5951b5a13ddc48f56803117e445a9dc46f0/tensorlake-0.5.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6bcc9f836da4613ae4b64f2c2d2f881572004863893fce935de6e930ceef6c0", size = 14449209, upload-time = "2026-05-07T22:44:30.94Z" }, + { url = "https://files.pythonhosted.org/packages/c6/f4/775271ed456be337079e4d652bebf04bc0ad04ad40d6b11d62f8de35bf94/tensorlake-0.5.9-py3-none-win_amd64.whl", hash = "sha256:a2e5ab7fd7ac034e1613d1dac7668d828c6f2ff86257c19edc778fe2de4e836c", size = 15197516, upload-time = "2026-05-07T22:44:34.046Z" }, ] [[package]] From 714c7466af9acab5eaaf6baf215bbf2b78a2b9e5 Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Tue, 19 May 2026 08:48:20 -0500 Subject: [PATCH 23/26] fix runner --- .../sandbox/extensions/tensorlake_runner.py | 75 +++++++++++-------- 1 file changed, 44 insertions(+), 31 deletions(-) diff --git a/examples/sandbox/extensions/tensorlake_runner.py b/examples/sandbox/extensions/tensorlake_runner.py index ebe93fcd10..4281421d48 100644 --- a/examples/sandbox/extensions/tensorlake_runner.py +++ b/examples/sandbox/extensions/tensorlake_runner.py @@ -23,12 +23,13 @@ from agents.models.openai_provider import OpenAIProvider from agents.run import RunConfig from agents.sandbox import LocalSnapshotSpec, Manifest, SandboxAgent, SandboxRunConfig +from agents.sandbox.entries import File +from agents.sandbox.errors import SandboxError from agents.sandbox.session import BaseSandboxSession if __package__ is None or __package__ == "": sys.path.insert(0, str(Path(__file__).resolve().parents[3])) -from examples.sandbox.misc.example_support import text_manifest from examples.sandbox.misc.workspace_shell import WorkspaceShellCapability try: @@ -51,24 +52,28 @@ def _build_manifest() -> Manifest: - return text_manifest( - { - "README.md": ( - "# Tensorlake Demo Workspace\n\n" - "This workspace exists to validate the Tensorlake sandbox backend manually.\n" - ), - "handoff.md": ( - "# Handoff\n\n" - "- Customer: Northwind Traders.\n" - "- Goal: validate Tensorlake sandbox exec and persistence flows.\n" - "- Current status: non-PTY backend slice is wired and under test.\n" - ), - "todo.md": ( - "# Todo\n\n" - "1. Inspect the workspace files.\n" - "2. Summarize the current status in two sentences.\n" - ), - } + # The Tensorlake default image runs as a non-root user, so the workspace root must live + # under a directory the sandbox user can write to (the default `/workspace` is not). + files = { + "README.md": ( + "# Tensorlake Demo Workspace\n\n" + "This workspace exists to validate the Tensorlake sandbox backend manually.\n" + ), + "handoff.md": ( + "# Handoff\n\n" + "- Customer: Northwind Traders.\n" + "- Goal: validate Tensorlake sandbox exec and persistence flows.\n" + "- Current status: non-PTY backend slice is wired and under test.\n" + ), + "todo.md": ( + "# Todo\n\n" + "1. Inspect the workspace files.\n" + "2. Summarize the current status in two sentences.\n" + ), + } + return Manifest( + root="/tmp/workspace", + entries={path: File(content=contents.encode("utf-8")) for path, contents in files.items()}, ) @@ -185,18 +190,26 @@ async def main( manifest = _build_manifest() - await _verify_stop_resume( - manifest=manifest, - image=image, - timeout_secs=timeout_secs, - workspace_persistence=workspace_persistence, - ) - await _verify_resume_running_sandbox( - manifest=manifest, - image=image, - timeout_secs=timeout_secs, - workspace_persistence=workspace_persistence, - ) + try: + await _verify_stop_resume( + manifest=manifest, + image=image, + timeout_secs=timeout_secs, + workspace_persistence=workspace_persistence, + ) + await _verify_resume_running_sandbox( + manifest=manifest, + image=image, + timeout_secs=timeout_secs, + workspace_persistence=workspace_persistence, + ) + except SandboxError as exc: + print( + "SandboxError during sandbox verification: " + f"code={exc.error_code} op={exc.op} context={exc.context}", + file=sys.stderr, + ) + raise agent = SandboxAgent( name="Tensorlake Sandbox Assistant", From f48812f413d0b18bfa0fccd964f959a49b69aaf2 Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Tue, 19 May 2026 08:55:14 -0500 Subject: [PATCH 24/26] cleanup example runner --- .../sandbox/extensions/tensorlake_runner.py | 105 +++++++----------- 1 file changed, 41 insertions(+), 64 deletions(-) diff --git a/examples/sandbox/extensions/tensorlake_runner.py b/examples/sandbox/extensions/tensorlake_runner.py index 4281421d48..96a070aca4 100644 --- a/examples/sandbox/extensions/tensorlake_runner.py +++ b/examples/sandbox/extensions/tensorlake_runner.py @@ -20,11 +20,9 @@ from openai.types.responses import ResponseTextDeltaEvent from agents import ModelSettings, Runner -from agents.models.openai_provider import OpenAIProvider from agents.run import RunConfig from agents.sandbox import LocalSnapshotSpec, Manifest, SandboxAgent, SandboxRunConfig from agents.sandbox.entries import File -from agents.sandbox.errors import SandboxError from agents.sandbox.session import BaseSandboxSession if __package__ is None or __package__ == "": @@ -144,14 +142,12 @@ async def _verify_resume_running_sandbox( workspace_persistence: Literal["tar", "snapshot"], ) -> None: client = TensorlakeSandboxClient() - sandbox = await client.create( - manifest=manifest, - options=TensorlakeSandboxClientOptions( - image=image, - timeout_secs=timeout_secs, - workspace_persistence=workspace_persistence, - ), + options = TensorlakeSandboxClientOptions( + image=image, + timeout_secs=timeout_secs, + workspace_persistence=workspace_persistence, ) + sandbox = await client.create(manifest=manifest, options=options) try: await sandbox.start() @@ -190,26 +186,18 @@ async def main( manifest = _build_manifest() - try: - await _verify_stop_resume( - manifest=manifest, - image=image, - timeout_secs=timeout_secs, - workspace_persistence=workspace_persistence, - ) - await _verify_resume_running_sandbox( - manifest=manifest, - image=image, - timeout_secs=timeout_secs, - workspace_persistence=workspace_persistence, - ) - except SandboxError as exc: - print( - "SandboxError during sandbox verification: " - f"code={exc.error_code} op={exc.op} context={exc.context}", - file=sys.stderr, - ) - raise + await _verify_stop_resume( + manifest=manifest, + image=image, + timeout_secs=timeout_secs, + workspace_persistence=workspace_persistence, + ) + await _verify_resume_running_sandbox( + manifest=manifest, + image=image, + timeout_secs=timeout_secs, + workspace_persistence=workspace_persistence, + ) agent = SandboxAgent( name="Tensorlake Sandbox Assistant", @@ -225,45 +213,34 @@ async def main( model_settings=ModelSettings(tool_choice="required"), ) - client = TensorlakeSandboxClient() - sandbox = await client.create( - manifest=manifest, - options=TensorlakeSandboxClientOptions( - image=image, - timeout_secs=timeout_secs, - workspace_persistence=workspace_persistence, - ), - ) - run_config = RunConfig( - model_provider=OpenAIProvider(), - sandbox=SandboxRunConfig(session=sandbox), - tracing_disabled=True, + sandbox=SandboxRunConfig( + client=TensorlakeSandboxClient(), + options=TensorlakeSandboxClientOptions( + image=image, + timeout_secs=timeout_secs, + workspace_persistence=workspace_persistence, + ), + ), workflow_name="Tensorlake sandbox example", ) - try: - async with sandbox: - if not stream: - result = await Runner.run(agent, question, run_config=run_config) - print(result.final_output) - return - - stream_result = Runner.run_streamed(agent, question, run_config=run_config) - saw_text_delta = False - async for event in stream_result.stream_events(): - if event.type == "raw_response_event" and isinstance( - event.data, ResponseTextDeltaEvent - ): - if not saw_text_delta: - print("assistant> ", end="", flush=True) - saw_text_delta = True - print(event.data.delta, end="", flush=True) - - if saw_text_delta: - print() - finally: - await client.delete(sandbox) + if not stream: + result = await Runner.run(agent, question, run_config=run_config) + print(result.final_output) + return + + stream_result = Runner.run_streamed(agent, question, run_config=run_config) + saw_text_delta = False + async for event in stream_result.stream_events(): + if event.type == "raw_response_event" and isinstance(event.data, ResponseTextDeltaEvent): + if not saw_text_delta: + print("assistant> ", end="", flush=True) + saw_text_delta = True + print(event.data.delta, end="", flush=True) + + if saw_text_delta: + print() if __name__ == "__main__": From e40309447a4bf33692bc3cb9839a322fd5bff1b4 Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Tue, 19 May 2026 09:28:00 -0500 Subject: [PATCH 25/26] Default Tensorlake manifest root to a tl-user-writable, snapshotted path --- docs/sandbox/clients.md | 2 +- .../sandbox/extensions/tensorlake_runner.py | 5 +- src/agents/extensions/sandbox/__init__.py | 2 + .../extensions/sandbox/tensorlake/__init__.py | 2 + .../extensions/sandbox/tensorlake/sandbox.py | 61 ++++++++++++++----- uv.lock | 2 +- 6 files changed, 54 insertions(+), 20 deletions(-) diff --git a/docs/sandbox/clients.md b/docs/sandbox/clients.md index 266ef4051d..be4794454b 100644 --- a/docs/sandbox/clients.md +++ b/docs/sandbox/clients.md @@ -114,7 +114,7 @@ Hosted sandbox clients expose provider-specific mount strategies. Choose the bac | `DaytonaSandboxClient` | Supports rclone-backed cloud storage mounts with `DaytonaCloudBucketMountStrategy`; use it with `S3Mount`, `GCSMount`, `R2Mount`, `AzureBlobMount`, and `BoxMount`. | | `E2BSandboxClient` | Supports rclone-backed cloud storage mounts with `E2BCloudBucketMountStrategy`; use it with `S3Mount`, `GCSMount`, `R2Mount`, `AzureBlobMount`, and `BoxMount`. | | `RunloopSandboxClient` | Supports rclone-backed cloud storage mounts with `RunloopCloudBucketMountStrategy`; use it with `S3Mount`, `GCSMount`, `R2Mount`, `AzureBlobMount`, and `BoxMount`. | -| `TensorlakeSandboxClient` | No hosted-specific mount strategy is currently exposed. Use manifest files, repos, or other workspace inputs instead. Tensorlake's native sandbox checkpoint API is available via `workspace_persistence="snapshot"`; prefer this over external bucket mounts for between-run persistence. | +| `TensorlakeSandboxClient` | No hosted-specific mount strategy is currently exposed. Use manifest files, repos, or other workspace inputs instead. The default manifest root is `DEFAULT_TENSORLAKE_WORKSPACE_ROOT` (`/home/tl-user/workspace`), which is writable by the default image's non-root user and persisted across FILESYSTEM checkpoints; override it only when targeting a custom image. Tensorlake's native sandbox checkpoint API is available via `workspace_persistence="snapshot"`; prefer this over external bucket mounts for between-run persistence. | | `VercelSandboxClient` | No hosted-specific mount strategy is currently exposed. Use manifest files, repos, or other workspace inputs instead. | diff --git a/examples/sandbox/extensions/tensorlake_runner.py b/examples/sandbox/extensions/tensorlake_runner.py index 96a070aca4..7ea05b6287 100644 --- a/examples/sandbox/extensions/tensorlake_runner.py +++ b/examples/sandbox/extensions/tensorlake_runner.py @@ -32,6 +32,7 @@ try: from agents.extensions.sandbox import ( + DEFAULT_TENSORLAKE_WORKSPACE_ROOT, TensorlakeSandboxClient, TensorlakeSandboxClientOptions, ) @@ -50,8 +51,6 @@ def _build_manifest() -> Manifest: - # The Tensorlake default image runs as a non-root user, so the workspace root must live - # under a directory the sandbox user can write to (the default `/workspace` is not). files = { "README.md": ( "# Tensorlake Demo Workspace\n\n" @@ -70,7 +69,7 @@ def _build_manifest() -> Manifest: ), } return Manifest( - root="/tmp/workspace", + root=DEFAULT_TENSORLAKE_WORKSPACE_ROOT, entries={path: File(content=contents.encode("utf-8")) for path, contents in files.items()}, ) diff --git a/src/agents/extensions/sandbox/__init__.py b/src/agents/extensions/sandbox/__init__.py index 9d45cebf25..fa2a66dab3 100644 --- a/src/agents/extensions/sandbox/__init__.py +++ b/src/agents/extensions/sandbox/__init__.py @@ -99,6 +99,7 @@ try: from .tensorlake import ( + DEFAULT_TENSORLAKE_WORKSPACE_ROOT as DEFAULT_TENSORLAKE_WORKSPACE_ROOT, TensorlakeSandboxClient as TensorlakeSandboxClient, TensorlakeSandboxClientOptions as TensorlakeSandboxClientOptions, TensorlakeSandboxSession as TensorlakeSandboxSession, @@ -193,6 +194,7 @@ if _HAS_TENSORLAKE: __all__.extend( [ + "DEFAULT_TENSORLAKE_WORKSPACE_ROOT", "TensorlakeSandboxClient", "TensorlakeSandboxClientOptions", "TensorlakeSandboxSession", diff --git a/src/agents/extensions/sandbox/tensorlake/__init__.py b/src/agents/extensions/sandbox/tensorlake/__init__.py index c75b0c6021..dea284e1bb 100644 --- a/src/agents/extensions/sandbox/tensorlake/__init__.py +++ b/src/agents/extensions/sandbox/tensorlake/__init__.py @@ -1,6 +1,7 @@ from __future__ import annotations from .sandbox import ( + DEFAULT_TENSORLAKE_WORKSPACE_ROOT, TensorlakeSandboxClient, TensorlakeSandboxClientOptions, TensorlakeSandboxSession, @@ -9,6 +10,7 @@ ) __all__ = [ + "DEFAULT_TENSORLAKE_WORKSPACE_ROOT", "TensorlakeSandboxClient", "TensorlakeSandboxClientOptions", "TensorlakeSandboxSession", diff --git a/src/agents/extensions/sandbox/tensorlake/sandbox.py b/src/agents/extensions/sandbox/tensorlake/sandbox.py index f7aacd9f98..d3e306ae00 100644 --- a/src/agents/extensions/sandbox/tensorlake/sandbox.py +++ b/src/agents/extensions/sandbox/tensorlake/sandbox.py @@ -14,12 +14,14 @@ from __future__ import annotations import asyncio +import inspect import io import json import logging import math import uuid from contextlib import suppress +from functools import lru_cache from pathlib import Path from typing import Any, Literal, cast from urllib.parse import urlsplit @@ -63,6 +65,12 @@ _WORKSPACE_PERSISTENCE_TAR: WorkspacePersistenceMode = "tar" _WORKSPACE_PERSISTENCE_SNAPSHOT: WorkspacePersistenceMode = "snapshot" +# Default manifest root for the Tensorlake provider. The default image runs as the +# non-root `tl-user`, so `/workspace` (the cross-provider default) is not writable; +# tmpfs paths like `/tmp/*` are writable but excluded from FILESYSTEM checkpoints. +# `/home/tl-user/workspace` is both `tl-user`-writable and persisted across snapshots. +DEFAULT_TENSORLAKE_WORKSPACE_ROOT = "/home/tl-user/workspace" + # Magic prefix for Tensorlake checkpoint references that are not tar bytes. _TENSORLAKE_SNAPSHOT_MAGIC = b"TENSORLAKE_SANDBOX_SNAPSHOT_V1\n" @@ -91,6 +99,22 @@ def _import_tensorlake_sandbox() -> tuple[Any, Any, Any]: ) from exc +@lru_cache(maxsize=1) +def _checkpoint_supports_wait_until() -> bool: + """Return True when `AsyncSandbox.checkpoint` accepts the `wait_until` kwarg. + + `wait_until` was added in tensorlake 0.5.15; on older SDKs `snapshot_and_wait` + already polls until status is `completed` or `failed`, so the kwarg is a no-op + semantically and can be omitted. + """ + + AsyncSandbox, _, _ = _import_tensorlake_sandbox() + try: + return "wait_until" in inspect.signature(AsyncSandbox.checkpoint).parameters + except (TypeError, ValueError): # pragma: no cover - defensive + return False + + def _is_running_status(status: object) -> bool: """Return True when the SDK `SandboxStatus` value is RUNNING. @@ -667,7 +691,9 @@ async def _get_proxy_hostname(self) -> str | None: hostname = parsed self._cached_proxy_hostname = hostname self._proxy_hostname_resolved = True - if hostname is None and (self.state.proxy_url is not None or self.state.api_url is not None): + if hostname is None and ( + self.state.proxy_url is not None or self.state.api_url is not None + ): # Custom deployments cannot be reached via the public # `-.sandbox.tensorlake.ai` template; warn once so callers # know the fallback URL likely won't route to their backend. @@ -806,17 +832,22 @@ async def _persist_workspace_via_checkpoint(self) -> io.IOBase: _, CheckpointType, _ = _import_tensorlake_sandbox() checkpoint_type = _resolve_checkpoint_type(self.state.checkpoint_mode, CheckpointType) + # Rely on the SDK's own `timeout` so the backend tears down the operation; + # an outer `asyncio.wait_for` would only cancel the local awaiter. + checkpoint_kwargs: dict[str, Any] = { + "checkpoint_type": checkpoint_type, + "timeout": int(self.state.checkpoint_timeout_s), + } + # `wait_until="completed"` blocks until the snapshot is uploaded to remote storage, + # not just the SDK default `local_ready`. Restore goes through + # `AsyncSandbox.create(snapshot_id=...)`, which can land on a different host with no + # view of a local-only snapshot. On older SDKs (<0.5.15) without `wait_until`, the + # default `snapshot_and_wait` already polls until status is `completed` or `failed`. + if _checkpoint_supports_wait_until(): + checkpoint_kwargs["wait_until"] = "completed" + try: - # Wait for `completed` (uploaded to remote storage), not the SDK default - # `local_ready`. Restore goes through `AsyncSandbox.create(snapshot_id=...)`, - # which can land on a different host that has no view of a local-only snapshot. - # Rely on the SDK's own `timeout` so the backend tears down the operation; - # an outer `asyncio.wait_for` would only cancel the local awaiter. - snapshot = await self._sandbox.checkpoint( - checkpoint_type=checkpoint_type, - timeout=int(self.state.checkpoint_timeout_s), - wait_until="completed", - ) + snapshot = await self._sandbox.checkpoint(**checkpoint_kwargs) except Exception as exc: raise WorkspaceArchiveReadError( path=error_root, @@ -863,9 +894,7 @@ async def _persist_workspace_via_tar(self) -> io.IOBase: exc, TRANSIENT_HTTP_STATUS_CODES ) ) - async def _run_persist_workspace_command( - self, tar_argv: list[str], archive_path: str - ) -> bytes: + async def _run_persist_workspace_command(self, tar_argv: list[str], archive_path: str) -> bytes: envs = await self._resolved_envs() result = await self._sandbox.run( "tar", @@ -1080,7 +1109,8 @@ async def create( manifest: Manifest | None = None, options: TensorlakeSandboxClientOptions, ) -> SandboxSession: - manifest = manifest or Manifest() + if manifest is None: + manifest = Manifest(root=DEFAULT_TENSORLAKE_WORKSPACE_ROOT) timeouts_in = options.timeouts if isinstance(timeouts_in, TensorlakeSandboxTimeouts): @@ -1276,6 +1306,7 @@ def deserialize_session_state(self, payload: dict[str, object]) -> SandboxSessio __all__ = [ + "DEFAULT_TENSORLAKE_WORKSPACE_ROOT", "TensorlakeSandboxClient", "TensorlakeSandboxClientOptions", "TensorlakeSandboxSession", diff --git a/uv.lock b/uv.lock index 9086a5f4f5..aa04ca7edb 100644 --- a/uv.lock +++ b/uv.lock @@ -9,7 +9,7 @@ resolution-markers = [ ] [options] -exclude-newer = "2026-05-12T04:34:53.788697Z" +exclude-newer = "2026-05-12T14:26:16.612606Z" exclude-newer-span = "P7D" [[package]] From 04d96a8458277bc62e09d113eb472414fffacdcd Mon Sep 17 00:00:00 2001 From: shanshan wang Date: Tue, 19 May 2026 10:42:26 -0500 Subject: [PATCH 26/26] update runner --- .../sandbox/extensions/tensorlake_runner.py | 114 +++++++++++------- 1 file changed, 72 insertions(+), 42 deletions(-) diff --git a/examples/sandbox/extensions/tensorlake_runner.py b/examples/sandbox/extensions/tensorlake_runner.py index 7ea05b6287..8079af975c 100644 --- a/examples/sandbox/extensions/tensorlake_runner.py +++ b/examples/sandbox/extensions/tensorlake_runner.py @@ -88,19 +88,23 @@ def _require_env(name: str) -> None: raise SystemExit(f"{name} must be set before running this example.") +def _parse_env_pair(raw: str) -> tuple[str, str]: + if "=" not in raw: + raise argparse.ArgumentTypeError( + f"--env value must be KEY=VAL (got {raw!r})." + ) + key, value = raw.split("=", 1) + if not key: + raise argparse.ArgumentTypeError(f"--env key must be non-empty (got {raw!r}).") + return key, value + + async def _verify_stop_resume( *, manifest: Manifest, - image: str | None, - timeout_secs: int | None, - workspace_persistence: Literal["tar", "snapshot"], + options: TensorlakeSandboxClientOptions, ) -> None: client = TensorlakeSandboxClient() - options = TensorlakeSandboxClientOptions( - image=image, - timeout_secs=timeout_secs, - workspace_persistence=workspace_persistence, - ) with tempfile.TemporaryDirectory(prefix="tensorlake-snapshot-example-") as snapshot_dir: sandbox = await client.create( manifest=manifest, @@ -124,28 +128,21 @@ async def _verify_stop_resume( restored_text = await _read_text(resumed_sandbox, SNAPSHOT_CHECK_PATH) if restored_text != SNAPSHOT_CHECK_CONTENT: raise RuntimeError( - f"Snapshot resume verification failed for {workspace_persistence!r}: " + f"Snapshot resume verification failed for {options.workspace_persistence!r}: " f"expected {SNAPSHOT_CHECK_CONTENT!r}, got {restored_text!r}" ) finally: await resumed_sandbox.aclose() - print(f"snapshot round-trip ok ({workspace_persistence})") + print(f"snapshot round-trip ok ({options.workspace_persistence})") async def _verify_resume_running_sandbox( *, manifest: Manifest, - image: str | None, - timeout_secs: int | None, - workspace_persistence: Literal["tar", "snapshot"], + options: TensorlakeSandboxClientOptions, ) -> None: client = TensorlakeSandboxClient() - options = TensorlakeSandboxClientOptions( - image=image, - timeout_secs=timeout_secs, - workspace_persistence=workspace_persistence, - ) sandbox = await client.create(manifest=manifest, options=options) try: @@ -168,16 +165,14 @@ async def _verify_resume_running_sandbox( finally: await sandbox.shutdown() - print(f"running sandbox resume ok ({workspace_persistence})") + print(f"running sandbox resume ok ({options.workspace_persistence})") async def main( *, model: str, question: str, - image: str | None, - timeout_secs: int | None, - workspace_persistence: Literal["tar", "snapshot"], + options: TensorlakeSandboxClientOptions, stream: bool, ) -> None: _require_env("OPENAI_API_KEY") @@ -185,18 +180,8 @@ async def main( manifest = _build_manifest() - await _verify_stop_resume( - manifest=manifest, - image=image, - timeout_secs=timeout_secs, - workspace_persistence=workspace_persistence, - ) - await _verify_resume_running_sandbox( - manifest=manifest, - image=image, - timeout_secs=timeout_secs, - workspace_persistence=workspace_persistence, - ) + await _verify_stop_resume(manifest=manifest, options=options) + await _verify_resume_running_sandbox(manifest=manifest, options=options) agent = SandboxAgent( name="Tensorlake Sandbox Assistant", @@ -215,11 +200,7 @@ async def main( run_config = RunConfig( sandbox=SandboxRunConfig( client=TensorlakeSandboxClient(), - options=TensorlakeSandboxClientOptions( - image=image, - timeout_secs=timeout_secs, - workspace_persistence=workspace_persistence, - ), + options=options, ), workflow_name="Tensorlake sandbox example", ) @@ -263,16 +244,65 @@ async def main( default="tar", help="Workspace persistence mode to verify before the agent run.", ) + parser.add_argument( + "--env", + action="append", + default=None, + type=_parse_env_pair, + metavar="KEY=VAL", + help="Environment variable to inject into the sandbox. Repeatable.", + ) + parser.add_argument( + "--secret", + action="append", + default=None, + metavar="NAME", + help="Tensorlake-managed secret name to inject into the sandbox. Repeatable.", + ) + parser.add_argument( + "--pause-on-exit", + action="store_true", + default=False, + help="Pause the sandbox on shutdown instead of terminating it.", + ) + parser.add_argument( + "--cpus", + type=float, + default=None, + help="Optional CPU allocation for the sandbox.", + ) + parser.add_argument( + "--memory-mb", + type=int, + default=None, + help="Optional memory allocation for the sandbox, in megabytes.", + ) + parser.add_argument( + "--disk-mb", + type=int, + default=None, + help="Optional disk allocation for the sandbox, in megabytes.", + ) parser.add_argument("--stream", action="store_true", default=False, help="Stream the response.") args = parser.parse_args() + options = TensorlakeSandboxClientOptions( + image=args.image, + timeout_secs=args.timeout_secs, + workspace_persistence=cast(Literal["tar", "snapshot"], args.workspace_persistence), + envs=dict(args.env) if args.env else None, + secret_names=tuple(args.secret or ()), + pause_on_exit=args.pause_on_exit, + cpus=args.cpus, + memory_mb=args.memory_mb, + disk_mb=args.disk_mb, + ) + asyncio.run( main( model=args.model, question=args.question, - image=args.image, - timeout_secs=args.timeout_secs, - workspace_persistence=cast(Literal["tar", "snapshot"], args.workspace_persistence), + options=options, stream=args.stream, ) )