Skip to content
54 changes: 54 additions & 0 deletions cuda_bindings/cuda/bindings/_test_helpers/mempool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

import sys

import pytest

from cuda.bindings import driver, runtime


def is_windows_mcdm_device(device=0):
if sys.platform != "win32":
return False
import cuda.bindings.nvml as nvml

device_id = int(getattr(device, "device_id", device))
(err,) = driver.cuInit(0)
if err != driver.CUresult.CUDA_SUCCESS:
return False
err, pci_bus_id = driver.cuDeviceGetPCIBusId(13, device_id)
if err != driver.CUresult.CUDA_SUCCESS:
return False
pci_bus_id = pci_bus_id.split(b"\x00", 1)[0].decode("ascii")
nvml.init_v2()
try:
handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id)
current, _ = nvml.device_get_driver_model_v2(handle)
return current == nvml.DriverModel.DRIVER_MCDM
finally:
nvml.shutdown()
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't this assume that nvml was uninitialized on entry to this function? Would it break callers that initialized nvml?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked the NVML API contract directly instead of relying on memory. The short answer is that nvmlInit_v2() and nvmlShutdown() are reference-counted, so the balanced nvml.init_v2() / nvml.shutdown() pair in our helper should not break callers that had already initialized NVML.

The most relevant NVIDIA doc is the current NVML "Initialization and Cleanup" page: https://docs.nvidia.com/deploy/nvml-api/group__nvmlInitializationAndCleanup.html.

Cursor generated supporting details:

  • The current NVML docs for nvmlInit_v2() say: "A reference count of the number of initializations is maintained. Shutdown only occurs when the reference count reaches zero."
  • The current NVML docs for nvmlShutdown() say: "This method should be called ... once for each call to nvmlInit_v2(). A reference count of the number of initializations is maintained. Shutdown only occurs when the reference count reaches zero."
  • The same current docs also say this applies "For all products." Separately, the NVML API reference lists Windows as a supported OS platform, so there is no indication that the ref-count behavior is Linux-only.
  • The archived R525 docs use the same ref-count language, which suggests this is not a recent or unstable contract.
  • Our cuda.bindings.nvml layer is a thin pass-through here: init_v2() calls nvmlInit_v2() directly and shutdown() calls nvmlShutdown() directly, so there is no extra Python-side lifecycle logic changing the semantics.
  • The generated binding text in cuda_bindings/cuda/bindings/nvml.pyx also reflects the same contract: ERROR_ALREADY_INITIALIZED is described as deprecated because "Multiple initializations are now allowed through ref counting."
  • The repo already encodes this assumption in cuda_bindings/tests/nvml/test_init.py, whose test_init_ref_count() explicitly exercises repeated init_v2() / shutdown() calls and checks that NVML remains initialized until the matching final shutdown. That test is skipped on Windows, so it is not direct Windows coverage, but it does show the intended interpretation inside this repo.
  • One unrelated wrinkle: the current docs say extra nvmlShutdown() calls beyond the init count are tolerated for backwards compatibility, while our local test expects UninitializedError on a naked shutdown(). That mismatch is worth keeping in mind, but it does not affect this helper because the helper uses a balanced init/shutdown pair.



def xfail_if_mempool_oom(err_or_exc, api_name=None, device=0):
if api_name is not None and not isinstance(api_name, str):
device = api_name
api_name = None

is_oom = err_or_exc in (
driver.CUresult.CUDA_ERROR_OUT_OF_MEMORY,
runtime.cudaError_t.cudaErrorMemoryAllocation,
) or "CUDA_ERROR_OUT_OF_MEMORY" in str(err_or_exc)

if not is_oom:
return
try:
is_windows_mcdm = is_windows_mcdm_device(device)
except Exception:
# If MCDM detection fails, leave the primary test failure visible.
return
if not is_windows_mcdm:
return

api_context = f"{api_name} " if api_name else ""
pytest.xfail(f"{api_context}could not reserve VA for mempool operations on Windows MCDM")
8 changes: 8 additions & 0 deletions cuda_bindings/tests/test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import cuda.bindings.driver as cuda
import cuda.bindings.runtime as cudart
from cuda.bindings import driver
from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom


def driverVersionLessThan(target):
Expand Down Expand Up @@ -270,6 +271,7 @@ def test_cuda_memPool_attr():

attr_list = [None] * 8
err, pool = cuda.cuMemPoolCreate(poolProps)
xfail_if_mempool_oom(err, "cuMemPoolCreate", poolProps.location.id)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was (perhaps naively) expecting the xfail logic to appear as a decorator on the test itself, or, at worst, a context manager. I guess that's not practical?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Helper-based local skip/xfail logic is a widely-used pattern in this repo, especially under cuda_core/tests. Also, I believe for this specific issue, a local helper is the right tool for the job because the condition is only knowable after a specific CUDA API call returns a specific failure on a specific runtime configuration.

Cursor generated supporting details:

  • Under cuda_core/tests, runtime gating is frequently factored into helpers and fixtures rather than only using decorators. Examples include skip_if_pinned_memory_unsupported() and skip_if_managed_memory_unsupported() in cuda_core/tests/conftest.py, plus local helpers like _skip_if_no_mempool() / _skip_if_no_managed_mempool() in cuda_core/tests/graph/test_graph_definition.py, similar _skip_if_no_mempool() helpers in several other graph/object-protocol modules, and fixture-style runtime gates like skip_if_no_tma in cuda_core/tests/test_tensor_map.py.
  • So I think it is fair to describe helper-based runtime skip/xfail logic as a commonly used pattern under cuda-python, with the strongest examples living in cuda_core/tests.
  • Decorators are most natural when the condition is static up front: platform, version, missing import, permanently absent feature, etc. Here the interesting condition is narrower: a particular mempool setup call fails with the known Windows MCDM OOM-like failure. A decorator would tend to mark the whole test based on environment rather than on the actually observed failure.
  • A context manager is more plausible than a decorator, but still not a great fit here because cuda_bindings/tests is largely return-code driven. The test gets an err back from the CUDA API and then decides what to do. In that style, a helper like xfail_if_mempool_oom(err, api_name, device) is more natural than building an exception-oriented context manager around a return-code check.
  • The local helper also keeps the xfail narrowly scoped. Unaffected systems still pass normally, affected systems only xfail when the known bug actually reproduces, and once the underlying issue is fixed the test can begin passing immediately instead of remaining broadly pre-marked.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree this follows the existing pattern. I'd be interested in exploring options to diminish the reliance on these helpers.

At this particular line of code, errors are being checked manually, so a helper makes sense. More broadly, it would be better if the tests could be written directly and some other mechanism could translate failures into skips or xfails, as needed. An aspiration.

assert err == cuda.CUresult.CUDA_SUCCESS

for idx, attr in enumerate(
Expand Down Expand Up @@ -468,6 +470,12 @@ def test_cuda_graphMem_attr(device):
params.bytesize = allocSize

err, allocNode = cuda.cuGraphAddMemAllocNode(graph, None, 0, params)
if err == cuda.CUresult.CUDA_ERROR_OUT_OF_MEMORY:
(destroy_err,) = cuda.cuGraphDestroy(graph)
assert destroy_err == cuda.CUresult.CUDA_SUCCESS
(destroy_err,) = cuda.cuStreamDestroy(stream)
assert destroy_err == cuda.CUresult.CUDA_SUCCESS
xfail_if_mempool_oom(err, "cuGraphAddMemAllocNode", device)
assert err == cuda.CUresult.CUDA_SUCCESS
err, freeNode = cuda.cuGraphAddMemFreeNode(graph, [allocNode], 1, params.dptr)
assert err == cuda.CUresult.CUDA_SUCCESS
Expand Down
2 changes: 2 additions & 0 deletions cuda_bindings/tests/test_cudart.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import cuda.bindings.runtime as cudart
from cuda import pathfinder
from cuda.bindings import runtime
from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom


def isSuccess(err):
Expand Down Expand Up @@ -432,6 +433,7 @@ def test_cudart_MemPool_attr():

attr_list = [None] * 8
err, pool = cudart.cudaMemPoolCreate(poolProps)
xfail_if_mempool_oom(err, "cudaMemPoolCreate", poolProps.location.id)
assertSuccess(err)

for idx, attr in enumerate(
Expand Down
49 changes: 47 additions & 2 deletions cuda_core/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,17 @@
PinnedMemoryResourceOptions,
_device,
)
from cuda.core._utils.cuda_utils import handle_return
from cuda.core._utils.cuda_utils import CUDAError, handle_return

try:
from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom
except ModuleNotFoundError:
# Older cuda.bindings artifacts (for example 12.9.x backports) do not ship
# this helper yet. In that case, keep the primary failure visible instead of
# xfail-ing the known Windows MCDM mempool setup issue.
def xfail_if_mempool_oom(err_or_exc, api_name=None, device=0):
return


# Import shared test helpers for tests across subprojects.
# PLEASE KEEP IN SYNC with copies in other conftest.py in this repo.
Expand Down Expand Up @@ -61,21 +71,56 @@ def skip_if_managed_memory_unsupported(device):
pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
try:
ManagedMemoryResource()
except CUDAError as e:
xfail_if_mempool_oom(e, device)
raise
except RuntimeError as e:
if "requires CUDA 13.0" in str(e):
pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
raise


def create_managed_memory_resource_or_skip(*args, **kwargs):
def create_managed_memory_resource_or_skip(*args, xfail_device=None, **kwargs):
# Keep the established "skip" helper name for call-site readability, even though
# Windows MCDM mempool OOM setup failures are xfailed instead of skipped.
try:
return ManagedMemoryResource(*args, **kwargs)
except CUDAError as e:
xfail_if_mempool_oom(e, _device_id_from_resource_options(xfail_device, args, kwargs))
raise
except RuntimeError as e:
if "requires CUDA 13.0" in str(e):
pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
raise


def create_pinned_memory_resource_or_xfail(*args, xfail_device=None, **kwargs):
try:
return PinnedMemoryResource(*args, **kwargs)
except CUDAError as e:
xfail_if_mempool_oom(e, xfail_device)
raise


def _device_id_from_resource_options(device, args, kwargs):
if device is not None:
return device
options = kwargs.get("options")
if options is None and args:
options = args[0]
if options is None:
return 0
if isinstance(options, dict):
preferred_location = options.get("preferred_location")
preferred_location_type = options.get("preferred_location_type")
else:
preferred_location = getattr(options, "preferred_location", None)
preferred_location_type = getattr(options, "preferred_location_type", None)
if preferred_location_type in (None, "device") and isinstance(preferred_location, int) and preferred_location >= 0:
return preferred_location
return 0


@pytest.fixture(scope="session", autouse=True)
def session_setup():
# Always init CUDA.
Expand Down
10 changes: 8 additions & 2 deletions cuda_core/tests/test_managed_memory_warning.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
import pytest

import cuda.bindings
from conftest import xfail_if_mempool_oom
from cuda.core import Device, ManagedMemoryResource, ManagedMemoryResourceOptions
from cuda.core._memory._managed_memory_resource import reset_concurrent_access_warning
from cuda.core._utils.cuda_utils import CUDAError

_cuda_major = int(cuda.bindings.__version__.split(".")[0])

Expand Down Expand Up @@ -47,8 +49,12 @@ def device_without_concurrent_managed_access(init_cuda):
@requires_cuda_13
def test_default_pool_error_without_concurrent_access(device_without_concurrent_managed_access):
"""ManagedMemoryResource() raises RuntimeError when the default pool doesn't support managed."""
with pytest.raises(RuntimeError, match="does not support managed allocations"):
ManagedMemoryResource()
try:
with pytest.raises(RuntimeError, match="does not support managed allocations"):
ManagedMemoryResource()
except CUDAError as exc:
xfail_if_mempool_oom(exc, device_without_concurrent_managed_access)
raise


@requires_cuda_13
Expand Down
7 changes: 4 additions & 3 deletions cuda_core/tests/test_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

from conftest import (
create_managed_memory_resource_or_skip,
create_pinned_memory_resource_or_xfail,
skip_if_managed_memory_unsupported,
skip_if_pinned_memory_unsupported,
)
Expand Down Expand Up @@ -639,7 +640,7 @@ def test_non_managed_resources_report_not_managed(mr_kind):
mr = DeviceMemoryResource(device)
else:
skip_if_pinned_memory_unsupported(device)
mr = PinnedMemoryResource()
mr = create_pinned_memory_resource_or_xfail(xfail_device=device)
assert mr.is_managed is False
buf = mr.allocate(1024)
assert buf.is_managed is False
Expand Down Expand Up @@ -684,7 +685,7 @@ def test_pinned_memory_resource_initialization(init_cuda):

device.set_current()

mr = PinnedMemoryResource()
mr = create_pinned_memory_resource_or_xfail(xfail_device=device)
assert mr.is_device_accessible
assert mr.is_host_accessible

Expand Down Expand Up @@ -1581,7 +1582,7 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory):
pytest.skip("Device does not support mempool operations")
elif MR is PinnedMemoryResource:
skip_if_pinned_memory_unsupported(device)
mr = MR()
mr = create_pinned_memory_resource_or_xfail(xfail_device=device)
elif MR is ManagedMemoryResource:
skip_if_managed_memory_unsupported(device)
mr = create_managed_memory_resource_or_skip(MROps(preferred_location=device.device_id))
Expand Down
Loading