diff --git a/cuda_bindings/cuda/bindings/_test_helpers/mempool.py b/cuda_bindings/cuda/bindings/_test_helpers/mempool.py new file mode 100644 index 00000000000..deee79f1aff --- /dev/null +++ b/cuda_bindings/cuda/bindings/_test_helpers/mempool.py @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +import sys + +import pytest + +from cuda.bindings import driver, runtime + + +def is_windows_mcdm_device(device=0): + if sys.platform != "win32": + return False + import cuda.bindings.nvml as nvml + + device_id = int(getattr(device, "device_id", device)) + (err,) = driver.cuInit(0) + if err != driver.CUresult.CUDA_SUCCESS: + return False + err, pci_bus_id = driver.cuDeviceGetPCIBusId(13, device_id) + if err != driver.CUresult.CUDA_SUCCESS: + return False + pci_bus_id = pci_bus_id.split(b"\x00", 1)[0].decode("ascii") + nvml.init_v2() + try: + handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id) + current, _ = nvml.device_get_driver_model_v2(handle) + return current == nvml.DriverModel.DRIVER_MCDM + finally: + nvml.shutdown() + + +def xfail_if_mempool_oom(err_or_exc, api_name=None, device=0): + if api_name is not None and not isinstance(api_name, str): + device = api_name + api_name = None + + is_oom = err_or_exc in ( + driver.CUresult.CUDA_ERROR_OUT_OF_MEMORY, + runtime.cudaError_t.cudaErrorMemoryAllocation, + ) or "CUDA_ERROR_OUT_OF_MEMORY" in str(err_or_exc) + + if not is_oom: + return + try: + is_windows_mcdm = is_windows_mcdm_device(device) + except Exception: + # If MCDM detection fails, leave the primary test failure visible. + return + if not is_windows_mcdm: + return + + api_context = f"{api_name} " if api_name else "" + pytest.xfail(f"{api_context}could not reserve VA for mempool operations on Windows MCDM") diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py index e3eefb1fdd7..e12d53d9665 100644 --- a/cuda_bindings/tests/test_cuda.py +++ b/cuda_bindings/tests/test_cuda.py @@ -12,6 +12,7 @@ import cuda.bindings.driver as cuda import cuda.bindings.runtime as cudart from cuda.bindings import driver +from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom def driverVersionLessThan(target): @@ -270,6 +271,7 @@ def test_cuda_memPool_attr(): attr_list = [None] * 8 err, pool = cuda.cuMemPoolCreate(poolProps) + xfail_if_mempool_oom(err, "cuMemPoolCreate", poolProps.location.id) assert err == cuda.CUresult.CUDA_SUCCESS for idx, attr in enumerate( @@ -468,6 +470,12 @@ def test_cuda_graphMem_attr(device): params.bytesize = allocSize err, allocNode = cuda.cuGraphAddMemAllocNode(graph, None, 0, params) + if err == cuda.CUresult.CUDA_ERROR_OUT_OF_MEMORY: + (destroy_err,) = cuda.cuGraphDestroy(graph) + assert destroy_err == cuda.CUresult.CUDA_SUCCESS + (destroy_err,) = cuda.cuStreamDestroy(stream) + assert destroy_err == cuda.CUresult.CUDA_SUCCESS + xfail_if_mempool_oom(err, "cuGraphAddMemAllocNode", device) assert err == cuda.CUresult.CUDA_SUCCESS err, freeNode = cuda.cuGraphAddMemFreeNode(graph, [allocNode], 1, params.dptr) assert err == cuda.CUresult.CUDA_SUCCESS diff --git a/cuda_bindings/tests/test_cudart.py b/cuda_bindings/tests/test_cudart.py index 3fa5594a262..144d7e75b12 100644 --- a/cuda_bindings/tests/test_cudart.py +++ b/cuda_bindings/tests/test_cudart.py @@ -11,6 +11,7 @@ import cuda.bindings.runtime as cudart from cuda import pathfinder from cuda.bindings import runtime +from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom def isSuccess(err): @@ -432,6 +433,7 @@ def test_cudart_MemPool_attr(): attr_list = [None] * 8 err, pool = cudart.cudaMemPoolCreate(poolProps) + xfail_if_mempool_oom(err, "cudaMemPoolCreate", poolProps.location.id) assertSuccess(err) for idx, attr in enumerate( diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index 85c5e75ff78..9f48686c30c 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -27,7 +27,17 @@ PinnedMemoryResourceOptions, _device, ) -from cuda.core._utils.cuda_utils import handle_return +from cuda.core._utils.cuda_utils import CUDAError, handle_return + +try: + from cuda.bindings._test_helpers.mempool import xfail_if_mempool_oom +except ModuleNotFoundError: + # Older cuda.bindings artifacts (for example 12.9.x backports) do not ship + # this helper yet. In that case, keep the primary failure visible instead of + # xfail-ing the known Windows MCDM mempool setup issue. + def xfail_if_mempool_oom(err_or_exc, api_name=None, device=0): + return + # Import shared test helpers for tests across subprojects. # PLEASE KEEP IN SYNC with copies in other conftest.py in this repo. @@ -61,21 +71,56 @@ def skip_if_managed_memory_unsupported(device): pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later") try: ManagedMemoryResource() + except CUDAError as e: + xfail_if_mempool_oom(e, device) + raise except RuntimeError as e: if "requires CUDA 13.0" in str(e): pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later") raise -def create_managed_memory_resource_or_skip(*args, **kwargs): +def create_managed_memory_resource_or_skip(*args, xfail_device=None, **kwargs): + # Keep the established "skip" helper name for call-site readability, even though + # Windows MCDM mempool OOM setup failures are xfailed instead of skipped. try: return ManagedMemoryResource(*args, **kwargs) + except CUDAError as e: + xfail_if_mempool_oom(e, _device_id_from_resource_options(xfail_device, args, kwargs)) + raise except RuntimeError as e: if "requires CUDA 13.0" in str(e): pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later") raise +def create_pinned_memory_resource_or_xfail(*args, xfail_device=None, **kwargs): + try: + return PinnedMemoryResource(*args, **kwargs) + except CUDAError as e: + xfail_if_mempool_oom(e, xfail_device) + raise + + +def _device_id_from_resource_options(device, args, kwargs): + if device is not None: + return device + options = kwargs.get("options") + if options is None and args: + options = args[0] + if options is None: + return 0 + if isinstance(options, dict): + preferred_location = options.get("preferred_location") + preferred_location_type = options.get("preferred_location_type") + else: + preferred_location = getattr(options, "preferred_location", None) + preferred_location_type = getattr(options, "preferred_location_type", None) + if preferred_location_type in (None, "device") and isinstance(preferred_location, int) and preferred_location >= 0: + return preferred_location + return 0 + + @pytest.fixture(scope="session", autouse=True) def session_setup(): # Always init CUDA. diff --git a/cuda_core/tests/test_managed_memory_warning.py b/cuda_core/tests/test_managed_memory_warning.py index 78015978e72..5e6032ebe9e 100644 --- a/cuda_core/tests/test_managed_memory_warning.py +++ b/cuda_core/tests/test_managed_memory_warning.py @@ -13,8 +13,10 @@ import pytest import cuda.bindings +from conftest import xfail_if_mempool_oom from cuda.core import Device, ManagedMemoryResource, ManagedMemoryResourceOptions from cuda.core._memory._managed_memory_resource import reset_concurrent_access_warning +from cuda.core._utils.cuda_utils import CUDAError _cuda_major = int(cuda.bindings.__version__.split(".")[0]) @@ -47,8 +49,12 @@ def device_without_concurrent_managed_access(init_cuda): @requires_cuda_13 def test_default_pool_error_without_concurrent_access(device_without_concurrent_managed_access): """ManagedMemoryResource() raises RuntimeError when the default pool doesn't support managed.""" - with pytest.raises(RuntimeError, match="does not support managed allocations"): - ManagedMemoryResource() + try: + with pytest.raises(RuntimeError, match="does not support managed allocations"): + ManagedMemoryResource() + except CUDAError as exc: + xfail_if_mempool_oom(exc, device_without_concurrent_managed_access) + raise @requires_cuda_13 diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 85dd4a7ea2b..fb99895616d 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -22,6 +22,7 @@ from conftest import ( create_managed_memory_resource_or_skip, + create_pinned_memory_resource_or_xfail, skip_if_managed_memory_unsupported, skip_if_pinned_memory_unsupported, ) @@ -639,7 +640,7 @@ def test_non_managed_resources_report_not_managed(mr_kind): mr = DeviceMemoryResource(device) else: skip_if_pinned_memory_unsupported(device) - mr = PinnedMemoryResource() + mr = create_pinned_memory_resource_or_xfail(xfail_device=device) assert mr.is_managed is False buf = mr.allocate(1024) assert buf.is_managed is False @@ -684,7 +685,7 @@ def test_pinned_memory_resource_initialization(init_cuda): device.set_current() - mr = PinnedMemoryResource() + mr = create_pinned_memory_resource_or_xfail(xfail_device=device) assert mr.is_device_accessible assert mr.is_host_accessible @@ -1581,7 +1582,7 @@ def test_memory_resource_alloc_zero_bytes(init_cuda, memory_resource_factory): pytest.skip("Device does not support mempool operations") elif MR is PinnedMemoryResource: skip_if_pinned_memory_unsupported(device) - mr = MR() + mr = create_pinned_memory_resource_or_xfail(xfail_device=device) elif MR is ManagedMemoryResource: skip_if_managed_memory_unsupported(device) mr = create_managed_memory_resource_or_skip(MROps(preferred_location=device.device_id))