diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx index 1ea2df564c4..c0d7f09ee44 100644 --- a/cuda_core/cuda/core/_device.pyx +++ b/cuda_core/cuda/core/_device.pyx @@ -377,7 +377,7 @@ cdef class DeviceProperties: @property def gpu_overlap(self) -> bool: - """bool: Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead async_engine_count.""" + """bool: Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use :attr:`~DeviceProperties.async_engine_count` instead.""" return bool(self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_GPU_OVERLAP)) @property @@ -662,7 +662,7 @@ cdef class DeviceProperties: @property def read_only_host_register_supported(self) -> bool: - """bool: True if device supports using the cuMemHostRegister flag CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU, False if not.""" + """bool: True if device supports using the cuMemHostRegister flag CU_MEMHOSTREGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU, False if not.""" return bool( self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED) ) @@ -841,12 +841,12 @@ cdef class DeviceProperties: @property def mem_decompress_algorithm_mask(self) -> int: - """int: The returned valued shall be interpreted as a bitmask, where the individual bits are described by the CUmemDecompressAlgorithm enum.""" + """int: The returned value shall be interpreted as a bitmask, where the individual bits are described by the CUmemDecompressAlgorithm enum.""" return self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK) @property def mem_decompress_maximum_length(self) -> int: - """int: The returned valued is the maximum length in bytes of a single decompress operation that is allowed.""" + """int: The returned value is the maximum length in bytes of a single decompress operation that is allowed.""" return self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_MAXIMUM_LENGTH) @property @@ -897,7 +897,7 @@ cdef class DeviceProperties: @property def host_memory_pools_supported(self) -> bool: - """bool: Device suports HOST location with the cuMemAllocAsync and cuMemPool family of APIs.""" + """bool: Device supports HOST location with the cuMemAllocAsync and cuMemPool family of APIs.""" return bool( self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_HOST_MEMORY_POOLS_SUPPORTED) ) @@ -1033,7 +1033,7 @@ class Device: Parameters ---------- peer : Device | int - The peer device to check accessibility to. Can be a Device object or device ID. + The peer device to check accessibility to. Can be a :obj:`~_device.Device` object or device ID. """ peer = Device(peer) cdef int d1 = self.device_id @@ -1253,7 +1253,7 @@ class Device: Note ---- - The newly context will not be set as current. + The newly created context will not be set as current. Parameters ---------- @@ -1269,7 +1269,7 @@ class Device: raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189") def create_stream(self, obj: IsStreamT | None = None, options: StreamOptions | None = None) -> Stream: - """Create a Stream object. + """Create a :obj:`~_stream.Stream` object. New stream objects can be created in two different ways: @@ -1300,7 +1300,7 @@ class Device: return Stream._init(obj=obj, options=options, device_id=self._device_id, ctx=self._context) def create_event(self, options: EventOptions | None = None) -> Event: - """Create an Event object without recording it to a Stream. + """Create an :obj:`~_event.Event` object without recording it to a :obj:`~_stream.Stream`. Note ---- diff --git a/cuda_core/cuda/core/_event.pyx b/cuda_core/cuda/core/_event.pyx index 4a0491d8650..d236aa5790f 100644 --- a/cuda_core/cuda/core/_event.pyx +++ b/cuda_core/cuda/core/_event.pyx @@ -211,7 +211,20 @@ cdef class Event: @classmethod def from_ipc_descriptor(cls, ipc_descriptor: IPCEventDescriptor) -> Event: - """Import an event that was exported from another process.""" + """Import an event that was exported from another process. + + Parameters + ---------- + ipc_descriptor : :obj:`~_memory._ipc.IPCEventDescriptor` + The IPC descriptor obtained from :attr:`~Event.ipc_descriptor` in + another process. + + Returns + ------- + :obj:`~_event.Event` + A new event backed by the imported IPC handle. + + """ cdef cydriver.CUipcEventHandle data memcpy(data.reserved, (ipc_descriptor._reserved), sizeof(data.reserved)) cdef Event self = Event.__new__(cls) diff --git a/cuda_core/cuda/core/_linker.pyx b/cuda_core/cuda/core/_linker.pyx index 09aa9863cd7..cf784706e5e 100644 --- a/cuda_core/cuda/core/_linker.pyx +++ b/cuda_core/cuda/core/_linker.pyx @@ -188,7 +188,7 @@ class LinkerOptions: Attributes ---------- name : str, optional - Name of the linker. If the linking succeeds, the name is passed down to the generated `ObjectCode`. + Name of the linker. If the linking succeeds, the name is passed down to the generated :class:`ObjectCode`. arch : str, optional Pass the SM architecture value, such as ``sm_`` (for generating CUBIN) or ``compute_`` (for generating PTX). If not provided, the current device's architecture diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index bb6fd97df6f..65df7091e67 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -204,8 +204,9 @@ cdef class Buffer: Parameters ---------- - dst : :obj:`~_memory.Buffer` - Source buffer to copy data from + dst : :obj:`~_memory.Buffer`, optional + Destination buffer to copy data to. If not provided, a new buffer + is allocated using this buffer's memory resource. stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder` Keyword argument specifying the stream for the asynchronous copy diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx index 9f8e4bcd534..57494c1e915 100644 --- a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx @@ -223,7 +223,7 @@ cdef class DeviceMemoryResource(_MemPool): Returns a tuple of sorted device IDs that currently have peer access to allocations from this memory pool. - When setting, accepts a sequence of Device objects or device IDs. + When setting, accepts a sequence of :obj:`~_device.Device` objects or device IDs. Setting to an empty sequence revokes all peer access. For non-owned pools (the default or current device pool), the state diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx index 2eaff7fb11b..7da61af5d48 100644 --- a/cuda_core/cuda/core/_module.pyx +++ b/cuda_core/cuda/core/_module.pyx @@ -257,7 +257,7 @@ cdef class KernelOccupancy: Returns ------- :obj:`~MaxPotentialBlockSizeOccupancyResult` - An object with `min_grid_size` amd `max_block_size` attributes encoding + An object with `min_grid_size` and `max_block_size` attributes encoding the suggested launch configuration. Note diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx index 194ef6da53f..cfc66451c86 100644 --- a/cuda_core/cuda/core/_program.pyx +++ b/cuda_core/cuda/core/_program.pyx @@ -173,7 +173,7 @@ class ProgramOptions: Attributes ---------- name : str, optional - Name of the program. If the compilation succeeds, the name is passed down to the generated `ObjectCode`. + Name of the program. If the compilation succeeds, the name is passed down to the generated :class:`ObjectCode`. arch : str, optional Pass the SM architecture value, such as ``sm_`` (for generating CUBIN) or ``compute_`` (for generating PTX). If not provided, the current device's architecture @@ -272,13 +272,13 @@ class ProgramOptions: Disable the display of a diagnostic number for warning messages. Default: False diag_error : Union[int, list[int]], optional - Emit error for a specified diagnostic message number or comma separated list of numbers. + Emit error for a specified diagnostic message number or comma-separated list of numbers. Default: None diag_suppress : Union[int, list[int]], optional - Suppress a specified diagnostic message number or comma separated list of numbers. + Suppress a specified diagnostic message number or comma-separated list of numbers. Default: None diag_warn : Union[int, list[int]], optional - Emit warning for a specified diagnostic message number or comma separated lis of numbers. + Emit warning for a specified diagnostic message number or comma-separated list of numbers. Default: None brief_diagnostics : bool, optional Disable or enable showing source line and column info in a diagnostic. diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx index ca13811cd3c..fdb617f0325 100644 --- a/cuda_core/cuda/core/_stream.pyx +++ b/cuda_core/cuda/core/_stream.pyx @@ -227,7 +227,7 @@ cdef class Stream: def record(self, event: Event = None, options: EventOptions = None) -> Event: """Record an event onto the stream. - Creates an Event object (or reuses the given one) by + Creates an :obj:`~_event.Event` object (or reuses the given one) by recording on the stream. Parameters @@ -269,6 +269,13 @@ cdef class Stream: work is completed. This is done by recording a new :obj:`~_event.Event` on the stream and then waiting on it. + Parameters + ---------- + event_or_stream : :obj:`~_event.Event` | :obj:`~_stream.Stream` + The event or stream to wait for. Objects supporting the + ``__cuda_stream__`` protocol are also accepted and treated as + streams. + """ cdef Stream stream cdef EventHandle h_event @@ -332,7 +339,7 @@ cdef class Stream: Note ---- Stream lifetime is not managed, foreign object must remain - alive while this steam is active. + alive while this stream is active. Parameters ---------- diff --git a/cuda_core/cuda/core/system/_system.pyx b/cuda_core/cuda/core/system/_system.pyx index f306c036b8c..d1a7e97e1b6 100644 --- a/cuda_core/cuda/core/system/_system.pyx +++ b/cuda_core/cuda/core/system/_system.pyx @@ -88,6 +88,11 @@ def get_driver_version_full(kernel_mode: bool = False) -> tuple[int, int, int]: def get_nvml_version() -> tuple[int, ...]: """ The version of the NVML library. + + Returns + ------- + version: tuple[int, ...] + Tuple of integers representing the NVML version components. """ if not CUDA_BINDINGS_NVML_IS_COMPATIBLE: raise RuntimeError("NVML library is not available") @@ -97,6 +102,11 @@ def get_nvml_version() -> tuple[int, ...]: def get_driver_branch() -> str: """ Retrieves the driver branch of the NVIDIA driver installed on the system. + + Returns + ------- + branch: str + The driver branch string (e.g., ``"560"``, ``"open"``, etc.). """ if not CUDA_BINDINGS_NVML_IS_COMPATIBLE: raise RuntimeError("NVML library is not available") diff --git a/cuda_core/docs/source/getting-started.rst b/cuda_core/docs/source/getting-started.rst index 1761f2cc37c..7ded390b65c 100644 --- a/cuda_core/docs/source/getting-started.rst +++ b/cuda_core/docs/source/getting-started.rst @@ -1,4 +1,4 @@ -.. SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. .. SPDX-License-Identifier: Apache-2.0 .. currentmodule:: cuda.core @@ -68,7 +68,7 @@ Don't forget to use :meth:`Device.set_current`! s = dev.create_stream() Next, we compile the CUDA C++ kernel from earlier using the :class:`Program` class. -The result of the compilation is saved as a CUBIN. +The result of the compilation is saved as a CUBIN. Note the use of the ``name_expressions`` parameter to the :meth:`Program.compile` method to specify which kernel template instantiations to compile: .. code-block:: python