From 66546452187221a73b94aa1bbfa6e1b3ab2da96f Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 6 May 2026 14:21:14 -0700 Subject: [PATCH 1/3] cuda.core: keep kernel-argument objects alive in graph kernel nodes `GraphDefinition.launch()` did not extend the lifetime of the Python kernel-argument objects to the lifetime of the graph. The `ParamHolder` built in `GN_launch` held the only references to those objects and was destroyed when `GN_launch` returned. The driver only stores the raw pointer values in the kernel node, so a `Buffer` reachable only through the call could be GC'd before the graph ran, leaving the graph with a stale device pointer. Attach the `kernel_args` tuple to the graph as a CUDA user object, mirroring the existing handling of `KernelHandle` and `EventHandle`. This reuses the `_py_host_destructor` path already used by the host callback machinery. Closes #2039 Co-authored-by: Cursor --- cuda_core/cuda/core/graph/_graph_node.pyx | 9 +++ cuda_core/cuda/core/graph/_utils.pxd | 2 + .../graph/test_graph_definition_lifetime.py | 79 ++++++++++++++++++- 3 files changed, 89 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/graph/_graph_node.pyx b/cuda_core/cuda/core/graph/_graph_node.pyx index c9d1786caa6..e4e00d5c5f5 100644 --- a/cuda_core/cuda/core/graph/_graph_node.pyx +++ b/cuda_core/cuda/core/graph/_graph_node.pyx @@ -6,6 +6,8 @@ from __future__ import annotations +from cpython.ref cimport Py_INCREF + from libc.stddef cimport size_t from libc.stdint cimport uintptr_t from libc.string cimport memset as c_memset @@ -54,6 +56,7 @@ from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, _parse_fill_value from cuda.core.graph._utils cimport ( _attach_host_callback_to_graph, _attach_user_object, + _py_host_destructor, ) import weakref @@ -617,6 +620,12 @@ cdef inline KernelNode GN_launch(GraphNode self, LaunchConfig conf, Kernel ker, _attach_user_object(as_cu(h_graph), new KernelHandle(ker._h_kernel), _destroy_kernel_handle_copy) + cdef object kernel_args = ker_args.kernel_args + if kernel_args is not None: + Py_INCREF(kernel_args) + _attach_user_object(as_cu(h_graph), kernel_args, + _py_host_destructor) + return _registered(KernelNode._create_with_params( create_graph_node_handle(new_node, h_graph), conf.grid, conf.block, conf.shmem_size, diff --git a/cuda_core/cuda/core/graph/_utils.pxd b/cuda_core/cuda/core/graph/_utils.pxd index 63fdb00ac4f..13d3742cc05 100644 --- a/cuda_core/cuda/core/graph/_utils.pxd +++ b/cuda_core/cuda/core/graph/_utils.pxd @@ -7,6 +7,8 @@ from cuda.bindings cimport cydriver cdef bint _is_py_host_trampoline(cydriver.CUhostFn fn) noexcept nogil +cdef void _py_host_destructor(void* data) noexcept with gil + cdef void _attach_user_object( cydriver.CUgraph graph, void* ptr, cydriver.CUhostFn destroy) except * diff --git a/cuda_core/tests/graph/test_graph_definition_lifetime.py b/cuda_core/tests/graph/test_graph_definition_lifetime.py index e231016c8ac..c617ab38da7 100644 --- a/cuda_core/tests/graph/test_graph_definition_lifetime.py +++ b/cuda_core/tests/graph/test_graph_definition_lifetime.py @@ -3,13 +3,15 @@ """Tests for GraphDefinition resource lifetime management and RAII correctness.""" +import ctypes import gc +import weakref import pytest from helpers.graph_kernels import compile_common_kernels from helpers.misc import try_create_condition -from cuda.core import Device, EventOptions, Kernel, LaunchConfig +from cuda.core import Device, DeviceMemoryResource, EventOptions, Kernel, LaunchConfig from cuda.core.graph import ( ChildGraphNode, ConditionalNode, @@ -485,3 +487,78 @@ def test_kernel_node_reconstruction_preserves_validity(init_cuda): stream = Device().create_stream() graph.launch(stream) stream.sync() + + +# ============================================================================= +# Kernel argument lifetime — kernel nodes should keep argument objects alive +# ============================================================================= + + +def test_kernel_args_buffer_kept_alive_through_execution(init_cuda): + """Buffer passed as a kernel arg is kept alive by the graph, and the kernel + actually executes against its memory after the original Python ref drops. + + Without the user-object attachment, the ParamHolder is destroyed when the + kernel node is added, the Buffer is GC'd, and the graph is left with a + stale device pointer. + """ + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + add_one = compile_common_kernels().get_kernel("add_one") + buf = mr.allocate(ctypes.sizeof(ctypes.c_int), stream=dev.default_stream) + buf.fill(0, stream=dev.default_stream) + dev.default_stream.sync() + buf_weak = weakref.ref(buf) + dptr = int(buf.handle) + + g = GraphDefinition() + g.launch(LaunchConfig(grid=1, block=1), add_one, buf) + + del buf + gc.collect() + assert buf_weak() is not None # graph kept the Buffer alive + + stream = dev.create_stream() + g.instantiate().launch(stream) + stream.sync() + + out = (ctypes.c_int * 1)(0) + handle_return(driver.cuMemcpyDtoH(out, dptr, ctypes.sizeof(ctypes.c_int))) + assert out[0] == 1 + + +def test_kernel_args_survive_graph_clone(init_cuda): + """Cloned graph keeps Buffer alive via CUDA user objects. + + A graph clone does not inherit Python-level references, so only user + objects (which propagate through cuGraphClone) can keep the args alive. + """ + from cuda.core._utils.cuda_utils import driver, handle_return + + _skip_if_no_mempool() + dev = Device() + mr = DeviceMemoryResource(dev) + add_one = compile_common_kernels().get_kernel("add_one") + buf = mr.allocate(ctypes.sizeof(ctypes.c_int), stream=dev.default_stream) + buf.fill(0, stream=dev.default_stream) + dev.default_stream.sync() + dptr = int(buf.handle) + + g = GraphDefinition() + g.launch(LaunchConfig(grid=1, block=1), add_one, buf) + cloned_cu_graph = handle_return(driver.cuGraphClone(driver.CUgraph(g.handle))) + + del buf, g + gc.collect() + + graph_exec = handle_return(driver.cuGraphInstantiate(cloned_cu_graph, 0)) + stream = dev.create_stream() + handle_return(driver.cuGraphLaunch(graph_exec, driver.CUstream(int(stream.handle)))) + stream.sync() + + out = (ctypes.c_int * 1)(0) + handle_return(driver.cuMemcpyDtoH(out, dptr, ctypes.sizeof(ctypes.c_int))) + assert out[0] == 1 From a9271aad6e665c28e88f40f24b4eb039f1befb1e Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 6 May 2026 15:17:06 -0700 Subject: [PATCH 2/3] test: assert kernel-arg Buffer is freed when graph is released Addresses review feedback (PR #2041): the existing test only proved the graph kept the Buffer alive, not that the user-object machinery actually releases it once the graph is destroyed. Without the symmetric check, a working attachment is indistinguishable from a permanent leak. Co-authored-by: Cursor --- .../tests/graph/test_graph_definition_lifetime.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cuda_core/tests/graph/test_graph_definition_lifetime.py b/cuda_core/tests/graph/test_graph_definition_lifetime.py index c617ab38da7..360f66c91ba 100644 --- a/cuda_core/tests/graph/test_graph_definition_lifetime.py +++ b/cuda_core/tests/graph/test_graph_definition_lifetime.py @@ -494,9 +494,10 @@ def test_kernel_node_reconstruction_preserves_validity(init_cuda): # ============================================================================= -def test_kernel_args_buffer_kept_alive_through_execution(init_cuda): - """Buffer passed as a kernel arg is kept alive by the graph, and the kernel - actually executes against its memory after the original Python ref drops. +def test_kernel_args_buffer_lifetime(init_cuda): + """Buffer passed as a kernel arg is kept alive by the graph, the kernel + executes against its memory after the original Python ref drops, and the + Buffer is released once the graph is destroyed. Without the user-object attachment, the ParamHolder is destroyed when the kernel node is added, the Buffer is GC'd, and the graph is left with a @@ -529,6 +530,10 @@ def test_kernel_args_buffer_kept_alive_through_execution(init_cuda): handle_return(driver.cuMemcpyDtoH(out, dptr, ctypes.sizeof(ctypes.c_int))) assert out[0] == 1 + del g + gc.collect() + assert buf_weak() is None # graph released, Buffer freed + def test_kernel_args_survive_graph_clone(init_cuda): """Cloned graph keeps Buffer alive via CUDA user objects. From 9f2c8f2aba6825b8b3648e03c820631d55c879d7 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 6 May 2026 16:03:53 -0700 Subject: [PATCH 3/3] test: poll for kernel-arg Buffer release to handle async cleanup The freeing assertion at the end of test_kernel_args_buffer_lifetime failed on free-threaded Python (py3.14t) because cuGraphExecDestroy releases its user-object references via an asynchronous DPC, and free- threaded CPython's deferred ref counting can need an extra GC pass to settle. Poll the weakref with a bounded timeout and per-iteration GC instead of asserting eagerly. Co-authored-by: Cursor --- .../graph/test_graph_definition_lifetime.py | 27 +++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/cuda_core/tests/graph/test_graph_definition_lifetime.py b/cuda_core/tests/graph/test_graph_definition_lifetime.py index 360f66c91ba..c53009a5724 100644 --- a/cuda_core/tests/graph/test_graph_definition_lifetime.py +++ b/cuda_core/tests/graph/test_graph_definition_lifetime.py @@ -5,12 +5,31 @@ import ctypes import gc +import time import weakref import pytest from helpers.graph_kernels import compile_common_kernels from helpers.misc import try_create_condition + +def _wait_until(predicate, timeout=2.0, interval=0.01): + """Poll predicate() until True or timeout, driving gc each iteration. + + Used for assertions about resource cleanup that may be delayed by CUDA's + asynchronous user-object destructor pump (DPC) or, on free-threaded + Python, by deferred reference-count processing. A bounded poll keeps the + test correct without depending on undocumented driver timing guarantees. + """ + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + gc.collect() + if predicate(): + return + time.sleep(interval) + raise AssertionError(f"condition not satisfied within {timeout}s") + + from cuda.core import Device, DeviceMemoryResource, EventOptions, Kernel, LaunchConfig from cuda.core.graph import ( ChildGraphNode, @@ -502,6 +521,11 @@ def test_kernel_args_buffer_lifetime(init_cuda): Without the user-object attachment, the ParamHolder is destroyed when the kernel node is added, the Buffer is GC'd, and the graph is left with a stale device pointer. + + The final freeing assertion uses a bounded poll because CUgraphExec + releases its user-object references via an asynchronous DPC, and on + free-threaded Python the resulting Py_DECREF chain may need an extra + GC pass to settle. """ from cuda.core._utils.cuda_utils import driver, handle_return @@ -531,8 +555,7 @@ def test_kernel_args_buffer_lifetime(init_cuda): assert out[0] == 1 del g - gc.collect() - assert buf_weak() is None # graph released, Buffer freed + _wait_until(lambda: buf_weak() is None) def test_kernel_args_survive_graph_clone(init_cuda):