From f036e2869705e21304fb065d3ec26269080aea73 Mon Sep 17 00:00:00 2001 From: Marwand Ayubi Date: Sat, 17 Jan 2026 14:52:50 +0100 Subject: [PATCH] Fix use-after-free in memory pool shrinker causing kernel crashes During memory pressure, kswapd invokes shrinker callbacks via shrink_slab. A race condition exists where nv_mem_pool_destroy() can free the shrinker while kswapd is still iterating, causing the kernel to call corrupted function pointers and crash. Changes: - Move nv_mem_pool_shrinker_free() to execute FIRST in destroy sequence - Add synchronize_rcu() after shrinker unregistration to ensure all RCU readers have completed before continuing destruction - Set shrinker pointer to NULL after free to prevent dangling reference - Split DRM fence context destruction into prepare + final phases to signal fences before drm_gem_object_release() Tested on RTX 5090 with kernel 6.18.5 - system stable after fix. --- kernel-open/nvidia-drm/nvidia-drm-fence.c | 75 +++++++++++++++++++---- kernel-open/nvidia-drm/nvidia-drm-gem.c | 12 +++- kernel-open/nvidia-drm/nvidia-drm-gem.h | 2 + kernel-open/nvidia/nv-vm.c | 22 ++++++- 4 files changed, 96 insertions(+), 15 deletions(-) diff --git a/kernel-open/nvidia-drm/nvidia-drm-fence.c b/kernel-open/nvidia-drm/nvidia-drm-fence.c index 7af1ed7f1..a189c7e72 100644 --- a/kernel-open/nvidia-drm/nvidia-drm-fence.c +++ b/kernel-open/nvidia-drm/nvidia-drm-fence.c @@ -41,6 +41,8 @@ struct nv_drm_fence_context; struct nv_drm_fence_context_ops { + /* Called before drm_gem_object_release() to stop callbacks and signal fences */ + void (*prepare_destroy)(struct nv_drm_fence_context *nv_fence_context); void (*destroy)(struct nv_drm_fence_context *nv_fence_context); }; @@ -205,7 +207,12 @@ to_nv_prime_fence_context(struct nv_drm_fence_context *nv_fence_context) { return container_of(nv_fence_context, struct nv_drm_prime_fence_context, base); } -static void __nv_drm_prime_fence_context_destroy( +/* + * Prepare for destruction - stop callbacks and signal fences. + * This must be called BEFORE drm_gem_object_release() to prevent + * race conditions with kernel shrinker/drm_exec infrastructure. + */ +static void __nv_drm_prime_fence_context_prepare_destroy( struct nv_drm_fence_context *nv_fence_context) { struct nv_drm_device *nv_dev = nv_fence_context->nv_dev; @@ -224,6 +231,18 @@ static void __nv_drm_prime_fence_context_destroy( nv_drm_gem_prime_force_fence_signal(nv_prime_fence_context); spin_unlock(&nv_prime_fence_context->lock); +} + +/* + * Final destruction - free NVKMS resources and the structure itself. + * Called after drm_gem_object_release() has completed. + */ +static void __nv_drm_prime_fence_context_destroy( + struct nv_drm_fence_context *nv_fence_context) +{ + struct nv_drm_device *nv_dev = nv_fence_context->nv_dev; + struct nv_drm_prime_fence_context *nv_prime_fence_context = + to_nv_prime_fence_context(nv_fence_context); /* Free nvkms resources */ @@ -238,6 +257,7 @@ static void __nv_drm_prime_fence_context_destroy( } static struct nv_drm_fence_context_ops nv_drm_prime_fence_context_ops = { + .prepare_destroy = __nv_drm_prime_fence_context_prepare_destroy, .destroy = __nv_drm_prime_fence_context_destroy, }; @@ -401,6 +421,21 @@ static inline struct nv_drm_fence_context *to_nv_fence_context( return NULL; } +/* + * Prepare fence context for release - stop callbacks and signal fences. + * Called BEFORE drm_gem_object_release() to prevent race with kernel + * shrinker/drm_exec. + */ +static void +__nv_drm_fence_context_gem_prepare_release(struct nv_drm_gem_object *nv_gem) +{ + struct nv_drm_fence_context *nv_fence_context = to_nv_fence_context(nv_gem); + + if (nv_fence_context->ops->prepare_destroy) { + nv_fence_context->ops->prepare_destroy(nv_fence_context); + } +} + /* * Tear down of the 'struct nv_drm_fence_context' object is not expected * to be happen from any worker thread, if that happen it causes dead-lock @@ -416,6 +451,7 @@ __nv_drm_fence_context_gem_free(struct nv_drm_gem_object *nv_gem) } const struct nv_drm_gem_object_funcs nv_fence_context_gem_ops = { + .prepare_release = __nv_drm_fence_context_gem_prepare_release, .free = __nv_drm_fence_context_gem_free, }; @@ -1112,7 +1148,12 @@ __nv_drm_semsurf_ctx_reg_callbacks(struct nv_drm_semsurf_fence_ctx *ctx) } } -static void __nv_drm_semsurf_fence_ctx_destroy( +/* + * Prepare for destruction - stop callbacks, timers, and signal fences. + * This must be called BEFORE drm_gem_object_release() to prevent + * race conditions with kernel shrinker/drm_exec infrastructure. + */ +static void __nv_drm_semsurf_fence_ctx_prepare_destroy( struct nv_drm_fence_context *nv_fence_context) { struct nv_drm_device *nv_dev = nv_fence_context->nv_dev; @@ -1154,22 +1195,17 @@ static void __nv_drm_semsurf_fence_ctx_destroy( pendingNvKmsCallback); } - nvKms->freeSemaphoreSurface(nv_dev->pDevice, ctx->pSemSurface); - /* - * Now that the semaphore surface, the timer, and the workthread are gone: - * - * -No more RM/NVKMS callbacks will arrive, nor are any in progress. Freeing - * the semaphore surface cancels all its callbacks associated with this - * instance of it, and idles any pending callbacks. + * Now that the timer and the workthread are gone: * * -No more timer callbacks will arrive, nor are any in flight. * * -The workthread has been idled and is no longer running. * - * Further, given the destructor is running, no other references to the - * fence context exist, so this code can assume no concurrent access to the - * fence context's data will happen from here on out. + * Clean up local callback data and force-signal all pending fences. + * This must happen BEFORE drm_gem_object_release() so the kernel's + * drm_exec/shrinker infrastructure doesn't try to access our dma_resv + * while we still have active fences. */ if (ctx->callback.local) { @@ -1179,6 +1215,20 @@ static void __nv_drm_semsurf_fence_ctx_destroy( } __nv_drm_semsurf_force_complete_pending(ctx); +} + +/* + * Final destruction - free NVKMS resources and the structure itself. + * Called after drm_gem_object_release() has completed. + */ +static void __nv_drm_semsurf_fence_ctx_destroy( + struct nv_drm_fence_context *nv_fence_context) +{ + struct nv_drm_device *nv_dev = nv_fence_context->nv_dev; + struct nv_drm_semsurf_fence_ctx *ctx = + to_semsurf_fence_ctx(nv_fence_context); + + nvKms->freeSemaphoreSurface(nv_dev->pDevice, ctx->pSemSurface); nv_drm_free(nv_fence_context); } @@ -1218,6 +1268,7 @@ __nv_drm_semsurf_ctx_timeout_callback(nv_drm_timer *timer) static struct nv_drm_fence_context_ops nv_drm_semsurf_fence_ctx_ops = { + .prepare_destroy = __nv_drm_semsurf_fence_ctx_prepare_destroy, .destroy = __nv_drm_semsurf_fence_ctx_destroy, }; diff --git a/kernel-open/nvidia-drm/nvidia-drm-gem.c b/kernel-open/nvidia-drm/nvidia-drm-gem.c index 65477cd00..f5d6f9d34 100644 --- a/kernel-open/nvidia-drm/nvidia-drm-gem.c +++ b/kernel-open/nvidia-drm/nvidia-drm-gem.c @@ -48,7 +48,17 @@ void nv_drm_gem_free(struct drm_gem_object *gem) { struct nv_drm_gem_object *nv_gem = to_nv_gem_object(gem); - /* Cleanup core gem object */ + /* + * Prepare for release - stop callbacks and signal fences BEFORE + * releasing the core gem object. This prevents race conditions where + * the kernel's drm_exec/shrinker infrastructure can access the object + * via dma_resv while it's being destroyed. + */ + if (nv_gem->ops->prepare_release) { + nv_gem->ops->prepare_release(nv_gem); + } + + /* Cleanup core gem object - now safe since fences are detached */ drm_gem_object_release(&nv_gem->base); #if !defined(NV_DRM_GEM_OBJECT_HAS_RESV) diff --git a/kernel-open/nvidia-drm/nvidia-drm-gem.h b/kernel-open/nvidia-drm/nvidia-drm-gem.h index efb590ec2..b51cc7710 100644 --- a/kernel-open/nvidia-drm/nvidia-drm-gem.h +++ b/kernel-open/nvidia-drm/nvidia-drm-gem.h @@ -45,6 +45,8 @@ struct nv_drm_gem_object; struct nv_drm_gem_object_funcs { + /* Called before drm_gem_object_release() to stop callbacks and signal fences */ + void (*prepare_release)(struct nv_drm_gem_object *nv_gem); void (*free)(struct nv_drm_gem_object *nv_gem); struct sg_table *(*prime_get_sg_table)(struct nv_drm_gem_object *nv_gem); void *(*prime_vmap)(struct nv_drm_gem_object *nv_gem); diff --git a/kernel-open/nvidia/nv-vm.c b/kernel-open/nvidia/nv-vm.c index c9e49cde1..a2c2df1c8 100644 --- a/kernel-open/nvidia/nv-vm.c +++ b/kernel-open/nvidia/nv-vm.c @@ -409,6 +409,8 @@ typedef struct nv_page_pool_t nv_page_pool_t *sysmem_page_pools[MAX_NUMNODES][NV_MAX_PAGE_ORDER + 1]; +#include + #ifdef NV_SHRINKER_ALLOC_PRESENT static nv_page_pool_t *nv_mem_pool_get_from_shrinker(struct shrinker *shrinker) { @@ -420,6 +422,13 @@ static void nv_mem_pool_shrinker_free(nv_page_pool_t *mem_pool) if (mem_pool->shrinker != NULL) { shrinker_free(mem_pool->shrinker); + mem_pool->shrinker = NULL; + + /* + * Ensure RCU grace period completes before continuing destruction. + * This prevents use-after-free if kswapd is iterating shrinkers. + */ + synchronize_rcu(); } } @@ -445,6 +454,13 @@ static void nv_mem_pool_shrinker_free(nv_page_pool_t *mem_pool) if (mem_pool->shrinker != NULL) { unregister_shrinker(mem_pool->shrinker); + mem_pool->shrinker = NULL; + + /* + * Ensure RCU grace period completes before continuing destruction. + * This prevents use-after-free if kswapd is iterating shrinkers. + */ + synchronize_rcu(); } } @@ -692,6 +708,9 @@ nv_mem_pool_destroy(nv_page_pool_t *mem_pool) { NV_STATUS status; + // Unregister shrinker FIRST to prevent callbacks during cleanup + nv_mem_pool_shrinker_free(mem_pool); + status = os_acquire_mutex(mem_pool->lock); WARN_ON(status != NV_OK); nv_mem_pool_free_page_list(&mem_pool->dirty_list, mem_pool->order); @@ -706,8 +725,6 @@ nv_mem_pool_destroy(nv_page_pool_t *mem_pool) nv_mem_pool_free_page_list(&mem_pool->clean_list, mem_pool->order); os_release_mutex(mem_pool->lock); - nv_mem_pool_shrinker_free(mem_pool); - os_free_mutex(mem_pool->lock); NV_KFREE(mem_pool, sizeof(*mem_pool)); @@ -759,6 +776,7 @@ nv_page_pool_t* nv_mem_pool_init(int node_id, unsigned int order) nv_mem_pool_shrinker_register(mem_pool, shrinker); mem_pool->shrinker = shrinker; + return mem_pool; failed: