Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 63 additions & 12 deletions kernel-open/nvidia-drm/nvidia-drm-fence.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
struct nv_drm_fence_context;

struct nv_drm_fence_context_ops {
/* Called before drm_gem_object_release() to stop callbacks and signal fences */
void (*prepare_destroy)(struct nv_drm_fence_context *nv_fence_context);
void (*destroy)(struct nv_drm_fence_context *nv_fence_context);
};

Expand Down Expand Up @@ -205,7 +207,12 @@ to_nv_prime_fence_context(struct nv_drm_fence_context *nv_fence_context) {
return container_of(nv_fence_context, struct nv_drm_prime_fence_context, base);
}

static void __nv_drm_prime_fence_context_destroy(
/*
* Prepare for destruction - stop callbacks and signal fences.
* This must be called BEFORE drm_gem_object_release() to prevent
* race conditions with kernel shrinker/drm_exec infrastructure.
*/
static void __nv_drm_prime_fence_context_prepare_destroy(
struct nv_drm_fence_context *nv_fence_context)
{
struct nv_drm_device *nv_dev = nv_fence_context->nv_dev;
Expand All @@ -224,6 +231,18 @@ static void __nv_drm_prime_fence_context_destroy(
nv_drm_gem_prime_force_fence_signal(nv_prime_fence_context);

spin_unlock(&nv_prime_fence_context->lock);
}

/*
* Final destruction - free NVKMS resources and the structure itself.
* Called after drm_gem_object_release() has completed.
*/
static void __nv_drm_prime_fence_context_destroy(
struct nv_drm_fence_context *nv_fence_context)
{
struct nv_drm_device *nv_dev = nv_fence_context->nv_dev;
struct nv_drm_prime_fence_context *nv_prime_fence_context =
to_nv_prime_fence_context(nv_fence_context);

/* Free nvkms resources */

Expand All @@ -238,6 +257,7 @@ static void __nv_drm_prime_fence_context_destroy(
}

static struct nv_drm_fence_context_ops nv_drm_prime_fence_context_ops = {
.prepare_destroy = __nv_drm_prime_fence_context_prepare_destroy,
.destroy = __nv_drm_prime_fence_context_destroy,
};

Expand Down Expand Up @@ -401,6 +421,21 @@ static inline struct nv_drm_fence_context *to_nv_fence_context(
return NULL;
}

/*
* Prepare fence context for release - stop callbacks and signal fences.
* Called BEFORE drm_gem_object_release() to prevent race with kernel
* shrinker/drm_exec.
*/
static void
__nv_drm_fence_context_gem_prepare_release(struct nv_drm_gem_object *nv_gem)
{
struct nv_drm_fence_context *nv_fence_context = to_nv_fence_context(nv_gem);

if (nv_fence_context->ops->prepare_destroy) {
nv_fence_context->ops->prepare_destroy(nv_fence_context);
}
}

/*
* Tear down of the 'struct nv_drm_fence_context' object is not expected
* to be happen from any worker thread, if that happen it causes dead-lock
Expand All @@ -416,6 +451,7 @@ __nv_drm_fence_context_gem_free(struct nv_drm_gem_object *nv_gem)
}

const struct nv_drm_gem_object_funcs nv_fence_context_gem_ops = {
.prepare_release = __nv_drm_fence_context_gem_prepare_release,
.free = __nv_drm_fence_context_gem_free,
};

Expand Down Expand Up @@ -1112,7 +1148,12 @@ __nv_drm_semsurf_ctx_reg_callbacks(struct nv_drm_semsurf_fence_ctx *ctx)
}
}

static void __nv_drm_semsurf_fence_ctx_destroy(
/*
* Prepare for destruction - stop callbacks, timers, and signal fences.
* This must be called BEFORE drm_gem_object_release() to prevent
* race conditions with kernel shrinker/drm_exec infrastructure.
*/
static void __nv_drm_semsurf_fence_ctx_prepare_destroy(
struct nv_drm_fence_context *nv_fence_context)
{
struct nv_drm_device *nv_dev = nv_fence_context->nv_dev;
Expand Down Expand Up @@ -1154,22 +1195,17 @@ static void __nv_drm_semsurf_fence_ctx_destroy(
pendingNvKmsCallback);
}

nvKms->freeSemaphoreSurface(nv_dev->pDevice, ctx->pSemSurface);

/*
* Now that the semaphore surface, the timer, and the workthread are gone:
*
* -No more RM/NVKMS callbacks will arrive, nor are any in progress. Freeing
* the semaphore surface cancels all its callbacks associated with this
* instance of it, and idles any pending callbacks.
* Now that the timer and the workthread are gone:
*
* -No more timer callbacks will arrive, nor are any in flight.
*
* -The workthread has been idled and is no longer running.
*
* Further, given the destructor is running, no other references to the
* fence context exist, so this code can assume no concurrent access to the
* fence context's data will happen from here on out.
* Clean up local callback data and force-signal all pending fences.
* This must happen BEFORE drm_gem_object_release() so the kernel's
* drm_exec/shrinker infrastructure doesn't try to access our dma_resv
* while we still have active fences.
*/

if (ctx->callback.local) {
Expand All @@ -1179,6 +1215,20 @@ static void __nv_drm_semsurf_fence_ctx_destroy(
}

__nv_drm_semsurf_force_complete_pending(ctx);
}

/*
* Final destruction - free NVKMS resources and the structure itself.
* Called after drm_gem_object_release() has completed.
*/
static void __nv_drm_semsurf_fence_ctx_destroy(
struct nv_drm_fence_context *nv_fence_context)
{
struct nv_drm_device *nv_dev = nv_fence_context->nv_dev;
struct nv_drm_semsurf_fence_ctx *ctx =
to_semsurf_fence_ctx(nv_fence_context);

nvKms->freeSemaphoreSurface(nv_dev->pDevice, ctx->pSemSurface);

nv_drm_free(nv_fence_context);
}
Expand Down Expand Up @@ -1218,6 +1268,7 @@ __nv_drm_semsurf_ctx_timeout_callback(nv_drm_timer *timer)

static struct nv_drm_fence_context_ops
nv_drm_semsurf_fence_ctx_ops = {
.prepare_destroy = __nv_drm_semsurf_fence_ctx_prepare_destroy,
.destroy = __nv_drm_semsurf_fence_ctx_destroy,
};

Expand Down
12 changes: 11 additions & 1 deletion kernel-open/nvidia-drm/nvidia-drm-gem.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,17 @@ void nv_drm_gem_free(struct drm_gem_object *gem)
{
struct nv_drm_gem_object *nv_gem = to_nv_gem_object(gem);

/* Cleanup core gem object */
/*
* Prepare for release - stop callbacks and signal fences BEFORE
* releasing the core gem object. This prevents race conditions where
* the kernel's drm_exec/shrinker infrastructure can access the object
* via dma_resv while it's being destroyed.
*/
if (nv_gem->ops->prepare_release) {
nv_gem->ops->prepare_release(nv_gem);
}

/* Cleanup core gem object - now safe since fences are detached */
drm_gem_object_release(&nv_gem->base);

#if !defined(NV_DRM_GEM_OBJECT_HAS_RESV)
Expand Down
2 changes: 2 additions & 0 deletions kernel-open/nvidia-drm/nvidia-drm-gem.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
struct nv_drm_gem_object;

struct nv_drm_gem_object_funcs {
/* Called before drm_gem_object_release() to stop callbacks and signal fences */
void (*prepare_release)(struct nv_drm_gem_object *nv_gem);
void (*free)(struct nv_drm_gem_object *nv_gem);
struct sg_table *(*prime_get_sg_table)(struct nv_drm_gem_object *nv_gem);
void *(*prime_vmap)(struct nv_drm_gem_object *nv_gem);
Expand Down
22 changes: 20 additions & 2 deletions kernel-open/nvidia/nv-vm.c
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,8 @@ typedef struct nv_page_pool_t

nv_page_pool_t *sysmem_page_pools[MAX_NUMNODES][NV_MAX_PAGE_ORDER + 1];

#include <linux/rcupdate.h>

#ifdef NV_SHRINKER_ALLOC_PRESENT
static nv_page_pool_t *nv_mem_pool_get_from_shrinker(struct shrinker *shrinker)
{
Expand All @@ -420,6 +422,13 @@ static void nv_mem_pool_shrinker_free(nv_page_pool_t *mem_pool)
if (mem_pool->shrinker != NULL)
{
shrinker_free(mem_pool->shrinker);
mem_pool->shrinker = NULL;

/*
* Ensure RCU grace period completes before continuing destruction.
* This prevents use-after-free if kswapd is iterating shrinkers.
*/
synchronize_rcu();
}
}

Expand All @@ -445,6 +454,13 @@ static void nv_mem_pool_shrinker_free(nv_page_pool_t *mem_pool)
if (mem_pool->shrinker != NULL)
{
unregister_shrinker(mem_pool->shrinker);
mem_pool->shrinker = NULL;

/*
* Ensure RCU grace period completes before continuing destruction.
* This prevents use-after-free if kswapd is iterating shrinkers.
*/
synchronize_rcu();
}
}

Expand Down Expand Up @@ -692,6 +708,9 @@ nv_mem_pool_destroy(nv_page_pool_t *mem_pool)
{
NV_STATUS status;

// Unregister shrinker FIRST to prevent callbacks during cleanup
nv_mem_pool_shrinker_free(mem_pool);

status = os_acquire_mutex(mem_pool->lock);
WARN_ON(status != NV_OK);
nv_mem_pool_free_page_list(&mem_pool->dirty_list, mem_pool->order);
Expand All @@ -706,8 +725,6 @@ nv_mem_pool_destroy(nv_page_pool_t *mem_pool)
nv_mem_pool_free_page_list(&mem_pool->clean_list, mem_pool->order);
os_release_mutex(mem_pool->lock);

nv_mem_pool_shrinker_free(mem_pool);

os_free_mutex(mem_pool->lock);

NV_KFREE(mem_pool, sizeof(*mem_pool));
Expand Down Expand Up @@ -759,6 +776,7 @@ nv_page_pool_t* nv_mem_pool_init(int node_id, unsigned int order)
nv_mem_pool_shrinker_register(mem_pool, shrinker);

mem_pool->shrinker = shrinker;

return mem_pool;

failed:
Expand Down