aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
diff options
context:
space:
mode:
authorEmily Deng <Emily.Deng@amd.com>2018-04-15 22:07:02 -0400
committerAlex Deucher <alexander.deucher@amd.com>2018-05-15 14:43:17 -0400
commit8ee3a52e3f35e064a3bf82f21dc74ddaf9843648 (patch)
treeb18d640c5c28dadc6ba3d4673d6ad6b0b18edb73 /drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
parent6f752ec2c20c6a575da29d5b297980f376830e6b (diff)
drm/gpu-sched: fix force APP kill hang(v4)
issue: there are VMC page fault occurred if force APP kill during 3dmark test, the cause is in entity_fini we manually signal all those jobs in entity's queue which confuse the sync/dep mechanism: 1)page fault occurred in sdma's clear job which operate on shadow buffer, and shadow buffer's Gart table is cleaned by ttm_bo_release since the fence in its reservation was fake signaled by entity_fini() under the case of SIGKILL received. 2)page fault occurred in gfx' job because during the lifetime of gfx job we manually fake signal all jobs from its entity in entity_fini(), thus the unmapping/clear PTE job depend on those result fence is satisfied and sdma start clearing the PTE and lead to GFX page fault. fix: 1)should at least wait all jobs already scheduled complete in entity_fini() if SIGKILL is the case. 2)if a fence signaled and try to clear some entity's dependency, should set this entity guilty to prevent its job really run since the dependency is fake signaled. v2: splitting drm_sched_entity_fini() into two functions: 1)The first one is does the waiting, removes the entity from the runqueue and returns an error when the process was killed. 2)The second one then goes over the entity, install it as completion signal for the remaining jobs and signals all jobs with an error code. v3: 1)Replace the fini1 and fini2 with better name 2)Call the first part before the VM teardown in amdgpu_driver_postclose_kms() and the second part after the VM teardown 3)Keep the original function drm_sched_entity_fini to refine the code. v4: 1)Rename entity->finished to entity->last_scheduled; 2)Rename drm_sched_entity_fini_job_cb() to drm_sched_entity_kill_jobs_cb(); 3)Pass NULL to drm_sched_entity_fini_job_cb() if -ENOENT; 4)Replace the type of entity->fini_status with "int"; 5)Remove the check about entity->finished. Signed-off-by: Monk Liu <Monk.Liu@amd.com> Signed-off-by: Emily Deng <Emily.Deng@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c64
1 files changed, 56 insertions, 8 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index 09d35051fdd6..eb80edfb1b0a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -111,8 +111,9 @@ failed:
111 return r; 111 return r;
112} 112}
113 113
114static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx) 114static void amdgpu_ctx_fini(struct kref *ref)
115{ 115{
116 struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx, refcount);
116 struct amdgpu_device *adev = ctx->adev; 117 struct amdgpu_device *adev = ctx->adev;
117 unsigned i, j; 118 unsigned i, j;
118 119
@@ -125,13 +126,11 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
125 kfree(ctx->fences); 126 kfree(ctx->fences);
126 ctx->fences = NULL; 127 ctx->fences = NULL;
127 128
128 for (i = 0; i < adev->num_rings; i++)
129 drm_sched_entity_fini(&adev->rings[i]->sched,
130 &ctx->rings[i].entity);
131
132 amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr); 129 amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);
133 130
134 mutex_destroy(&ctx->lock); 131 mutex_destroy(&ctx->lock);
132
133 kfree(ctx);
135} 134}
136 135
137static int amdgpu_ctx_alloc(struct amdgpu_device *adev, 136static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
@@ -170,12 +169,15 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
170static void amdgpu_ctx_do_release(struct kref *ref) 169static void amdgpu_ctx_do_release(struct kref *ref)
171{ 170{
172 struct amdgpu_ctx *ctx; 171 struct amdgpu_ctx *ctx;
172 u32 i;
173 173
174 ctx = container_of(ref, struct amdgpu_ctx, refcount); 174 ctx = container_of(ref, struct amdgpu_ctx, refcount);
175 175
176 amdgpu_ctx_fini(ctx); 176 for (i = 0; i < ctx->adev->num_rings; i++)
177 drm_sched_entity_fini(&ctx->adev->rings[i]->sched,
178 &ctx->rings[i].entity);
177 179
178 kfree(ctx); 180 amdgpu_ctx_fini(ref);
179} 181}
180 182
181static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id) 183static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id)
@@ -435,16 +437,62 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr)
435 idr_init(&mgr->ctx_handles); 437 idr_init(&mgr->ctx_handles);
436} 438}
437 439
440void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
441{
442 struct amdgpu_ctx *ctx;
443 struct idr *idp;
444 uint32_t id, i;
445
446 idp = &mgr->ctx_handles;
447
448 idr_for_each_entry(idp, ctx, id) {
449
450 if (!ctx->adev)
451 return;
452
453 for (i = 0; i < ctx->adev->num_rings; i++)
454 if (kref_read(&ctx->refcount) == 1)
455 drm_sched_entity_do_release(&ctx->adev->rings[i]->sched,
456 &ctx->rings[i].entity);
457 else
458 DRM_ERROR("ctx %p is still alive\n", ctx);
459 }
460}
461
462void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr)
463{
464 struct amdgpu_ctx *ctx;
465 struct idr *idp;
466 uint32_t id, i;
467
468 idp = &mgr->ctx_handles;
469
470 idr_for_each_entry(idp, ctx, id) {
471
472 if (!ctx->adev)
473 return;
474
475 for (i = 0; i < ctx->adev->num_rings; i++)
476 if (kref_read(&ctx->refcount) == 1)
477 drm_sched_entity_cleanup(&ctx->adev->rings[i]->sched,
478 &ctx->rings[i].entity);
479 else
480 DRM_ERROR("ctx %p is still alive\n", ctx);
481 }
482}
483
438void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr) 484void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)
439{ 485{
440 struct amdgpu_ctx *ctx; 486 struct amdgpu_ctx *ctx;
441 struct idr *idp; 487 struct idr *idp;
442 uint32_t id; 488 uint32_t id;
443 489
490 amdgpu_ctx_mgr_entity_cleanup(mgr);
491
444 idp = &mgr->ctx_handles; 492 idp = &mgr->ctx_handles;
445 493
446 idr_for_each_entry(idp, ctx, id) { 494 idr_for_each_entry(idp, ctx, id) {
447 if (kref_put(&ctx->refcount, amdgpu_ctx_do_release) != 1) 495 if (kref_put(&ctx->refcount, amdgpu_ctx_fini) != 1)
448 DRM_ERROR("ctx %p is still alive\n", ctx); 496 DRM_ERROR("ctx %p is still alive\n", ctx);
449 } 497 }
450 498