aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMonk Liu <Monk.Liu@amd.com>2017-10-25 04:21:08 -0400
committerAlex Deucher <alexander.deucher@amd.com>2017-12-04 16:41:30 -0500
commit48f05f2955e4a3183b219d6dfdb1c28e17d03da7 (patch)
treea444ff5d0e61958d30b7105f999d8695e994581b
parent3a393cf96ab38c72565fda106a825302828b7e05 (diff)
amd/scheduler:imple job skip feature(v3)
jobs are skipped under two cases 1)when the entity behind this job marked guilty, the job poped from this entity's queue will be dropped in sched_main loop. 2)in job_recovery(), skip the scheduling job if its karma detected above limit, and also skipped as well for other jobs sharing the same fence context. this approach is becuase job_recovery() cannot access job->entity due to entity may already dead. v2: some logic fix v3: when entity detected guilty, don't drop the job in the poping stage, instead set its fence error as -ECANCELED in run_job(), skip the scheduling either:1) fence->error < 0 or 2) there was a VRAM LOST occurred on this job. this way we can unify the job skipping logic. with this feature we can introduce new gpu recover feature. Signed-off-by: Monk Liu <Monk.Liu@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_job.c13
-rw-r--r--drivers/gpu/drm/amd/scheduler/gpu_scheduler.c39
2 files changed, 31 insertions, 21 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index f60662e03761..0a90c768dbc1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -180,7 +180,7 @@ static struct dma_fence *amdgpu_job_dependency(struct amd_sched_job *sched_job,
180 180
181static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job) 181static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
182{ 182{
183 struct dma_fence *fence = NULL; 183 struct dma_fence *fence = NULL, *finished;
184 struct amdgpu_device *adev; 184 struct amdgpu_device *adev;
185 struct amdgpu_job *job; 185 struct amdgpu_job *job;
186 int r; 186 int r;
@@ -190,15 +190,18 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
190 return NULL; 190 return NULL;
191 } 191 }
192 job = to_amdgpu_job(sched_job); 192 job = to_amdgpu_job(sched_job);
193 finished = &job->base.s_fence->finished;
193 adev = job->adev; 194 adev = job->adev;
194 195
195 BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); 196 BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
196 197
197 trace_amdgpu_sched_run_job(job); 198 trace_amdgpu_sched_run_job(job);
198 /* skip ib schedule when vram is lost */ 199
199 if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) { 200 if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
200 dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED); 201 dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
201 DRM_ERROR("Skip scheduling IBs!\n"); 202
203 if (finished->error < 0) {
204 DRM_INFO("Skip scheduling IBs!\n");
202 } else { 205 } else {
203 r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, 206 r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job,
204 &fence); 207 &fence);
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index f116de798204..941b5920b97b 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -345,6 +345,10 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity)
345 if (amd_sched_entity_add_dependency_cb(entity)) 345 if (amd_sched_entity_add_dependency_cb(entity))
346 return NULL; 346 return NULL;
347 347
348 /* skip jobs from entity that marked guilty */
349 if (entity->guilty && atomic_read(entity->guilty))
350 dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);
351
348 spsc_queue_pop(&entity->job_queue); 352 spsc_queue_pop(&entity->job_queue);
349 return sched_job; 353 return sched_job;
350} 354}
@@ -441,14 +445,6 @@ static void amd_sched_job_timedout(struct work_struct *work)
441 job->sched->ops->timedout_job(job); 445 job->sched->ops->timedout_job(job);
442} 446}
443 447
444static void amd_sched_set_guilty(struct amd_sched_job *s_job,
445 struct amd_sched_entity *s_entity)
446{
447 if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
448 if (s_entity->guilty)
449 atomic_set(s_entity->guilty, 1);
450}
451
452void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad) 448void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad)
453{ 449{
454 struct amd_sched_job *s_job; 450 struct amd_sched_job *s_job;
@@ -468,21 +464,24 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo
468 spin_unlock(&sched->job_list_lock); 464 spin_unlock(&sched->job_list_lock);
469 465
470 if (bad) { 466 if (bad) {
471 bool found = false; 467 /* don't increase @bad's karma if it's from KERNEL RQ,
472 468 * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs)
473 for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) { 469 * corrupt but keep in mind that kernel jobs always considered good.
470 */
471 for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++ ) {
474 struct amd_sched_rq *rq = &sched->sched_rq[i]; 472 struct amd_sched_rq *rq = &sched->sched_rq[i];
475 473
476 spin_lock(&rq->lock); 474 spin_lock(&rq->lock);
477 list_for_each_entry_safe(entity, tmp, &rq->entities, list) { 475 list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
478 if (bad->s_fence->scheduled.context == entity->fence_context) { 476 if (bad->s_fence->scheduled.context == entity->fence_context) {
479 found = true; 477 if (atomic_inc_return(&bad->karma) > bad->sched->hang_limit)
480 amd_sched_set_guilty(bad, entity); 478 if (entity->guilty)
479 atomic_set(entity->guilty, 1);
481 break; 480 break;
482 } 481 }
483 } 482 }
484 spin_unlock(&rq->lock); 483 spin_unlock(&rq->lock);
485 if (found) 484 if (&entity->list != &rq->entities)
486 break; 485 break;
487 } 486 }
488 } 487 }
@@ -500,6 +499,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job)
500void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) 499void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
501{ 500{
502 struct amd_sched_job *s_job, *tmp; 501 struct amd_sched_job *s_job, *tmp;
502 bool found_guilty = false;
503 int r; 503 int r;
504 504
505 spin_lock(&sched->job_list_lock); 505 spin_lock(&sched->job_list_lock);
@@ -511,6 +511,15 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
511 list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { 511 list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
512 struct amd_sched_fence *s_fence = s_job->s_fence; 512 struct amd_sched_fence *s_fence = s_job->s_fence;
513 struct dma_fence *fence; 513 struct dma_fence *fence;
514 uint64_t guilty_context;
515
516 if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
517 found_guilty = true;
518 guilty_context = s_job->s_fence->scheduled.context;
519 }
520
521 if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
522 dma_fence_set_error(&s_fence->finished, -ECANCELED);
514 523
515 spin_unlock(&sched->job_list_lock); 524 spin_unlock(&sched->job_list_lock);
516 fence = sched->ops->run_job(s_job); 525 fence = sched->ops->run_job(s_job);
@@ -526,7 +535,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
526 r); 535 r);
527 dma_fence_put(fence); 536 dma_fence_put(fence);
528 } else { 537 } else {
529 DRM_ERROR("Failed to run job!\n");
530 amd_sched_process_job(NULL, &s_fence->cb); 538 amd_sched_process_job(NULL, &s_fence->cb);
531 } 539 }
532 spin_lock(&sched->job_list_lock); 540 spin_lock(&sched->job_list_lock);
@@ -664,7 +672,6 @@ static int amd_sched_main(void *param)
664 r); 672 r);
665 dma_fence_put(fence); 673 dma_fence_put(fence);
666 } else { 674 } else {
667 DRM_ERROR("Failed to run job!\n");
668 amd_sched_process_job(NULL, &s_fence->cb); 675 amd_sched_process_job(NULL, &s_fence->cb);
669 } 676 }
670 677