diff options
author | Monk Liu <Monk.Liu@amd.com> | 2017-10-25 04:21:08 -0400 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2017-12-04 16:41:30 -0500 |
commit | 48f05f2955e4a3183b219d6dfdb1c28e17d03da7 (patch) | |
tree | a444ff5d0e61958d30b7105f999d8695e994581b | |
parent | 3a393cf96ab38c72565fda106a825302828b7e05 (diff) |
amd/scheduler:imple job skip feature(v3)
jobs are skipped under two cases
1)when the entity behind this job marked guilty, the job
poped from this entity's queue will be dropped in sched_main loop.
2)in job_recovery(), skip the scheduling job if its karma detected
above limit, and also skipped as well for other jobs sharing the
same fence context. this approach is becuase job_recovery() cannot
access job->entity due to entity may already dead.
v2:
some logic fix
v3:
when entity detected guilty, don't drop the job in the poping
stage, instead set its fence error as -ECANCELED
in run_job(), skip the scheduling either:1) fence->error < 0
or 2) there was a VRAM LOST occurred on this job.
this way we can unify the job skipping logic.
with this feature we can introduce new gpu recover feature.
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 13 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 39 |
2 files changed, 31 insertions, 21 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index f60662e03761..0a90c768dbc1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | |||
@@ -180,7 +180,7 @@ static struct dma_fence *amdgpu_job_dependency(struct amd_sched_job *sched_job, | |||
180 | 180 | ||
181 | static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job) | 181 | static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job) |
182 | { | 182 | { |
183 | struct dma_fence *fence = NULL; | 183 | struct dma_fence *fence = NULL, *finished; |
184 | struct amdgpu_device *adev; | 184 | struct amdgpu_device *adev; |
185 | struct amdgpu_job *job; | 185 | struct amdgpu_job *job; |
186 | int r; | 186 | int r; |
@@ -190,15 +190,18 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job) | |||
190 | return NULL; | 190 | return NULL; |
191 | } | 191 | } |
192 | job = to_amdgpu_job(sched_job); | 192 | job = to_amdgpu_job(sched_job); |
193 | finished = &job->base.s_fence->finished; | ||
193 | adev = job->adev; | 194 | adev = job->adev; |
194 | 195 | ||
195 | BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); | 196 | BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); |
196 | 197 | ||
197 | trace_amdgpu_sched_run_job(job); | 198 | trace_amdgpu_sched_run_job(job); |
198 | /* skip ib schedule when vram is lost */ | 199 | |
199 | if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) { | 200 | if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) |
200 | dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED); | 201 | dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */ |
201 | DRM_ERROR("Skip scheduling IBs!\n"); | 202 | |
203 | if (finished->error < 0) { | ||
204 | DRM_INFO("Skip scheduling IBs!\n"); | ||
202 | } else { | 205 | } else { |
203 | r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, | 206 | r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, |
204 | &fence); | 207 | &fence); |
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index f116de798204..941b5920b97b 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | |||
@@ -345,6 +345,10 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity) | |||
345 | if (amd_sched_entity_add_dependency_cb(entity)) | 345 | if (amd_sched_entity_add_dependency_cb(entity)) |
346 | return NULL; | 346 | return NULL; |
347 | 347 | ||
348 | /* skip jobs from entity that marked guilty */ | ||
349 | if (entity->guilty && atomic_read(entity->guilty)) | ||
350 | dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED); | ||
351 | |||
348 | spsc_queue_pop(&entity->job_queue); | 352 | spsc_queue_pop(&entity->job_queue); |
349 | return sched_job; | 353 | return sched_job; |
350 | } | 354 | } |
@@ -441,14 +445,6 @@ static void amd_sched_job_timedout(struct work_struct *work) | |||
441 | job->sched->ops->timedout_job(job); | 445 | job->sched->ops->timedout_job(job); |
442 | } | 446 | } |
443 | 447 | ||
444 | static void amd_sched_set_guilty(struct amd_sched_job *s_job, | ||
445 | struct amd_sched_entity *s_entity) | ||
446 | { | ||
447 | if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit) | ||
448 | if (s_entity->guilty) | ||
449 | atomic_set(s_entity->guilty, 1); | ||
450 | } | ||
451 | |||
452 | void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad) | 448 | void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad) |
453 | { | 449 | { |
454 | struct amd_sched_job *s_job; | 450 | struct amd_sched_job *s_job; |
@@ -468,21 +464,24 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo | |||
468 | spin_unlock(&sched->job_list_lock); | 464 | spin_unlock(&sched->job_list_lock); |
469 | 465 | ||
470 | if (bad) { | 466 | if (bad) { |
471 | bool found = false; | 467 | /* don't increase @bad's karma if it's from KERNEL RQ, |
472 | 468 | * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs) | |
473 | for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) { | 469 | * corrupt but keep in mind that kernel jobs always considered good. |
470 | */ | ||
471 | for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++ ) { | ||
474 | struct amd_sched_rq *rq = &sched->sched_rq[i]; | 472 | struct amd_sched_rq *rq = &sched->sched_rq[i]; |
475 | 473 | ||
476 | spin_lock(&rq->lock); | 474 | spin_lock(&rq->lock); |
477 | list_for_each_entry_safe(entity, tmp, &rq->entities, list) { | 475 | list_for_each_entry_safe(entity, tmp, &rq->entities, list) { |
478 | if (bad->s_fence->scheduled.context == entity->fence_context) { | 476 | if (bad->s_fence->scheduled.context == entity->fence_context) { |
479 | found = true; | 477 | if (atomic_inc_return(&bad->karma) > bad->sched->hang_limit) |
480 | amd_sched_set_guilty(bad, entity); | 478 | if (entity->guilty) |
479 | atomic_set(entity->guilty, 1); | ||
481 | break; | 480 | break; |
482 | } | 481 | } |
483 | } | 482 | } |
484 | spin_unlock(&rq->lock); | 483 | spin_unlock(&rq->lock); |
485 | if (found) | 484 | if (&entity->list != &rq->entities) |
486 | break; | 485 | break; |
487 | } | 486 | } |
488 | } | 487 | } |
@@ -500,6 +499,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job) | |||
500 | void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) | 499 | void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) |
501 | { | 500 | { |
502 | struct amd_sched_job *s_job, *tmp; | 501 | struct amd_sched_job *s_job, *tmp; |
502 | bool found_guilty = false; | ||
503 | int r; | 503 | int r; |
504 | 504 | ||
505 | spin_lock(&sched->job_list_lock); | 505 | spin_lock(&sched->job_list_lock); |
@@ -511,6 +511,15 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) | |||
511 | list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { | 511 | list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { |
512 | struct amd_sched_fence *s_fence = s_job->s_fence; | 512 | struct amd_sched_fence *s_fence = s_job->s_fence; |
513 | struct dma_fence *fence; | 513 | struct dma_fence *fence; |
514 | uint64_t guilty_context; | ||
515 | |||
516 | if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) { | ||
517 | found_guilty = true; | ||
518 | guilty_context = s_job->s_fence->scheduled.context; | ||
519 | } | ||
520 | |||
521 | if (found_guilty && s_job->s_fence->scheduled.context == guilty_context) | ||
522 | dma_fence_set_error(&s_fence->finished, -ECANCELED); | ||
514 | 523 | ||
515 | spin_unlock(&sched->job_list_lock); | 524 | spin_unlock(&sched->job_list_lock); |
516 | fence = sched->ops->run_job(s_job); | 525 | fence = sched->ops->run_job(s_job); |
@@ -526,7 +535,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) | |||
526 | r); | 535 | r); |
527 | dma_fence_put(fence); | 536 | dma_fence_put(fence); |
528 | } else { | 537 | } else { |
529 | DRM_ERROR("Failed to run job!\n"); | ||
530 | amd_sched_process_job(NULL, &s_fence->cb); | 538 | amd_sched_process_job(NULL, &s_fence->cb); |
531 | } | 539 | } |
532 | spin_lock(&sched->job_list_lock); | 540 | spin_lock(&sched->job_list_lock); |
@@ -664,7 +672,6 @@ static int amd_sched_main(void *param) | |||
664 | r); | 672 | r); |
665 | dma_fence_put(fence); | 673 | dma_fence_put(fence); |
666 | } else { | 674 | } else { |
667 | DRM_ERROR("Failed to run job!\n"); | ||
668 | amd_sched_process_job(NULL, &s_fence->cb); | 675 | amd_sched_process_job(NULL, &s_fence->cb); |
669 | } | 676 | } |
670 | 677 | ||