amd/scheduler:imple job skip feature(v3)

jobs are skipped under two cases 1)when the entity behind this job marked guilty, the job poped from this entity's queue will be dropped in sched_main loop. 2)in job_recovery(), skip the scheduling job if its karma detected above limit, and also skipped as well for other jobs sharing the same fence context. this approach is becuase job_recovery() cannot access job->entity due to entity may already dead. v2: some logic fix v3: when entity detected guilty, don't drop the job in the poping stage, instead set its fence error as -ECANCELED in run_job(), skip the scheduling either:1) fence->error < 0 or 2) there was a VRAM LOST occurred on this job. this way we can unify the job skipping logic. with this feature we can introduce new gpu recover feature. Signed-off-by: Monk Liu <Monk.Liu@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Monk Liu <Monk.Liu@amd.com> 2017-10-25 04:21:08 -0400
committer: Alex Deucher <alexander.deucher@amd.com> 2017-12-04 16:41:30 -0500
commit: 48f05f2955e4a3183b219d6dfdb1c28e17d03da7 (patch)
tree: a444ff5d0e61958d30b7105f999d8695e994581b /drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
parent: 3a393cf96ab38c72565fda106a825302828b7e05 (diff)
1 files changed, 8 insertions, 5 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index f60662e03761..0a90c768dbc1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -180,7 +180,7 @@ static struct dma_fence *amdgpu_job_dependency(struct amd_sched_job *sched_job,
 static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
 {
-        struct dma_fence *fence = NULL;
+        struct dma_fence *fence = NULL, *finished;
        struct amdgpu_device *adev;
        struct amdgpu_job *job;
        int r;
@@ -190,15 +190,18 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
                return NULL;
        }
        job = to_amdgpu_job(sched_job);
+        finished = &job->base.s_fence->finished;
        adev = job->adev;
        BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
        trace_amdgpu_sched_run_job(job);
-        /* skip ib schedule when vram is lost */
-        if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) {
+        if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
-                dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED);
+                dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
-                DRM_ERROR("Skip scheduling IBs!\n");
+        if (finished->error < 0) {
+                DRM_INFO("Skip scheduling IBs!\n");
        } else {
                r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job,
                                       &fence);
author	Monk Liu <Monk.Liu@amd.com>	2017-10-25 04:21:08 -0400
committer	Alex Deucher <alexander.deucher@amd.com>	2017-12-04 16:41:30 -0500
commit	48f05f2955e4a3183b219d6dfdb1c28e17d03da7 (patch)
tree	a444ff5d0e61958d30b7105f999d8695e994581b /drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
parent	3a393cf96ab38c72565fda106a825302828b7e05 (diff)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index f60662e03761..0a90c768dbc1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -180,7 +180,7 @@ static struct dma_fence amdgpu_job_dependency(struct amd_sched_job sched_job,
180		180
181	static struct dma_fence amdgpu_job_run(struct amd_sched_job sched_job)	181	static struct dma_fence amdgpu_job_run(struct amd_sched_job sched_job)
182	{	182	{
183	struct dma_fence *fence = NULL;	183	struct dma_fence fence = NULL, finished;
184	struct amdgpu_device *adev;	184	struct amdgpu_device *adev;
185	struct amdgpu_job *job;	185	struct amdgpu_job *job;
186	int r;	186	int r;
@@ -190,15 +190,18 @@ static struct dma_fence amdgpu_job_run(struct amd_sched_job sched_job)
190	return NULL;	190	return NULL;
191	}	191	}
192	job = to_amdgpu_job(sched_job);	192	job = to_amdgpu_job(sched_job);
		193	finished = &job->base.s_fence->finished;
193	adev = job->adev;	194	adev = job->adev;
194		195
195	BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));	196	BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
196		197
197	trace_amdgpu_sched_run_job(job);	198	trace_amdgpu_sched_run_job(job);
198	/* skip ib schedule when vram is lost */	199
199	if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) {	200	if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
200	dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED);	201	dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
201	DRM_ERROR("Skip scheduling IBs!\n");	202
		203	if (finished->error < 0) {
		204	DRM_INFO("Skip scheduling IBs!\n");
202	} else {	205	} else {
203	r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job,	206	r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job,
204	&fence);	207	&fence);