drm/scheduler: fix timeout worker setup for out of order job completions

drm_sched_job_finish() is a work item scheduled for each finished job on a unbound system workqueue. This means the workers can execute out of order with regard to the real hardware job completions. If this happens queueing a timeout worker for the first job on the ring mirror list is wrong, as this may be a job which has already finished executing. Fix this by reorganizing the code to always queue the worker for the next job on the list, if this job hasn't finished yet. This is robust against a potential reordering of the finish workers. Also move out the timeout worker cancelling, so that we don't need to take the job list lock twice. As a small optimization list_del is used to remove the job from the ring mirror list, as there is no need to reinit the list head in the job we are about to free. Signed-off-by: Lucas Stach <l.stach@pengutronix.de> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Lucas Stach <l.stach@pengutronix.de> 2018-08-06 09:12:48 -0400
committer: Alex Deucher <alexander.deucher@amd.com> 2018-08-06 16:58:00 -0400
commit: 4823e5da2ea9061011242db81334d6ebbd2ed0a5 (patch)
tree: bd7d771afeaf06390077524e0f983fea50abc8f3 /drivers/gpu
parent: 1e1dbd6fd10031bf46d9e44b6ad423e2ee39e2a7 (diff)
1 files changed, 17 insertions, 13 deletions
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c
index 1b733229201e..a70c7f7fd6fe 100644
--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
@@ -552,24 +552,28 @@ static void drm_sched_job_finish(struct work_struct *work)
                                                   finish_work);
        struct drm_gpu_scheduler *sched = s_job->sched;
-        /* remove job from ring_mirror_list */
+        /*
-        spin_lock(&sched->job_list_lock);
+         * Canceling the timeout without removing our job from the ring mirror
-        list_del_init(&s_job->node);
+         * list is safe, as we will only end up in this worker if our jobs
-        if (sched->timeout != MAX_SCHEDULE_TIMEOUT) {
+         * finished fence has been signaled. So even if some another worker
-                struct drm_sched_job *next;
+         * manages to find this job as the next job in the list, the fence
+         * signaled check below will prevent the timeout to be restarted.
-                spin_unlock(&sched->job_list_lock);
+         */
-                cancel_delayed_work_sync(&s_job->work_tdr);
+        cancel_delayed_work_sync(&s_job->work_tdr);
-                spin_lock(&sched->job_list_lock);
-                /* queue TDR for next job */
+        spin_lock(&sched->job_list_lock);
-                next = list_first_entry_or_null(&sched->ring_mirror_list,
+        /* queue TDR for next job */
-                                                struct drm_sched_job, node);
+        if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
+            !list_is_last(&s_job->node, &sched->ring_mirror_list)) {
+                struct drm_sched_job *next = list_next_entry(s_job, node);
-                if (next)
+                if (!dma_fence_is_signaled(&next->s_fence->finished))
                        schedule_delayed_work(&next->work_tdr, sched->timeout);
        }
+        /* remove job from ring_mirror_list */
+        list_del(&s_job->node);
        spin_unlock(&sched->job_list_lock);
        dma_fence_put(&s_job->s_fence->finished);
        sched->ops->free_job(s_job);
 }
author	Lucas Stach <l.stach@pengutronix.de>	2018-08-06 09:12:48 -0400
committer	Alex Deucher <alexander.deucher@amd.com>	2018-08-06 16:58:00 -0400
commit	4823e5da2ea9061011242db81334d6ebbd2ed0a5 (patch)
tree	bd7d771afeaf06390077524e0f983fea50abc8f3 /drivers/gpu
parent	1e1dbd6fd10031bf46d9e44b6ad423e2ee39e2a7 (diff)

diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 1b733229201e..a70c7f7fd6fe 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
@@ -552,24 +552,28 @@ static void drm_sched_job_finish(struct work_struct *work)
552	finish_work);	552	finish_work);
553	struct drm_gpu_scheduler *sched = s_job->sched;	553	struct drm_gpu_scheduler *sched = s_job->sched;
554		554
555	/* remove job from ring_mirror_list */	555	/*
556	spin_lock(&sched->job_list_lock);	556	* Canceling the timeout without removing our job from the ring mirror
557	list_del_init(&s_job->node);	557	* list is safe, as we will only end up in this worker if our jobs
558	if (sched->timeout != MAX_SCHEDULE_TIMEOUT) {	558	* finished fence has been signaled. So even if some another worker
559	struct drm_sched_job *next;	559	* manages to find this job as the next job in the list, the fence
560		560	* signaled check below will prevent the timeout to be restarted.
561	spin_unlock(&sched->job_list_lock);	561	*/
562	cancel_delayed_work_sync(&s_job->work_tdr);	562	cancel_delayed_work_sync(&s_job->work_tdr);
563	spin_lock(&sched->job_list_lock);
564		563
565	/* queue TDR for next job */	564	spin_lock(&sched->job_list_lock);
566	next = list_first_entry_or_null(&sched->ring_mirror_list,	565	/* queue TDR for next job */
567	struct drm_sched_job, node);	566	if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
		567	!list_is_last(&s_job->node, &sched->ring_mirror_list)) {
		568	struct drm_sched_job *next = list_next_entry(s_job, node);
568		569
569	if (next)	570	if (!dma_fence_is_signaled(&next->s_fence->finished))
570	schedule_delayed_work(&next->work_tdr, sched->timeout);	571	schedule_delayed_work(&next->work_tdr, sched->timeout);
571	}	572	}
		573	/* remove job from ring_mirror_list */
		574	list_del(&s_job->node);
572	spin_unlock(&sched->job_list_lock);	575	spin_unlock(&sched->job_list_lock);
		576
573	dma_fence_put(&s_job->s_fence->finished);	577	dma_fence_put(&s_job->s_fence->finished);
574	sched->ops->free_job(s_job);	578	sched->ops->free_job(s_job);
575	}	579	}