aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
authorMonk Liu <Monk.Liu@amd.com>2017-05-11 01:36:44 -0400
committerAlex Deucher <alexander.deucher@amd.com>2017-05-24 17:40:40 -0400
commit65781c78ad74e4260fbec92c0ecc05738044e177 (patch)
tree2a9f38885d8e0bb02f48d33235db39b445e21cdc /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parent75fbed20e5e36130a381cc162b137aef349d1f81 (diff)
drm/amdgpu/SRIOV:implement guilty job TDR for(V2)
1,TDR will kickout guilty job if it hang exceed the threshold of the given one from kernel paramter "job_hang_limit", that way a bad command stream will not infinitly cause GPU hang. by default this threshold is 1 so a job will be kicked out after it hang. 2,if a job timeout TDR routine will not reset all sched/ring, instead if will only reset on the givn one which is indicated by @job of amdgpu_sriov_gpu_reset, that way we don't need to reset and recover each sched/ring if we already know which job cause GPU hang. 3,unblock sriov_gpu_reset for AI family. V2: 1:put kickout guilty job after sched parked. 2:since parking scheduler prior to kickout already occupies a while, we can do last check on the in question job before doing hw_reset. TODO: 1:when a job is considered as guilty, we should mark some flag in its fence status flag, and let UMD side aware that this fence signaling is not due to job complete but job hang. 2:if gpu reset cause all video memory lost, we need introduce a new policy to implement TDR, like drop all jobs not yet signaled, and all IOCTL on this device will return ERROR DEVICE_LOST. this will be implemented later. Signed-off-by: Monk Liu <Monk.Liu@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c43
1 files changed, 34 insertions, 9 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 41c18700e275..8b0f4864a885 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2617,7 +2617,7 @@ err:
2617 */ 2617 */
2618int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job) 2618int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job)
2619{ 2619{
2620 int i, r = 0; 2620 int i, j, r = 0;
2621 int resched; 2621 int resched;
2622 struct amdgpu_bo *bo, *tmp; 2622 struct amdgpu_bo *bo, *tmp;
2623 struct amdgpu_ring *ring; 2623 struct amdgpu_ring *ring;
@@ -2630,19 +2630,36 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job)
2630 /* block TTM */ 2630 /* block TTM */
2631 resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 2631 resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
2632 2632
2633 /* block scheduler */ 2633 /* we start from the ring trigger GPU hang */
2634 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2634 j = job ? job->ring->idx : 0;
2635 ring = adev->rings[i];
2636 2635
2636 /* block scheduler */
2637 for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) {
2638 ring = adev->rings[i % AMDGPU_MAX_RINGS];
2637 if (!ring || !ring->sched.thread) 2639 if (!ring || !ring->sched.thread)
2638 continue; 2640 continue;
2639 2641
2640 kthread_park(ring->sched.thread); 2642 kthread_park(ring->sched.thread);
2643
2644 if (job && j != i)
2645 continue;
2646
2647 /* here give the last chance to check if fence signaled
2648 * since we already pay some time on kthread_park */
2649 if (job && dma_fence_is_signaled(&job->base.s_fence->finished)) {
2650 kthread_unpark(ring->sched.thread);
2651 goto give_up_reset;
2652 }
2653
2654 if (amd_sched_invalidate_job(&job->base, amdgpu_job_hang_limit))
2655 amd_sched_job_kickout(&job->base);
2656
2657 /* only do job_reset on the hang ring if @job not NULL */
2641 amd_sched_hw_job_reset(&ring->sched); 2658 amd_sched_hw_job_reset(&ring->sched);
2642 }
2643 2659
2644 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 2660 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
2645 amdgpu_fence_driver_force_completion(adev); 2661 amdgpu_fence_driver_force_completion_ring(ring);
2662 }
2646 2663
2647 /* request to take full control of GPU before re-initialization */ 2664 /* request to take full control of GPU before re-initialization */
2648 if (job) 2665 if (job)
@@ -2695,20 +2712,28 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job)
2695 } 2712 }
2696 dma_fence_put(fence); 2713 dma_fence_put(fence);
2697 2714
2698 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2715 for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) {
2699 struct amdgpu_ring *ring = adev->rings[i]; 2716 ring = adev->rings[i % AMDGPU_MAX_RINGS];
2700 if (!ring || !ring->sched.thread) 2717 if (!ring || !ring->sched.thread)
2701 continue; 2718 continue;
2702 2719
2720 if (job && j != i) {
2721 kthread_unpark(ring->sched.thread);
2722 continue;
2723 }
2724
2703 amd_sched_job_recovery(&ring->sched); 2725 amd_sched_job_recovery(&ring->sched);
2704 kthread_unpark(ring->sched.thread); 2726 kthread_unpark(ring->sched.thread);
2705 } 2727 }
2706 2728
2707 drm_helper_resume_force_mode(adev->ddev); 2729 drm_helper_resume_force_mode(adev->ddev);
2730give_up_reset:
2708 ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); 2731 ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
2709 if (r) { 2732 if (r) {
2710 /* bad news, how to tell it to userspace ? */ 2733 /* bad news, how to tell it to userspace ? */
2711 dev_info(adev->dev, "GPU reset failed\n"); 2734 dev_info(adev->dev, "GPU reset failed\n");
2735 } else {
2736 dev_info(adev->dev, "GPU reset successed!\n");
2712 } 2737 }
2713 2738
2714 adev->gfx.in_reset = false; 2739 adev->gfx.in_reset = false;