diff options
author | Christian König <christian.koenig@amd.com> | 2018-08-21 04:45:29 -0400 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2018-08-27 12:11:16 -0400 |
commit | 12938fad234a3924cc9b82080db4f62fe1cf52bb (patch) | |
tree | 3b784794d8d219b95dd22cd5544a05177ea9ec10 /drivers/gpu/drm/amd/amdgpu | |
parent | 1849e73748be3c80bf752e4c4877fe90a8da4822 (diff) |
drm/amdgpu: cleanup GPU recovery check a bit (v2)
Check if we should call the function instead of providing the forced
flag.
v2: rebase on KFD changes (Alex)
Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: Huang Rui <ray.huang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 3 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 38 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 4 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 3 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 4 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 3 |
8 files changed, 38 insertions, 22 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 19ef7711d944..340e40d03d54 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h | |||
@@ -1158,8 +1158,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev); | |||
1158 | #define amdgpu_asic_need_full_reset(adev) (adev)->asic_funcs->need_full_reset((adev)) | 1158 | #define amdgpu_asic_need_full_reset(adev) (adev)->asic_funcs->need_full_reset((adev)) |
1159 | 1159 | ||
1160 | /* Common functions */ | 1160 | /* Common functions */ |
1161 | bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev); | ||
1161 | int amdgpu_device_gpu_recover(struct amdgpu_device *adev, | 1162 | int amdgpu_device_gpu_recover(struct amdgpu_device *adev, |
1162 | struct amdgpu_job* job, bool force); | 1163 | struct amdgpu_job* job); |
1163 | void amdgpu_device_pci_config_reset(struct amdgpu_device *adev); | 1164 | void amdgpu_device_pci_config_reset(struct amdgpu_device *adev); |
1164 | bool amdgpu_device_need_post(struct amdgpu_device *adev); | 1165 | bool amdgpu_device_need_post(struct amdgpu_device *adev); |
1165 | 1166 | ||
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index f8bbbb3a9504..3dbe675b6fe1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | |||
@@ -267,7 +267,8 @@ void amdgpu_amdkfd_gpu_reset(struct kgd_dev *kgd) | |||
267 | { | 267 | { |
268 | struct amdgpu_device *adev = (struct amdgpu_device *)kgd; | 268 | struct amdgpu_device *adev = (struct amdgpu_device *)kgd; |
269 | 269 | ||
270 | amdgpu_device_gpu_recover(adev, NULL, false); | 270 | if (amdgpu_device_should_recover_gpu(adev)) |
271 | amdgpu_device_gpu_recover(adev, NULL); | ||
271 | } | 272 | } |
272 | 273 | ||
273 | int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, | 274 | int alloc_gtt_mem(struct kgd_dev *kgd, size_t size, |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c961e781430d..8f431740c424 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | |||
@@ -3244,31 +3244,43 @@ error: | |||
3244 | } | 3244 | } |
3245 | 3245 | ||
3246 | /** | 3246 | /** |
3247 | * amdgpu_device_should_recover_gpu - check if we should try GPU recovery | ||
3248 | * | ||
3249 | * @adev: amdgpu device pointer | ||
3250 | * | ||
3251 | * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover | ||
3252 | * a hung GPU. | ||
3253 | */ | ||
3254 | bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) | ||
3255 | { | ||
3256 | if (!amdgpu_device_ip_check_soft_reset(adev)) { | ||
3257 | DRM_INFO("Timeout, but no hardware hang detected.\n"); | ||
3258 | return false; | ||
3259 | } | ||
3260 | |||
3261 | if (amdgpu_gpu_recovery == 0 || (amdgpu_gpu_recovery == -1 && | ||
3262 | !amdgpu_sriov_vf(adev))) { | ||
3263 | DRM_INFO("GPU recovery disabled.\n"); | ||
3264 | return false; | ||
3265 | } | ||
3266 | |||
3267 | return true; | ||
3268 | } | ||
3269 | |||
3270 | /** | ||
3247 | * amdgpu_device_gpu_recover - reset the asic and recover scheduler | 3271 | * amdgpu_device_gpu_recover - reset the asic and recover scheduler |
3248 | * | 3272 | * |
3249 | * @adev: amdgpu device pointer | 3273 | * @adev: amdgpu device pointer |
3250 | * @job: which job trigger hang | 3274 | * @job: which job trigger hang |
3251 | * @force: forces reset regardless of amdgpu_gpu_recovery | ||
3252 | * | 3275 | * |
3253 | * Attempt to reset the GPU if it has hung (all asics). | 3276 | * Attempt to reset the GPU if it has hung (all asics). |
3254 | * Returns 0 for success or an error on failure. | 3277 | * Returns 0 for success or an error on failure. |
3255 | */ | 3278 | */ |
3256 | int amdgpu_device_gpu_recover(struct amdgpu_device *adev, | 3279 | int amdgpu_device_gpu_recover(struct amdgpu_device *adev, |
3257 | struct amdgpu_job *job, bool force) | 3280 | struct amdgpu_job *job) |
3258 | { | 3281 | { |
3259 | int i, r, resched; | 3282 | int i, r, resched; |
3260 | 3283 | ||
3261 | if (!force && !amdgpu_device_ip_check_soft_reset(adev)) { | ||
3262 | DRM_INFO("No hardware hang detected. Did some blocks stall?\n"); | ||
3263 | return 0; | ||
3264 | } | ||
3265 | |||
3266 | if (!force && (amdgpu_gpu_recovery == 0 || | ||
3267 | (amdgpu_gpu_recovery == -1 && !amdgpu_sriov_vf(adev)))) { | ||
3268 | DRM_INFO("GPU recovery disabled.\n"); | ||
3269 | return 0; | ||
3270 | } | ||
3271 | |||
3272 | dev_info(adev->dev, "GPU reset begin!\n"); | 3284 | dev_info(adev->dev, "GPU reset begin!\n"); |
3273 | 3285 | ||
3274 | mutex_lock(&adev->lock_reset); | 3286 | mutex_lock(&adev->lock_reset); |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 7056925eb386..da36731460b5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | |||
@@ -701,7 +701,7 @@ static int amdgpu_debugfs_gpu_recover(struct seq_file *m, void *data) | |||
701 | struct amdgpu_device *adev = dev->dev_private; | 701 | struct amdgpu_device *adev = dev->dev_private; |
702 | 702 | ||
703 | seq_printf(m, "gpu recover\n"); | 703 | seq_printf(m, "gpu recover\n"); |
704 | amdgpu_device_gpu_recover(adev, NULL, true); | 704 | amdgpu_device_gpu_recover(adev, NULL); |
705 | 705 | ||
706 | return 0; | 706 | return 0; |
707 | } | 707 | } |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c index 1abf5b5bac9e..b927e8798534 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | |||
@@ -105,8 +105,8 @@ static void amdgpu_irq_reset_work_func(struct work_struct *work) | |||
105 | struct amdgpu_device *adev = container_of(work, struct amdgpu_device, | 105 | struct amdgpu_device *adev = container_of(work, struct amdgpu_device, |
106 | reset_work); | 106 | reset_work); |
107 | 107 | ||
108 | if (!amdgpu_sriov_vf(adev)) | 108 | if (!amdgpu_sriov_vf(adev) && amdgpu_device_should_recover_gpu(adev)) |
109 | amdgpu_device_gpu_recover(adev, NULL, false); | 109 | amdgpu_device_gpu_recover(adev, NULL); |
110 | } | 110 | } |
111 | 111 | ||
112 | /** | 112 | /** |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 391e2f7c03aa..265ff90f4e01 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | |||
@@ -37,7 +37,8 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job) | |||
37 | job->base.sched->name, atomic_read(&ring->fence_drv.last_seq), | 37 | job->base.sched->name, atomic_read(&ring->fence_drv.last_seq), |
38 | ring->fence_drv.sync_seq); | 38 | ring->fence_drv.sync_seq); |
39 | 39 | ||
40 | amdgpu_device_gpu_recover(ring->adev, job, false); | 40 | if (amdgpu_device_should_recover_gpu(ring->adev)) |
41 | amdgpu_device_gpu_recover(ring->adev, job); | ||
41 | } | 42 | } |
42 | 43 | ||
43 | int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, | 44 | int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, |
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 078f70faedcb..8cbb4655896a 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | |||
@@ -266,8 +266,8 @@ flr_done: | |||
266 | } | 266 | } |
267 | 267 | ||
268 | /* Trigger recovery for world switch failure if no TDR */ | 268 | /* Trigger recovery for world switch failure if no TDR */ |
269 | if (amdgpu_lockup_timeout == 0) | 269 | if (amdgpu_device_should_recover_gpu(adev)) |
270 | amdgpu_device_gpu_recover(adev, NULL, true); | 270 | amdgpu_device_gpu_recover(adev, NULL); |
271 | } | 271 | } |
272 | 272 | ||
273 | static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, | 273 | static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, |
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index 9fc1c37344ce..842567b53df5 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | |||
@@ -521,7 +521,8 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) | |||
521 | } | 521 | } |
522 | 522 | ||
523 | /* Trigger recovery due to world switch failure */ | 523 | /* Trigger recovery due to world switch failure */ |
524 | amdgpu_device_gpu_recover(adev, NULL, false); | 524 | if (amdgpu_device_should_recover_gpu(adev)) |
525 | amdgpu_device_gpu_recover(adev, NULL); | ||
525 | } | 526 | } |
526 | 527 | ||
527 | static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev, | 528 | static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev, |