aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
authorAndrey Grodzovsky <andrey.grodzovsky@amd.com>2018-11-29 15:14:27 -0500
committerAlex Deucher <alexander.deucher@amd.com>2018-12-03 11:15:14 -0500
commitd4535e2c018bba71b49edeb5e396183920f5d341 (patch)
tree894857dc0eff45db769fcaa8b0dbcd0d793d072f /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parenta82400b57abb6aff068bb3b21d1cccd63acbb863 (diff)
drm/amdgpu: Implement concurrent asic reset for XGMI.
Use per hive wq to concurrently send reset commands to all nodes in the hive. v2: Switch to system_highpri_wq after dropping dedicated queue. Fix non XGMI code path KASAN error. Stop the hive reset for each node loop if there is a reset failure on any of the nodes. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Acked-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c44
1 files changed, 39 insertions, 5 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index bfd286c40631..9fd9f63adc08 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2356,6 +2356,19 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2356 return amdgpu_device_asic_has_dc_support(adev->asic_type); 2356 return amdgpu_device_asic_has_dc_support(adev->asic_type);
2357} 2357}
2358 2358
2359
2360static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
2361{
2362 struct amdgpu_device *adev =
2363 container_of(__work, struct amdgpu_device, xgmi_reset_work);
2364
2365 adev->asic_reset_res = amdgpu_asic_reset(adev);
2366 if (adev->asic_reset_res)
2367 DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s",
2368 adev->asic_reset_res, adev->ddev->unique);
2369}
2370
2371
2359/** 2372/**
2360 * amdgpu_device_init - initialize the driver 2373 * amdgpu_device_init - initialize the driver
2361 * 2374 *
@@ -2454,6 +2467,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
2454 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 2467 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
2455 amdgpu_device_delay_enable_gfx_off); 2468 amdgpu_device_delay_enable_gfx_off);
2456 2469
2470 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
2471
2457 adev->gfx.gfx_off_req_count = 1; 2472 adev->gfx.gfx_off_req_count = 1;
2458 adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false; 2473 adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false;
2459 2474
@@ -3331,10 +3346,31 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
3331 */ 3346 */
3332 if (need_full_reset) { 3347 if (need_full_reset) {
3333 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3348 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
3334 r = amdgpu_asic_reset(tmp_adev); 3349 /* For XGMI run all resets in parallel to speed up the process */
3335 if (r) 3350 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
3336 DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s", 3351 if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
3352 r = -EALREADY;
3353 } else
3354 r = amdgpu_asic_reset(tmp_adev);
3355
3356 if (r) {
3357 DRM_ERROR("ASIC reset failed with err r, %d for drm dev, %s",
3337 r, tmp_adev->ddev->unique); 3358 r, tmp_adev->ddev->unique);
3359 break;
3360 }
3361 }
3362
3363 /* For XGMI wait for all PSP resets to complete before proceed */
3364 if (!r) {
3365 list_for_each_entry(tmp_adev, device_list_handle,
3366 gmc.xgmi.head) {
3367 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
3368 flush_work(&tmp_adev->xgmi_reset_work);
3369 r = tmp_adev->asic_reset_res;
3370 if (r)
3371 break;
3372 }
3373 }
3338 } 3374 }
3339 } 3375 }
3340 3376
@@ -3521,8 +3557,6 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
3521 if (tmp_adev == adev) 3557 if (tmp_adev == adev)
3522 continue; 3558 continue;
3523 3559
3524 dev_info(tmp_adev->dev, "GPU reset begin for drm dev %s!\n", adev->ddev->unique);
3525
3526 amdgpu_device_lock_adev(tmp_adev); 3560 amdgpu_device_lock_adev(tmp_adev);
3527 r = amdgpu_device_pre_asic_reset(tmp_adev, 3561 r = amdgpu_device_pre_asic_reset(tmp_adev,
3528 NULL, 3562 NULL,