diff options
author | Andrey Grodzovsky <andrey.grodzovsky@amd.com> | 2018-11-29 15:14:27 -0500 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2018-12-03 11:15:14 -0500 |
commit | d4535e2c018bba71b49edeb5e396183920f5d341 (patch) | |
tree | 894857dc0eff45db769fcaa8b0dbcd0d793d072f /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | |
parent | a82400b57abb6aff068bb3b21d1cccd63acbb863 (diff) |
drm/amdgpu: Implement concurrent asic reset for XGMI.
Use per hive wq to concurrently send reset commands to all nodes
in the hive.
v2:
Switch to system_highpri_wq after dropping dedicated queue.
Fix non XGMI code path KASAN error.
Stop the hive reset for each node loop if there
is a reset failure on any of the nodes.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 |
1 files changed, 39 insertions, 5 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index bfd286c40631..9fd9f63adc08 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | |||
@@ -2356,6 +2356,19 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) | |||
2356 | return amdgpu_device_asic_has_dc_support(adev->asic_type); | 2356 | return amdgpu_device_asic_has_dc_support(adev->asic_type); |
2357 | } | 2357 | } |
2358 | 2358 | ||
2359 | |||
2360 | static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) | ||
2361 | { | ||
2362 | struct amdgpu_device *adev = | ||
2363 | container_of(__work, struct amdgpu_device, xgmi_reset_work); | ||
2364 | |||
2365 | adev->asic_reset_res = amdgpu_asic_reset(adev); | ||
2366 | if (adev->asic_reset_res) | ||
2367 | DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s", | ||
2368 | adev->asic_reset_res, adev->ddev->unique); | ||
2369 | } | ||
2370 | |||
2371 | |||
2359 | /** | 2372 | /** |
2360 | * amdgpu_device_init - initialize the driver | 2373 | * amdgpu_device_init - initialize the driver |
2361 | * | 2374 | * |
@@ -2454,6 +2467,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, | |||
2454 | INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, | 2467 | INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, |
2455 | amdgpu_device_delay_enable_gfx_off); | 2468 | amdgpu_device_delay_enable_gfx_off); |
2456 | 2469 | ||
2470 | INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); | ||
2471 | |||
2457 | adev->gfx.gfx_off_req_count = 1; | 2472 | adev->gfx.gfx_off_req_count = 1; |
2458 | adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false; | 2473 | adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false; |
2459 | 2474 | ||
@@ -3331,10 +3346,31 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, | |||
3331 | */ | 3346 | */ |
3332 | if (need_full_reset) { | 3347 | if (need_full_reset) { |
3333 | list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { | 3348 | list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { |
3334 | r = amdgpu_asic_reset(tmp_adev); | 3349 | /* For XGMI run all resets in parallel to speed up the process */ |
3335 | if (r) | 3350 | if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { |
3336 | DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s", | 3351 | if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) |
3352 | r = -EALREADY; | ||
3353 | } else | ||
3354 | r = amdgpu_asic_reset(tmp_adev); | ||
3355 | |||
3356 | if (r) { | ||
3357 | DRM_ERROR("ASIC reset failed with err r, %d for drm dev, %s", | ||
3337 | r, tmp_adev->ddev->unique); | 3358 | r, tmp_adev->ddev->unique); |
3359 | break; | ||
3360 | } | ||
3361 | } | ||
3362 | |||
3363 | /* For XGMI wait for all PSP resets to complete before proceed */ | ||
3364 | if (!r) { | ||
3365 | list_for_each_entry(tmp_adev, device_list_handle, | ||
3366 | gmc.xgmi.head) { | ||
3367 | if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { | ||
3368 | flush_work(&tmp_adev->xgmi_reset_work); | ||
3369 | r = tmp_adev->asic_reset_res; | ||
3370 | if (r) | ||
3371 | break; | ||
3372 | } | ||
3373 | } | ||
3338 | } | 3374 | } |
3339 | } | 3375 | } |
3340 | 3376 | ||
@@ -3521,8 +3557,6 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ | |||
3521 | if (tmp_adev == adev) | 3557 | if (tmp_adev == adev) |
3522 | continue; | 3558 | continue; |
3523 | 3559 | ||
3524 | dev_info(tmp_adev->dev, "GPU reset begin for drm dev %s!\n", adev->ddev->unique); | ||
3525 | |||
3526 | amdgpu_device_lock_adev(tmp_adev); | 3560 | amdgpu_device_lock_adev(tmp_adev); |
3527 | r = amdgpu_device_pre_asic_reset(tmp_adev, | 3561 | r = amdgpu_device_pre_asic_reset(tmp_adev, |
3528 | NULL, | 3562 | NULL, |