aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
authorAndrey Grodzovsky <andrey.grodzovsky@amd.com>2018-11-22 18:57:08 -0500
committerAlex Deucher <alexander.deucher@amd.com>2018-11-28 15:55:36 -0500
commit26bc534094ed45fdedef6b4ce8b96030340c5ce7 (patch)
tree63e9070146eb3da91e2f340e33afaf15da83c5bb /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parented2bf5229c53f20cfbca7a34fcbafa91c3168e1e (diff)
drm/amdgpu: Refactor GPU reset for XGMI hive case
For XGMI hive case do reset in steps where each step iterates over all devs in hive. This especially important for asic reset since all PSP FW in hive must come up within a limited time (around 1 sec) to properply negotiate the link. Do this by refactoring amdgpu_device_gpu_recover and amdgpu_device_reset into pre_asic_reset, asic_reset and post_asic_reset functions where is part is exectued for all the GPUs in the hive before going to the next step. v2: Update names for amdgpu_device_lock/unlock functions. v3: Introduce per hive locking to avoid multiple resets for GPUs in same hive. v4: Remove delayed_workqueue()/ttm_bo_unlock_delayed_workqueue() - they are copy & pasted over from radeon and on amdgpu there isn't any reason for that any more. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c372
1 files changed, 255 insertions, 117 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5a95cea58d46..8eaa40eb1c4a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3161,86 +3161,6 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3161 return 0; 3161 return 0;
3162} 3162}
3163 3163
3164/**
3165 * amdgpu_device_reset - reset ASIC/GPU for bare-metal or passthrough
3166 *
3167 * @adev: amdgpu device pointer
3168 *
3169 * attempt to do soft-reset or full-reset and reinitialize Asic
3170 * return 0 means succeeded otherwise failed
3171 */
3172static int amdgpu_device_reset(struct amdgpu_device *adev)
3173{
3174 bool need_full_reset, vram_lost = 0;
3175 int r;
3176
3177 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
3178
3179 if (!need_full_reset) {
3180 amdgpu_device_ip_pre_soft_reset(adev);
3181 r = amdgpu_device_ip_soft_reset(adev);
3182 amdgpu_device_ip_post_soft_reset(adev);
3183 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
3184 DRM_INFO("soft reset failed, will fallback to full reset!\n");
3185 need_full_reset = true;
3186 }
3187 }
3188
3189 if (need_full_reset) {
3190 r = amdgpu_device_ip_suspend(adev);
3191
3192retry:
3193 r = amdgpu_asic_reset(adev);
3194 /* post card */
3195 amdgpu_atom_asic_init(adev->mode_info.atom_context);
3196
3197 if (!r) {
3198 dev_info(adev->dev, "GPU reset succeeded, trying to resume\n");
3199 r = amdgpu_device_ip_resume_phase1(adev);
3200 if (r)
3201 goto out;
3202
3203 vram_lost = amdgpu_device_check_vram_lost(adev);
3204 if (vram_lost) {
3205 DRM_ERROR("VRAM is lost!\n");
3206 atomic_inc(&adev->vram_lost_counter);
3207 }
3208
3209 r = amdgpu_gtt_mgr_recover(
3210 &adev->mman.bdev.man[TTM_PL_TT]);
3211 if (r)
3212 goto out;
3213
3214 r = amdgpu_device_fw_loading(adev);
3215 if (r)
3216 return r;
3217
3218 r = amdgpu_device_ip_resume_phase2(adev);
3219 if (r)
3220 goto out;
3221
3222 if (vram_lost)
3223 amdgpu_device_fill_reset_magic(adev);
3224 }
3225 }
3226
3227out:
3228 if (!r) {
3229 amdgpu_irq_gpu_reset_resume_helper(adev);
3230 r = amdgpu_ib_ring_tests(adev);
3231 if (r) {
3232 dev_err(adev->dev, "ib ring test failed (%d).\n", r);
3233 r = amdgpu_device_ip_suspend(adev);
3234 need_full_reset = true;
3235 goto retry;
3236 }
3237 }
3238
3239 if (!r)
3240 r = amdgpu_device_recover_vram(adev);
3241
3242 return r;
3243}
3244 3164
3245/** 3165/**
3246 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 3166 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
@@ -3339,31 +3259,13 @@ disabled:
3339 return false; 3259 return false;
3340} 3260}
3341 3261
3342/**
3343 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
3344 *
3345 * @adev: amdgpu device pointer
3346 * @job: which job trigger hang
3347 *
3348 * Attempt to reset the GPU if it has hung (all asics).
3349 * Returns 0 for success or an error on failure.
3350 */
3351int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
3352 struct amdgpu_job *job)
3353{
3354 int i, r, resched;
3355
3356 dev_info(adev->dev, "GPU reset begin!\n");
3357
3358 mutex_lock(&adev->lock_reset);
3359 atomic_inc(&adev->gpu_reset_counter);
3360 adev->in_gpu_reset = 1;
3361
3362 /* Block kfd */
3363 amdgpu_amdkfd_pre_reset(adev);
3364 3262
3365 /* block TTM */ 3263static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
3366 resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3264 struct amdgpu_job *job,
3265 bool *need_full_reset_arg)
3266{
3267 int i, r = 0;
3268 bool need_full_reset = *need_full_reset_arg;
3367 3269
3368 /* block all schedulers and reset given job's ring */ 3270 /* block all schedulers and reset given job's ring */
3369 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3271 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
@@ -3383,10 +3285,123 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
3383 amdgpu_fence_driver_force_completion(ring); 3285 amdgpu_fence_driver_force_completion(ring);
3384 } 3286 }
3385 3287
3386 if (amdgpu_sriov_vf(adev)) 3288
3387 r = amdgpu_device_reset_sriov(adev, job ? false : true); 3289
3388 else 3290 if (!amdgpu_sriov_vf(adev)) {
3389 r = amdgpu_device_reset(adev); 3291
3292 if (!need_full_reset)
3293 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
3294
3295 if (!need_full_reset) {
3296 amdgpu_device_ip_pre_soft_reset(adev);
3297 r = amdgpu_device_ip_soft_reset(adev);
3298 amdgpu_device_ip_post_soft_reset(adev);
3299 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
3300 DRM_INFO("soft reset failed, will fallback to full reset!\n");
3301 need_full_reset = true;
3302 }
3303 }
3304
3305 if (need_full_reset)
3306 r = amdgpu_device_ip_suspend(adev);
3307
3308 *need_full_reset_arg = need_full_reset;
3309 }
3310
3311 return r;
3312}
3313
3314static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
3315 struct list_head *device_list_handle,
3316 bool *need_full_reset_arg)
3317{
3318 struct amdgpu_device *tmp_adev = NULL;
3319 bool need_full_reset = *need_full_reset_arg, vram_lost = false;
3320 int r = 0;
3321
3322 /*
3323 * ASIC reset has to be done on all HGMI hive nodes ASAP
3324 * to allow proper links negotiation in FW (within 1 sec)
3325 */
3326 if (need_full_reset) {
3327 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
3328 r = amdgpu_asic_reset(tmp_adev);
3329 if (r)
3330 DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s",
3331 r, tmp_adev->ddev->unique);
3332 }
3333 }
3334
3335
3336 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
3337 if (need_full_reset) {
3338 /* post card */
3339 if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context))
3340 DRM_WARN("asic atom init failed!");
3341
3342 if (!r) {
3343 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
3344 r = amdgpu_device_ip_resume_phase1(tmp_adev);
3345 if (r)
3346 goto out;
3347
3348 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
3349 if (vram_lost) {
3350 DRM_ERROR("VRAM is lost!\n");
3351 atomic_inc(&tmp_adev->vram_lost_counter);
3352 }
3353
3354 r = amdgpu_gtt_mgr_recover(
3355 &tmp_adev->mman.bdev.man[TTM_PL_TT]);
3356 if (r)
3357 goto out;
3358
3359 r = amdgpu_device_fw_loading(tmp_adev);
3360 if (r)
3361 return r;
3362
3363 r = amdgpu_device_ip_resume_phase2(tmp_adev);
3364 if (r)
3365 goto out;
3366
3367 if (vram_lost)
3368 amdgpu_device_fill_reset_magic(tmp_adev);
3369
3370 /* Update PSP FW topology after reset */
3371 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
3372 r = amdgpu_xgmi_update_topology(hive, tmp_adev);
3373 }
3374 }
3375
3376
3377out:
3378 if (!r) {
3379 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
3380 r = amdgpu_ib_ring_tests(tmp_adev);
3381 if (r) {
3382 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
3383 r = amdgpu_device_ip_suspend(tmp_adev);
3384 need_full_reset = true;
3385 r = -EAGAIN;
3386 goto end;
3387 }
3388 }
3389
3390 if (!r)
3391 r = amdgpu_device_recover_vram(tmp_adev);
3392 else
3393 tmp_adev->asic_reset_res = r;
3394 }
3395
3396end:
3397 *need_full_reset_arg = need_full_reset;
3398 return r;
3399}
3400
3401static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
3402 struct amdgpu_job *job)
3403{
3404 int i;
3390 3405
3391 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3406 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3392 struct amdgpu_ring *ring = adev->rings[i]; 3407 struct amdgpu_ring *ring = adev->rings[i];
@@ -3398,7 +3413,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
3398 * or all rings (in the case @job is NULL) 3413 * or all rings (in the case @job is NULL)
3399 * after above amdgpu_reset accomplished 3414 * after above amdgpu_reset accomplished
3400 */ 3415 */
3401 if ((!job || job->base.sched == &ring->sched) && !r) 3416 if ((!job || job->base.sched == &ring->sched) && !adev->asic_reset_res)
3402 drm_sched_job_recovery(&ring->sched); 3417 drm_sched_job_recovery(&ring->sched);
3403 3418
3404 kthread_unpark(ring->sched.thread); 3419 kthread_unpark(ring->sched.thread);
@@ -3408,21 +3423,144 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
3408 drm_helper_resume_force_mode(adev->ddev); 3423 drm_helper_resume_force_mode(adev->ddev);
3409 } 3424 }
3410 3425
3411 ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); 3426 adev->asic_reset_res = 0;
3427}
3412 3428
3413 if (r) { 3429static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
3414 /* bad news, how to tell it to userspace ? */ 3430{
3415 dev_info(adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter)); 3431 mutex_lock(&adev->lock_reset);
3416 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 3432 atomic_inc(&adev->gpu_reset_counter);
3417 } else { 3433 adev->in_gpu_reset = 1;
3418 dev_info(adev->dev, "GPU reset(%d) succeeded!\n",atomic_read(&adev->gpu_reset_counter)); 3434 /* Block kfd */
3419 } 3435 amdgpu_amdkfd_pre_reset(adev);
3436}
3420 3437
3438static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
3439{
3421 /*unlock kfd */ 3440 /*unlock kfd */
3422 amdgpu_amdkfd_post_reset(adev); 3441 amdgpu_amdkfd_post_reset(adev);
3423 amdgpu_vf_error_trans_all(adev); 3442 amdgpu_vf_error_trans_all(adev);
3424 adev->in_gpu_reset = 0; 3443 adev->in_gpu_reset = 0;
3425 mutex_unlock(&adev->lock_reset); 3444 mutex_unlock(&adev->lock_reset);
3445}
3446
3447
3448/**
3449 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
3450 *
3451 * @adev: amdgpu device pointer
3452 * @job: which job trigger hang
3453 *
3454 * Attempt to reset the GPU if it has hung (all asics).
3455 * Attempt to do soft-reset or full-reset and reinitialize Asic
3456 * Returns 0 for success or an error on failure.
3457 */
3458
3459int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
3460 struct amdgpu_job *job)
3461{
3462 int r;
3463 struct amdgpu_hive_info *hive = NULL;
3464 bool need_full_reset = false;
3465 struct amdgpu_device *tmp_adev = NULL;
3466 struct list_head device_list, *device_list_handle = NULL;
3467
3468 INIT_LIST_HEAD(&device_list);
3469
3470 dev_info(adev->dev, "GPU reset begin!\n");
3471
3472 /*
3473 * In case of XGMI hive disallow concurrent resets to be triggered
3474 * by different nodes. No point also since the one node already executing
3475 * reset will also reset all the other nodes in the hive.
3476 */
3477 hive = amdgpu_get_xgmi_hive(adev);
3478 if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
3479 !mutex_trylock(&hive->hive_lock))
3480 return 0;
3481
3482 /* Start with adev pre asic reset first for soft reset check.*/
3483 amdgpu_device_lock_adev(adev);
3484 r = amdgpu_device_pre_asic_reset(adev,
3485 job,
3486 &need_full_reset);
3487 if (r) {
3488 /*TODO Should we stop ?*/
3489 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
3490 r, adev->ddev->unique);
3491 adev->asic_reset_res = r;
3492 }
3493
3494 /* Build list of devices to reset */
3495 if (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
3496 if (!hive) {
3497 amdgpu_device_unlock_adev(adev);
3498 return -ENODEV;
3499 }
3500
3501 /*
3502 * In case we are in XGMI hive mode device reset is done for all the
3503 * nodes in the hive to retrain all XGMI links and hence the reset
3504 * sequence is executed in loop on all nodes.
3505 */
3506 device_list_handle = &hive->device_list;
3507 } else {
3508 list_add_tail(&adev->gmc.xgmi.head, &device_list);
3509 device_list_handle = &device_list;
3510 }
3511
3512retry: /* Rest of adevs pre asic reset from XGMI hive. */
3513 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
3514
3515 if (tmp_adev == adev)
3516 continue;
3517
3518 dev_info(tmp_adev->dev, "GPU reset begin for drm dev %s!\n", adev->ddev->unique);
3519
3520 amdgpu_device_lock_adev(tmp_adev);
3521 r = amdgpu_device_pre_asic_reset(tmp_adev,
3522 NULL,
3523 &need_full_reset);
3524 /*TODO Should we stop ?*/
3525 if (r) {
3526 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
3527 r, tmp_adev->ddev->unique);
3528 tmp_adev->asic_reset_res = r;
3529 }
3530 }
3531
3532 /* Actual ASIC resets if needed.*/
3533 /* TODO Implement XGMI hive reset logic for SRIOV */
3534 if (amdgpu_sriov_vf(adev)) {
3535 r = amdgpu_device_reset_sriov(adev, job ? false : true);
3536 if (r)
3537 adev->asic_reset_res = r;
3538 } else {
3539 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
3540 if (r && r == -EAGAIN)
3541 goto retry;
3542 }
3543
3544 /* Post ASIC reset for all devs .*/
3545 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
3546 amdgpu_device_post_asic_reset(tmp_adev, tmp_adev == adev ? job : NULL);
3547
3548 if (r) {
3549 /* bad news, how to tell it to userspace ? */
3550 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter));
3551 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
3552 } else {
3553 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&adev->gpu_reset_counter));
3554 }
3555
3556 amdgpu_device_unlock_adev(tmp_adev);
3557 }
3558
3559 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
3560 mutex_unlock(&hive->hive_lock);
3561
3562 if (r)
3563 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
3426 return r; 3564 return r;
3427} 3565}
3428 3566