aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
authorMonk Liu <Monk.Liu@amd.com>2017-12-24 22:59:27 -0500
committerAlex Deucher <alexander.deucher@amd.com>2018-03-01 11:52:41 -0500
commitc41d1cf62d3615294c1dee291b05ee3220a4de6c (patch)
tree093f0edfda55ac39e070ee685900fc5bd1226eac /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parent711826656bebb09b814349fac21cb13f88f92665 (diff)
drm/amdgpu: cleanups for vram lost handling
1)create a routine "handle_vram_lost" to do the vram recovery, and put it into amdgpu_device_reset/reset_sriov, this way no need of the extra paramter to hold the VRAM LOST information and the related macros can be removed. 3)show vram_recover failure if time out, and set TMO equal to lockup_timeout if vram_recover is under SRIOV runtime mode. 4)report error if any ip reset failed for SR-IOV Signed-off-by: Monk Liu <Monk.Liu@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Acked-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c137
1 files changed, 72 insertions, 65 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 64bd30075951..856378434ea2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1591,6 +1591,8 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
1591 1591
1592 r = block->version->funcs->hw_init(adev); 1592 r = block->version->funcs->hw_init(adev);
1593 DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed"); 1593 DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed");
1594 if (r)
1595 return r;
1594 } 1596 }
1595 } 1597 }
1596 1598
@@ -1624,6 +1626,8 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
1624 1626
1625 r = block->version->funcs->hw_init(adev); 1627 r = block->version->funcs->hw_init(adev);
1626 DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed"); 1628 DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed");
1629 if (r)
1630 return r;
1627 } 1631 }
1628 } 1632 }
1629 1633
@@ -2470,17 +2474,71 @@ err:
2470 return r; 2474 return r;
2471} 2475}
2472 2476
2477static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
2478{
2479 struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
2480 struct amdgpu_bo *bo, *tmp;
2481 struct dma_fence *fence = NULL, *next = NULL;
2482 long r = 1;
2483 int i = 0;
2484 long tmo;
2485
2486 if (amdgpu_sriov_runtime(adev))
2487 tmo = msecs_to_jiffies(amdgpu_lockup_timeout);
2488 else
2489 tmo = msecs_to_jiffies(100);
2490
2491 DRM_INFO("recover vram bo from shadow start\n");
2492 mutex_lock(&adev->shadow_list_lock);
2493 list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
2494 next = NULL;
2495 amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
2496 if (fence) {
2497 r = dma_fence_wait_timeout(fence, false, tmo);
2498 if (r == 0)
2499 pr_err("wait fence %p[%d] timeout\n", fence, i);
2500 else if (r < 0)
2501 pr_err("wait fence %p[%d] interrupted\n", fence, i);
2502 if (r < 1) {
2503 dma_fence_put(fence);
2504 fence = next;
2505 break;
2506 }
2507 i++;
2508 }
2509
2510 dma_fence_put(fence);
2511 fence = next;
2512 }
2513 mutex_unlock(&adev->shadow_list_lock);
2514
2515 if (fence) {
2516 r = dma_fence_wait_timeout(fence, false, tmo);
2517 if (r == 0)
2518 pr_err("wait fence %p[%d] timeout\n", fence, i);
2519 else if (r < 0)
2520 pr_err("wait fence %p[%d] interrupted\n", fence, i);
2521
2522 }
2523 dma_fence_put(fence);
2524
2525 if (r > 0)
2526 DRM_INFO("recover vram bo from shadow done\n");
2527 else
2528 DRM_ERROR("recover vram bo from shadow failed\n");
2529
2530 return (r > 0?0:1);
2531}
2532
2473/* 2533/*
2474 * amdgpu_device_reset - reset ASIC/GPU for bare-metal or passthrough 2534 * amdgpu_device_reset - reset ASIC/GPU for bare-metal or passthrough
2475 * 2535 *
2476 * @adev: amdgpu device pointer 2536 * @adev: amdgpu device pointer
2477 * @reset_flags: output param tells caller the reset result
2478 * 2537 *
2479 * attempt to do soft-reset or full-reset and reinitialize Asic 2538 * attempt to do soft-reset or full-reset and reinitialize Asic
2480 * return 0 means successed otherwise failed 2539 * return 0 means successed otherwise failed
2481*/ 2540*/
2482static int amdgpu_device_reset(struct amdgpu_device *adev, 2541static int amdgpu_device_reset(struct amdgpu_device *adev)
2483 uint64_t* reset_flags)
2484{ 2542{
2485 bool need_full_reset, vram_lost = 0; 2543 bool need_full_reset, vram_lost = 0;
2486 int r; 2544 int r;
@@ -2495,7 +2553,6 @@ static int amdgpu_device_reset(struct amdgpu_device *adev,
2495 DRM_INFO("soft reset failed, will fallback to full reset!\n"); 2553 DRM_INFO("soft reset failed, will fallback to full reset!\n");
2496 need_full_reset = true; 2554 need_full_reset = true;
2497 } 2555 }
2498
2499 } 2556 }
2500 2557
2501 if (need_full_reset) { 2558 if (need_full_reset) {
@@ -2544,13 +2601,8 @@ out:
2544 } 2601 }
2545 } 2602 }
2546 2603
2547 if (reset_flags) { 2604 if (!r && ((need_full_reset && !(adev->flags & AMD_IS_APU)) || vram_lost))
2548 if (vram_lost) 2605 r = amdgpu_device_handle_vram_lost(adev);
2549 (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST;
2550
2551 if (need_full_reset)
2552 (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET;
2553 }
2554 2606
2555 return r; 2607 return r;
2556} 2608}
@@ -2559,14 +2611,11 @@ out:
2559 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 2611 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
2560 * 2612 *
2561 * @adev: amdgpu device pointer 2613 * @adev: amdgpu device pointer
2562 * @reset_flags: output param tells caller the reset result
2563 * 2614 *
2564 * do VF FLR and reinitialize Asic 2615 * do VF FLR and reinitialize Asic
2565 * return 0 means successed otherwise failed 2616 * return 0 means successed otherwise failed
2566*/ 2617*/
2567static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 2618static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, bool from_hypervisor)
2568 uint64_t *reset_flags,
2569 bool from_hypervisor)
2570{ 2619{
2571 int r; 2620 int r;
2572 2621
@@ -2587,28 +2636,20 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
2587 2636
2588 /* now we are okay to resume SMC/CP/SDMA */ 2637 /* now we are okay to resume SMC/CP/SDMA */
2589 r = amdgpu_device_ip_reinit_late_sriov(adev); 2638 r = amdgpu_device_ip_reinit_late_sriov(adev);
2639 amdgpu_virt_release_full_gpu(adev, true);
2590 if (r) 2640 if (r)
2591 goto error; 2641 goto error;
2592 2642
2593 amdgpu_irq_gpu_reset_resume_helper(adev); 2643 amdgpu_irq_gpu_reset_resume_helper(adev);
2594 r = amdgpu_ib_ring_tests(adev); 2644 r = amdgpu_ib_ring_tests(adev);
2595 if (r)
2596 dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r);
2597 2645
2598error: 2646 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
2599 /* release full control of GPU after ib test */ 2647 atomic_inc(&adev->vram_lost_counter);
2600 amdgpu_virt_release_full_gpu(adev, true); 2648 r = amdgpu_device_handle_vram_lost(adev);
2601
2602 if (reset_flags) {
2603 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
2604 (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST;
2605 atomic_inc(&adev->vram_lost_counter);
2606 }
2607
2608 /* VF FLR or hotlink reset is always full-reset */
2609 (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET;
2610 } 2649 }
2611 2650
2651error:
2652
2612 return r; 2653 return r;
2613} 2654}
2614 2655
@@ -2626,7 +2667,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
2626 struct amdgpu_job *job, bool force) 2667 struct amdgpu_job *job, bool force)
2627{ 2668{
2628 struct drm_atomic_state *state = NULL; 2669 struct drm_atomic_state *state = NULL;
2629 uint64_t reset_flags = 0;
2630 int i, r, resched; 2670 int i, r, resched;
2631 2671
2632 if (!force && !amdgpu_device_ip_check_soft_reset(adev)) { 2672 if (!force && !amdgpu_device_ip_check_soft_reset(adev)) {
@@ -2672,42 +2712,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
2672 } 2712 }
2673 2713
2674 if (amdgpu_sriov_vf(adev)) 2714 if (amdgpu_sriov_vf(adev))
2675 r = amdgpu_device_reset_sriov(adev, &reset_flags, job ? false : true); 2715 r = amdgpu_device_reset_sriov(adev, job ? false : true);
2676 else 2716 else
2677 r = amdgpu_device_reset(adev, &reset_flags); 2717 r = amdgpu_device_reset(adev);
2678
2679 if (!r) {
2680 if (((reset_flags & AMDGPU_RESET_INFO_FULLRESET) && !(adev->flags & AMD_IS_APU)) ||
2681 (reset_flags & AMDGPU_RESET_INFO_VRAM_LOST)) {
2682 struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
2683 struct amdgpu_bo *bo, *tmp;
2684 struct dma_fence *fence = NULL, *next = NULL;
2685
2686 DRM_INFO("recover vram bo from shadow\n");
2687 mutex_lock(&adev->shadow_list_lock);
2688 list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
2689 next = NULL;
2690 amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
2691 if (fence) {
2692 r = dma_fence_wait(fence, false);
2693 if (r) {
2694 WARN(r, "recovery from shadow isn't completed\n");
2695 break;
2696 }
2697 }
2698
2699 dma_fence_put(fence);
2700 fence = next;
2701 }
2702 mutex_unlock(&adev->shadow_list_lock);
2703 if (fence) {
2704 r = dma_fence_wait(fence, false);
2705 if (r)
2706 WARN(r, "recovery from shadow isn't completed\n");
2707 }
2708 dma_fence_put(fence);
2709 }
2710 }
2711 2718
2712 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2719 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2713 struct amdgpu_ring *ring = adev->rings[i]; 2720 struct amdgpu_ring *ring = adev->rings[i];