diff options
author | Monk Liu <Monk.Liu@amd.com> | 2017-12-24 22:59:27 -0500 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2018-03-01 11:52:41 -0500 |
commit | c41d1cf62d3615294c1dee291b05ee3220a4de6c (patch) | |
tree | 093f0edfda55ac39e070ee685900fc5bd1226eac /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | |
parent | 711826656bebb09b814349fac21cb13f88f92665 (diff) |
drm/amdgpu: cleanups for vram lost handling
1)create a routine "handle_vram_lost" to do the vram
recovery, and put it into amdgpu_device_reset/reset_sriov,
this way no need of the extra paramter to hold the
VRAM LOST information and the related macros can be removed.
3)show vram_recover failure if time out, and set TMO equal to
lockup_timeout if vram_recover is under SRIOV runtime mode.
4)report error if any ip reset failed for SR-IOV
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 137 |
1 files changed, 72 insertions, 65 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 64bd30075951..856378434ea2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | |||
@@ -1591,6 +1591,8 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) | |||
1591 | 1591 | ||
1592 | r = block->version->funcs->hw_init(adev); | 1592 | r = block->version->funcs->hw_init(adev); |
1593 | DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed"); | 1593 | DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed"); |
1594 | if (r) | ||
1595 | return r; | ||
1594 | } | 1596 | } |
1595 | } | 1597 | } |
1596 | 1598 | ||
@@ -1624,6 +1626,8 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) | |||
1624 | 1626 | ||
1625 | r = block->version->funcs->hw_init(adev); | 1627 | r = block->version->funcs->hw_init(adev); |
1626 | DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed"); | 1628 | DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed"); |
1629 | if (r) | ||
1630 | return r; | ||
1627 | } | 1631 | } |
1628 | } | 1632 | } |
1629 | 1633 | ||
@@ -2470,17 +2474,71 @@ err: | |||
2470 | return r; | 2474 | return r; |
2471 | } | 2475 | } |
2472 | 2476 | ||
2477 | static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev) | ||
2478 | { | ||
2479 | struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; | ||
2480 | struct amdgpu_bo *bo, *tmp; | ||
2481 | struct dma_fence *fence = NULL, *next = NULL; | ||
2482 | long r = 1; | ||
2483 | int i = 0; | ||
2484 | long tmo; | ||
2485 | |||
2486 | if (amdgpu_sriov_runtime(adev)) | ||
2487 | tmo = msecs_to_jiffies(amdgpu_lockup_timeout); | ||
2488 | else | ||
2489 | tmo = msecs_to_jiffies(100); | ||
2490 | |||
2491 | DRM_INFO("recover vram bo from shadow start\n"); | ||
2492 | mutex_lock(&adev->shadow_list_lock); | ||
2493 | list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { | ||
2494 | next = NULL; | ||
2495 | amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next); | ||
2496 | if (fence) { | ||
2497 | r = dma_fence_wait_timeout(fence, false, tmo); | ||
2498 | if (r == 0) | ||
2499 | pr_err("wait fence %p[%d] timeout\n", fence, i); | ||
2500 | else if (r < 0) | ||
2501 | pr_err("wait fence %p[%d] interrupted\n", fence, i); | ||
2502 | if (r < 1) { | ||
2503 | dma_fence_put(fence); | ||
2504 | fence = next; | ||
2505 | break; | ||
2506 | } | ||
2507 | i++; | ||
2508 | } | ||
2509 | |||
2510 | dma_fence_put(fence); | ||
2511 | fence = next; | ||
2512 | } | ||
2513 | mutex_unlock(&adev->shadow_list_lock); | ||
2514 | |||
2515 | if (fence) { | ||
2516 | r = dma_fence_wait_timeout(fence, false, tmo); | ||
2517 | if (r == 0) | ||
2518 | pr_err("wait fence %p[%d] timeout\n", fence, i); | ||
2519 | else if (r < 0) | ||
2520 | pr_err("wait fence %p[%d] interrupted\n", fence, i); | ||
2521 | |||
2522 | } | ||
2523 | dma_fence_put(fence); | ||
2524 | |||
2525 | if (r > 0) | ||
2526 | DRM_INFO("recover vram bo from shadow done\n"); | ||
2527 | else | ||
2528 | DRM_ERROR("recover vram bo from shadow failed\n"); | ||
2529 | |||
2530 | return (r > 0?0:1); | ||
2531 | } | ||
2532 | |||
2473 | /* | 2533 | /* |
2474 | * amdgpu_device_reset - reset ASIC/GPU for bare-metal or passthrough | 2534 | * amdgpu_device_reset - reset ASIC/GPU for bare-metal or passthrough |
2475 | * | 2535 | * |
2476 | * @adev: amdgpu device pointer | 2536 | * @adev: amdgpu device pointer |
2477 | * @reset_flags: output param tells caller the reset result | ||
2478 | * | 2537 | * |
2479 | * attempt to do soft-reset or full-reset and reinitialize Asic | 2538 | * attempt to do soft-reset or full-reset and reinitialize Asic |
2480 | * return 0 means successed otherwise failed | 2539 | * return 0 means successed otherwise failed |
2481 | */ | 2540 | */ |
2482 | static int amdgpu_device_reset(struct amdgpu_device *adev, | 2541 | static int amdgpu_device_reset(struct amdgpu_device *adev) |
2483 | uint64_t* reset_flags) | ||
2484 | { | 2542 | { |
2485 | bool need_full_reset, vram_lost = 0; | 2543 | bool need_full_reset, vram_lost = 0; |
2486 | int r; | 2544 | int r; |
@@ -2495,7 +2553,6 @@ static int amdgpu_device_reset(struct amdgpu_device *adev, | |||
2495 | DRM_INFO("soft reset failed, will fallback to full reset!\n"); | 2553 | DRM_INFO("soft reset failed, will fallback to full reset!\n"); |
2496 | need_full_reset = true; | 2554 | need_full_reset = true; |
2497 | } | 2555 | } |
2498 | |||
2499 | } | 2556 | } |
2500 | 2557 | ||
2501 | if (need_full_reset) { | 2558 | if (need_full_reset) { |
@@ -2544,13 +2601,8 @@ out: | |||
2544 | } | 2601 | } |
2545 | } | 2602 | } |
2546 | 2603 | ||
2547 | if (reset_flags) { | 2604 | if (!r && ((need_full_reset && !(adev->flags & AMD_IS_APU)) || vram_lost)) |
2548 | if (vram_lost) | 2605 | r = amdgpu_device_handle_vram_lost(adev); |
2549 | (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST; | ||
2550 | |||
2551 | if (need_full_reset) | ||
2552 | (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET; | ||
2553 | } | ||
2554 | 2606 | ||
2555 | return r; | 2607 | return r; |
2556 | } | 2608 | } |
@@ -2559,14 +2611,11 @@ out: | |||
2559 | * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf | 2611 | * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf |
2560 | * | 2612 | * |
2561 | * @adev: amdgpu device pointer | 2613 | * @adev: amdgpu device pointer |
2562 | * @reset_flags: output param tells caller the reset result | ||
2563 | * | 2614 | * |
2564 | * do VF FLR and reinitialize Asic | 2615 | * do VF FLR and reinitialize Asic |
2565 | * return 0 means successed otherwise failed | 2616 | * return 0 means successed otherwise failed |
2566 | */ | 2617 | */ |
2567 | static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, | 2618 | static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, bool from_hypervisor) |
2568 | uint64_t *reset_flags, | ||
2569 | bool from_hypervisor) | ||
2570 | { | 2619 | { |
2571 | int r; | 2620 | int r; |
2572 | 2621 | ||
@@ -2587,28 +2636,20 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, | |||
2587 | 2636 | ||
2588 | /* now we are okay to resume SMC/CP/SDMA */ | 2637 | /* now we are okay to resume SMC/CP/SDMA */ |
2589 | r = amdgpu_device_ip_reinit_late_sriov(adev); | 2638 | r = amdgpu_device_ip_reinit_late_sriov(adev); |
2639 | amdgpu_virt_release_full_gpu(adev, true); | ||
2590 | if (r) | 2640 | if (r) |
2591 | goto error; | 2641 | goto error; |
2592 | 2642 | ||
2593 | amdgpu_irq_gpu_reset_resume_helper(adev); | 2643 | amdgpu_irq_gpu_reset_resume_helper(adev); |
2594 | r = amdgpu_ib_ring_tests(adev); | 2644 | r = amdgpu_ib_ring_tests(adev); |
2595 | if (r) | ||
2596 | dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r); | ||
2597 | 2645 | ||
2598 | error: | 2646 | if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { |
2599 | /* release full control of GPU after ib test */ | 2647 | atomic_inc(&adev->vram_lost_counter); |
2600 | amdgpu_virt_release_full_gpu(adev, true); | 2648 | r = amdgpu_device_handle_vram_lost(adev); |
2601 | |||
2602 | if (reset_flags) { | ||
2603 | if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { | ||
2604 | (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST; | ||
2605 | atomic_inc(&adev->vram_lost_counter); | ||
2606 | } | ||
2607 | |||
2608 | /* VF FLR or hotlink reset is always full-reset */ | ||
2609 | (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET; | ||
2610 | } | 2649 | } |
2611 | 2650 | ||
2651 | error: | ||
2652 | |||
2612 | return r; | 2653 | return r; |
2613 | } | 2654 | } |
2614 | 2655 | ||
@@ -2626,7 +2667,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, | |||
2626 | struct amdgpu_job *job, bool force) | 2667 | struct amdgpu_job *job, bool force) |
2627 | { | 2668 | { |
2628 | struct drm_atomic_state *state = NULL; | 2669 | struct drm_atomic_state *state = NULL; |
2629 | uint64_t reset_flags = 0; | ||
2630 | int i, r, resched; | 2670 | int i, r, resched; |
2631 | 2671 | ||
2632 | if (!force && !amdgpu_device_ip_check_soft_reset(adev)) { | 2672 | if (!force && !amdgpu_device_ip_check_soft_reset(adev)) { |
@@ -2672,42 +2712,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, | |||
2672 | } | 2712 | } |
2673 | 2713 | ||
2674 | if (amdgpu_sriov_vf(adev)) | 2714 | if (amdgpu_sriov_vf(adev)) |
2675 | r = amdgpu_device_reset_sriov(adev, &reset_flags, job ? false : true); | 2715 | r = amdgpu_device_reset_sriov(adev, job ? false : true); |
2676 | else | 2716 | else |
2677 | r = amdgpu_device_reset(adev, &reset_flags); | 2717 | r = amdgpu_device_reset(adev); |
2678 | |||
2679 | if (!r) { | ||
2680 | if (((reset_flags & AMDGPU_RESET_INFO_FULLRESET) && !(adev->flags & AMD_IS_APU)) || | ||
2681 | (reset_flags & AMDGPU_RESET_INFO_VRAM_LOST)) { | ||
2682 | struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; | ||
2683 | struct amdgpu_bo *bo, *tmp; | ||
2684 | struct dma_fence *fence = NULL, *next = NULL; | ||
2685 | |||
2686 | DRM_INFO("recover vram bo from shadow\n"); | ||
2687 | mutex_lock(&adev->shadow_list_lock); | ||
2688 | list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { | ||
2689 | next = NULL; | ||
2690 | amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next); | ||
2691 | if (fence) { | ||
2692 | r = dma_fence_wait(fence, false); | ||
2693 | if (r) { | ||
2694 | WARN(r, "recovery from shadow isn't completed\n"); | ||
2695 | break; | ||
2696 | } | ||
2697 | } | ||
2698 | |||
2699 | dma_fence_put(fence); | ||
2700 | fence = next; | ||
2701 | } | ||
2702 | mutex_unlock(&adev->shadow_list_lock); | ||
2703 | if (fence) { | ||
2704 | r = dma_fence_wait(fence, false); | ||
2705 | if (r) | ||
2706 | WARN(r, "recovery from shadow isn't completed\n"); | ||
2707 | } | ||
2708 | dma_fence_put(fence); | ||
2709 | } | ||
2710 | } | ||
2711 | 2718 | ||
2712 | for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { | 2719 | for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
2713 | struct amdgpu_ring *ring = adev->rings[i]; | 2720 | struct amdgpu_ring *ring = adev->rings[i]; |