diff options
author | Monk Liu <Monk.Liu@amd.com> | 2017-10-25 04:37:02 -0400 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2017-12-04 16:41:30 -0500 |
commit | 5740682e66cef57626a328d237698cad329c0449 (patch) | |
tree | 6dd15cc6cb5cbcc511dd7f6bded375e5b01575b0 | |
parent | 48f05f2955e4a3183b219d6dfdb1c28e17d03da7 (diff) |
drm/amdgpu:implement new GPU recover(v3)
1,new imple names amdgpu_gpu_recover which gives more hint
on what it does compared with gpu_reset
2,gpu_recover unify bare-metal and SR-IOV, only the asic reset
part is implemented differently
3,gpu_recover will increase hang job karma and mark its entity/context
as guilty if exceeds limit
V2:
4,in scheduler main routine the job from guilty context will be immedialy
fake signaled after it poped from queue and its fence be set with
"-ECANCELED" error
5,in scheduler recovery routine all jobs from the guilty entity would be
dropped
6,in run_job() routine the real IB submission would be skipped if @skip parameter
equales true or there was VRAM lost occured.
V3:
7,replace deprecated gpu reset, use new gpu recover
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu.h | 6 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 322 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 10 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 5 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 2 |
8 files changed, 166 insertions, 184 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 88fa19b1a802..5714b7e8cb09 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h | |||
@@ -178,6 +178,10 @@ extern int amdgpu_cik_support; | |||
178 | #define CIK_CURSOR_WIDTH 128 | 178 | #define CIK_CURSOR_WIDTH 128 |
179 | #define CIK_CURSOR_HEIGHT 128 | 179 | #define CIK_CURSOR_HEIGHT 128 |
180 | 180 | ||
181 | /* GPU RESET flags */ | ||
182 | #define AMDGPU_RESET_INFO_VRAM_LOST (1 << 0) | ||
183 | #define AMDGPU_RESET_INFO_FULLRESET (1 << 1) | ||
184 | |||
181 | struct amdgpu_device; | 185 | struct amdgpu_device; |
182 | struct amdgpu_ib; | 186 | struct amdgpu_ib; |
183 | struct amdgpu_cs_parser; | 187 | struct amdgpu_cs_parser; |
@@ -1833,7 +1837,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring) | |||
1833 | #define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i)) | 1837 | #define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i)) |
1834 | 1838 | ||
1835 | /* Common functions */ | 1839 | /* Common functions */ |
1836 | int amdgpu_gpu_reset(struct amdgpu_device *adev); | 1840 | int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job); |
1837 | bool amdgpu_need_backup(struct amdgpu_device *adev); | 1841 | bool amdgpu_need_backup(struct amdgpu_device *adev); |
1838 | void amdgpu_pci_config_reset(struct amdgpu_device *adev); | 1842 | void amdgpu_pci_config_reset(struct amdgpu_device *adev); |
1839 | bool amdgpu_need_post(struct amdgpu_device *adev); | 1843 | bool amdgpu_need_post(struct amdgpu_device *adev); |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e521850e9409..e287eeda2dab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | |||
@@ -2827,163 +2827,172 @@ err: | |||
2827 | return r; | 2827 | return r; |
2828 | } | 2828 | } |
2829 | 2829 | ||
2830 | /** | 2830 | /* |
2831 | * amdgpu_sriov_gpu_reset - reset the asic | 2831 | * amdgpu_reset - reset ASIC/GPU for bare-metal or passthrough |
2832 | * | 2832 | * |
2833 | * @adev: amdgpu device pointer | 2833 | * @adev: amdgpu device pointer |
2834 | * @job: which job trigger hang | 2834 | * @reset_flags: output param tells caller the reset result |
2835 | * | 2835 | * |
2836 | * Attempt the reset the GPU if it has hung (all asics). | 2836 | * attempt to do soft-reset or full-reset and reinitialize Asic |
2837 | * for SRIOV case. | 2837 | * return 0 means successed otherwise failed |
2838 | * Returns 0 for success or an error on failure. | 2838 | */ |
2839 | */ | 2839 | static int amdgpu_reset(struct amdgpu_device *adev, uint64_t* reset_flags) |
2840 | int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job) | ||
2841 | { | 2840 | { |
2842 | int i, j, r = 0; | 2841 | bool need_full_reset, vram_lost = 0; |
2843 | int resched; | 2842 | int r; |
2844 | struct amdgpu_bo *bo, *tmp; | ||
2845 | struct amdgpu_ring *ring; | ||
2846 | struct dma_fence *fence = NULL, *next = NULL; | ||
2847 | 2843 | ||
2848 | mutex_lock(&adev->virt.lock_reset); | 2844 | need_full_reset = amdgpu_need_full_reset(adev); |
2849 | atomic_inc(&adev->gpu_reset_counter); | ||
2850 | adev->in_sriov_reset = true; | ||
2851 | 2845 | ||
2852 | /* block TTM */ | 2846 | if (!need_full_reset) { |
2853 | resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); | 2847 | amdgpu_pre_soft_reset(adev); |
2848 | r = amdgpu_soft_reset(adev); | ||
2849 | amdgpu_post_soft_reset(adev); | ||
2850 | if (r || amdgpu_check_soft_reset(adev)) { | ||
2851 | DRM_INFO("soft reset failed, will fallback to full reset!\n"); | ||
2852 | need_full_reset = true; | ||
2853 | } | ||
2854 | 2854 | ||
2855 | /* we start from the ring trigger GPU hang */ | 2855 | } |
2856 | j = job ? job->ring->idx : 0; | ||
2857 | 2856 | ||
2858 | /* block scheduler */ | 2857 | if (need_full_reset) { |
2859 | for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) { | 2858 | r = amdgpu_suspend(adev); |
2860 | ring = adev->rings[i % AMDGPU_MAX_RINGS]; | ||
2861 | if (!ring || !ring->sched.thread) | ||
2862 | continue; | ||
2863 | 2859 | ||
2864 | kthread_park(ring->sched.thread); | 2860 | retry: |
2861 | amdgpu_atombios_scratch_regs_save(adev); | ||
2862 | r = amdgpu_asic_reset(adev); | ||
2863 | amdgpu_atombios_scratch_regs_restore(adev); | ||
2864 | /* post card */ | ||
2865 | amdgpu_atom_asic_init(adev->mode_info.atom_context); | ||
2865 | 2866 | ||
2866 | if (job && j != i) | 2867 | if (!r) { |
2867 | continue; | 2868 | dev_info(adev->dev, "GPU reset succeeded, trying to resume\n"); |
2869 | r = amdgpu_resume_phase1(adev); | ||
2870 | if (r) | ||
2871 | goto out; | ||
2868 | 2872 | ||
2869 | /* here give the last chance to check if job removed from mirror-list | 2873 | vram_lost = amdgpu_check_vram_lost(adev); |
2870 | * since we already pay some time on kthread_park */ | 2874 | if (vram_lost) { |
2871 | if (job && list_empty(&job->base.node)) { | 2875 | DRM_ERROR("VRAM is lost!\n"); |
2872 | kthread_unpark(ring->sched.thread); | 2876 | atomic_inc(&adev->vram_lost_counter); |
2873 | goto give_up_reset; | 2877 | } |
2878 | |||
2879 | r = amdgpu_ttm_recover_gart(adev); | ||
2880 | if (r) | ||
2881 | goto out; | ||
2882 | |||
2883 | r = amdgpu_resume_phase2(adev); | ||
2884 | if (r) | ||
2885 | goto out; | ||
2886 | |||
2887 | if (vram_lost) | ||
2888 | amdgpu_fill_reset_magic(adev); | ||
2874 | } | 2889 | } |
2890 | } | ||
2875 | 2891 | ||
2876 | if (amd_sched_invalidate_job(&job->base, amdgpu_job_hang_limit)) | 2892 | out: |
2877 | amd_sched_job_kickout(&job->base); | 2893 | if (!r) { |
2894 | amdgpu_irq_gpu_reset_resume_helper(adev); | ||
2895 | r = amdgpu_ib_ring_tests(adev); | ||
2896 | if (r) { | ||
2897 | dev_err(adev->dev, "ib ring test failed (%d).\n", r); | ||
2898 | r = amdgpu_suspend(adev); | ||
2899 | need_full_reset = true; | ||
2900 | goto retry; | ||
2901 | } | ||
2902 | } | ||
2878 | 2903 | ||
2879 | /* only do job_reset on the hang ring if @job not NULL */ | 2904 | if (reset_flags) { |
2880 | amd_sched_hw_job_reset(&ring->sched, NULL); | 2905 | if (vram_lost) |
2906 | (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST; | ||
2881 | 2907 | ||
2882 | /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ | 2908 | if (need_full_reset) |
2883 | amdgpu_fence_driver_force_completion(ring); | 2909 | (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET; |
2884 | } | 2910 | } |
2885 | 2911 | ||
2886 | /* request to take full control of GPU before re-initialization */ | 2912 | return r; |
2887 | if (job) | 2913 | } |
2888 | amdgpu_virt_reset_gpu(adev); | 2914 | |
2889 | else | 2915 | /* |
2890 | amdgpu_virt_request_full_gpu(adev, true); | 2916 | * amdgpu_reset_sriov - reset ASIC for SR-IOV vf |
2917 | * | ||
2918 | * @adev: amdgpu device pointer | ||
2919 | * @reset_flags: output param tells caller the reset result | ||
2920 | * | ||
2921 | * do VF FLR and reinitialize Asic | ||
2922 | * return 0 means successed otherwise failed | ||
2923 | */ | ||
2924 | static int amdgpu_reset_sriov(struct amdgpu_device *adev, uint64_t *reset_flags, bool from_hypervisor) | ||
2925 | { | ||
2926 | int r; | ||
2891 | 2927 | ||
2928 | if (from_hypervisor) | ||
2929 | r = amdgpu_virt_request_full_gpu(adev, true); | ||
2930 | else | ||
2931 | r = amdgpu_virt_reset_gpu(adev); | ||
2932 | if (r) | ||
2933 | return r; | ||
2892 | 2934 | ||
2893 | /* Resume IP prior to SMC */ | 2935 | /* Resume IP prior to SMC */ |
2894 | amdgpu_sriov_reinit_early(adev); | 2936 | r = amdgpu_sriov_reinit_early(adev); |
2937 | if (r) | ||
2938 | goto error; | ||
2895 | 2939 | ||
2896 | /* we need recover gart prior to run SMC/CP/SDMA resume */ | 2940 | /* we need recover gart prior to run SMC/CP/SDMA resume */ |
2897 | amdgpu_ttm_recover_gart(adev); | 2941 | amdgpu_ttm_recover_gart(adev); |
2898 | 2942 | ||
2899 | /* now we are okay to resume SMC/CP/SDMA */ | 2943 | /* now we are okay to resume SMC/CP/SDMA */ |
2900 | amdgpu_sriov_reinit_late(adev); | 2944 | r = amdgpu_sriov_reinit_late(adev); |
2945 | if (r) | ||
2946 | goto error; | ||
2901 | 2947 | ||
2902 | amdgpu_irq_gpu_reset_resume_helper(adev); | 2948 | amdgpu_irq_gpu_reset_resume_helper(adev); |
2903 | 2949 | r = amdgpu_ib_ring_tests(adev); | |
2904 | if (amdgpu_ib_ring_tests(adev)) | 2950 | if (r) |
2905 | dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r); | 2951 | dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r); |
2906 | 2952 | ||
2953 | error: | ||
2907 | /* release full control of GPU after ib test */ | 2954 | /* release full control of GPU after ib test */ |
2908 | amdgpu_virt_release_full_gpu(adev, true); | 2955 | amdgpu_virt_release_full_gpu(adev, true); |
2909 | 2956 | ||
2910 | DRM_INFO("recover vram bo from shadow\n"); | 2957 | if (reset_flags) { |
2911 | 2958 | /* will get vram_lost from GIM in future, now all | |
2912 | ring = adev->mman.buffer_funcs_ring; | 2959 | * reset request considered VRAM LOST |
2913 | mutex_lock(&adev->shadow_list_lock); | 2960 | */ |
2914 | list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { | 2961 | (*reset_flags) |= ~AMDGPU_RESET_INFO_VRAM_LOST; |
2915 | next = NULL; | 2962 | atomic_inc(&adev->vram_lost_counter); |
2916 | amdgpu_recover_vram_from_shadow(adev, ring, bo, &next); | ||
2917 | if (fence) { | ||
2918 | r = dma_fence_wait(fence, false); | ||
2919 | if (r) { | ||
2920 | WARN(r, "recovery from shadow isn't completed\n"); | ||
2921 | break; | ||
2922 | } | ||
2923 | } | ||
2924 | |||
2925 | dma_fence_put(fence); | ||
2926 | fence = next; | ||
2927 | } | ||
2928 | mutex_unlock(&adev->shadow_list_lock); | ||
2929 | |||
2930 | if (fence) { | ||
2931 | r = dma_fence_wait(fence, false); | ||
2932 | if (r) | ||
2933 | WARN(r, "recovery from shadow isn't completed\n"); | ||
2934 | } | ||
2935 | dma_fence_put(fence); | ||
2936 | |||
2937 | for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) { | ||
2938 | ring = adev->rings[i % AMDGPU_MAX_RINGS]; | ||
2939 | if (!ring || !ring->sched.thread) | ||
2940 | continue; | ||
2941 | |||
2942 | if (job && j != i) { | ||
2943 | kthread_unpark(ring->sched.thread); | ||
2944 | continue; | ||
2945 | } | ||
2946 | |||
2947 | amd_sched_job_recovery(&ring->sched); | ||
2948 | kthread_unpark(ring->sched.thread); | ||
2949 | } | ||
2950 | 2963 | ||
2951 | drm_helper_resume_force_mode(adev->ddev); | 2964 | /* VF FLR or hotlink reset is always full-reset */ |
2952 | give_up_reset: | 2965 | (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET; |
2953 | ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); | ||
2954 | if (r) { | ||
2955 | /* bad news, how to tell it to userspace ? */ | ||
2956 | dev_info(adev->dev, "GPU reset failed\n"); | ||
2957 | } else { | ||
2958 | dev_info(adev->dev, "GPU reset successed!\n"); | ||
2959 | } | 2966 | } |
2960 | 2967 | ||
2961 | adev->in_sriov_reset = false; | ||
2962 | mutex_unlock(&adev->virt.lock_reset); | ||
2963 | return r; | 2968 | return r; |
2964 | } | 2969 | } |
2965 | 2970 | ||
2966 | /** | 2971 | /** |
2967 | * amdgpu_gpu_reset - reset the asic | 2972 | * amdgpu_gpu_recover - reset the asic and recover scheduler |
2968 | * | 2973 | * |
2969 | * @adev: amdgpu device pointer | 2974 | * @adev: amdgpu device pointer |
2975 | * @job: which job trigger hang | ||
2970 | * | 2976 | * |
2971 | * Attempt the reset the GPU if it has hung (all asics). | 2977 | * Attempt to reset the GPU if it has hung (all asics). |
2972 | * Returns 0 for success or an error on failure. | 2978 | * Returns 0 for success or an error on failure. |
2973 | */ | 2979 | */ |
2974 | int amdgpu_gpu_reset(struct amdgpu_device *adev) | 2980 | int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job) |
2975 | { | 2981 | { |
2976 | struct drm_atomic_state *state = NULL; | 2982 | struct drm_atomic_state *state = NULL; |
2977 | int i, r; | 2983 | uint64_t reset_flags = 0; |
2978 | int resched; | 2984 | int i, r, resched; |
2979 | bool need_full_reset, vram_lost = false; | ||
2980 | 2985 | ||
2981 | if (!amdgpu_check_soft_reset(adev)) { | 2986 | if (!amdgpu_check_soft_reset(adev)) { |
2982 | DRM_INFO("No hardware hang detected. Did some blocks stall?\n"); | 2987 | DRM_INFO("No hardware hang detected. Did some blocks stall?\n"); |
2983 | return 0; | 2988 | return 0; |
2984 | } | 2989 | } |
2985 | 2990 | ||
2991 | dev_info(adev->dev, "GPU reset begin!\n"); | ||
2992 | |||
2993 | mutex_lock(&adev->virt.lock_reset); | ||
2986 | atomic_inc(&adev->gpu_reset_counter); | 2994 | atomic_inc(&adev->gpu_reset_counter); |
2995 | adev->in_sriov_reset = 1; | ||
2987 | 2996 | ||
2988 | /* block TTM */ | 2997 | /* block TTM */ |
2989 | resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); | 2998 | resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); |
@@ -2997,69 +3006,26 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) | |||
2997 | 3006 | ||
2998 | if (!ring || !ring->sched.thread) | 3007 | if (!ring || !ring->sched.thread) |
2999 | continue; | 3008 | continue; |
3009 | |||
3010 | /* only focus on the ring hit timeout if &job not NULL */ | ||
3011 | if (job && job->ring->idx != i) | ||
3012 | continue; | ||
3013 | |||
3000 | kthread_park(ring->sched.thread); | 3014 | kthread_park(ring->sched.thread); |
3001 | amd_sched_hw_job_reset(&ring->sched, NULL); | 3015 | amd_sched_hw_job_reset(&ring->sched, &job->base); |
3016 | |||
3002 | /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ | 3017 | /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ |
3003 | amdgpu_fence_driver_force_completion(ring); | 3018 | amdgpu_fence_driver_force_completion(ring); |
3004 | } | 3019 | } |
3005 | 3020 | ||
3006 | need_full_reset = amdgpu_need_full_reset(adev); | 3021 | if (amdgpu_sriov_vf(adev)) |
3007 | 3022 | r = amdgpu_reset_sriov(adev, &reset_flags, job ? false : true); | |
3008 | if (!need_full_reset) { | 3023 | else |
3009 | amdgpu_pre_soft_reset(adev); | 3024 | r = amdgpu_reset(adev, &reset_flags); |
3010 | r = amdgpu_soft_reset(adev); | ||
3011 | amdgpu_post_soft_reset(adev); | ||
3012 | if (r || amdgpu_check_soft_reset(adev)) { | ||
3013 | DRM_INFO("soft reset failed, will fallback to full reset!\n"); | ||
3014 | need_full_reset = true; | ||
3015 | } | ||
3016 | } | ||
3017 | |||
3018 | if (need_full_reset) { | ||
3019 | r = amdgpu_suspend(adev); | ||
3020 | |||
3021 | retry: | ||
3022 | amdgpu_atombios_scratch_regs_save(adev); | ||
3023 | r = amdgpu_asic_reset(adev); | ||
3024 | amdgpu_atombios_scratch_regs_restore(adev); | ||
3025 | /* post card */ | ||
3026 | amdgpu_atom_asic_init(adev->mode_info.atom_context); | ||
3027 | 3025 | ||
3028 | if (!r) { | ||
3029 | dev_info(adev->dev, "GPU reset succeeded, trying to resume\n"); | ||
3030 | r = amdgpu_resume_phase1(adev); | ||
3031 | if (r) | ||
3032 | goto out; | ||
3033 | vram_lost = amdgpu_check_vram_lost(adev); | ||
3034 | if (vram_lost) { | ||
3035 | DRM_ERROR("VRAM is lost!\n"); | ||
3036 | atomic_inc(&adev->vram_lost_counter); | ||
3037 | } | ||
3038 | r = amdgpu_ttm_recover_gart(adev); | ||
3039 | if (r) | ||
3040 | goto out; | ||
3041 | r = amdgpu_resume_phase2(adev); | ||
3042 | if (r) | ||
3043 | goto out; | ||
3044 | if (vram_lost) | ||
3045 | amdgpu_fill_reset_magic(adev); | ||
3046 | } | ||
3047 | } | ||
3048 | out: | ||
3049 | if (!r) { | 3026 | if (!r) { |
3050 | amdgpu_irq_gpu_reset_resume_helper(adev); | 3027 | if (((reset_flags & AMDGPU_RESET_INFO_FULLRESET) && !(adev->flags & AMD_IS_APU)) || |
3051 | r = amdgpu_ib_ring_tests(adev); | 3028 | (reset_flags & AMDGPU_RESET_INFO_VRAM_LOST)) { |
3052 | if (r) { | ||
3053 | dev_err(adev->dev, "ib ring test failed (%d).\n", r); | ||
3054 | r = amdgpu_suspend(adev); | ||
3055 | need_full_reset = true; | ||
3056 | goto retry; | ||
3057 | } | ||
3058 | /** | ||
3059 | * recovery vm page tables, since we cannot depend on VRAM is | ||
3060 | * consistent after gpu full reset. | ||
3061 | */ | ||
3062 | if (need_full_reset && amdgpu_need_backup(adev)) { | ||
3063 | struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; | 3029 | struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; |
3064 | struct amdgpu_bo *bo, *tmp; | 3030 | struct amdgpu_bo *bo, *tmp; |
3065 | struct dma_fence *fence = NULL, *next = NULL; | 3031 | struct dma_fence *fence = NULL, *next = NULL; |
@@ -3088,40 +3054,56 @@ out: | |||
3088 | } | 3054 | } |
3089 | dma_fence_put(fence); | 3055 | dma_fence_put(fence); |
3090 | } | 3056 | } |
3057 | |||
3091 | for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { | 3058 | for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
3092 | struct amdgpu_ring *ring = adev->rings[i]; | 3059 | struct amdgpu_ring *ring = adev->rings[i]; |
3093 | 3060 | ||
3094 | if (!ring || !ring->sched.thread) | 3061 | if (!ring || !ring->sched.thread) |
3095 | continue; | 3062 | continue; |
3096 | 3063 | ||
3064 | /* only focus on the ring hit timeout if &job not NULL */ | ||
3065 | if (job && job->ring->idx != i) | ||
3066 | continue; | ||
3067 | |||
3097 | amd_sched_job_recovery(&ring->sched); | 3068 | amd_sched_job_recovery(&ring->sched); |
3098 | kthread_unpark(ring->sched.thread); | 3069 | kthread_unpark(ring->sched.thread); |
3099 | } | 3070 | } |
3100 | } else { | 3071 | } else { |
3101 | dev_err(adev->dev, "asic resume failed (%d).\n", r); | ||
3102 | for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { | 3072 | for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
3103 | if (adev->rings[i] && adev->rings[i]->sched.thread) { | 3073 | struct amdgpu_ring *ring = adev->rings[i]; |
3104 | kthread_unpark(adev->rings[i]->sched.thread); | 3074 | |
3105 | } | 3075 | if (!ring || !ring->sched.thread) |
3076 | continue; | ||
3077 | |||
3078 | /* only focus on the ring hit timeout if &job not NULL */ | ||
3079 | if (job && job->ring->idx != i) | ||
3080 | continue; | ||
3081 | |||
3082 | kthread_unpark(adev->rings[i]->sched.thread); | ||
3106 | } | 3083 | } |
3107 | } | 3084 | } |
3108 | 3085 | ||
3109 | if (amdgpu_device_has_dc_support(adev)) { | 3086 | if (amdgpu_device_has_dc_support(adev)) { |
3110 | r = drm_atomic_helper_resume(adev->ddev, state); | 3087 | if (drm_atomic_helper_resume(adev->ddev, state)) |
3088 | dev_info(adev->dev, "drm resume failed:%d\n", r); | ||
3111 | amdgpu_dm_display_resume(adev); | 3089 | amdgpu_dm_display_resume(adev); |
3112 | } else | 3090 | } else { |
3113 | drm_helper_resume_force_mode(adev->ddev); | 3091 | drm_helper_resume_force_mode(adev->ddev); |
3092 | } | ||
3114 | 3093 | ||
3115 | ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); | 3094 | ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); |
3095 | |||
3116 | if (r) { | 3096 | if (r) { |
3117 | /* bad news, how to tell it to userspace ? */ | 3097 | /* bad news, how to tell it to userspace ? */ |
3118 | dev_info(adev->dev, "GPU reset failed\n"); | 3098 | dev_info(adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter)); |
3119 | } | 3099 | amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); |
3120 | else { | 3100 | } else { |
3121 | dev_info(adev->dev, "GPU reset successed!\n"); | 3101 | dev_info(adev->dev, "GPU reset(%d) successed!\n",atomic_read(&adev->gpu_reset_counter)); |
3122 | } | 3102 | } |
3123 | 3103 | ||
3124 | amdgpu_vf_error_trans_all(adev); | 3104 | amdgpu_vf_error_trans_all(adev); |
3105 | adev->in_sriov_reset = 0; | ||
3106 | mutex_unlock(&adev->virt.lock_reset); | ||
3125 | return r; | 3107 | return r; |
3126 | } | 3108 | } |
3127 | 3109 | ||
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index eda89dfdef5b..604ac03a42e4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | |||
@@ -694,25 +694,25 @@ static int amdgpu_debugfs_fence_info(struct seq_file *m, void *data) | |||
694 | } | 694 | } |
695 | 695 | ||
696 | /** | 696 | /** |
697 | * amdgpu_debugfs_gpu_reset - manually trigger a gpu reset | 697 | * amdgpu_debugfs_gpu_recover - manually trigger a gpu reset & recover |
698 | * | 698 | * |
699 | * Manually trigger a gpu reset at the next fence wait. | 699 | * Manually trigger a gpu reset at the next fence wait. |
700 | */ | 700 | */ |
701 | static int amdgpu_debugfs_gpu_reset(struct seq_file *m, void *data) | 701 | static int amdgpu_debugfs_gpu_recover(struct seq_file *m, void *data) |
702 | { | 702 | { |
703 | struct drm_info_node *node = (struct drm_info_node *) m->private; | 703 | struct drm_info_node *node = (struct drm_info_node *) m->private; |
704 | struct drm_device *dev = node->minor->dev; | 704 | struct drm_device *dev = node->minor->dev; |
705 | struct amdgpu_device *adev = dev->dev_private; | 705 | struct amdgpu_device *adev = dev->dev_private; |
706 | 706 | ||
707 | seq_printf(m, "gpu reset\n"); | 707 | seq_printf(m, "gpu recover\n"); |
708 | amdgpu_gpu_reset(adev); | 708 | amdgpu_gpu_recover(adev, NULL); |
709 | 709 | ||
710 | return 0; | 710 | return 0; |
711 | } | 711 | } |
712 | 712 | ||
713 | static const struct drm_info_list amdgpu_debugfs_fence_list[] = { | 713 | static const struct drm_info_list amdgpu_debugfs_fence_list[] = { |
714 | {"amdgpu_fence_info", &amdgpu_debugfs_fence_info, 0, NULL}, | 714 | {"amdgpu_fence_info", &amdgpu_debugfs_fence_info, 0, NULL}, |
715 | {"amdgpu_gpu_reset", &amdgpu_debugfs_gpu_reset, 0, NULL} | 715 | {"amdgpu_gpu_recover", &amdgpu_debugfs_gpu_recover, 0, NULL} |
716 | }; | 716 | }; |
717 | 717 | ||
718 | static const struct drm_info_list amdgpu_debugfs_fence_list_sriov[] = { | 718 | static const struct drm_info_list amdgpu_debugfs_fence_list_sriov[] = { |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c index 32590e4f9f7a..c340774082ea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | |||
@@ -88,7 +88,7 @@ static void amdgpu_irq_reset_work_func(struct work_struct *work) | |||
88 | reset_work); | 88 | reset_work); |
89 | 89 | ||
90 | if (!amdgpu_sriov_vf(adev)) | 90 | if (!amdgpu_sriov_vf(adev)) |
91 | amdgpu_gpu_reset(adev); | 91 | amdgpu_gpu_recover(adev, NULL); |
92 | } | 92 | } |
93 | 93 | ||
94 | /* Disable *all* interrupts */ | 94 | /* Disable *all* interrupts */ |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 0a90c768dbc1..18770a880393 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | |||
@@ -37,10 +37,7 @@ static void amdgpu_job_timedout(struct amd_sched_job *s_job) | |||
37 | atomic_read(&job->ring->fence_drv.last_seq), | 37 | atomic_read(&job->ring->fence_drv.last_seq), |
38 | job->ring->fence_drv.sync_seq); | 38 | job->ring->fence_drv.sync_seq); |
39 | 39 | ||
40 | if (amdgpu_sriov_vf(job->adev)) | 40 | amdgpu_gpu_recover(job->adev, job); |
41 | amdgpu_sriov_gpu_reset(job->adev, job); | ||
42 | else | ||
43 | amdgpu_gpu_reset(job->adev); | ||
44 | } | 41 | } |
45 | 42 | ||
46 | int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, | 43 | int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index d149aca71a44..20bdb8fb0b8c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | |||
@@ -288,7 +288,6 @@ int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init); | |||
288 | int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init); | 288 | int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init); |
289 | int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); | 289 | int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); |
290 | int amdgpu_virt_wait_reset(struct amdgpu_device *adev); | 290 | int amdgpu_virt_wait_reset(struct amdgpu_device *adev); |
291 | int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job); | ||
292 | int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev); | 291 | int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev); |
293 | void amdgpu_virt_free_mm_table(struct amdgpu_device *adev); | 292 | void amdgpu_virt_free_mm_table(struct amdgpu_device *adev); |
294 | int amdgpu_virt_fw_reserve_get_checksum(void *obj, unsigned long obj_size, | 293 | int amdgpu_virt_fw_reserve_get_checksum(void *obj, unsigned long obj_size, |
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index f91aab38637c..c32d0b0868e8 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | |||
@@ -254,7 +254,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) | |||
254 | } | 254 | } |
255 | 255 | ||
256 | /* Trigger recovery due to world switch failure */ | 256 | /* Trigger recovery due to world switch failure */ |
257 | amdgpu_sriov_gpu_reset(adev, NULL); | 257 | amdgpu_gpu_recover(adev, NULL); |
258 | } | 258 | } |
259 | 259 | ||
260 | static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, | 260 | static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, |
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index 27b03c773b1b..818ec0fe2f51 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | |||
@@ -519,7 +519,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) | |||
519 | } | 519 | } |
520 | 520 | ||
521 | /* Trigger recovery due to world switch failure */ | 521 | /* Trigger recovery due to world switch failure */ |
522 | amdgpu_sriov_gpu_reset(adev, NULL); | 522 | amdgpu_gpu_recover(adev, NULL); |
523 | } | 523 | } |
524 | 524 | ||
525 | static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev, | 525 | static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev, |