aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMonk Liu <Monk.Liu@amd.com>2017-10-25 04:37:02 -0400
committerAlex Deucher <alexander.deucher@amd.com>2017-12-04 16:41:30 -0500
commit5740682e66cef57626a328d237698cad329c0449 (patch)
tree6dd15cc6cb5cbcc511dd7f6bded375e5b01575b0
parent48f05f2955e4a3183b219d6dfdb1c28e17d03da7 (diff)
drm/amdgpu:implement new GPU recover(v3)
1,new imple names amdgpu_gpu_recover which gives more hint on what it does compared with gpu_reset 2,gpu_recover unify bare-metal and SR-IOV, only the asic reset part is implemented differently 3,gpu_recover will increase hang job karma and mark its entity/context as guilty if exceeds limit V2: 4,in scheduler main routine the job from guilty context will be immedialy fake signaled after it poped from queue and its fence be set with "-ECANCELED" error 5,in scheduler recovery routine all jobs from the guilty entity would be dropped 6,in run_job() routine the real IB submission would be skipped if @skip parameter equales true or there was VRAM lost occured. V3: 7,replace deprecated gpu reset, use new gpu recover Signed-off-by: Monk Liu <Monk.Liu@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu.h6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c322
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_job.c5
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c2
8 files changed, 166 insertions, 184 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 88fa19b1a802..5714b7e8cb09 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -178,6 +178,10 @@ extern int amdgpu_cik_support;
178#define CIK_CURSOR_WIDTH 128 178#define CIK_CURSOR_WIDTH 128
179#define CIK_CURSOR_HEIGHT 128 179#define CIK_CURSOR_HEIGHT 128
180 180
181/* GPU RESET flags */
182#define AMDGPU_RESET_INFO_VRAM_LOST (1 << 0)
183#define AMDGPU_RESET_INFO_FULLRESET (1 << 1)
184
181struct amdgpu_device; 185struct amdgpu_device;
182struct amdgpu_ib; 186struct amdgpu_ib;
183struct amdgpu_cs_parser; 187struct amdgpu_cs_parser;
@@ -1833,7 +1837,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
1833#define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i)) 1837#define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
1834 1838
1835/* Common functions */ 1839/* Common functions */
1836int amdgpu_gpu_reset(struct amdgpu_device *adev); 1840int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job);
1837bool amdgpu_need_backup(struct amdgpu_device *adev); 1841bool amdgpu_need_backup(struct amdgpu_device *adev);
1838void amdgpu_pci_config_reset(struct amdgpu_device *adev); 1842void amdgpu_pci_config_reset(struct amdgpu_device *adev);
1839bool amdgpu_need_post(struct amdgpu_device *adev); 1843bool amdgpu_need_post(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e521850e9409..e287eeda2dab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2827,163 +2827,172 @@ err:
2827 return r; 2827 return r;
2828} 2828}
2829 2829
2830/** 2830/*
2831 * amdgpu_sriov_gpu_reset - reset the asic 2831 * amdgpu_reset - reset ASIC/GPU for bare-metal or passthrough
2832 * 2832 *
2833 * @adev: amdgpu device pointer 2833 * @adev: amdgpu device pointer
2834 * @job: which job trigger hang 2834 * @reset_flags: output param tells caller the reset result
2835 * 2835 *
2836 * Attempt the reset the GPU if it has hung (all asics). 2836 * attempt to do soft-reset or full-reset and reinitialize Asic
2837 * for SRIOV case. 2837 * return 0 means successed otherwise failed
2838 * Returns 0 for success or an error on failure. 2838*/
2839 */ 2839static int amdgpu_reset(struct amdgpu_device *adev, uint64_t* reset_flags)
2840int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job)
2841{ 2840{
2842 int i, j, r = 0; 2841 bool need_full_reset, vram_lost = 0;
2843 int resched; 2842 int r;
2844 struct amdgpu_bo *bo, *tmp;
2845 struct amdgpu_ring *ring;
2846 struct dma_fence *fence = NULL, *next = NULL;
2847 2843
2848 mutex_lock(&adev->virt.lock_reset); 2844 need_full_reset = amdgpu_need_full_reset(adev);
2849 atomic_inc(&adev->gpu_reset_counter);
2850 adev->in_sriov_reset = true;
2851 2845
2852 /* block TTM */ 2846 if (!need_full_reset) {
2853 resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 2847 amdgpu_pre_soft_reset(adev);
2848 r = amdgpu_soft_reset(adev);
2849 amdgpu_post_soft_reset(adev);
2850 if (r || amdgpu_check_soft_reset(adev)) {
2851 DRM_INFO("soft reset failed, will fallback to full reset!\n");
2852 need_full_reset = true;
2853 }
2854 2854
2855 /* we start from the ring trigger GPU hang */ 2855 }
2856 j = job ? job->ring->idx : 0;
2857 2856
2858 /* block scheduler */ 2857 if (need_full_reset) {
2859 for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) { 2858 r = amdgpu_suspend(adev);
2860 ring = adev->rings[i % AMDGPU_MAX_RINGS];
2861 if (!ring || !ring->sched.thread)
2862 continue;
2863 2859
2864 kthread_park(ring->sched.thread); 2860retry:
2861 amdgpu_atombios_scratch_regs_save(adev);
2862 r = amdgpu_asic_reset(adev);
2863 amdgpu_atombios_scratch_regs_restore(adev);
2864 /* post card */
2865 amdgpu_atom_asic_init(adev->mode_info.atom_context);
2865 2866
2866 if (job && j != i) 2867 if (!r) {
2867 continue; 2868 dev_info(adev->dev, "GPU reset succeeded, trying to resume\n");
2869 r = amdgpu_resume_phase1(adev);
2870 if (r)
2871 goto out;
2868 2872
2869 /* here give the last chance to check if job removed from mirror-list 2873 vram_lost = amdgpu_check_vram_lost(adev);
2870 * since we already pay some time on kthread_park */ 2874 if (vram_lost) {
2871 if (job && list_empty(&job->base.node)) { 2875 DRM_ERROR("VRAM is lost!\n");
2872 kthread_unpark(ring->sched.thread); 2876 atomic_inc(&adev->vram_lost_counter);
2873 goto give_up_reset; 2877 }
2878
2879 r = amdgpu_ttm_recover_gart(adev);
2880 if (r)
2881 goto out;
2882
2883 r = amdgpu_resume_phase2(adev);
2884 if (r)
2885 goto out;
2886
2887 if (vram_lost)
2888 amdgpu_fill_reset_magic(adev);
2874 } 2889 }
2890 }
2875 2891
2876 if (amd_sched_invalidate_job(&job->base, amdgpu_job_hang_limit)) 2892out:
2877 amd_sched_job_kickout(&job->base); 2893 if (!r) {
2894 amdgpu_irq_gpu_reset_resume_helper(adev);
2895 r = amdgpu_ib_ring_tests(adev);
2896 if (r) {
2897 dev_err(adev->dev, "ib ring test failed (%d).\n", r);
2898 r = amdgpu_suspend(adev);
2899 need_full_reset = true;
2900 goto retry;
2901 }
2902 }
2878 2903
2879 /* only do job_reset on the hang ring if @job not NULL */ 2904 if (reset_flags) {
2880 amd_sched_hw_job_reset(&ring->sched, NULL); 2905 if (vram_lost)
2906 (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST;
2881 2907
2882 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 2908 if (need_full_reset)
2883 amdgpu_fence_driver_force_completion(ring); 2909 (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET;
2884 } 2910 }
2885 2911
2886 /* request to take full control of GPU before re-initialization */ 2912 return r;
2887 if (job) 2913}
2888 amdgpu_virt_reset_gpu(adev); 2914
2889 else 2915/*
2890 amdgpu_virt_request_full_gpu(adev, true); 2916 * amdgpu_reset_sriov - reset ASIC for SR-IOV vf
2917 *
2918 * @adev: amdgpu device pointer
2919 * @reset_flags: output param tells caller the reset result
2920 *
2921 * do VF FLR and reinitialize Asic
2922 * return 0 means successed otherwise failed
2923*/
2924static int amdgpu_reset_sriov(struct amdgpu_device *adev, uint64_t *reset_flags, bool from_hypervisor)
2925{
2926 int r;
2891 2927
2928 if (from_hypervisor)
2929 r = amdgpu_virt_request_full_gpu(adev, true);
2930 else
2931 r = amdgpu_virt_reset_gpu(adev);
2932 if (r)
2933 return r;
2892 2934
2893 /* Resume IP prior to SMC */ 2935 /* Resume IP prior to SMC */
2894 amdgpu_sriov_reinit_early(adev); 2936 r = amdgpu_sriov_reinit_early(adev);
2937 if (r)
2938 goto error;
2895 2939
2896 /* we need recover gart prior to run SMC/CP/SDMA resume */ 2940 /* we need recover gart prior to run SMC/CP/SDMA resume */
2897 amdgpu_ttm_recover_gart(adev); 2941 amdgpu_ttm_recover_gart(adev);
2898 2942
2899 /* now we are okay to resume SMC/CP/SDMA */ 2943 /* now we are okay to resume SMC/CP/SDMA */
2900 amdgpu_sriov_reinit_late(adev); 2944 r = amdgpu_sriov_reinit_late(adev);
2945 if (r)
2946 goto error;
2901 2947
2902 amdgpu_irq_gpu_reset_resume_helper(adev); 2948 amdgpu_irq_gpu_reset_resume_helper(adev);
2903 2949 r = amdgpu_ib_ring_tests(adev);
2904 if (amdgpu_ib_ring_tests(adev)) 2950 if (r)
2905 dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r); 2951 dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r);
2906 2952
2953error:
2907 /* release full control of GPU after ib test */ 2954 /* release full control of GPU after ib test */
2908 amdgpu_virt_release_full_gpu(adev, true); 2955 amdgpu_virt_release_full_gpu(adev, true);
2909 2956
2910 DRM_INFO("recover vram bo from shadow\n"); 2957 if (reset_flags) {
2911 2958 /* will get vram_lost from GIM in future, now all
2912 ring = adev->mman.buffer_funcs_ring; 2959 * reset request considered VRAM LOST
2913 mutex_lock(&adev->shadow_list_lock); 2960 */
2914 list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { 2961 (*reset_flags) |= ~AMDGPU_RESET_INFO_VRAM_LOST;
2915 next = NULL; 2962 atomic_inc(&adev->vram_lost_counter);
2916 amdgpu_recover_vram_from_shadow(adev, ring, bo, &next);
2917 if (fence) {
2918 r = dma_fence_wait(fence, false);
2919 if (r) {
2920 WARN(r, "recovery from shadow isn't completed\n");
2921 break;
2922 }
2923 }
2924
2925 dma_fence_put(fence);
2926 fence = next;
2927 }
2928 mutex_unlock(&adev->shadow_list_lock);
2929
2930 if (fence) {
2931 r = dma_fence_wait(fence, false);
2932 if (r)
2933 WARN(r, "recovery from shadow isn't completed\n");
2934 }
2935 dma_fence_put(fence);
2936
2937 for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) {
2938 ring = adev->rings[i % AMDGPU_MAX_RINGS];
2939 if (!ring || !ring->sched.thread)
2940 continue;
2941
2942 if (job && j != i) {
2943 kthread_unpark(ring->sched.thread);
2944 continue;
2945 }
2946
2947 amd_sched_job_recovery(&ring->sched);
2948 kthread_unpark(ring->sched.thread);
2949 }
2950 2963
2951 drm_helper_resume_force_mode(adev->ddev); 2964 /* VF FLR or hotlink reset is always full-reset */
2952give_up_reset: 2965 (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET;
2953 ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
2954 if (r) {
2955 /* bad news, how to tell it to userspace ? */
2956 dev_info(adev->dev, "GPU reset failed\n");
2957 } else {
2958 dev_info(adev->dev, "GPU reset successed!\n");
2959 } 2966 }
2960 2967
2961 adev->in_sriov_reset = false;
2962 mutex_unlock(&adev->virt.lock_reset);
2963 return r; 2968 return r;
2964} 2969}
2965 2970
2966/** 2971/**
2967 * amdgpu_gpu_reset - reset the asic 2972 * amdgpu_gpu_recover - reset the asic and recover scheduler
2968 * 2973 *
2969 * @adev: amdgpu device pointer 2974 * @adev: amdgpu device pointer
2975 * @job: which job trigger hang
2970 * 2976 *
2971 * Attempt the reset the GPU if it has hung (all asics). 2977 * Attempt to reset the GPU if it has hung (all asics).
2972 * Returns 0 for success or an error on failure. 2978 * Returns 0 for success or an error on failure.
2973 */ 2979 */
2974int amdgpu_gpu_reset(struct amdgpu_device *adev) 2980int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job)
2975{ 2981{
2976 struct drm_atomic_state *state = NULL; 2982 struct drm_atomic_state *state = NULL;
2977 int i, r; 2983 uint64_t reset_flags = 0;
2978 int resched; 2984 int i, r, resched;
2979 bool need_full_reset, vram_lost = false;
2980 2985
2981 if (!amdgpu_check_soft_reset(adev)) { 2986 if (!amdgpu_check_soft_reset(adev)) {
2982 DRM_INFO("No hardware hang detected. Did some blocks stall?\n"); 2987 DRM_INFO("No hardware hang detected. Did some blocks stall?\n");
2983 return 0; 2988 return 0;
2984 } 2989 }
2985 2990
2991 dev_info(adev->dev, "GPU reset begin!\n");
2992
2993 mutex_lock(&adev->virt.lock_reset);
2986 atomic_inc(&adev->gpu_reset_counter); 2994 atomic_inc(&adev->gpu_reset_counter);
2995 adev->in_sriov_reset = 1;
2987 2996
2988 /* block TTM */ 2997 /* block TTM */
2989 resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 2998 resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
@@ -2997,69 +3006,26 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
2997 3006
2998 if (!ring || !ring->sched.thread) 3007 if (!ring || !ring->sched.thread)
2999 continue; 3008 continue;
3009
3010 /* only focus on the ring hit timeout if &job not NULL */
3011 if (job && job->ring->idx != i)
3012 continue;
3013
3000 kthread_park(ring->sched.thread); 3014 kthread_park(ring->sched.thread);
3001 amd_sched_hw_job_reset(&ring->sched, NULL); 3015 amd_sched_hw_job_reset(&ring->sched, &job->base);
3016
3002 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 3017 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
3003 amdgpu_fence_driver_force_completion(ring); 3018 amdgpu_fence_driver_force_completion(ring);
3004 } 3019 }
3005 3020
3006 need_full_reset = amdgpu_need_full_reset(adev); 3021 if (amdgpu_sriov_vf(adev))
3007 3022 r = amdgpu_reset_sriov(adev, &reset_flags, job ? false : true);
3008 if (!need_full_reset) { 3023 else
3009 amdgpu_pre_soft_reset(adev); 3024 r = amdgpu_reset(adev, &reset_flags);
3010 r = amdgpu_soft_reset(adev);
3011 amdgpu_post_soft_reset(adev);
3012 if (r || amdgpu_check_soft_reset(adev)) {
3013 DRM_INFO("soft reset failed, will fallback to full reset!\n");
3014 need_full_reset = true;
3015 }
3016 }
3017
3018 if (need_full_reset) {
3019 r = amdgpu_suspend(adev);
3020
3021retry:
3022 amdgpu_atombios_scratch_regs_save(adev);
3023 r = amdgpu_asic_reset(adev);
3024 amdgpu_atombios_scratch_regs_restore(adev);
3025 /* post card */
3026 amdgpu_atom_asic_init(adev->mode_info.atom_context);
3027 3025
3028 if (!r) {
3029 dev_info(adev->dev, "GPU reset succeeded, trying to resume\n");
3030 r = amdgpu_resume_phase1(adev);
3031 if (r)
3032 goto out;
3033 vram_lost = amdgpu_check_vram_lost(adev);
3034 if (vram_lost) {
3035 DRM_ERROR("VRAM is lost!\n");
3036 atomic_inc(&adev->vram_lost_counter);
3037 }
3038 r = amdgpu_ttm_recover_gart(adev);
3039 if (r)
3040 goto out;
3041 r = amdgpu_resume_phase2(adev);
3042 if (r)
3043 goto out;
3044 if (vram_lost)
3045 amdgpu_fill_reset_magic(adev);
3046 }
3047 }
3048out:
3049 if (!r) { 3026 if (!r) {
3050 amdgpu_irq_gpu_reset_resume_helper(adev); 3027 if (((reset_flags & AMDGPU_RESET_INFO_FULLRESET) && !(adev->flags & AMD_IS_APU)) ||
3051 r = amdgpu_ib_ring_tests(adev); 3028 (reset_flags & AMDGPU_RESET_INFO_VRAM_LOST)) {
3052 if (r) {
3053 dev_err(adev->dev, "ib ring test failed (%d).\n", r);
3054 r = amdgpu_suspend(adev);
3055 need_full_reset = true;
3056 goto retry;
3057 }
3058 /**
3059 * recovery vm page tables, since we cannot depend on VRAM is
3060 * consistent after gpu full reset.
3061 */
3062 if (need_full_reset && amdgpu_need_backup(adev)) {
3063 struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; 3029 struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
3064 struct amdgpu_bo *bo, *tmp; 3030 struct amdgpu_bo *bo, *tmp;
3065 struct dma_fence *fence = NULL, *next = NULL; 3031 struct dma_fence *fence = NULL, *next = NULL;
@@ -3088,40 +3054,56 @@ out:
3088 } 3054 }
3089 dma_fence_put(fence); 3055 dma_fence_put(fence);
3090 } 3056 }
3057
3091 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3058 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3092 struct amdgpu_ring *ring = adev->rings[i]; 3059 struct amdgpu_ring *ring = adev->rings[i];
3093 3060
3094 if (!ring || !ring->sched.thread) 3061 if (!ring || !ring->sched.thread)
3095 continue; 3062 continue;
3096 3063
3064 /* only focus on the ring hit timeout if &job not NULL */
3065 if (job && job->ring->idx != i)
3066 continue;
3067
3097 amd_sched_job_recovery(&ring->sched); 3068 amd_sched_job_recovery(&ring->sched);
3098 kthread_unpark(ring->sched.thread); 3069 kthread_unpark(ring->sched.thread);
3099 } 3070 }
3100 } else { 3071 } else {
3101 dev_err(adev->dev, "asic resume failed (%d).\n", r);
3102 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3072 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3103 if (adev->rings[i] && adev->rings[i]->sched.thread) { 3073 struct amdgpu_ring *ring = adev->rings[i];
3104 kthread_unpark(adev->rings[i]->sched.thread); 3074
3105 } 3075 if (!ring || !ring->sched.thread)
3076 continue;
3077
3078 /* only focus on the ring hit timeout if &job not NULL */
3079 if (job && job->ring->idx != i)
3080 continue;
3081
3082 kthread_unpark(adev->rings[i]->sched.thread);
3106 } 3083 }
3107 } 3084 }
3108 3085
3109 if (amdgpu_device_has_dc_support(adev)) { 3086 if (amdgpu_device_has_dc_support(adev)) {
3110 r = drm_atomic_helper_resume(adev->ddev, state); 3087 if (drm_atomic_helper_resume(adev->ddev, state))
3088 dev_info(adev->dev, "drm resume failed:%d\n", r);
3111 amdgpu_dm_display_resume(adev); 3089 amdgpu_dm_display_resume(adev);
3112 } else 3090 } else {
3113 drm_helper_resume_force_mode(adev->ddev); 3091 drm_helper_resume_force_mode(adev->ddev);
3092 }
3114 3093
3115 ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); 3094 ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
3095
3116 if (r) { 3096 if (r) {
3117 /* bad news, how to tell it to userspace ? */ 3097 /* bad news, how to tell it to userspace ? */
3118 dev_info(adev->dev, "GPU reset failed\n"); 3098 dev_info(adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter));
3119 } 3099 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
3120 else { 3100 } else {
3121 dev_info(adev->dev, "GPU reset successed!\n"); 3101 dev_info(adev->dev, "GPU reset(%d) successed!\n",atomic_read(&adev->gpu_reset_counter));
3122 } 3102 }
3123 3103
3124 amdgpu_vf_error_trans_all(adev); 3104 amdgpu_vf_error_trans_all(adev);
3105 adev->in_sriov_reset = 0;
3106 mutex_unlock(&adev->virt.lock_reset);
3125 return r; 3107 return r;
3126} 3108}
3127 3109
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index eda89dfdef5b..604ac03a42e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -694,25 +694,25 @@ static int amdgpu_debugfs_fence_info(struct seq_file *m, void *data)
694} 694}
695 695
696/** 696/**
697 * amdgpu_debugfs_gpu_reset - manually trigger a gpu reset 697 * amdgpu_debugfs_gpu_recover - manually trigger a gpu reset & recover
698 * 698 *
699 * Manually trigger a gpu reset at the next fence wait. 699 * Manually trigger a gpu reset at the next fence wait.
700 */ 700 */
701static int amdgpu_debugfs_gpu_reset(struct seq_file *m, void *data) 701static int amdgpu_debugfs_gpu_recover(struct seq_file *m, void *data)
702{ 702{
703 struct drm_info_node *node = (struct drm_info_node *) m->private; 703 struct drm_info_node *node = (struct drm_info_node *) m->private;
704 struct drm_device *dev = node->minor->dev; 704 struct drm_device *dev = node->minor->dev;
705 struct amdgpu_device *adev = dev->dev_private; 705 struct amdgpu_device *adev = dev->dev_private;
706 706
707 seq_printf(m, "gpu reset\n"); 707 seq_printf(m, "gpu recover\n");
708 amdgpu_gpu_reset(adev); 708 amdgpu_gpu_recover(adev, NULL);
709 709
710 return 0; 710 return 0;
711} 711}
712 712
713static const struct drm_info_list amdgpu_debugfs_fence_list[] = { 713static const struct drm_info_list amdgpu_debugfs_fence_list[] = {
714 {"amdgpu_fence_info", &amdgpu_debugfs_fence_info, 0, NULL}, 714 {"amdgpu_fence_info", &amdgpu_debugfs_fence_info, 0, NULL},
715 {"amdgpu_gpu_reset", &amdgpu_debugfs_gpu_reset, 0, NULL} 715 {"amdgpu_gpu_recover", &amdgpu_debugfs_gpu_recover, 0, NULL}
716}; 716};
717 717
718static const struct drm_info_list amdgpu_debugfs_fence_list_sriov[] = { 718static const struct drm_info_list amdgpu_debugfs_fence_list_sriov[] = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
index 32590e4f9f7a..c340774082ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
@@ -88,7 +88,7 @@ static void amdgpu_irq_reset_work_func(struct work_struct *work)
88 reset_work); 88 reset_work);
89 89
90 if (!amdgpu_sriov_vf(adev)) 90 if (!amdgpu_sriov_vf(adev))
91 amdgpu_gpu_reset(adev); 91 amdgpu_gpu_recover(adev, NULL);
92} 92}
93 93
94/* Disable *all* interrupts */ 94/* Disable *all* interrupts */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 0a90c768dbc1..18770a880393 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -37,10 +37,7 @@ static void amdgpu_job_timedout(struct amd_sched_job *s_job)
37 atomic_read(&job->ring->fence_drv.last_seq), 37 atomic_read(&job->ring->fence_drv.last_seq),
38 job->ring->fence_drv.sync_seq); 38 job->ring->fence_drv.sync_seq);
39 39
40 if (amdgpu_sriov_vf(job->adev)) 40 amdgpu_gpu_recover(job->adev, job);
41 amdgpu_sriov_gpu_reset(job->adev, job);
42 else
43 amdgpu_gpu_reset(job->adev);
44} 41}
45 42
46int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, 43int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index d149aca71a44..20bdb8fb0b8c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -288,7 +288,6 @@ int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
288int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init); 288int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
289int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); 289int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
290int amdgpu_virt_wait_reset(struct amdgpu_device *adev); 290int amdgpu_virt_wait_reset(struct amdgpu_device *adev);
291int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job);
292int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev); 291int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev);
293void amdgpu_virt_free_mm_table(struct amdgpu_device *adev); 292void amdgpu_virt_free_mm_table(struct amdgpu_device *adev);
294int amdgpu_virt_fw_reserve_get_checksum(void *obj, unsigned long obj_size, 293int amdgpu_virt_fw_reserve_get_checksum(void *obj, unsigned long obj_size,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index f91aab38637c..c32d0b0868e8 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -254,7 +254,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
254 } 254 }
255 255
256 /* Trigger recovery due to world switch failure */ 256 /* Trigger recovery due to world switch failure */
257 amdgpu_sriov_gpu_reset(adev, NULL); 257 amdgpu_gpu_recover(adev, NULL);
258} 258}
259 259
260static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, 260static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index 27b03c773b1b..818ec0fe2f51 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -519,7 +519,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
519 } 519 }
520 520
521 /* Trigger recovery due to world switch failure */ 521 /* Trigger recovery due to world switch failure */
522 amdgpu_sriov_gpu_reset(adev, NULL); 522 amdgpu_gpu_recover(adev, NULL);
523} 523}
524 524
525static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev, 525static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,