diff options
author | Monk Liu <Monk.Liu@amd.com> | 2017-10-25 04:37:02 -0400 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2017-12-04 16:41:30 -0500 |
commit | 5740682e66cef57626a328d237698cad329c0449 (patch) | |
tree | 6dd15cc6cb5cbcc511dd7f6bded375e5b01575b0 /drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | |
parent | 48f05f2955e4a3183b219d6dfdb1c28e17d03da7 (diff) |
drm/amdgpu:implement new GPU recover(v3)
1,new imple names amdgpu_gpu_recover which gives more hint
on what it does compared with gpu_reset
2,gpu_recover unify bare-metal and SR-IOV, only the asic reset
part is implemented differently
3,gpu_recover will increase hang job karma and mark its entity/context
as guilty if exceeds limit
V2:
4,in scheduler main routine the job from guilty context will be immedialy
fake signaled after it poped from queue and its fence be set with
"-ECANCELED" error
5,in scheduler recovery routine all jobs from the guilty entity would be
dropped
6,in run_job() routine the real IB submission would be skipped if @skip parameter
equales true or there was VRAM lost occured.
V3:
7,replace deprecated gpu reset, use new gpu recover
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 10 |
1 files changed, 5 insertions, 5 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index eda89dfdef5b..604ac03a42e4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | |||
@@ -694,25 +694,25 @@ static int amdgpu_debugfs_fence_info(struct seq_file *m, void *data) | |||
694 | } | 694 | } |
695 | 695 | ||
696 | /** | 696 | /** |
697 | * amdgpu_debugfs_gpu_reset - manually trigger a gpu reset | 697 | * amdgpu_debugfs_gpu_recover - manually trigger a gpu reset & recover |
698 | * | 698 | * |
699 | * Manually trigger a gpu reset at the next fence wait. | 699 | * Manually trigger a gpu reset at the next fence wait. |
700 | */ | 700 | */ |
701 | static int amdgpu_debugfs_gpu_reset(struct seq_file *m, void *data) | 701 | static int amdgpu_debugfs_gpu_recover(struct seq_file *m, void *data) |
702 | { | 702 | { |
703 | struct drm_info_node *node = (struct drm_info_node *) m->private; | 703 | struct drm_info_node *node = (struct drm_info_node *) m->private; |
704 | struct drm_device *dev = node->minor->dev; | 704 | struct drm_device *dev = node->minor->dev; |
705 | struct amdgpu_device *adev = dev->dev_private; | 705 | struct amdgpu_device *adev = dev->dev_private; |
706 | 706 | ||
707 | seq_printf(m, "gpu reset\n"); | 707 | seq_printf(m, "gpu recover\n"); |
708 | amdgpu_gpu_reset(adev); | 708 | amdgpu_gpu_recover(adev, NULL); |
709 | 709 | ||
710 | return 0; | 710 | return 0; |
711 | } | 711 | } |
712 | 712 | ||
713 | static const struct drm_info_list amdgpu_debugfs_fence_list[] = { | 713 | static const struct drm_info_list amdgpu_debugfs_fence_list[] = { |
714 | {"amdgpu_fence_info", &amdgpu_debugfs_fence_info, 0, NULL}, | 714 | {"amdgpu_fence_info", &amdgpu_debugfs_fence_info, 0, NULL}, |
715 | {"amdgpu_gpu_reset", &amdgpu_debugfs_gpu_reset, 0, NULL} | 715 | {"amdgpu_gpu_recover", &amdgpu_debugfs_gpu_recover, 0, NULL} |
716 | }; | 716 | }; |
717 | 717 | ||
718 | static const struct drm_info_list amdgpu_debugfs_fence_list_sriov[] = { | 718 | static const struct drm_info_list amdgpu_debugfs_fence_list_sriov[] = { |