diff options
author | Alex Deucher <alexander.deucher@amd.com> | 2017-09-28 09:47:32 -0400 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2017-09-28 16:03:20 -0400 |
commit | e23b74aab5dc48d3e508a2bc171ccd152fb03803 (patch) | |
tree | b715c0cff8755ec4f0b51bc696080e982e881bb7 | |
parent | 6f87a895709eecc1542fe947e349364ad061ac00 (diff) |
drm/amdgpu: fix vf error handling
The error handling for virtual functions assumed a single
vf per VM and didn't properly account for bare metal. Make
the error arrays per device and add locking.
Reviewed-by: Gavin Wan <gavin.wan@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 23 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c | 54 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.h | 5 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 13 |
4 files changed, 54 insertions, 41 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3e84ddf9e3b5..fc0c1cde69ae 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | |||
@@ -2040,6 +2040,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, | |||
2040 | mutex_init(&adev->srbm_mutex); | 2040 | mutex_init(&adev->srbm_mutex); |
2041 | mutex_init(&adev->grbm_idx_mutex); | 2041 | mutex_init(&adev->grbm_idx_mutex); |
2042 | mutex_init(&adev->mn_lock); | 2042 | mutex_init(&adev->mn_lock); |
2043 | mutex_init(&adev->virt.vf_errors.lock); | ||
2043 | hash_init(adev->mn_hash); | 2044 | hash_init(adev->mn_hash); |
2044 | 2045 | ||
2045 | amdgpu_check_arguments(adev); | 2046 | amdgpu_check_arguments(adev); |
@@ -2125,7 +2126,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, | |||
2125 | r = amdgpu_atombios_init(adev); | 2126 | r = amdgpu_atombios_init(adev); |
2126 | if (r) { | 2127 | if (r) { |
2127 | dev_err(adev->dev, "amdgpu_atombios_init failed\n"); | 2128 | dev_err(adev->dev, "amdgpu_atombios_init failed\n"); |
2128 | amdgpu_vf_error_put(AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); | 2129 | amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); |
2129 | goto failed; | 2130 | goto failed; |
2130 | } | 2131 | } |
2131 | 2132 | ||
@@ -2136,7 +2137,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, | |||
2136 | if (amdgpu_vpost_needed(adev)) { | 2137 | if (amdgpu_vpost_needed(adev)) { |
2137 | if (!adev->bios) { | 2138 | if (!adev->bios) { |
2138 | dev_err(adev->dev, "no vBIOS found\n"); | 2139 | dev_err(adev->dev, "no vBIOS found\n"); |
2139 | amdgpu_vf_error_put(AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); | 2140 | amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); |
2140 | r = -EINVAL; | 2141 | r = -EINVAL; |
2141 | goto failed; | 2142 | goto failed; |
2142 | } | 2143 | } |
@@ -2144,7 +2145,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, | |||
2144 | r = amdgpu_atom_asic_init(adev->mode_info.atom_context); | 2145 | r = amdgpu_atom_asic_init(adev->mode_info.atom_context); |
2145 | if (r) { | 2146 | if (r) { |
2146 | dev_err(adev->dev, "gpu post error!\n"); | 2147 | dev_err(adev->dev, "gpu post error!\n"); |
2147 | amdgpu_vf_error_put(AMDGIM_ERROR_VF_GPU_POST_ERROR, 0, 0); | 2148 | amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_POST_ERROR, 0, 0); |
2148 | goto failed; | 2149 | goto failed; |
2149 | } | 2150 | } |
2150 | } else { | 2151 | } else { |
@@ -2156,7 +2157,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, | |||
2156 | r = amdgpu_atomfirmware_get_clock_info(adev); | 2157 | r = amdgpu_atomfirmware_get_clock_info(adev); |
2157 | if (r) { | 2158 | if (r) { |
2158 | dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); | 2159 | dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); |
2159 | amdgpu_vf_error_put(AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); | 2160 | amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); |
2160 | goto failed; | 2161 | goto failed; |
2161 | } | 2162 | } |
2162 | } else { | 2163 | } else { |
@@ -2164,7 +2165,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, | |||
2164 | r = amdgpu_atombios_get_clock_info(adev); | 2165 | r = amdgpu_atombios_get_clock_info(adev); |
2165 | if (r) { | 2166 | if (r) { |
2166 | dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); | 2167 | dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); |
2167 | amdgpu_vf_error_put(AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); | 2168 | amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); |
2168 | goto failed; | 2169 | goto failed; |
2169 | } | 2170 | } |
2170 | /* init i2c buses */ | 2171 | /* init i2c buses */ |
@@ -2175,7 +2176,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, | |||
2175 | r = amdgpu_fence_driver_init(adev); | 2176 | r = amdgpu_fence_driver_init(adev); |
2176 | if (r) { | 2177 | if (r) { |
2177 | dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); | 2178 | dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); |
2178 | amdgpu_vf_error_put(AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); | 2179 | amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); |
2179 | goto failed; | 2180 | goto failed; |
2180 | } | 2181 | } |
2181 | 2182 | ||
@@ -2185,7 +2186,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, | |||
2185 | r = amdgpu_init(adev); | 2186 | r = amdgpu_init(adev); |
2186 | if (r) { | 2187 | if (r) { |
2187 | dev_err(adev->dev, "amdgpu_init failed\n"); | 2188 | dev_err(adev->dev, "amdgpu_init failed\n"); |
2188 | amdgpu_vf_error_put(AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); | 2189 | amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); |
2189 | amdgpu_fini(adev); | 2190 | amdgpu_fini(adev); |
2190 | goto failed; | 2191 | goto failed; |
2191 | } | 2192 | } |
@@ -2205,7 +2206,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, | |||
2205 | r = amdgpu_ib_pool_init(adev); | 2206 | r = amdgpu_ib_pool_init(adev); |
2206 | if (r) { | 2207 | if (r) { |
2207 | dev_err(adev->dev, "IB initialization failed (%d).\n", r); | 2208 | dev_err(adev->dev, "IB initialization failed (%d).\n", r); |
2208 | amdgpu_vf_error_put(AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); | 2209 | amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); |
2209 | goto failed; | 2210 | goto failed; |
2210 | } | 2211 | } |
2211 | 2212 | ||
@@ -2254,7 +2255,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, | |||
2254 | r = amdgpu_late_init(adev); | 2255 | r = amdgpu_late_init(adev); |
2255 | if (r) { | 2256 | if (r) { |
2256 | dev_err(adev->dev, "amdgpu_late_init failed\n"); | 2257 | dev_err(adev->dev, "amdgpu_late_init failed\n"); |
2257 | amdgpu_vf_error_put(AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); | 2258 | amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); |
2258 | goto failed; | 2259 | goto failed; |
2259 | } | 2260 | } |
2260 | 2261 | ||
@@ -2936,7 +2937,7 @@ out: | |||
2936 | } | 2937 | } |
2937 | } else { | 2938 | } else { |
2938 | dev_err(adev->dev, "asic resume failed (%d).\n", r); | 2939 | dev_err(adev->dev, "asic resume failed (%d).\n", r); |
2939 | amdgpu_vf_error_put(AMDGIM_ERROR_VF_ASIC_RESUME_FAIL, 0, r); | 2940 | amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ASIC_RESUME_FAIL, 0, r); |
2940 | for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { | 2941 | for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { |
2941 | if (adev->rings[i] && adev->rings[i]->sched.thread) { | 2942 | if (adev->rings[i] && adev->rings[i]->sched.thread) { |
2942 | kthread_unpark(adev->rings[i]->sched.thread); | 2943 | kthread_unpark(adev->rings[i]->sched.thread); |
@@ -2950,7 +2951,7 @@ out: | |||
2950 | if (r) { | 2951 | if (r) { |
2951 | /* bad news, how to tell it to userspace ? */ | 2952 | /* bad news, how to tell it to userspace ? */ |
2952 | dev_info(adev->dev, "GPU reset failed\n"); | 2953 | dev_info(adev->dev, "GPU reset failed\n"); |
2953 | amdgpu_vf_error_put(AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); | 2954 | amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); |
2954 | } | 2955 | } |
2955 | else { | 2956 | else { |
2956 | dev_info(adev->dev, "GPU reset successed!\n"); | 2957 | dev_info(adev->dev, "GPU reset successed!\n"); |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c index 45ac91861965..746b81339835 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c | |||
@@ -25,30 +25,21 @@ | |||
25 | #include "amdgpu_vf_error.h" | 25 | #include "amdgpu_vf_error.h" |
26 | #include "mxgpu_ai.h" | 26 | #include "mxgpu_ai.h" |
27 | 27 | ||
28 | #define AMDGPU_VF_ERROR_ENTRY_SIZE 16 | 28 | void amdgpu_vf_error_put(struct amdgpu_device *adev, |
29 | 29 | uint16_t sub_error_code, | |
30 | /* struct error_entry - amdgpu VF error information. */ | 30 | uint16_t error_flags, |
31 | struct amdgpu_vf_error_buffer { | 31 | uint64_t error_data) |
32 | int read_count; | ||
33 | int write_count; | ||
34 | uint16_t code[AMDGPU_VF_ERROR_ENTRY_SIZE]; | ||
35 | uint16_t flags[AMDGPU_VF_ERROR_ENTRY_SIZE]; | ||
36 | uint64_t data[AMDGPU_VF_ERROR_ENTRY_SIZE]; | ||
37 | }; | ||
38 | |||
39 | struct amdgpu_vf_error_buffer admgpu_vf_errors; | ||
40 | |||
41 | |||
42 | void amdgpu_vf_error_put(uint16_t sub_error_code, uint16_t error_flags, uint64_t error_data) | ||
43 | { | 32 | { |
44 | int index; | 33 | int index; |
45 | uint16_t error_code = AMDGIM_ERROR_CODE(AMDGIM_ERROR_CATEGORY_VF, sub_error_code); | 34 | uint16_t error_code = AMDGIM_ERROR_CODE(AMDGIM_ERROR_CATEGORY_VF, sub_error_code); |
46 | 35 | ||
47 | index = admgpu_vf_errors.write_count % AMDGPU_VF_ERROR_ENTRY_SIZE; | 36 | mutex_lock(&adev->virt.vf_errors.lock); |
48 | admgpu_vf_errors.code [index] = error_code; | 37 | index = adev->virt.vf_errors.write_count % AMDGPU_VF_ERROR_ENTRY_SIZE; |
49 | admgpu_vf_errors.flags [index] = error_flags; | 38 | adev->virt.vf_errors.code [index] = error_code; |
50 | admgpu_vf_errors.data [index] = error_data; | 39 | adev->virt.vf_errors.flags [index] = error_flags; |
51 | admgpu_vf_errors.write_count ++; | 40 | adev->virt.vf_errors.data [index] = error_data; |
41 | adev->virt.vf_errors.write_count ++; | ||
42 | mutex_unlock(&adev->virt.vf_errors.lock); | ||
52 | } | 43 | } |
53 | 44 | ||
54 | 45 | ||
@@ -58,7 +49,8 @@ void amdgpu_vf_error_trans_all(struct amdgpu_device *adev) | |||
58 | u32 data1, data2, data3; | 49 | u32 data1, data2, data3; |
59 | int index; | 50 | int index; |
60 | 51 | ||
61 | if ((NULL == adev) || (!amdgpu_sriov_vf(adev)) || (!adev->virt.ops) || (!adev->virt.ops->trans_msg)) { | 52 | if ((NULL == adev) || (!amdgpu_sriov_vf(adev)) || |
53 | (!adev->virt.ops) || (!adev->virt.ops->trans_msg)) { | ||
62 | return; | 54 | return; |
63 | } | 55 | } |
64 | /* | 56 | /* |
@@ -68,18 +60,22 @@ void amdgpu_vf_error_trans_all(struct amdgpu_device *adev) | |||
68 | return; | 60 | return; |
69 | } | 61 | } |
70 | */ | 62 | */ |
63 | |||
64 | mutex_lock(&adev->virt.vf_errors.lock); | ||
71 | /* The errors are overlay of array, correct read_count as full. */ | 65 | /* The errors are overlay of array, correct read_count as full. */ |
72 | if (admgpu_vf_errors.write_count - admgpu_vf_errors.read_count > AMDGPU_VF_ERROR_ENTRY_SIZE) { | 66 | if (adev->virt.vf_errors.write_count - adev->virt.vf_errors.read_count > AMDGPU_VF_ERROR_ENTRY_SIZE) { |
73 | admgpu_vf_errors.read_count = admgpu_vf_errors.write_count - AMDGPU_VF_ERROR_ENTRY_SIZE; | 67 | adev->virt.vf_errors.read_count = adev->virt.vf_errors.write_count - AMDGPU_VF_ERROR_ENTRY_SIZE; |
74 | } | 68 | } |
75 | 69 | ||
76 | while (admgpu_vf_errors.read_count < admgpu_vf_errors.write_count) { | 70 | while (adev->virt.vf_errors.read_count < adev->virt.vf_errors.write_count) { |
77 | index =admgpu_vf_errors.read_count % AMDGPU_VF_ERROR_ENTRY_SIZE; | 71 | index =adev->virt.vf_errors.read_count % AMDGPU_VF_ERROR_ENTRY_SIZE; |
78 | data1 = AMDGIM_ERROR_CODE_FLAGS_TO_MAILBOX (admgpu_vf_errors.code[index], admgpu_vf_errors.flags[index]); | 72 | data1 = AMDGIM_ERROR_CODE_FLAGS_TO_MAILBOX(adev->virt.vf_errors.code[index], |
79 | data2 = admgpu_vf_errors.data[index] & 0xFFFFFFFF; | 73 | adev->virt.vf_errors.flags[index]); |
80 | data3 = (admgpu_vf_errors.data[index] >> 32) & 0xFFFFFFFF; | 74 | data2 = adev->virt.vf_errors.data[index] & 0xFFFFFFFF; |
75 | data3 = (adev->virt.vf_errors.data[index] >> 32) & 0xFFFFFFFF; | ||
81 | 76 | ||
82 | adev->virt.ops->trans_msg(adev, IDH_LOG_VF_ERROR, data1, data2, data3); | 77 | adev->virt.ops->trans_msg(adev, IDH_LOG_VF_ERROR, data1, data2, data3); |
83 | admgpu_vf_errors.read_count ++; | 78 | adev->virt.vf_errors.read_count ++; |
84 | } | 79 | } |
80 | mutex_unlock(&adev->virt.vf_errors.lock); | ||
85 | } | 81 | } |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.h index 2a3278ec76ba..6436bd053325 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.h | |||
@@ -56,7 +56,10 @@ enum AMDGIM_ERROR_CATEGORY { | |||
56 | AMDGIM_ERROR_CATEGORY_MAX | 56 | AMDGIM_ERROR_CATEGORY_MAX |
57 | }; | 57 | }; |
58 | 58 | ||
59 | void amdgpu_vf_error_put(uint16_t sub_error_code, uint16_t error_flags, uint64_t error_data); | 59 | void amdgpu_vf_error_put(struct amdgpu_device *adev, |
60 | uint16_t sub_error_code, | ||
61 | uint16_t error_flags, | ||
62 | uint64_t error_data); | ||
60 | void amdgpu_vf_error_trans_all (struct amdgpu_device *adev); | 63 | void amdgpu_vf_error_trans_all (struct amdgpu_device *adev); |
61 | 64 | ||
62 | #endif /* __VF_ERROR_H__ */ | 65 | #endif /* __VF_ERROR_H__ */ |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index afcfb8bcfb65..e5fd0ff6b29d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | |||
@@ -36,6 +36,18 @@ struct amdgpu_mm_table { | |||
36 | uint64_t gpu_addr; | 36 | uint64_t gpu_addr; |
37 | }; | 37 | }; |
38 | 38 | ||
39 | #define AMDGPU_VF_ERROR_ENTRY_SIZE 16 | ||
40 | |||
41 | /* struct error_entry - amdgpu VF error information. */ | ||
42 | struct amdgpu_vf_error_buffer { | ||
43 | struct mutex lock; | ||
44 | int read_count; | ||
45 | int write_count; | ||
46 | uint16_t code[AMDGPU_VF_ERROR_ENTRY_SIZE]; | ||
47 | uint16_t flags[AMDGPU_VF_ERROR_ENTRY_SIZE]; | ||
48 | uint64_t data[AMDGPU_VF_ERROR_ENTRY_SIZE]; | ||
49 | }; | ||
50 | |||
39 | /** | 51 | /** |
40 | * struct amdgpu_virt_ops - amdgpu device virt operations | 52 | * struct amdgpu_virt_ops - amdgpu device virt operations |
41 | */ | 53 | */ |
@@ -59,6 +71,7 @@ struct amdgpu_virt { | |||
59 | struct work_struct flr_work; | 71 | struct work_struct flr_work; |
60 | struct amdgpu_mm_table mm_table; | 72 | struct amdgpu_mm_table mm_table; |
61 | const struct amdgpu_virt_ops *ops; | 73 | const struct amdgpu_virt_ops *ops; |
74 | struct amdgpu_vf_error_buffer vf_errors; | ||
62 | }; | 75 | }; |
63 | 76 | ||
64 | #define AMDGPU_CSA_SIZE (8 * 1024) | 77 | #define AMDGPU_CSA_SIZE (8 * 1024) |