diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1247 |
1 files changed, 1247 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c new file mode 100644 index 000000000000..d7c7090fade9 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |||
@@ -0,0 +1,1247 @@ | |||
1 | /* | ||
2 | * Copyright 2018 Advanced Micro Devices, Inc. | ||
3 | * | ||
4 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
5 | * copy of this software and associated documentation files (the "Software"), | ||
6 | * to deal in the Software without restriction, including without limitation | ||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
8 | * and/or sell copies of the Software, and to permit persons to whom the | ||
9 | * Software is furnished to do so, subject to the following conditions: | ||
10 | * | ||
11 | * The above copyright notice and this permission notice shall be included in | ||
12 | * all copies or substantial portions of the Software. | ||
13 | * | ||
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR | ||
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | ||
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
20 | * OTHER DEALINGS IN THE SOFTWARE. | ||
21 | * | ||
22 | * | ||
23 | */ | ||
24 | #include <linux/debugfs.h> | ||
25 | #include <linux/list.h> | ||
26 | #include <linux/module.h> | ||
27 | #include "amdgpu.h" | ||
28 | #include "amdgpu_ras.h" | ||
29 | |||
30 | struct ras_ih_data { | ||
31 | /* interrupt bottom half */ | ||
32 | struct work_struct ih_work; | ||
33 | int inuse; | ||
34 | /* IP callback */ | ||
35 | ras_ih_cb cb; | ||
36 | /* full of entries */ | ||
37 | unsigned char *ring; | ||
38 | unsigned int ring_size; | ||
39 | unsigned int element_size; | ||
40 | unsigned int aligned_element_size; | ||
41 | unsigned int rptr; | ||
42 | unsigned int wptr; | ||
43 | }; | ||
44 | |||
45 | struct ras_fs_data { | ||
46 | char sysfs_name[32]; | ||
47 | char debugfs_name[32]; | ||
48 | }; | ||
49 | |||
50 | struct ras_err_data { | ||
51 | unsigned long ue_count; | ||
52 | unsigned long ce_count; | ||
53 | }; | ||
54 | |||
55 | struct ras_err_handler_data { | ||
56 | /* point to bad pages array */ | ||
57 | struct { | ||
58 | unsigned long bp; | ||
59 | struct amdgpu_bo *bo; | ||
60 | } *bps; | ||
61 | /* the count of entries */ | ||
62 | int count; | ||
63 | /* the space can place new entries */ | ||
64 | int space_left; | ||
65 | /* last reserved entry's index + 1 */ | ||
66 | int last_reserved; | ||
67 | }; | ||
68 | |||
69 | struct ras_manager { | ||
70 | struct ras_common_if head; | ||
71 | /* reference count */ | ||
72 | int use; | ||
73 | /* ras block link */ | ||
74 | struct list_head node; | ||
75 | /* the device */ | ||
76 | struct amdgpu_device *adev; | ||
77 | /* debugfs */ | ||
78 | struct dentry *ent; | ||
79 | /* sysfs */ | ||
80 | struct device_attribute sysfs_attr; | ||
81 | int attr_inuse; | ||
82 | |||
83 | /* fs node name */ | ||
84 | struct ras_fs_data fs_data; | ||
85 | |||
86 | /* IH data */ | ||
87 | struct ras_ih_data ih_data; | ||
88 | |||
89 | struct ras_err_data err_data; | ||
90 | }; | ||
91 | |||
92 | const char *ras_error_string[] = { | ||
93 | "none", | ||
94 | "parity", | ||
95 | "single_correctable", | ||
96 | "multi_uncorrectable", | ||
97 | "poison", | ||
98 | }; | ||
99 | |||
100 | const char *ras_block_string[] = { | ||
101 | "umc", | ||
102 | "sdma", | ||
103 | "gfx", | ||
104 | "mmhub", | ||
105 | "athub", | ||
106 | "pcie_bif", | ||
107 | "hdp", | ||
108 | "xgmi_wafl", | ||
109 | "df", | ||
110 | "smn", | ||
111 | "sem", | ||
112 | "mp0", | ||
113 | "mp1", | ||
114 | "fuse", | ||
115 | }; | ||
116 | |||
117 | #define ras_err_str(i) (ras_error_string[ffs(i)]) | ||
118 | #define ras_block_str(i) (ras_block_string[i]) | ||
119 | |||
120 | static void amdgpu_ras_self_test(struct amdgpu_device *adev) | ||
121 | { | ||
122 | /* TODO */ | ||
123 | } | ||
124 | |||
125 | static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, | ||
126 | size_t size, loff_t *pos) | ||
127 | { | ||
128 | struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; | ||
129 | struct ras_query_if info = { | ||
130 | .head = obj->head, | ||
131 | }; | ||
132 | ssize_t s; | ||
133 | char val[128]; | ||
134 | |||
135 | if (amdgpu_ras_error_query(obj->adev, &info)) | ||
136 | return -EINVAL; | ||
137 | |||
138 | s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", | ||
139 | "ue", info.ue_count, | ||
140 | "ce", info.ce_count); | ||
141 | if (*pos >= s) | ||
142 | return 0; | ||
143 | |||
144 | s -= *pos; | ||
145 | s = min_t(u64, s, size); | ||
146 | |||
147 | |||
148 | if (copy_to_user(buf, &val[*pos], s)) | ||
149 | return -EINVAL; | ||
150 | |||
151 | *pos += s; | ||
152 | |||
153 | return s; | ||
154 | } | ||
155 | |||
156 | static ssize_t amdgpu_ras_debugfs_write(struct file *f, const char __user *buf, | ||
157 | size_t size, loff_t *pos) | ||
158 | { | ||
159 | struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; | ||
160 | struct ras_inject_if info = { | ||
161 | .head = obj->head, | ||
162 | }; | ||
163 | ssize_t s = min_t(u64, 64, size); | ||
164 | char val[64]; | ||
165 | char *str = val; | ||
166 | memset(val, 0, sizeof(val)); | ||
167 | |||
168 | if (*pos) | ||
169 | return -EINVAL; | ||
170 | |||
171 | if (copy_from_user(str, buf, s)) | ||
172 | return -EINVAL; | ||
173 | |||
174 | /* only care ue/ce for now. */ | ||
175 | if (memcmp(str, "ue", 2) == 0) { | ||
176 | info.head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; | ||
177 | str += 2; | ||
178 | } else if (memcmp(str, "ce", 2) == 0) { | ||
179 | info.head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; | ||
180 | str += 2; | ||
181 | } | ||
182 | |||
183 | if (sscanf(str, "0x%llx 0x%llx", &info.address, &info.value) != 2) { | ||
184 | if (sscanf(str, "%llu %llu", &info.address, &info.value) != 2) | ||
185 | return -EINVAL; | ||
186 | } | ||
187 | |||
188 | *pos = s; | ||
189 | |||
190 | if (amdgpu_ras_error_inject(obj->adev, &info)) | ||
191 | return -EINVAL; | ||
192 | |||
193 | return size; | ||
194 | } | ||
195 | |||
196 | static const struct file_operations amdgpu_ras_debugfs_ops = { | ||
197 | .owner = THIS_MODULE, | ||
198 | .read = amdgpu_ras_debugfs_read, | ||
199 | .write = amdgpu_ras_debugfs_write, | ||
200 | .llseek = default_llseek | ||
201 | }; | ||
202 | |||
203 | static ssize_t amdgpu_ras_sysfs_read(struct device *dev, | ||
204 | struct device_attribute *attr, char *buf) | ||
205 | { | ||
206 | struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr); | ||
207 | struct ras_query_if info = { | ||
208 | .head = obj->head, | ||
209 | }; | ||
210 | |||
211 | if (amdgpu_ras_error_query(obj->adev, &info)) | ||
212 | return -EINVAL; | ||
213 | |||
214 | return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n", | ||
215 | "ue", info.ue_count, | ||
216 | "ce", info.ce_count); | ||
217 | } | ||
218 | |||
219 | /* obj begin */ | ||
220 | |||
221 | #define get_obj(obj) do { (obj)->use++; } while (0) | ||
222 | #define alive_obj(obj) ((obj)->use) | ||
223 | |||
224 | static inline void put_obj(struct ras_manager *obj) | ||
225 | { | ||
226 | if (obj && --obj->use == 0) | ||
227 | list_del(&obj->node); | ||
228 | if (obj && obj->use < 0) { | ||
229 | DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name); | ||
230 | } | ||
231 | } | ||
232 | |||
233 | /* make one obj and return it. */ | ||
234 | static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, | ||
235 | struct ras_common_if *head) | ||
236 | { | ||
237 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
238 | struct ras_manager *obj; | ||
239 | |||
240 | if (!con) | ||
241 | return NULL; | ||
242 | |||
243 | if (head->block >= AMDGPU_RAS_BLOCK_COUNT) | ||
244 | return NULL; | ||
245 | |||
246 | obj = &con->objs[head->block]; | ||
247 | /* already exist. return obj? */ | ||
248 | if (alive_obj(obj)) | ||
249 | return NULL; | ||
250 | |||
251 | obj->head = *head; | ||
252 | obj->adev = adev; | ||
253 | list_add(&obj->node, &con->head); | ||
254 | get_obj(obj); | ||
255 | |||
256 | return obj; | ||
257 | } | ||
258 | |||
259 | /* return an obj equal to head, or the first when head is NULL */ | ||
260 | static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, | ||
261 | struct ras_common_if *head) | ||
262 | { | ||
263 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
264 | struct ras_manager *obj; | ||
265 | int i; | ||
266 | |||
267 | if (!con) | ||
268 | return NULL; | ||
269 | |||
270 | if (head) { | ||
271 | if (head->block >= AMDGPU_RAS_BLOCK_COUNT) | ||
272 | return NULL; | ||
273 | |||
274 | obj = &con->objs[head->block]; | ||
275 | |||
276 | if (alive_obj(obj)) { | ||
277 | WARN_ON(head->block != obj->head.block); | ||
278 | return obj; | ||
279 | } | ||
280 | } else { | ||
281 | for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { | ||
282 | obj = &con->objs[i]; | ||
283 | if (alive_obj(obj)) { | ||
284 | WARN_ON(i != obj->head.block); | ||
285 | return obj; | ||
286 | } | ||
287 | } | ||
288 | } | ||
289 | |||
290 | return NULL; | ||
291 | } | ||
292 | /* obj end */ | ||
293 | |||
294 | /* feature ctl begin */ | ||
295 | static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, | ||
296 | struct ras_common_if *head) | ||
297 | { | ||
298 | return amdgpu_ras_enable && (amdgpu_ras_mask & BIT(head->block)); | ||
299 | } | ||
300 | |||
301 | static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, | ||
302 | struct ras_common_if *head) | ||
303 | { | ||
304 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
305 | |||
306 | return con->features & BIT(head->block); | ||
307 | } | ||
308 | |||
309 | /* | ||
310 | * if obj is not created, then create one. | ||
311 | * set feature enable flag. | ||
312 | */ | ||
313 | static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, | ||
314 | struct ras_common_if *head, int enable) | ||
315 | { | ||
316 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
317 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); | ||
318 | |||
319 | if (!amdgpu_ras_is_feature_allowed(adev, head)) | ||
320 | return 0; | ||
321 | if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) | ||
322 | return 0; | ||
323 | |||
324 | if (enable) { | ||
325 | if (!obj) { | ||
326 | obj = amdgpu_ras_create_obj(adev, head); | ||
327 | if (!obj) | ||
328 | return -EINVAL; | ||
329 | } else { | ||
330 | /* In case we create obj somewhere else */ | ||
331 | get_obj(obj); | ||
332 | } | ||
333 | con->features |= BIT(head->block); | ||
334 | } else { | ||
335 | if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { | ||
336 | con->features &= ~BIT(head->block); | ||
337 | put_obj(obj); | ||
338 | } | ||
339 | } | ||
340 | |||
341 | return 0; | ||
342 | } | ||
343 | |||
344 | /* wrapper of psp_ras_enable_features */ | ||
345 | int amdgpu_ras_feature_enable(struct amdgpu_device *adev, | ||
346 | struct ras_common_if *head, bool enable) | ||
347 | { | ||
348 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
349 | union ta_ras_cmd_input info; | ||
350 | int ret; | ||
351 | |||
352 | if (!con) | ||
353 | return -EINVAL; | ||
354 | |||
355 | if (!enable) { | ||
356 | info.disable_features = (struct ta_ras_disable_features_input) { | ||
357 | .block_id = head->block, | ||
358 | .error_type = head->type, | ||
359 | }; | ||
360 | } else { | ||
361 | info.enable_features = (struct ta_ras_enable_features_input) { | ||
362 | .block_id = head->block, | ||
363 | .error_type = head->type, | ||
364 | }; | ||
365 | } | ||
366 | |||
367 | /* Do not enable if it is not allowed. */ | ||
368 | WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); | ||
369 | /* Are we alerady in that state we are going to set? */ | ||
370 | if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) | ||
371 | return 0; | ||
372 | |||
373 | ret = psp_ras_enable_features(&adev->psp, &info, enable); | ||
374 | if (ret) { | ||
375 | DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n", | ||
376 | enable ? "enable":"disable", | ||
377 | ras_block_str(head->block), | ||
378 | ret); | ||
379 | return -EINVAL; | ||
380 | } | ||
381 | |||
382 | /* setup the obj */ | ||
383 | __amdgpu_ras_feature_enable(adev, head, enable); | ||
384 | |||
385 | return 0; | ||
386 | } | ||
387 | |||
388 | static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev, | ||
389 | bool bypass) | ||
390 | { | ||
391 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
392 | struct ras_manager *obj, *tmp; | ||
393 | |||
394 | list_for_each_entry_safe(obj, tmp, &con->head, node) { | ||
395 | /* bypass psp. | ||
396 | * aka just release the obj and corresponding flags | ||
397 | */ | ||
398 | if (bypass) { | ||
399 | if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) | ||
400 | break; | ||
401 | } else { | ||
402 | if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) | ||
403 | break; | ||
404 | } | ||
405 | }; | ||
406 | |||
407 | return con->features; | ||
408 | } | ||
409 | |||
410 | static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, | ||
411 | bool bypass) | ||
412 | { | ||
413 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
414 | int ras_block_count = AMDGPU_RAS_BLOCK_COUNT; | ||
415 | int i; | ||
416 | |||
417 | for (i = 0; i < ras_block_count; i++) { | ||
418 | struct ras_common_if head = { | ||
419 | .block = i, | ||
420 | .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, | ||
421 | .sub_block_index = 0, | ||
422 | }; | ||
423 | strcpy(head.name, ras_block_str(i)); | ||
424 | if (bypass) { | ||
425 | /* | ||
426 | * bypass psp. vbios enable ras for us. | ||
427 | * so just create the obj | ||
428 | */ | ||
429 | if (__amdgpu_ras_feature_enable(adev, &head, 1)) | ||
430 | break; | ||
431 | } else { | ||
432 | if (amdgpu_ras_feature_enable(adev, &head, 1)) | ||
433 | break; | ||
434 | } | ||
435 | }; | ||
436 | |||
437 | return con->features; | ||
438 | } | ||
439 | /* feature ctl end */ | ||
440 | |||
441 | /* query/inject/cure begin */ | ||
442 | int amdgpu_ras_error_query(struct amdgpu_device *adev, | ||
443 | struct ras_query_if *info) | ||
444 | { | ||
445 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); | ||
446 | |||
447 | if (!obj) | ||
448 | return -EINVAL; | ||
449 | /* TODO might read the register to read the count */ | ||
450 | |||
451 | info->ue_count = obj->err_data.ue_count; | ||
452 | info->ce_count = obj->err_data.ce_count; | ||
453 | |||
454 | return 0; | ||
455 | } | ||
456 | |||
457 | /* wrapper of psp_ras_trigger_error */ | ||
458 | int amdgpu_ras_error_inject(struct amdgpu_device *adev, | ||
459 | struct ras_inject_if *info) | ||
460 | { | ||
461 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); | ||
462 | struct ta_ras_trigger_error_input block_info = { | ||
463 | .block_id = info->head.block, | ||
464 | .inject_error_type = info->head.type, | ||
465 | .sub_block_index = info->head.sub_block_index, | ||
466 | .address = info->address, | ||
467 | .value = info->value, | ||
468 | }; | ||
469 | int ret = 0; | ||
470 | |||
471 | if (!obj) | ||
472 | return -EINVAL; | ||
473 | |||
474 | ret = psp_ras_trigger_error(&adev->psp, &block_info); | ||
475 | if (ret) | ||
476 | DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n", | ||
477 | ras_block_str(info->head.block), | ||
478 | ret); | ||
479 | |||
480 | return ret; | ||
481 | } | ||
482 | |||
483 | int amdgpu_ras_error_cure(struct amdgpu_device *adev, | ||
484 | struct ras_cure_if *info) | ||
485 | { | ||
486 | /* psp fw has no cure interface for now. */ | ||
487 | return 0; | ||
488 | } | ||
489 | |||
490 | /* get the total error counts on all IPs */ | ||
491 | int amdgpu_ras_query_error_count(struct amdgpu_device *adev, | ||
492 | bool is_ce) | ||
493 | { | ||
494 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
495 | struct ras_manager *obj; | ||
496 | struct ras_err_data data = {0, 0}; | ||
497 | |||
498 | if (!con) | ||
499 | return -EINVAL; | ||
500 | |||
501 | list_for_each_entry(obj, &con->head, node) { | ||
502 | struct ras_query_if info = { | ||
503 | .head = obj->head, | ||
504 | }; | ||
505 | |||
506 | if (amdgpu_ras_error_query(adev, &info)) | ||
507 | return -EINVAL; | ||
508 | |||
509 | data.ce_count += info.ce_count; | ||
510 | data.ue_count += info.ue_count; | ||
511 | } | ||
512 | |||
513 | return is_ce ? data.ce_count : data.ue_count; | ||
514 | } | ||
515 | /* query/inject/cure end */ | ||
516 | |||
517 | |||
518 | /* sysfs begin */ | ||
519 | |||
520 | static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, | ||
521 | struct device_attribute *attr, char *buf) | ||
522 | { | ||
523 | struct amdgpu_ras *con = | ||
524 | container_of(attr, struct amdgpu_ras, features_attr); | ||
525 | struct drm_device *ddev = dev_get_drvdata(dev); | ||
526 | struct amdgpu_device *adev = ddev->dev_private; | ||
527 | struct ras_common_if head; | ||
528 | int ras_block_count = AMDGPU_RAS_BLOCK_COUNT; | ||
529 | int i; | ||
530 | ssize_t s; | ||
531 | struct ras_manager *obj; | ||
532 | |||
533 | s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features); | ||
534 | |||
535 | for (i = 0; i < ras_block_count; i++) { | ||
536 | head.block = i; | ||
537 | |||
538 | if (amdgpu_ras_is_feature_enabled(adev, &head)) { | ||
539 | obj = amdgpu_ras_find_obj(adev, &head); | ||
540 | s += scnprintf(&buf[s], PAGE_SIZE - s, | ||
541 | "%s: %s\n", | ||
542 | ras_block_str(i), | ||
543 | ras_err_str(obj->head.type)); | ||
544 | } else | ||
545 | s += scnprintf(&buf[s], PAGE_SIZE - s, | ||
546 | "%s: disabled\n", | ||
547 | ras_block_str(i)); | ||
548 | } | ||
549 | |||
550 | return s; | ||
551 | } | ||
552 | |||
553 | static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) | ||
554 | { | ||
555 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
556 | struct attribute *attrs[] = { | ||
557 | &con->features_attr.attr, | ||
558 | NULL | ||
559 | }; | ||
560 | struct attribute_group group = { | ||
561 | .name = "ras", | ||
562 | .attrs = attrs, | ||
563 | }; | ||
564 | |||
565 | con->features_attr = (struct device_attribute) { | ||
566 | .attr = { | ||
567 | .name = "features", | ||
568 | .mode = S_IRUGO, | ||
569 | }, | ||
570 | .show = amdgpu_ras_sysfs_features_read, | ||
571 | }; | ||
572 | |||
573 | return sysfs_create_group(&adev->dev->kobj, &group); | ||
574 | } | ||
575 | |||
576 | static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) | ||
577 | { | ||
578 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
579 | struct attribute *attrs[] = { | ||
580 | &con->features_attr.attr, | ||
581 | NULL | ||
582 | }; | ||
583 | struct attribute_group group = { | ||
584 | .name = "ras", | ||
585 | .attrs = attrs, | ||
586 | }; | ||
587 | |||
588 | sysfs_remove_group(&adev->dev->kobj, &group); | ||
589 | |||
590 | return 0; | ||
591 | } | ||
592 | |||
593 | int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, | ||
594 | struct ras_fs_if *head) | ||
595 | { | ||
596 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); | ||
597 | |||
598 | if (!obj || obj->attr_inuse) | ||
599 | return -EINVAL; | ||
600 | |||
601 | get_obj(obj); | ||
602 | |||
603 | memcpy(obj->fs_data.sysfs_name, | ||
604 | head->sysfs_name, | ||
605 | sizeof(obj->fs_data.sysfs_name)); | ||
606 | |||
607 | obj->sysfs_attr = (struct device_attribute){ | ||
608 | .attr = { | ||
609 | .name = obj->fs_data.sysfs_name, | ||
610 | .mode = S_IRUGO, | ||
611 | }, | ||
612 | .show = amdgpu_ras_sysfs_read, | ||
613 | }; | ||
614 | |||
615 | if (sysfs_add_file_to_group(&adev->dev->kobj, | ||
616 | &obj->sysfs_attr.attr, | ||
617 | "ras")) { | ||
618 | put_obj(obj); | ||
619 | return -EINVAL; | ||
620 | } | ||
621 | |||
622 | obj->attr_inuse = 1; | ||
623 | |||
624 | return 0; | ||
625 | } | ||
626 | |||
627 | int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, | ||
628 | struct ras_common_if *head) | ||
629 | { | ||
630 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); | ||
631 | |||
632 | if (!obj || !obj->attr_inuse) | ||
633 | return -EINVAL; | ||
634 | |||
635 | sysfs_remove_file_from_group(&adev->dev->kobj, | ||
636 | &obj->sysfs_attr.attr, | ||
637 | "ras"); | ||
638 | obj->attr_inuse = 0; | ||
639 | put_obj(obj); | ||
640 | |||
641 | return 0; | ||
642 | } | ||
643 | |||
644 | static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) | ||
645 | { | ||
646 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
647 | struct ras_manager *obj, *tmp; | ||
648 | |||
649 | list_for_each_entry_safe(obj, tmp, &con->head, node) { | ||
650 | amdgpu_ras_sysfs_remove(adev, &obj->head); | ||
651 | } | ||
652 | |||
653 | amdgpu_ras_sysfs_remove_feature_node(adev); | ||
654 | |||
655 | return 0; | ||
656 | } | ||
657 | /* sysfs end */ | ||
658 | |||
659 | /* debugfs begin */ | ||
660 | int amdgpu_ras_debugfs_create(struct amdgpu_device *adev, | ||
661 | struct ras_fs_if *head) | ||
662 | { | ||
663 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
664 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); | ||
665 | struct dentry *ent; | ||
666 | |||
667 | if (!obj || obj->ent) | ||
668 | return -EINVAL; | ||
669 | |||
670 | get_obj(obj); | ||
671 | |||
672 | memcpy(obj->fs_data.debugfs_name, | ||
673 | head->debugfs_name, | ||
674 | sizeof(obj->fs_data.debugfs_name)); | ||
675 | |||
676 | ent = debugfs_create_file(obj->fs_data.debugfs_name, | ||
677 | S_IWUGO | S_IRUGO, con->dir, | ||
678 | obj, &amdgpu_ras_debugfs_ops); | ||
679 | |||
680 | if (IS_ERR(ent)) | ||
681 | return -EINVAL; | ||
682 | |||
683 | obj->ent = ent; | ||
684 | |||
685 | return 0; | ||
686 | } | ||
687 | |||
688 | int amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, | ||
689 | struct ras_common_if *head) | ||
690 | { | ||
691 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); | ||
692 | |||
693 | if (!obj || !obj->ent) | ||
694 | return 0; | ||
695 | |||
696 | debugfs_remove(obj->ent); | ||
697 | obj->ent = NULL; | ||
698 | put_obj(obj); | ||
699 | |||
700 | return 0; | ||
701 | } | ||
702 | |||
703 | static int amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) | ||
704 | { | ||
705 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
706 | struct ras_manager *obj, *tmp; | ||
707 | |||
708 | list_for_each_entry_safe(obj, tmp, &con->head, node) { | ||
709 | amdgpu_ras_debugfs_remove(adev, &obj->head); | ||
710 | } | ||
711 | |||
712 | debugfs_remove(con->dir); | ||
713 | con->dir = NULL; | ||
714 | |||
715 | return 0; | ||
716 | } | ||
717 | /* debugfs end */ | ||
718 | |||
719 | /* ras fs */ | ||
720 | |||
721 | static int amdgpu_ras_fs_init(struct amdgpu_device *adev) | ||
722 | { | ||
723 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
724 | struct drm_minor *minor = adev->ddev->primary; | ||
725 | struct dentry *root = minor->debugfs_root, *dir; | ||
726 | |||
727 | dir = debugfs_create_dir("ras", root); | ||
728 | if (IS_ERR(dir)) | ||
729 | return -EINVAL; | ||
730 | |||
731 | con->dir = dir; | ||
732 | |||
733 | amdgpu_ras_sysfs_create_feature_node(adev); | ||
734 | |||
735 | return 0; | ||
736 | } | ||
737 | |||
738 | static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) | ||
739 | { | ||
740 | amdgpu_ras_debugfs_remove_all(adev); | ||
741 | amdgpu_ras_sysfs_remove_all(adev); | ||
742 | return 0; | ||
743 | } | ||
744 | /* ras fs end */ | ||
745 | |||
746 | /* ih begin */ | ||
747 | static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) | ||
748 | { | ||
749 | struct ras_ih_data *data = &obj->ih_data; | ||
750 | struct amdgpu_iv_entry entry; | ||
751 | int ret; | ||
752 | |||
753 | while (data->rptr != data->wptr) { | ||
754 | rmb(); | ||
755 | memcpy(&entry, &data->ring[data->rptr], | ||
756 | data->element_size); | ||
757 | |||
758 | wmb(); | ||
759 | data->rptr = (data->aligned_element_size + | ||
760 | data->rptr) % data->ring_size; | ||
761 | |||
762 | /* Let IP handle its data, maybe we need get the output | ||
763 | * from the callback to udpate the error type/count, etc | ||
764 | */ | ||
765 | if (data->cb) { | ||
766 | ret = data->cb(obj->adev, &entry); | ||
767 | /* ue will trigger an interrupt, and in that case | ||
768 | * we need do a reset to recovery the whole system. | ||
769 | * But leave IP do that recovery, here we just dispatch | ||
770 | * the error. | ||
771 | */ | ||
772 | if (ret == AMDGPU_RAS_UE) { | ||
773 | obj->err_data.ue_count++; | ||
774 | } | ||
775 | /* Might need get ce count by register, but not all IP | ||
776 | * saves ce count, some IP just use one bit or two bits | ||
777 | * to indicate ce happened. | ||
778 | */ | ||
779 | } | ||
780 | } | ||
781 | } | ||
782 | |||
783 | static void amdgpu_ras_interrupt_process_handler(struct work_struct *work) | ||
784 | { | ||
785 | struct ras_ih_data *data = | ||
786 | container_of(work, struct ras_ih_data, ih_work); | ||
787 | struct ras_manager *obj = | ||
788 | container_of(data, struct ras_manager, ih_data); | ||
789 | |||
790 | amdgpu_ras_interrupt_handler(obj); | ||
791 | } | ||
792 | |||
793 | int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, | ||
794 | struct ras_dispatch_if *info) | ||
795 | { | ||
796 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); | ||
797 | struct ras_ih_data *data = &obj->ih_data; | ||
798 | |||
799 | if (!obj) | ||
800 | return -EINVAL; | ||
801 | |||
802 | if (data->inuse == 0) | ||
803 | return 0; | ||
804 | |||
805 | /* Might be overflow... */ | ||
806 | memcpy(&data->ring[data->wptr], info->entry, | ||
807 | data->element_size); | ||
808 | |||
809 | wmb(); | ||
810 | data->wptr = (data->aligned_element_size + | ||
811 | data->wptr) % data->ring_size; | ||
812 | |||
813 | schedule_work(&data->ih_work); | ||
814 | |||
815 | return 0; | ||
816 | } | ||
817 | |||
818 | int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, | ||
819 | struct ras_ih_if *info) | ||
820 | { | ||
821 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); | ||
822 | struct ras_ih_data *data; | ||
823 | |||
824 | if (!obj) | ||
825 | return -EINVAL; | ||
826 | |||
827 | data = &obj->ih_data; | ||
828 | if (data->inuse == 0) | ||
829 | return 0; | ||
830 | |||
831 | cancel_work_sync(&data->ih_work); | ||
832 | |||
833 | kfree(data->ring); | ||
834 | memset(data, 0, sizeof(*data)); | ||
835 | put_obj(obj); | ||
836 | |||
837 | return 0; | ||
838 | } | ||
839 | |||
840 | int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, | ||
841 | struct ras_ih_if *info) | ||
842 | { | ||
843 | struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); | ||
844 | struct ras_ih_data *data; | ||
845 | |||
846 | if (!obj) { | ||
847 | /* in case we registe the IH before enable ras feature */ | ||
848 | obj = amdgpu_ras_create_obj(adev, &info->head); | ||
849 | if (!obj) | ||
850 | return -EINVAL; | ||
851 | } else | ||
852 | get_obj(obj); | ||
853 | |||
854 | data = &obj->ih_data; | ||
855 | /* add the callback.etc */ | ||
856 | *data = (struct ras_ih_data) { | ||
857 | .inuse = 0, | ||
858 | .cb = info->cb, | ||
859 | .element_size = sizeof(struct amdgpu_iv_entry), | ||
860 | .rptr = 0, | ||
861 | .wptr = 0, | ||
862 | }; | ||
863 | |||
864 | INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler); | ||
865 | |||
866 | data->aligned_element_size = ALIGN(data->element_size, 8); | ||
867 | /* the ring can store 64 iv entries. */ | ||
868 | data->ring_size = 64 * data->aligned_element_size; | ||
869 | data->ring = kmalloc(data->ring_size, GFP_KERNEL); | ||
870 | if (!data->ring) { | ||
871 | put_obj(obj); | ||
872 | return -ENOMEM; | ||
873 | } | ||
874 | |||
875 | /* IH is ready */ | ||
876 | data->inuse = 1; | ||
877 | |||
878 | return 0; | ||
879 | } | ||
880 | |||
881 | static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) | ||
882 | { | ||
883 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
884 | struct ras_manager *obj, *tmp; | ||
885 | |||
886 | list_for_each_entry_safe(obj, tmp, &con->head, node) { | ||
887 | struct ras_ih_if info = { | ||
888 | .head = obj->head, | ||
889 | }; | ||
890 | amdgpu_ras_interrupt_remove_handler(adev, &info); | ||
891 | } | ||
892 | |||
893 | return 0; | ||
894 | } | ||
895 | /* ih end */ | ||
896 | |||
897 | /* recovery begin */ | ||
898 | static void amdgpu_ras_do_recovery(struct work_struct *work) | ||
899 | { | ||
900 | struct amdgpu_ras *ras = | ||
901 | container_of(work, struct amdgpu_ras, recovery_work); | ||
902 | |||
903 | amdgpu_device_gpu_recover(ras->adev, 0); | ||
904 | atomic_set(&ras->in_recovery, 0); | ||
905 | } | ||
906 | |||
907 | static int amdgpu_ras_release_vram(struct amdgpu_device *adev, | ||
908 | struct amdgpu_bo **bo_ptr) | ||
909 | { | ||
910 | /* no need to free it actually. */ | ||
911 | amdgpu_bo_free_kernel(bo_ptr, NULL, NULL); | ||
912 | return 0; | ||
913 | } | ||
914 | |||
915 | /* reserve vram with size@offset */ | ||
916 | static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, | ||
917 | uint64_t offset, uint64_t size, | ||
918 | struct amdgpu_bo **bo_ptr) | ||
919 | { | ||
920 | struct ttm_operation_ctx ctx = { false, false }; | ||
921 | struct amdgpu_bo_param bp; | ||
922 | int r = 0; | ||
923 | int i; | ||
924 | struct amdgpu_bo *bo; | ||
925 | |||
926 | if (bo_ptr) | ||
927 | *bo_ptr = NULL; | ||
928 | memset(&bp, 0, sizeof(bp)); | ||
929 | bp.size = size; | ||
930 | bp.byte_align = PAGE_SIZE; | ||
931 | bp.domain = AMDGPU_GEM_DOMAIN_VRAM; | ||
932 | bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS | | ||
933 | AMDGPU_GEM_CREATE_NO_CPU_ACCESS; | ||
934 | bp.type = ttm_bo_type_kernel; | ||
935 | bp.resv = NULL; | ||
936 | |||
937 | r = amdgpu_bo_create(adev, &bp, &bo); | ||
938 | if (r) | ||
939 | return -EINVAL; | ||
940 | |||
941 | r = amdgpu_bo_reserve(bo, false); | ||
942 | if (r) | ||
943 | goto error_reserve; | ||
944 | |||
945 | offset = ALIGN(offset, PAGE_SIZE); | ||
946 | for (i = 0; i < bo->placement.num_placement; ++i) { | ||
947 | bo->placements[i].fpfn = offset >> PAGE_SHIFT; | ||
948 | bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT; | ||
949 | } | ||
950 | |||
951 | ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem); | ||
952 | r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx); | ||
953 | if (r) | ||
954 | goto error_pin; | ||
955 | |||
956 | r = amdgpu_bo_pin_restricted(bo, | ||
957 | AMDGPU_GEM_DOMAIN_VRAM, | ||
958 | offset, | ||
959 | offset + size); | ||
960 | if (r) | ||
961 | goto error_pin; | ||
962 | |||
963 | if (bo_ptr) | ||
964 | *bo_ptr = bo; | ||
965 | |||
966 | amdgpu_bo_unreserve(bo); | ||
967 | return r; | ||
968 | |||
969 | error_pin: | ||
970 | amdgpu_bo_unreserve(bo); | ||
971 | error_reserve: | ||
972 | amdgpu_bo_unref(&bo); | ||
973 | return r; | ||
974 | } | ||
975 | |||
976 | /* alloc/realloc bps array */ | ||
977 | static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, | ||
978 | struct ras_err_handler_data *data, int pages) | ||
979 | { | ||
980 | unsigned int old_space = data->count + data->space_left; | ||
981 | unsigned int new_space = old_space + pages; | ||
982 | unsigned int align_space = ALIGN(new_space, 1024); | ||
983 | void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); | ||
984 | |||
985 | if (!tmp) | ||
986 | return -ENOMEM; | ||
987 | |||
988 | if (data->bps) { | ||
989 | memcpy(tmp, data->bps, | ||
990 | data->count * sizeof(*data->bps)); | ||
991 | kfree(data->bps); | ||
992 | } | ||
993 | |||
994 | data->bps = tmp; | ||
995 | data->space_left += align_space - old_space; | ||
996 | return 0; | ||
997 | } | ||
998 | |||
999 | /* it deal with vram only. */ | ||
1000 | int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, | ||
1001 | unsigned long *bps, int pages) | ||
1002 | { | ||
1003 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
1004 | struct ras_err_handler_data *data = con->eh_data; | ||
1005 | int i = pages; | ||
1006 | int ret = 0; | ||
1007 | |||
1008 | if (!con || !data || !bps || pages <= 0) | ||
1009 | return 0; | ||
1010 | |||
1011 | mutex_lock(&con->recovery_lock); | ||
1012 | if (!data) | ||
1013 | goto out; | ||
1014 | |||
1015 | if (data->space_left <= pages) | ||
1016 | if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) { | ||
1017 | ret = -ENOMEM; | ||
1018 | goto out; | ||
1019 | } | ||
1020 | |||
1021 | while (i--) | ||
1022 | data->bps[data->count++].bp = bps[i]; | ||
1023 | |||
1024 | data->space_left -= pages; | ||
1025 | out: | ||
1026 | mutex_unlock(&con->recovery_lock); | ||
1027 | |||
1028 | return ret; | ||
1029 | } | ||
1030 | |||
1031 | /* called in gpu recovery/init */ | ||
1032 | int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) | ||
1033 | { | ||
1034 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
1035 | struct ras_err_handler_data *data = con->eh_data; | ||
1036 | uint64_t bp; | ||
1037 | struct amdgpu_bo *bo; | ||
1038 | int i; | ||
1039 | |||
1040 | if (!con || !data) | ||
1041 | return 0; | ||
1042 | |||
1043 | mutex_lock(&con->recovery_lock); | ||
1044 | /* reserve vram at driver post stage. */ | ||
1045 | for (i = data->last_reserved; i < data->count; i++) { | ||
1046 | bp = data->bps[i].bp; | ||
1047 | |||
1048 | if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT, | ||
1049 | PAGE_SIZE, &bo)) | ||
1050 | DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp); | ||
1051 | |||
1052 | data->bps[i].bo = bo; | ||
1053 | data->last_reserved = i + 1; | ||
1054 | } | ||
1055 | mutex_unlock(&con->recovery_lock); | ||
1056 | return 0; | ||
1057 | } | ||
1058 | |||
1059 | /* called when driver unload */ | ||
1060 | static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) | ||
1061 | { | ||
1062 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
1063 | struct ras_err_handler_data *data = con->eh_data; | ||
1064 | struct amdgpu_bo *bo; | ||
1065 | int i; | ||
1066 | |||
1067 | if (!con || !data) | ||
1068 | return 0; | ||
1069 | |||
1070 | mutex_lock(&con->recovery_lock); | ||
1071 | for (i = data->last_reserved - 1; i >= 0; i--) { | ||
1072 | bo = data->bps[i].bo; | ||
1073 | |||
1074 | amdgpu_ras_release_vram(adev, &bo); | ||
1075 | |||
1076 | data->bps[i].bo = bo; | ||
1077 | data->last_reserved = i; | ||
1078 | } | ||
1079 | mutex_unlock(&con->recovery_lock); | ||
1080 | return 0; | ||
1081 | } | ||
1082 | |||
1083 | static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) | ||
1084 | { | ||
1085 | /* TODO | ||
1086 | * write the array to eeprom when SMU disabled. | ||
1087 | */ | ||
1088 | return 0; | ||
1089 | } | ||
1090 | |||
1091 | static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) | ||
1092 | { | ||
1093 | /* TODO | ||
1094 | * read the array to eeprom when SMU disabled. | ||
1095 | */ | ||
1096 | return 0; | ||
1097 | } | ||
1098 | |||
1099 | static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) | ||
1100 | { | ||
1101 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
1102 | struct ras_err_handler_data **data = &con->eh_data; | ||
1103 | |||
1104 | *data = kmalloc(sizeof(**data), | ||
1105 | GFP_KERNEL|__GFP_ZERO); | ||
1106 | if (!*data) | ||
1107 | return -ENOMEM; | ||
1108 | |||
1109 | mutex_init(&con->recovery_lock); | ||
1110 | INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); | ||
1111 | atomic_set(&con->in_recovery, 0); | ||
1112 | con->adev = adev; | ||
1113 | |||
1114 | amdgpu_ras_load_bad_pages(adev); | ||
1115 | amdgpu_ras_reserve_bad_pages(adev); | ||
1116 | |||
1117 | return 0; | ||
1118 | } | ||
1119 | |||
1120 | static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) | ||
1121 | { | ||
1122 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
1123 | struct ras_err_handler_data *data = con->eh_data; | ||
1124 | |||
1125 | cancel_work_sync(&con->recovery_work); | ||
1126 | amdgpu_ras_save_bad_pages(adev); | ||
1127 | amdgpu_ras_release_bad_pages(adev); | ||
1128 | |||
1129 | mutex_lock(&con->recovery_lock); | ||
1130 | con->eh_data = NULL; | ||
1131 | kfree(data->bps); | ||
1132 | kfree(data); | ||
1133 | mutex_unlock(&con->recovery_lock); | ||
1134 | |||
1135 | return 0; | ||
1136 | } | ||
1137 | /* recovery end */ | ||
1138 | |||
1139 | struct ras_DID_capability { | ||
1140 | u16 did; | ||
1141 | u8 rid; | ||
1142 | u32 capability; | ||
1143 | }; | ||
1144 | |||
1145 | static const struct ras_DID_capability supported_DID_array[] = { | ||
1146 | {0x66a0, 0x00, AMDGPU_RAS_BLOCK_MASK}, | ||
1147 | {0x66a0, 0x02, AMDGPU_RAS_BLOCK_MASK}, | ||
1148 | {0x66a1, 0x00, AMDGPU_RAS_BLOCK_MASK}, | ||
1149 | {0x66a1, 0x01, AMDGPU_RAS_BLOCK_MASK}, | ||
1150 | {0x66a1, 0x04, AMDGPU_RAS_BLOCK_MASK}, | ||
1151 | {0x66a3, 0x00, AMDGPU_RAS_BLOCK_MASK}, | ||
1152 | {0x66a7, 0x00, AMDGPU_RAS_BLOCK_MASK}, | ||
1153 | }; | ||
1154 | |||
1155 | static uint32_t amdgpu_ras_check_supported(struct amdgpu_device *adev) | ||
1156 | { | ||
1157 | /* TODO need check vbios table */ | ||
1158 | int i; | ||
1159 | int did = adev->pdev->device; | ||
1160 | int rid = adev->pdev->revision; | ||
1161 | |||
1162 | for (i = 0; i < ARRAY_SIZE(supported_DID_array); i++) { | ||
1163 | if (did == supported_DID_array[i].did && | ||
1164 | rid == supported_DID_array[i].rid) { | ||
1165 | return supported_DID_array[i].capability; | ||
1166 | } | ||
1167 | } | ||
1168 | return 0; | ||
1169 | } | ||
1170 | |||
1171 | int amdgpu_ras_init(struct amdgpu_device *adev) | ||
1172 | { | ||
1173 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
1174 | uint32_t supported = amdgpu_ras_check_supported(adev); | ||
1175 | |||
1176 | if (con || supported == 0) | ||
1177 | return 0; | ||
1178 | |||
1179 | con = kmalloc(sizeof(struct amdgpu_ras) + | ||
1180 | sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT, | ||
1181 | GFP_KERNEL|__GFP_ZERO); | ||
1182 | if (!con) | ||
1183 | return -ENOMEM; | ||
1184 | |||
1185 | con->objs = (struct ras_manager *)(con + 1); | ||
1186 | |||
1187 | amdgpu_ras_set_context(adev, con); | ||
1188 | |||
1189 | con->supported = supported; | ||
1190 | con->features = 0; | ||
1191 | INIT_LIST_HEAD(&con->head); | ||
1192 | |||
1193 | if (amdgpu_ras_recovery_init(adev)) | ||
1194 | goto recovery_out; | ||
1195 | |||
1196 | amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK; | ||
1197 | |||
1198 | amdgpu_ras_enable_all_features(adev, 1); | ||
1199 | |||
1200 | if (amdgpu_ras_fs_init(adev)) | ||
1201 | goto fs_out; | ||
1202 | |||
1203 | amdgpu_ras_self_test(adev); | ||
1204 | return 0; | ||
1205 | fs_out: | ||
1206 | amdgpu_ras_recovery_fini(adev); | ||
1207 | recovery_out: | ||
1208 | amdgpu_ras_set_context(adev, NULL); | ||
1209 | kfree(con); | ||
1210 | |||
1211 | return -EINVAL; | ||
1212 | } | ||
1213 | |||
1214 | /* do some fini work before IP fini as dependence */ | ||
1215 | int amdgpu_ras_pre_fini(struct amdgpu_device *adev) | ||
1216 | { | ||
1217 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
1218 | |||
1219 | if (!con) | ||
1220 | return 0; | ||
1221 | |||
1222 | /* Need disable ras on all IPs here before ip [hw/sw]fini */ | ||
1223 | amdgpu_ras_disable_all_features(adev, 0); | ||
1224 | amdgpu_ras_recovery_fini(adev); | ||
1225 | return 0; | ||
1226 | } | ||
1227 | |||
1228 | int amdgpu_ras_fini(struct amdgpu_device *adev) | ||
1229 | { | ||
1230 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
1231 | |||
1232 | if (!con) | ||
1233 | return 0; | ||
1234 | |||
1235 | amdgpu_ras_fs_fini(adev); | ||
1236 | amdgpu_ras_interrupt_remove_all(adev); | ||
1237 | |||
1238 | WARN(con->features, "Feature mask is not cleared"); | ||
1239 | |||
1240 | if (con->features) | ||
1241 | amdgpu_ras_disable_all_features(adev, 1); | ||
1242 | |||
1243 | amdgpu_ras_set_context(adev, NULL); | ||
1244 | kfree(con); | ||
1245 | |||
1246 | return 0; | ||
1247 | } | ||