diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 232 |
1 files changed, 219 insertions, 13 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 22bd21efe6b1..7c8a4aedf07c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |||
@@ -90,6 +90,12 @@ struct ras_manager { | |||
90 | struct ras_err_data err_data; | 90 | struct ras_err_data err_data; |
91 | }; | 91 | }; |
92 | 92 | ||
93 | struct ras_badpage { | ||
94 | unsigned int bp; | ||
95 | unsigned int size; | ||
96 | unsigned int flags; | ||
97 | }; | ||
98 | |||
93 | const char *ras_error_string[] = { | 99 | const char *ras_error_string[] = { |
94 | "none", | 100 | "none", |
95 | "parity", | 101 | "parity", |
@@ -118,7 +124,8 @@ const char *ras_block_string[] = { | |||
118 | #define ras_err_str(i) (ras_error_string[ffs(i)]) | 124 | #define ras_err_str(i) (ras_error_string[ffs(i)]) |
119 | #define ras_block_str(i) (ras_block_string[i]) | 125 | #define ras_block_str(i) (ras_block_string[i]) |
120 | 126 | ||
121 | #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1 | 127 | #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1 |
128 | #define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2 | ||
122 | #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) | 129 | #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) |
123 | 130 | ||
124 | static void amdgpu_ras_self_test(struct amdgpu_device *adev) | 131 | static void amdgpu_ras_self_test(struct amdgpu_device *adev) |
@@ -237,8 +244,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, | |||
237 | 244 | ||
238 | return 0; | 245 | return 0; |
239 | } | 246 | } |
240 | /* | 247 | /** |
241 | * DOC: ras debugfs control interface | 248 | * DOC: AMDGPU RAS debugfs control interface |
242 | * | 249 | * |
243 | * It accepts struct ras_debug_if who has two members. | 250 | * It accepts struct ras_debug_if who has two members. |
244 | * | 251 | * |
@@ -521,6 +528,8 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, | |||
521 | enable ? "enable":"disable", | 528 | enable ? "enable":"disable", |
522 | ras_block_str(head->block), | 529 | ras_block_str(head->block), |
523 | ret); | 530 | ret); |
531 | if (ret == TA_RAS_STATUS__RESET_NEEDED) | ||
532 | return -EAGAIN; | ||
524 | return -EINVAL; | 533 | return -EINVAL; |
525 | } | 534 | } |
526 | 535 | ||
@@ -541,16 +550,32 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, | |||
541 | return -EINVAL; | 550 | return -EINVAL; |
542 | 551 | ||
543 | if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { | 552 | if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { |
544 | /* If ras is enabled by vbios, we set up ras object first in | 553 | if (enable) { |
545 | * both case. For enable, that is all what we need do. For | 554 | /* There is no harm to issue a ras TA cmd regardless of |
546 | * disable, we need perform a ras TA disable cmd after that. | 555 | * the currecnt ras state. |
547 | */ | 556 | * If current state == target state, it will do nothing |
548 | ret = __amdgpu_ras_feature_enable(adev, head, 1); | 557 | * But sometimes it requests driver to reset and repost |
549 | if (ret) | 558 | * with error code -EAGAIN. |
550 | return ret; | 559 | */ |
560 | ret = amdgpu_ras_feature_enable(adev, head, 1); | ||
561 | /* With old ras TA, we might fail to enable ras. | ||
562 | * Log it and just setup the object. | ||
563 | * TODO need remove this WA in the future. | ||
564 | */ | ||
565 | if (ret == -EINVAL) { | ||
566 | ret = __amdgpu_ras_feature_enable(adev, head, 1); | ||
567 | if (!ret) | ||
568 | DRM_INFO("RAS INFO: %s setup object\n", | ||
569 | ras_block_str(head->block)); | ||
570 | } | ||
571 | } else { | ||
572 | /* setup the object then issue a ras TA disable cmd.*/ | ||
573 | ret = __amdgpu_ras_feature_enable(adev, head, 1); | ||
574 | if (ret) | ||
575 | return ret; | ||
551 | 576 | ||
552 | if (!enable) | ||
553 | ret = amdgpu_ras_feature_enable(adev, head, 0); | 577 | ret = amdgpu_ras_feature_enable(adev, head, 0); |
578 | } | ||
554 | } else | 579 | } else |
555 | ret = amdgpu_ras_feature_enable(adev, head, enable); | 580 | ret = amdgpu_ras_feature_enable(adev, head, enable); |
556 | 581 | ||
@@ -691,6 +716,77 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev, | |||
691 | 716 | ||
692 | /* sysfs begin */ | 717 | /* sysfs begin */ |
693 | 718 | ||
719 | static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, | ||
720 | struct ras_badpage **bps, unsigned int *count); | ||
721 | |||
722 | static char *amdgpu_ras_badpage_flags_str(unsigned int flags) | ||
723 | { | ||
724 | switch (flags) { | ||
725 | case 0: | ||
726 | return "R"; | ||
727 | case 1: | ||
728 | return "P"; | ||
729 | case 2: | ||
730 | default: | ||
731 | return "F"; | ||
732 | }; | ||
733 | } | ||
734 | |||
735 | /* | ||
736 | * DOC: ras sysfs gpu_vram_bad_pages interface | ||
737 | * | ||
738 | * It allows user to read the bad pages of vram on the gpu through | ||
739 | * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages | ||
740 | * | ||
741 | * It outputs multiple lines, and each line stands for one gpu page. | ||
742 | * | ||
743 | * The format of one line is below, | ||
744 | * gpu pfn : gpu page size : flags | ||
745 | * | ||
746 | * gpu pfn and gpu page size are printed in hex format. | ||
747 | * flags can be one of below character, | ||
748 | * R: reserved, this gpu page is reserved and not able to use. | ||
749 | * P: pending for reserve, this gpu page is marked as bad, will be reserved | ||
750 | * in next window of page_reserve. | ||
751 | * F: unable to reserve. this gpu page can't be reserved due to some reasons. | ||
752 | * | ||
753 | * examples: | ||
754 | * 0x00000001 : 0x00001000 : R | ||
755 | * 0x00000002 : 0x00001000 : P | ||
756 | */ | ||
757 | |||
758 | static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, | ||
759 | struct kobject *kobj, struct bin_attribute *attr, | ||
760 | char *buf, loff_t ppos, size_t count) | ||
761 | { | ||
762 | struct amdgpu_ras *con = | ||
763 | container_of(attr, struct amdgpu_ras, badpages_attr); | ||
764 | struct amdgpu_device *adev = con->adev; | ||
765 | const unsigned int element_size = | ||
766 | sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; | ||
767 | unsigned int start = div64_ul(ppos + element_size - 1, element_size); | ||
768 | unsigned int end = div64_ul(ppos + count - 1, element_size); | ||
769 | ssize_t s = 0; | ||
770 | struct ras_badpage *bps = NULL; | ||
771 | unsigned int bps_count = 0; | ||
772 | |||
773 | memset(buf, 0, count); | ||
774 | |||
775 | if (amdgpu_ras_badpages_read(adev, &bps, &bps_count)) | ||
776 | return 0; | ||
777 | |||
778 | for (; start < end && start < bps_count; start++) | ||
779 | s += scnprintf(&buf[s], element_size + 1, | ||
780 | "0x%08x : 0x%08x : %1s\n", | ||
781 | bps[start].bp, | ||
782 | bps[start].size, | ||
783 | amdgpu_ras_badpage_flags_str(bps[start].flags)); | ||
784 | |||
785 | kfree(bps); | ||
786 | |||
787 | return s; | ||
788 | } | ||
789 | |||
694 | static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, | 790 | static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, |
695 | struct device_attribute *attr, char *buf) | 791 | struct device_attribute *attr, char *buf) |
696 | { | 792 | { |
@@ -731,9 +827,14 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) | |||
731 | &con->features_attr.attr, | 827 | &con->features_attr.attr, |
732 | NULL | 828 | NULL |
733 | }; | 829 | }; |
830 | struct bin_attribute *bin_attrs[] = { | ||
831 | &con->badpages_attr, | ||
832 | NULL | ||
833 | }; | ||
734 | struct attribute_group group = { | 834 | struct attribute_group group = { |
735 | .name = "ras", | 835 | .name = "ras", |
736 | .attrs = attrs, | 836 | .attrs = attrs, |
837 | .bin_attrs = bin_attrs, | ||
737 | }; | 838 | }; |
738 | 839 | ||
739 | con->features_attr = (struct device_attribute) { | 840 | con->features_attr = (struct device_attribute) { |
@@ -743,7 +844,19 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) | |||
743 | }, | 844 | }, |
744 | .show = amdgpu_ras_sysfs_features_read, | 845 | .show = amdgpu_ras_sysfs_features_read, |
745 | }; | 846 | }; |
847 | |||
848 | con->badpages_attr = (struct bin_attribute) { | ||
849 | .attr = { | ||
850 | .name = "gpu_vram_bad_pages", | ||
851 | .mode = S_IRUGO, | ||
852 | }, | ||
853 | .size = 0, | ||
854 | .private = NULL, | ||
855 | .read = amdgpu_ras_sysfs_badpages_read, | ||
856 | }; | ||
857 | |||
746 | sysfs_attr_init(attrs[0]); | 858 | sysfs_attr_init(attrs[0]); |
859 | sysfs_bin_attr_init(bin_attrs[0]); | ||
747 | 860 | ||
748 | return sysfs_create_group(&adev->dev->kobj, &group); | 861 | return sysfs_create_group(&adev->dev->kobj, &group); |
749 | } | 862 | } |
@@ -755,9 +868,14 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) | |||
755 | &con->features_attr.attr, | 868 | &con->features_attr.attr, |
756 | NULL | 869 | NULL |
757 | }; | 870 | }; |
871 | struct bin_attribute *bin_attrs[] = { | ||
872 | &con->badpages_attr, | ||
873 | NULL | ||
874 | }; | ||
758 | struct attribute_group group = { | 875 | struct attribute_group group = { |
759 | .name = "ras", | 876 | .name = "ras", |
760 | .attrs = attrs, | 877 | .attrs = attrs, |
878 | .bin_attrs = bin_attrs, | ||
761 | }; | 879 | }; |
762 | 880 | ||
763 | sysfs_remove_group(&adev->dev->kobj, &group); | 881 | sysfs_remove_group(&adev->dev->kobj, &group); |
@@ -1089,6 +1207,53 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) | |||
1089 | /* ih end */ | 1207 | /* ih end */ |
1090 | 1208 | ||
1091 | /* recovery begin */ | 1209 | /* recovery begin */ |
1210 | |||
1211 | /* return 0 on success. | ||
1212 | * caller need free bps. | ||
1213 | */ | ||
1214 | static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, | ||
1215 | struct ras_badpage **bps, unsigned int *count) | ||
1216 | { | ||
1217 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
1218 | struct ras_err_handler_data *data; | ||
1219 | int i = 0; | ||
1220 | int ret = 0; | ||
1221 | |||
1222 | if (!con || !con->eh_data || !bps || !count) | ||
1223 | return -EINVAL; | ||
1224 | |||
1225 | mutex_lock(&con->recovery_lock); | ||
1226 | data = con->eh_data; | ||
1227 | if (!data || data->count == 0) { | ||
1228 | *bps = NULL; | ||
1229 | goto out; | ||
1230 | } | ||
1231 | |||
1232 | *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL); | ||
1233 | if (!*bps) { | ||
1234 | ret = -ENOMEM; | ||
1235 | goto out; | ||
1236 | } | ||
1237 | |||
1238 | for (; i < data->count; i++) { | ||
1239 | (*bps)[i] = (struct ras_badpage){ | ||
1240 | .bp = data->bps[i].bp, | ||
1241 | .size = AMDGPU_GPU_PAGE_SIZE, | ||
1242 | .flags = 0, | ||
1243 | }; | ||
1244 | |||
1245 | if (data->last_reserved <= i) | ||
1246 | (*bps)[i].flags = 1; | ||
1247 | else if (data->bps[i].bo == NULL) | ||
1248 | (*bps)[i].flags = 2; | ||
1249 | } | ||
1250 | |||
1251 | *count = data->count; | ||
1252 | out: | ||
1253 | mutex_unlock(&con->recovery_lock); | ||
1254 | return ret; | ||
1255 | } | ||
1256 | |||
1092 | static void amdgpu_ras_do_recovery(struct work_struct *work) | 1257 | static void amdgpu_ras_do_recovery(struct work_struct *work) |
1093 | { | 1258 | { |
1094 | struct amdgpu_ras *ras = | 1259 | struct amdgpu_ras *ras = |
@@ -1340,6 +1505,19 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) | |||
1340 | } | 1505 | } |
1341 | /* recovery end */ | 1506 | /* recovery end */ |
1342 | 1507 | ||
1508 | /* return 0 if ras will reset gpu and repost.*/ | ||
1509 | int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, | ||
1510 | unsigned int block) | ||
1511 | { | ||
1512 | struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); | ||
1513 | |||
1514 | if (!ras) | ||
1515 | return -EINVAL; | ||
1516 | |||
1517 | ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET; | ||
1518 | return 0; | ||
1519 | } | ||
1520 | |||
1343 | /* | 1521 | /* |
1344 | * check hardware's ras ability which will be saved in hw_supported. | 1522 | * check hardware's ras ability which will be saved in hw_supported. |
1345 | * if hardware does not support ras, we can skip some ras initializtion and | 1523 | * if hardware does not support ras, we can skip some ras initializtion and |
@@ -1415,8 +1593,10 @@ recovery_out: | |||
1415 | return -EINVAL; | 1593 | return -EINVAL; |
1416 | } | 1594 | } |
1417 | 1595 | ||
1418 | /* do some init work after IP late init as dependence */ | 1596 | /* do some init work after IP late init as dependence. |
1419 | void amdgpu_ras_post_init(struct amdgpu_device *adev) | 1597 | * and it runs in resume/gpu reset/booting up cases. |
1598 | */ | ||
1599 | void amdgpu_ras_resume(struct amdgpu_device *adev) | ||
1420 | { | 1600 | { |
1421 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | 1601 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
1422 | struct ras_manager *obj, *tmp; | 1602 | struct ras_manager *obj, *tmp; |
@@ -1444,6 +1624,32 @@ void amdgpu_ras_post_init(struct amdgpu_device *adev) | |||
1444 | } | 1624 | } |
1445 | } | 1625 | } |
1446 | } | 1626 | } |
1627 | |||
1628 | if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) { | ||
1629 | con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET; | ||
1630 | /* setup ras obj state as disabled. | ||
1631 | * for init_by_vbios case. | ||
1632 | * if we want to enable ras, just enable it in a normal way. | ||
1633 | * If we want do disable it, need setup ras obj as enabled, | ||
1634 | * then issue another TA disable cmd. | ||
1635 | * See feature_enable_on_boot | ||
1636 | */ | ||
1637 | amdgpu_ras_disable_all_features(adev, 1); | ||
1638 | amdgpu_ras_reset_gpu(adev, 0); | ||
1639 | } | ||
1640 | } | ||
1641 | |||
1642 | void amdgpu_ras_suspend(struct amdgpu_device *adev) | ||
1643 | { | ||
1644 | struct amdgpu_ras *con = amdgpu_ras_get_context(adev); | ||
1645 | |||
1646 | if (!con) | ||
1647 | return; | ||
1648 | |||
1649 | amdgpu_ras_disable_all_features(adev, 0); | ||
1650 | /* Make sure all ras objects are disabled. */ | ||
1651 | if (con->features) | ||
1652 | amdgpu_ras_disable_all_features(adev, 1); | ||
1447 | } | 1653 | } |
1448 | 1654 | ||
1449 | /* do some fini work before IP fini as dependence */ | 1655 | /* do some fini work before IP fini as dependence */ |