aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c232
1 files changed, 219 insertions, 13 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 22bd21efe6b1..7c8a4aedf07c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -90,6 +90,12 @@ struct ras_manager {
90 struct ras_err_data err_data; 90 struct ras_err_data err_data;
91}; 91};
92 92
93struct ras_badpage {
94 unsigned int bp;
95 unsigned int size;
96 unsigned int flags;
97};
98
93const char *ras_error_string[] = { 99const char *ras_error_string[] = {
94 "none", 100 "none",
95 "parity", 101 "parity",
@@ -118,7 +124,8 @@ const char *ras_block_string[] = {
118#define ras_err_str(i) (ras_error_string[ffs(i)]) 124#define ras_err_str(i) (ras_error_string[ffs(i)])
119#define ras_block_str(i) (ras_block_string[i]) 125#define ras_block_str(i) (ras_block_string[i])
120 126
121#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1 127#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1
128#define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2
122#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) 129#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
123 130
124static void amdgpu_ras_self_test(struct amdgpu_device *adev) 131static void amdgpu_ras_self_test(struct amdgpu_device *adev)
@@ -237,8 +244,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
237 244
238 return 0; 245 return 0;
239} 246}
240/* 247/**
241 * DOC: ras debugfs control interface 248 * DOC: AMDGPU RAS debugfs control interface
242 * 249 *
243 * It accepts struct ras_debug_if who has two members. 250 * It accepts struct ras_debug_if who has two members.
244 * 251 *
@@ -521,6 +528,8 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
521 enable ? "enable":"disable", 528 enable ? "enable":"disable",
522 ras_block_str(head->block), 529 ras_block_str(head->block),
523 ret); 530 ret);
531 if (ret == TA_RAS_STATUS__RESET_NEEDED)
532 return -EAGAIN;
524 return -EINVAL; 533 return -EINVAL;
525 } 534 }
526 535
@@ -541,16 +550,32 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
541 return -EINVAL; 550 return -EINVAL;
542 551
543 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { 552 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
544 /* If ras is enabled by vbios, we set up ras object first in 553 if (enable) {
545 * both case. For enable, that is all what we need do. For 554 /* There is no harm to issue a ras TA cmd regardless of
546 * disable, we need perform a ras TA disable cmd after that. 555 * the currecnt ras state.
547 */ 556 * If current state == target state, it will do nothing
548 ret = __amdgpu_ras_feature_enable(adev, head, 1); 557 * But sometimes it requests driver to reset and repost
549 if (ret) 558 * with error code -EAGAIN.
550 return ret; 559 */
560 ret = amdgpu_ras_feature_enable(adev, head, 1);
561 /* With old ras TA, we might fail to enable ras.
562 * Log it and just setup the object.
563 * TODO need remove this WA in the future.
564 */
565 if (ret == -EINVAL) {
566 ret = __amdgpu_ras_feature_enable(adev, head, 1);
567 if (!ret)
568 DRM_INFO("RAS INFO: %s setup object\n",
569 ras_block_str(head->block));
570 }
571 } else {
572 /* setup the object then issue a ras TA disable cmd.*/
573 ret = __amdgpu_ras_feature_enable(adev, head, 1);
574 if (ret)
575 return ret;
551 576
552 if (!enable)
553 ret = amdgpu_ras_feature_enable(adev, head, 0); 577 ret = amdgpu_ras_feature_enable(adev, head, 0);
578 }
554 } else 579 } else
555 ret = amdgpu_ras_feature_enable(adev, head, enable); 580 ret = amdgpu_ras_feature_enable(adev, head, enable);
556 581
@@ -691,6 +716,77 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
691 716
692/* sysfs begin */ 717/* sysfs begin */
693 718
719static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
720 struct ras_badpage **bps, unsigned int *count);
721
722static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
723{
724 switch (flags) {
725 case 0:
726 return "R";
727 case 1:
728 return "P";
729 case 2:
730 default:
731 return "F";
732 };
733}
734
735/*
736 * DOC: ras sysfs gpu_vram_bad_pages interface
737 *
738 * It allows user to read the bad pages of vram on the gpu through
739 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
740 *
741 * It outputs multiple lines, and each line stands for one gpu page.
742 *
743 * The format of one line is below,
744 * gpu pfn : gpu page size : flags
745 *
746 * gpu pfn and gpu page size are printed in hex format.
747 * flags can be one of below character,
748 * R: reserved, this gpu page is reserved and not able to use.
749 * P: pending for reserve, this gpu page is marked as bad, will be reserved
750 * in next window of page_reserve.
751 * F: unable to reserve. this gpu page can't be reserved due to some reasons.
752 *
753 * examples:
754 * 0x00000001 : 0x00001000 : R
755 * 0x00000002 : 0x00001000 : P
756 */
757
758static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
759 struct kobject *kobj, struct bin_attribute *attr,
760 char *buf, loff_t ppos, size_t count)
761{
762 struct amdgpu_ras *con =
763 container_of(attr, struct amdgpu_ras, badpages_attr);
764 struct amdgpu_device *adev = con->adev;
765 const unsigned int element_size =
766 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
767 unsigned int start = div64_ul(ppos + element_size - 1, element_size);
768 unsigned int end = div64_ul(ppos + count - 1, element_size);
769 ssize_t s = 0;
770 struct ras_badpage *bps = NULL;
771 unsigned int bps_count = 0;
772
773 memset(buf, 0, count);
774
775 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
776 return 0;
777
778 for (; start < end && start < bps_count; start++)
779 s += scnprintf(&buf[s], element_size + 1,
780 "0x%08x : 0x%08x : %1s\n",
781 bps[start].bp,
782 bps[start].size,
783 amdgpu_ras_badpage_flags_str(bps[start].flags));
784
785 kfree(bps);
786
787 return s;
788}
789
694static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, 790static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
695 struct device_attribute *attr, char *buf) 791 struct device_attribute *attr, char *buf)
696{ 792{
@@ -731,9 +827,14 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
731 &con->features_attr.attr, 827 &con->features_attr.attr,
732 NULL 828 NULL
733 }; 829 };
830 struct bin_attribute *bin_attrs[] = {
831 &con->badpages_attr,
832 NULL
833 };
734 struct attribute_group group = { 834 struct attribute_group group = {
735 .name = "ras", 835 .name = "ras",
736 .attrs = attrs, 836 .attrs = attrs,
837 .bin_attrs = bin_attrs,
737 }; 838 };
738 839
739 con->features_attr = (struct device_attribute) { 840 con->features_attr = (struct device_attribute) {
@@ -743,7 +844,19 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
743 }, 844 },
744 .show = amdgpu_ras_sysfs_features_read, 845 .show = amdgpu_ras_sysfs_features_read,
745 }; 846 };
847
848 con->badpages_attr = (struct bin_attribute) {
849 .attr = {
850 .name = "gpu_vram_bad_pages",
851 .mode = S_IRUGO,
852 },
853 .size = 0,
854 .private = NULL,
855 .read = amdgpu_ras_sysfs_badpages_read,
856 };
857
746 sysfs_attr_init(attrs[0]); 858 sysfs_attr_init(attrs[0]);
859 sysfs_bin_attr_init(bin_attrs[0]);
747 860
748 return sysfs_create_group(&adev->dev->kobj, &group); 861 return sysfs_create_group(&adev->dev->kobj, &group);
749} 862}
@@ -755,9 +868,14 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
755 &con->features_attr.attr, 868 &con->features_attr.attr,
756 NULL 869 NULL
757 }; 870 };
871 struct bin_attribute *bin_attrs[] = {
872 &con->badpages_attr,
873 NULL
874 };
758 struct attribute_group group = { 875 struct attribute_group group = {
759 .name = "ras", 876 .name = "ras",
760 .attrs = attrs, 877 .attrs = attrs,
878 .bin_attrs = bin_attrs,
761 }; 879 };
762 880
763 sysfs_remove_group(&adev->dev->kobj, &group); 881 sysfs_remove_group(&adev->dev->kobj, &group);
@@ -1089,6 +1207,53 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1089/* ih end */ 1207/* ih end */
1090 1208
1091/* recovery begin */ 1209/* recovery begin */
1210
1211/* return 0 on success.
1212 * caller need free bps.
1213 */
1214static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1215 struct ras_badpage **bps, unsigned int *count)
1216{
1217 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1218 struct ras_err_handler_data *data;
1219 int i = 0;
1220 int ret = 0;
1221
1222 if (!con || !con->eh_data || !bps || !count)
1223 return -EINVAL;
1224
1225 mutex_lock(&con->recovery_lock);
1226 data = con->eh_data;
1227 if (!data || data->count == 0) {
1228 *bps = NULL;
1229 goto out;
1230 }
1231
1232 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1233 if (!*bps) {
1234 ret = -ENOMEM;
1235 goto out;
1236 }
1237
1238 for (; i < data->count; i++) {
1239 (*bps)[i] = (struct ras_badpage){
1240 .bp = data->bps[i].bp,
1241 .size = AMDGPU_GPU_PAGE_SIZE,
1242 .flags = 0,
1243 };
1244
1245 if (data->last_reserved <= i)
1246 (*bps)[i].flags = 1;
1247 else if (data->bps[i].bo == NULL)
1248 (*bps)[i].flags = 2;
1249 }
1250
1251 *count = data->count;
1252out:
1253 mutex_unlock(&con->recovery_lock);
1254 return ret;
1255}
1256
1092static void amdgpu_ras_do_recovery(struct work_struct *work) 1257static void amdgpu_ras_do_recovery(struct work_struct *work)
1093{ 1258{
1094 struct amdgpu_ras *ras = 1259 struct amdgpu_ras *ras =
@@ -1340,6 +1505,19 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1340} 1505}
1341/* recovery end */ 1506/* recovery end */
1342 1507
1508/* return 0 if ras will reset gpu and repost.*/
1509int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
1510 unsigned int block)
1511{
1512 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1513
1514 if (!ras)
1515 return -EINVAL;
1516
1517 ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1518 return 0;
1519}
1520
1343/* 1521/*
1344 * check hardware's ras ability which will be saved in hw_supported. 1522 * check hardware's ras ability which will be saved in hw_supported.
1345 * if hardware does not support ras, we can skip some ras initializtion and 1523 * if hardware does not support ras, we can skip some ras initializtion and
@@ -1415,8 +1593,10 @@ recovery_out:
1415 return -EINVAL; 1593 return -EINVAL;
1416} 1594}
1417 1595
1418/* do some init work after IP late init as dependence */ 1596/* do some init work after IP late init as dependence.
1419void amdgpu_ras_post_init(struct amdgpu_device *adev) 1597 * and it runs in resume/gpu reset/booting up cases.
1598 */
1599void amdgpu_ras_resume(struct amdgpu_device *adev)
1420{ 1600{
1421 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1601 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1422 struct ras_manager *obj, *tmp; 1602 struct ras_manager *obj, *tmp;
@@ -1444,6 +1624,32 @@ void amdgpu_ras_post_init(struct amdgpu_device *adev)
1444 } 1624 }
1445 } 1625 }
1446 } 1626 }
1627
1628 if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
1629 con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1630 /* setup ras obj state as disabled.
1631 * for init_by_vbios case.
1632 * if we want to enable ras, just enable it in a normal way.
1633 * If we want do disable it, need setup ras obj as enabled,
1634 * then issue another TA disable cmd.
1635 * See feature_enable_on_boot
1636 */
1637 amdgpu_ras_disable_all_features(adev, 1);
1638 amdgpu_ras_reset_gpu(adev, 0);
1639 }
1640}
1641
1642void amdgpu_ras_suspend(struct amdgpu_device *adev)
1643{
1644 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1645
1646 if (!con)
1647 return;
1648
1649 amdgpu_ras_disable_all_features(adev, 0);
1650 /* Make sure all ras objects are disabled. */
1651 if (con->features)
1652 amdgpu_ras_disable_all_features(adev, 1);
1447} 1653}
1448 1654
1449/* do some fini work before IP fini as dependence */ 1655/* do some fini work before IP fini as dependence */