aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJonathan Kim <jonathan.kim@amd.com>2019-07-11 13:14:02 -0400
committerAlex Deucher <alexander.deucher@amd.com>2019-07-31 00:22:34 -0400
commit24f9aacfb0fbe724c94f6ffe24cc518bfdca4b1d (patch)
tree747e834609304eaf85977d6cf5ec3b12fbded112
parent64671c0fdc9193978cb93aaa79965e45b3cce437 (diff)
drm/amdgpu: adding xgmi error monitoring
monitor xgmi errors via mc pie status through fica registers. Signed-off-by: Jonathan Kim <Jonathan.Kim@amd.com> Reviewed-by: Kent Russell <Kent.Russell@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c38
1 files changed, 36 insertions, 2 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 4e8d60eec0fe..65aae75f80fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -25,7 +25,7 @@
25#include "amdgpu.h" 25#include "amdgpu.h"
26#include "amdgpu_xgmi.h" 26#include "amdgpu_xgmi.h"
27#include "amdgpu_smu.h" 27#include "amdgpu_smu.h"
28 28#include "df/df_3_6_offset.h"
29 29
30static DEFINE_MUTEX(xgmi_mutex); 30static DEFINE_MUTEX(xgmi_mutex);
31 31
@@ -131,9 +131,37 @@ static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
131 131
132} 132}
133 133
134#define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801)
135static ssize_t amdgpu_xgmi_show_error(struct device *dev,
136 struct device_attribute *attr,
137 char *buf)
138{
139 struct drm_device *ddev = dev_get_drvdata(dev);
140 struct amdgpu_device *adev = ddev->dev_private;
141 uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
142 uint64_t fica_out;
143 unsigned int error_count = 0;
144
145 ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
146 ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
134 147
135static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL); 148 fica_out = adev->df_funcs->get_fica(adev, ficaa_pie_ctl_in);
149 if (fica_out != 0x1f)
150 pr_err("xGMI error counters not enabled!\n");
151
152 fica_out = adev->df_funcs->get_fica(adev, ficaa_pie_status_in);
153
154 if ((fica_out & 0xffff) == 2)
155 error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
136 156
157 adev->df_funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
158
159 return snprintf(buf, PAGE_SIZE, "%d\n", error_count);
160}
161
162
163static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
164static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
137 165
138static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, 166static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
139 struct amdgpu_hive_info *hive) 167 struct amdgpu_hive_info *hive)
@@ -148,6 +176,12 @@ static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
148 return ret; 176 return ret;
149 } 177 }
150 178
179 /* Create xgmi error file */
180 ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
181 if (ret)
182 pr_err("failed to create xgmi_error\n");
183
184
151 /* Create sysfs link to hive info folder on the first device */ 185 /* Create sysfs link to hive info folder on the first device */
152 if (adev != hive->adev) { 186 if (adev != hive->adev) {
153 ret = sysfs_create_link(&adev->dev->kobj, hive->kobj, 187 ret = sysfs_create_link(&adev->dev->kobj, hive->kobj,