diff options
| author | Jonathan Kim <jonathan.kim@amd.com> | 2019-07-11 13:14:02 -0400 |
|---|---|---|
| committer | Alex Deucher <alexander.deucher@amd.com> | 2019-07-31 00:22:34 -0400 |
| commit | 24f9aacfb0fbe724c94f6ffe24cc518bfdca4b1d (patch) | |
| tree | 747e834609304eaf85977d6cf5ec3b12fbded112 | |
| parent | 64671c0fdc9193978cb93aaa79965e45b3cce437 (diff) | |
drm/amdgpu: adding xgmi error monitoring
monitor xgmi errors via mc pie status through fica registers.
Signed-off-by: Jonathan Kim <Jonathan.Kim@amd.com>
Reviewed-by: Kent Russell <Kent.Russell@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 38 |
1 files changed, 36 insertions, 2 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 4e8d60eec0fe..65aae75f80fd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | |||
| @@ -25,7 +25,7 @@ | |||
| 25 | #include "amdgpu.h" | 25 | #include "amdgpu.h" |
| 26 | #include "amdgpu_xgmi.h" | 26 | #include "amdgpu_xgmi.h" |
| 27 | #include "amdgpu_smu.h" | 27 | #include "amdgpu_smu.h" |
| 28 | 28 | #include "df/df_3_6_offset.h" | |
| 29 | 29 | ||
| 30 | static DEFINE_MUTEX(xgmi_mutex); | 30 | static DEFINE_MUTEX(xgmi_mutex); |
| 31 | 31 | ||
| @@ -131,9 +131,37 @@ static ssize_t amdgpu_xgmi_show_device_id(struct device *dev, | |||
| 131 | 131 | ||
| 132 | } | 132 | } |
| 133 | 133 | ||
| 134 | #define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801) | ||
| 135 | static ssize_t amdgpu_xgmi_show_error(struct device *dev, | ||
| 136 | struct device_attribute *attr, | ||
| 137 | char *buf) | ||
| 138 | { | ||
| 139 | struct drm_device *ddev = dev_get_drvdata(dev); | ||
| 140 | struct amdgpu_device *adev = ddev->dev_private; | ||
| 141 | uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in; | ||
| 142 | uint64_t fica_out; | ||
| 143 | unsigned int error_count = 0; | ||
| 144 | |||
| 145 | ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200); | ||
| 146 | ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208); | ||
| 134 | 147 | ||
| 135 | static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL); | 148 | fica_out = adev->df_funcs->get_fica(adev, ficaa_pie_ctl_in); |
| 149 | if (fica_out != 0x1f) | ||
| 150 | pr_err("xGMI error counters not enabled!\n"); | ||
| 151 | |||
| 152 | fica_out = adev->df_funcs->get_fica(adev, ficaa_pie_status_in); | ||
| 153 | |||
| 154 | if ((fica_out & 0xffff) == 2) | ||
| 155 | error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63); | ||
| 136 | 156 | ||
| 157 | adev->df_funcs->set_fica(adev, ficaa_pie_status_in, 0, 0); | ||
| 158 | |||
| 159 | return snprintf(buf, PAGE_SIZE, "%d\n", error_count); | ||
| 160 | } | ||
| 161 | |||
| 162 | |||
| 163 | static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL); | ||
| 164 | static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL); | ||
| 137 | 165 | ||
| 138 | static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, | 166 | static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, |
| 139 | struct amdgpu_hive_info *hive) | 167 | struct amdgpu_hive_info *hive) |
| @@ -148,6 +176,12 @@ static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, | |||
| 148 | return ret; | 176 | return ret; |
| 149 | } | 177 | } |
| 150 | 178 | ||
| 179 | /* Create xgmi error file */ | ||
| 180 | ret = device_create_file(adev->dev, &dev_attr_xgmi_error); | ||
| 181 | if (ret) | ||
| 182 | pr_err("failed to create xgmi_error\n"); | ||
| 183 | |||
| 184 | |||
| 151 | /* Create sysfs link to hive info folder on the first device */ | 185 | /* Create sysfs link to hive info folder on the first device */ |
| 152 | if (adev != hive->adev) { | 186 | if (adev != hive->adev) { |
| 153 | ret = sysfs_create_link(&adev->dev->kobj, hive->kobj, | 187 | ret = sysfs_create_link(&adev->dev->kobj, hive->kobj, |
