diff options
author | David Nieto <dmartineznie@nvidia.com> | 2017-05-22 19:38:49 -0400 |
---|---|---|
committer | mobile promotions <svcmobile_promotions@nvidia.com> | 2017-06-04 23:34:57 -0400 |
commit | 6bc36bded05ee497a474e5a718c49dc33eb235f1 (patch) | |
tree | caf557eaff74b2fa01609dfa3b933713647838d6 /drivers/gpu/nvgpu/gv11b | |
parent | 81172b5df4c7dc46bf46419074b30e0a73f5ddfb (diff) |
gpu: nvgpu: L2 cache tag ECC support
Adding support for L2 cache tag ECC error handling
JIRA: GPUT19X-112
Change-Id: I9a8ebefe97814b341f57a024dfb126013adaac1c
Signed-off-by: David Nieto <dmartineznie@nvidia.com>
Reviewed-on: http://git-master/r/1489029
Reviewed-by: svccoveritychecker <svccoveritychecker@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gv11b')
-rw-r--r-- | drivers/gpu/nvgpu/gv11b/ecc_gv11b.h | 5 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gv11b/ltc_gv11b.c | 107 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c | 27 |
3 files changed, 139 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h b/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h index 6b471655..4e1696f7 100644 --- a/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h | |||
@@ -33,4 +33,9 @@ struct ecc_gr_t19x { | |||
33 | struct gk20a_ecc_stat gpccs_uncorrected_err_count; | 33 | struct gk20a_ecc_stat gpccs_uncorrected_err_count; |
34 | }; | 34 | }; |
35 | 35 | ||
36 | struct ecc_ltc_t19x { | ||
37 | struct gk20a_ecc_stat l2_cache_corrected_err_count; | ||
38 | struct gk20a_ecc_stat l2_cache_uncorrected_err_count; | ||
39 | }; | ||
40 | |||
36 | #endif | 41 | #endif |
diff --git a/drivers/gpu/nvgpu/gv11b/ltc_gv11b.c b/drivers/gpu/nvgpu/gv11b/ltc_gv11b.c index 23beca5d..b8a97ce3 100644 --- a/drivers/gpu/nvgpu/gv11b/ltc_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/ltc_gv11b.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include "ltc_gv11b.h" | 20 | #include "ltc_gv11b.h" |
21 | 21 | ||
22 | #include <nvgpu/hw/gv11b/hw_ltc_gv11b.h> | 22 | #include <nvgpu/hw/gv11b/hw_ltc_gv11b.h> |
23 | #include <nvgpu/hw/gv11b/hw_mc_gv11b.h> | ||
23 | #include <nvgpu/hw/gv11b/hw_top_gv11b.h> | 24 | #include <nvgpu/hw/gv11b/hw_top_gv11b.h> |
24 | #include <nvgpu/hw/gv11b/hw_pri_ringmaster_gv11b.h> | 25 | #include <nvgpu/hw/gv11b/hw_pri_ringmaster_gv11b.h> |
25 | 26 | ||
@@ -74,6 +75,111 @@ static void gv11b_ltc_init_fs_state(struct gk20a *g) | |||
74 | ltc_intr); | 75 | ltc_intr); |
75 | } | 76 | } |
76 | 77 | ||
78 | static void gv11b_ltc_isr(struct gk20a *g) | ||
79 | { | ||
80 | u32 mc_intr, ltc_intr3; | ||
81 | unsigned int ltc, slice; | ||
82 | u32 ltc_stride = nvgpu_get_litter_value(g, GPU_LIT_LTC_STRIDE); | ||
83 | u32 lts_stride = nvgpu_get_litter_value(g, GPU_LIT_LTS_STRIDE); | ||
84 | u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt; | ||
85 | u32 corrected_delta, uncorrected_delta; | ||
86 | u32 corrected_overflow, uncorrected_overflow; | ||
87 | u32 ltc_corrected, ltc_uncorrected; | ||
88 | |||
89 | mc_intr = gk20a_readl(g, mc_intr_ltc_r()); | ||
90 | for (ltc = 0; ltc < g->ltc_count; ltc++) { | ||
91 | if ((mc_intr & 1 << ltc) == 0) | ||
92 | continue; | ||
93 | ltc_corrected = ltc_uncorrected = 0; | ||
94 | |||
95 | for (slice = 0; slice < g->gr.slices_per_ltc; slice++) { | ||
96 | u32 offset = ltc_stride * ltc + lts_stride * slice; | ||
97 | ltc_intr3 = gk20a_readl(g, ltc_ltc0_lts0_intr3_r() + | ||
98 | offset); | ||
99 | |||
100 | /* Detect and handle ECC PARITY errors */ | ||
101 | |||
102 | if (ltc_intr3 & | ||
103 | (ltc_ltcs_ltss_intr3_ecc_uncorrected_m() | | ||
104 | ltc_ltcs_ltss_intr3_ecc_corrected_m())) { | ||
105 | |||
106 | ecc_status = gk20a_readl(g, | ||
107 | ltc_ltc0_lts0_l2_cache_ecc_status_r() + | ||
108 | offset); | ||
109 | ecc_addr = gk20a_readl(g, | ||
110 | ltc_ltc0_lts0_l2_cache_ecc_address_r() + | ||
111 | offset); | ||
112 | corrected_cnt = gk20a_readl(g, | ||
113 | ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r() + offset); | ||
114 | uncorrected_cnt = gk20a_readl(g, | ||
115 | ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r() + offset); | ||
116 | |||
117 | corrected_delta = | ||
118 | ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_v(corrected_cnt); | ||
119 | uncorrected_delta = | ||
120 | ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_v(uncorrected_cnt); | ||
121 | corrected_overflow = ecc_status & | ||
122 | ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m(); | ||
123 | |||
124 | uncorrected_overflow = ecc_status & | ||
125 | ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m(); | ||
126 | |||
127 | /* clear the interrupt */ | ||
128 | if ((corrected_delta > 0) || corrected_overflow) { | ||
129 | gk20a_writel(g, ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r() + offset, 0); | ||
130 | } | ||
131 | if ((uncorrected_delta > 0) || uncorrected_overflow) { | ||
132 | gk20a_writel(g, | ||
133 | ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r() + offset, 0); | ||
134 | } | ||
135 | |||
136 | gk20a_writel(g, ltc_ltc0_lts0_l2_cache_ecc_status_r() + offset, | ||
137 | ltc_ltc0_lts0_l2_cache_ecc_status_reset_task_f()); | ||
138 | |||
139 | /* update counters per slice */ | ||
140 | if (corrected_overflow) | ||
141 | corrected_delta += (0x1UL << ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_s()); | ||
142 | if (uncorrected_overflow) | ||
143 | uncorrected_delta += (0x1UL << ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_s()); | ||
144 | |||
145 | ltc_corrected += corrected_delta; | ||
146 | ltc_uncorrected += uncorrected_delta; | ||
147 | nvgpu_log(g, gpu_dbg_intr, | ||
148 | "ltc:%d lts: %d cache ecc interrupt intr: 0x%x", ltc, slice, ltc_intr3); | ||
149 | |||
150 | if (ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m()) | ||
151 | nvgpu_log(g, gpu_dbg_intr, "rstg ecc error corrected"); | ||
152 | if (ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m()) | ||
153 | nvgpu_log(g, gpu_dbg_intr, "rstg ecc error uncorrected"); | ||
154 | if (ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m()) | ||
155 | nvgpu_log(g, gpu_dbg_intr, "tstg ecc error corrected"); | ||
156 | if (ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m()) | ||
157 | nvgpu_log(g, gpu_dbg_intr, "tstg ecc error uncorrected"); | ||
158 | if (ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m()) | ||
159 | nvgpu_log(g, gpu_dbg_intr, "dstg ecc error corrected"); | ||
160 | if (ecc_status & ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m()) | ||
161 | nvgpu_log(g, gpu_dbg_intr, "dstg ecc error uncorrected"); | ||
162 | |||
163 | if (corrected_overflow || uncorrected_overflow) | ||
164 | nvgpu_info(g, "ecc counter overflow!"); | ||
165 | |||
166 | nvgpu_log(g, gpu_dbg_intr, | ||
167 | "ecc error address: 0x%x", ecc_addr); | ||
168 | |||
169 | } | ||
170 | |||
171 | } | ||
172 | g->ecc.ltc.t19x.l2_cache_corrected_err_count.counters[ltc] += | ||
173 | ltc_corrected; | ||
174 | g->ecc.ltc.t19x.l2_cache_uncorrected_err_count.counters[ltc] += | ||
175 | ltc_uncorrected; | ||
176 | |||
177 | } | ||
178 | |||
179 | /* fallback to other interrupts */ | ||
180 | gp10b_ltc_isr(g); | ||
181 | } | ||
182 | |||
77 | static u32 gv11b_ltc_cbc_fix_config(struct gk20a *g, int base) | 183 | static u32 gv11b_ltc_cbc_fix_config(struct gk20a *g, int base) |
78 | { | 184 | { |
79 | u32 val = gk20a_readl(g, ltc_ltcs_ltss_cbc_num_active_ltcs_r()); | 185 | u32 val = gk20a_readl(g, ltc_ltcs_ltss_cbc_num_active_ltcs_r()); |
@@ -93,4 +199,5 @@ void gv11b_init_ltc(struct gpu_ops *gops) | |||
93 | gops->ltc.set_zbc_s_entry = gv11b_ltc_set_zbc_stencil_entry; | 199 | gops->ltc.set_zbc_s_entry = gv11b_ltc_set_zbc_stencil_entry; |
94 | gops->ltc.init_fs_state = gv11b_ltc_init_fs_state; | 200 | gops->ltc.init_fs_state = gv11b_ltc_init_fs_state; |
95 | gops->ltc.cbc_fix_config = gv11b_ltc_cbc_fix_config; | 201 | gops->ltc.cbc_fix_config = gv11b_ltc_cbc_fix_config; |
202 | gops->ltc.isr = gv11b_ltc_isr; | ||
96 | } | 203 | } |
diff --git a/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c b/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c index 8733cae9..432af7c1 100644 --- a/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c +++ b/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c | |||
@@ -177,6 +177,9 @@ static struct device_attribute *dev_attr_fecs_ecc_uncorrected_err_count_array; | |||
177 | static struct device_attribute *dev_attr_gpccs_ecc_corrected_err_count_array; | 177 | static struct device_attribute *dev_attr_gpccs_ecc_corrected_err_count_array; |
178 | static struct device_attribute *dev_attr_gpccs_ecc_uncorrected_err_count_array; | 178 | static struct device_attribute *dev_attr_gpccs_ecc_uncorrected_err_count_array; |
179 | 179 | ||
180 | static struct device_attribute *dev_attr_l2_cache_ecc_corrected_err_count_array; | ||
181 | static struct device_attribute *dev_attr_l2_cache_ecc_uncorrected_err_count_array; | ||
182 | |||
180 | void gr_gv11b_create_sysfs(struct device *dev) | 183 | void gr_gv11b_create_sysfs(struct device *dev) |
181 | { | 184 | { |
182 | struct gk20a *g = get_gk20a(dev); | 185 | struct gk20a *g = get_gk20a(dev); |
@@ -251,6 +254,20 @@ void gr_gv11b_create_sysfs(struct device *dev) | |||
251 | dev_attr_gcc_l15_ecc_uncorrected_err_count_array); | 254 | dev_attr_gcc_l15_ecc_uncorrected_err_count_array); |
252 | 255 | ||
253 | error |= gp10b_ecc_stat_create(dev, | 256 | error |= gp10b_ecc_stat_create(dev, |
257 | g->ltc_count, | ||
258 | "ltc", | ||
259 | "l2_cache_uncorrected_err_count", | ||
260 | &g->ecc.ltc.t19x.l2_cache_uncorrected_err_count, | ||
261 | dev_attr_l2_cache_ecc_uncorrected_err_count_array); | ||
262 | |||
263 | error |= gp10b_ecc_stat_create(dev, | ||
264 | g->ltc_count, | ||
265 | "ltc", | ||
266 | "l2_cache_corrected_err_count", | ||
267 | &g->ecc.ltc.t19x.l2_cache_corrected_err_count, | ||
268 | dev_attr_l2_cache_ecc_corrected_err_count_array); | ||
269 | |||
270 | error |= gp10b_ecc_stat_create(dev, | ||
254 | 1, | 271 | 1, |
255 | "gpc", | 272 | "gpc", |
256 | "fecs_ecc_uncorrected_err_count", | 273 | "fecs_ecc_uncorrected_err_count", |
@@ -337,6 +354,16 @@ static void gr_gv11b_remove_sysfs(struct device *dev) | |||
337 | dev_attr_gcc_l15_ecc_uncorrected_err_count_array); | 354 | dev_attr_gcc_l15_ecc_uncorrected_err_count_array); |
338 | 355 | ||
339 | gp10b_ecc_stat_remove(dev, | 356 | gp10b_ecc_stat_remove(dev, |
357 | g->ltc_count, | ||
358 | &g->ecc.ltc.t19x.l2_cache_uncorrected_err_count, | ||
359 | dev_attr_l2_cache_ecc_uncorrected_err_count_array); | ||
360 | |||
361 | gp10b_ecc_stat_remove(dev, | ||
362 | g->ltc_count, | ||
363 | &g->ecc.ltc.t19x.l2_cache_corrected_err_count, | ||
364 | dev_attr_l2_cache_ecc_corrected_err_count_array); | ||
365 | |||
366 | gp10b_ecc_stat_remove(dev, | ||
340 | 1, | 367 | 1, |
341 | &g->ecc.gr.t19x.fecs_uncorrected_err_count, | 368 | &g->ecc.gr.t19x.fecs_uncorrected_err_count, |
342 | dev_attr_fecs_ecc_uncorrected_err_count_array); | 369 | dev_attr_fecs_ecc_uncorrected_err_count_array); |