diff options
author | Lakshmanan M <lm@nvidia.com> | 2017-05-15 06:02:21 -0400 |
---|---|---|
committer | mobile promotions <svcmobile_promotions@nvidia.com> | 2017-05-18 12:04:39 -0400 |
commit | d503a234440b0b5912f64314de68689b3211bbcd (patch) | |
tree | b49c43a03d54ace5673945fe9e3664e84e62247b /drivers/gpu/nvgpu/gv11b/gr_gv11b.c | |
parent | ffc37e50fa8e869e9a160b35f3cf414040e8a360 (diff) |
gpu: nvgpu: gv11b: Add LRF + CBU parity support
This CL covers the following parity support (uncorrected error),
1) SM's LRF
2) SM's CBU
Volta Resiliency Id - Volta-637
JIRA GPUT19X-85
JIRA GPUT19X-110
Bug 1775457
Change-Id: I3befb1fe22719d06aa819ef27654aaf97f911a9b
Signed-off-by: Lakshmanan M <lm@nvidia.com>
Reviewed-on: http://git-master/r/1481791
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gv11b/gr_gv11b.c')
-rw-r--r-- | drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 187 |
1 files changed, 180 insertions, 7 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index ad34233c..d36aa6ec 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c | |||
@@ -192,24 +192,197 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc, | |||
192 | 192 | ||
193 | } | 193 | } |
194 | 194 | ||
195 | static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc, | ||
196 | bool *post_event, struct channel_gk20a *fault_ch, | ||
197 | u32 *hww_global_esr) | ||
198 | { | ||
199 | u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); | ||
200 | u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); | ||
201 | u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; | ||
202 | u32 lrf_ecc_status, lrf_ecc_corrected_err_status = 0; | ||
203 | u32 lrf_ecc_uncorrected_err_status = 0; | ||
204 | u32 lrf_corrected_err_count_delta = 0; | ||
205 | u32 lrf_uncorrected_err_count_delta = 0; | ||
206 | bool is_lrf_ecc_corrected_total_err_overflow = 0; | ||
207 | bool is_lrf_ecc_uncorrected_total_err_overflow = 0; | ||
208 | |||
209 | /* Check for LRF ECC errors. */ | ||
210 | lrf_ecc_status = gk20a_readl(g, | ||
211 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset); | ||
212 | lrf_ecc_corrected_err_status = lrf_ecc_status & | ||
213 | (gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp0_m() | | ||
214 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp1_m() | | ||
215 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp2_m() | | ||
216 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp3_m() | | ||
217 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp4_m() | | ||
218 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp5_m() | | ||
219 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp6_m() | | ||
220 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp7_m()); | ||
221 | lrf_ecc_uncorrected_err_status = lrf_ecc_status & | ||
222 | (gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp0_m() | | ||
223 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp1_m() | | ||
224 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp2_m() | | ||
225 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp3_m() | | ||
226 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp4_m() | | ||
227 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp5_m() | | ||
228 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp6_m() | | ||
229 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp7_m()); | ||
230 | |||
231 | if ((lrf_ecc_corrected_err_status == 0) && (lrf_ecc_uncorrected_err_status == 0)) | ||
232 | return 0; | ||
233 | |||
234 | lrf_corrected_err_count_delta = | ||
235 | gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_total_v( | ||
236 | gk20a_readl(g, | ||
237 | gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r() + | ||
238 | offset)); | ||
239 | lrf_uncorrected_err_count_delta = | ||
240 | gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_v( | ||
241 | gk20a_readl(g, | ||
242 | gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + | ||
243 | offset)); | ||
244 | is_lrf_ecc_corrected_total_err_overflow = | ||
245 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_total_counter_overflow_v(lrf_ecc_status); | ||
246 | is_lrf_ecc_uncorrected_total_err_overflow = | ||
247 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_total_counter_overflow_v(lrf_ecc_status); | ||
248 | |||
249 | if ((lrf_corrected_err_count_delta > 0) || is_lrf_ecc_corrected_total_err_overflow) { | ||
250 | gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, | ||
251 | "corrected error (SBE) detected in SM LRF! err_mask [%08x] is_overf [%d]", | ||
252 | lrf_ecc_corrected_err_status, is_lrf_ecc_corrected_total_err_overflow); | ||
253 | |||
254 | /* HW uses 16-bits counter */ | ||
255 | lrf_corrected_err_count_delta += | ||
256 | (is_lrf_ecc_corrected_total_err_overflow << | ||
257 | gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_total_s()); | ||
258 | g->gr.t18x.ecc_stats.sm_lrf_single_err_count.counters[tpc] += | ||
259 | lrf_corrected_err_count_delta; | ||
260 | gk20a_writel(g, | ||
261 | gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r() + offset, | ||
262 | 0); | ||
263 | } | ||
264 | if ((lrf_uncorrected_err_count_delta > 0) || is_lrf_ecc_uncorrected_total_err_overflow) { | ||
265 | gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, | ||
266 | "Uncorrected error (DBE) detected in SM LRF! err_mask [%08x] is_overf [%d]", | ||
267 | lrf_ecc_uncorrected_err_status, is_lrf_ecc_uncorrected_total_err_overflow); | ||
268 | |||
269 | /* HW uses 16-bits counter */ | ||
270 | lrf_uncorrected_err_count_delta += | ||
271 | (is_lrf_ecc_uncorrected_total_err_overflow << | ||
272 | gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_s()); | ||
273 | g->gr.t18x.ecc_stats.sm_lrf_double_err_count.counters[tpc] += | ||
274 | lrf_uncorrected_err_count_delta; | ||
275 | gk20a_writel(g, | ||
276 | gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset, | ||
277 | 0); | ||
278 | } | ||
279 | |||
280 | gk20a_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset, | ||
281 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_reset_task_f()); | ||
282 | |||
283 | return 0; | ||
284 | |||
285 | } | ||
286 | |||
287 | static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc, | ||
288 | bool *post_event, struct channel_gk20a *fault_ch, | ||
289 | u32 *hww_global_esr) | ||
290 | { | ||
291 | u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); | ||
292 | u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); | ||
293 | u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; | ||
294 | u32 cbu_ecc_status, cbu_ecc_corrected_err_status = 0; | ||
295 | u32 cbu_ecc_uncorrected_err_status = 0; | ||
296 | u32 cbu_corrected_err_count_delta = 0; | ||
297 | u32 cbu_uncorrected_err_count_delta = 0; | ||
298 | bool is_cbu_ecc_corrected_total_err_overflow = 0; | ||
299 | bool is_cbu_ecc_uncorrected_total_err_overflow = 0; | ||
300 | |||
301 | /* Check for CBU ECC errors. */ | ||
302 | cbu_ecc_status = gk20a_readl(g, | ||
303 | gr_pri_gpc0_tpc0_sm_cbu_ecc_status_r() + offset); | ||
304 | cbu_ecc_corrected_err_status = cbu_ecc_status & | ||
305 | (gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_warp_sm0_m() | | ||
306 | gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_warp_sm1_m() | | ||
307 | gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_barrier_sm0_m() | | ||
308 | gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_barrier_sm1_m()); | ||
309 | cbu_ecc_uncorrected_err_status = cbu_ecc_status & | ||
310 | (gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_warp_sm0_m() | | ||
311 | gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_warp_sm1_m() | | ||
312 | gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_barrier_sm0_m() | | ||
313 | gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_barrier_sm1_m()); | ||
314 | |||
315 | if ((cbu_ecc_corrected_err_status == 0) && (cbu_ecc_uncorrected_err_status == 0)) | ||
316 | return 0; | ||
317 | |||
318 | cbu_corrected_err_count_delta = | ||
319 | gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_total_v( | ||
320 | gk20a_readl(g, | ||
321 | gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_r() + | ||
322 | offset)); | ||
323 | cbu_uncorrected_err_count_delta = | ||
324 | gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_v( | ||
325 | gk20a_readl(g, | ||
326 | gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + | ||
327 | offset)); | ||
328 | is_cbu_ecc_corrected_total_err_overflow = | ||
329 | gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_total_counter_overflow_v(cbu_ecc_status); | ||
330 | is_cbu_ecc_uncorrected_total_err_overflow = | ||
331 | gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_total_counter_overflow_v(cbu_ecc_status); | ||
332 | |||
333 | if ((cbu_corrected_err_count_delta > 0) || is_cbu_ecc_corrected_total_err_overflow) { | ||
334 | gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, | ||
335 | "corrected error (SBE) detected in SM CBU! err_mask [%08x] is_overf [%d]", | ||
336 | cbu_ecc_corrected_err_status, is_cbu_ecc_corrected_total_err_overflow); | ||
337 | |||
338 | /* HW uses 16-bits counter */ | ||
339 | cbu_corrected_err_count_delta += | ||
340 | (is_cbu_ecc_corrected_total_err_overflow << | ||
341 | gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_total_s()); | ||
342 | g->gr.t19x.ecc_stats.sm_cbu_corrected_err_count.counters[tpc] += | ||
343 | cbu_corrected_err_count_delta; | ||
344 | gk20a_writel(g, | ||
345 | gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_r() + offset, | ||
346 | 0); | ||
347 | } | ||
348 | if ((cbu_uncorrected_err_count_delta > 0) || is_cbu_ecc_uncorrected_total_err_overflow) { | ||
349 | gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, | ||
350 | "Uncorrected error (DBE) detected in SM CBU! err_mask [%08x] is_overf [%d]", | ||
351 | cbu_ecc_uncorrected_err_status, is_cbu_ecc_uncorrected_total_err_overflow); | ||
352 | |||
353 | /* HW uses 16-bits counter */ | ||
354 | cbu_uncorrected_err_count_delta += | ||
355 | (is_cbu_ecc_uncorrected_total_err_overflow << | ||
356 | gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_s()); | ||
357 | g->gr.t19x.ecc_stats.sm_cbu_uncorrected_err_count.counters[tpc] += | ||
358 | cbu_uncorrected_err_count_delta; | ||
359 | gk20a_writel(g, | ||
360 | gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset, | ||
361 | 0); | ||
362 | } | ||
363 | |||
364 | gk20a_writel(g, gr_pri_gpc0_tpc0_sm_cbu_ecc_status_r() + offset, | ||
365 | gr_pri_gpc0_tpc0_sm_cbu_ecc_status_reset_task_f()); | ||
366 | |||
367 | return 0; | ||
368 | |||
369 | } | ||
370 | |||
195 | static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, | 371 | static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, |
196 | bool *post_event, struct channel_gk20a *fault_ch, | 372 | bool *post_event, struct channel_gk20a *fault_ch, |
197 | u32 *hww_global_esr) | 373 | u32 *hww_global_esr) |
198 | { | 374 | { |
199 | int ret = 0; | 375 | int ret = 0; |
200 | u32 offset = proj_gpc_stride_v() * gpc + | ||
201 | proj_tpc_in_gpc_stride_v() * tpc; | ||
202 | u32 lrf_ecc_status; | ||
203 | 376 | ||
204 | /* Check for L1 tag ECC errors. */ | 377 | /* Check for L1 tag ECC errors. */ |
205 | gr_gv11b_handle_l1_tag_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); | 378 | gr_gv11b_handle_l1_tag_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); |
206 | 379 | ||
207 | /* Check for LRF ECC errors. */ | 380 | /* Check for LRF ECC errors. */ |
208 | lrf_ecc_status = gk20a_readl(g, | 381 | gr_gv11b_handle_lrf_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); |
209 | gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset); | 382 | |
383 | /* Check for CBU ECC errors. */ | ||
384 | gr_gv11b_handle_cbu_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); | ||
210 | 385 | ||
211 | gk20a_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset, | ||
212 | lrf_ecc_status); | ||
213 | return ret; | 386 | return ret; |
214 | } | 387 | } |
215 | 388 | ||