summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
diff options
context:
space:
mode:
authorLakshmanan M <lm@nvidia.com>2017-05-15 06:02:21 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2017-05-18 12:04:39 -0400
commitd503a234440b0b5912f64314de68689b3211bbcd (patch)
treeb49c43a03d54ace5673945fe9e3664e84e62247b /drivers/gpu/nvgpu/gv11b/gr_gv11b.c
parentffc37e50fa8e869e9a160b35f3cf414040e8a360 (diff)
gpu: nvgpu: gv11b: Add LRF + CBU parity support
This CL covers the following parity support (uncorrected error), 1) SM's LRF 2) SM's CBU Volta Resiliency Id - Volta-637 JIRA GPUT19X-85 JIRA GPUT19X-110 Bug 1775457 Change-Id: I3befb1fe22719d06aa819ef27654aaf97f911a9b Signed-off-by: Lakshmanan M <lm@nvidia.com> Reviewed-on: http://git-master/r/1481791 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gv11b/gr_gv11b.c')
-rw-r--r--drivers/gpu/nvgpu/gv11b/gr_gv11b.c187
1 files changed, 180 insertions, 7 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index ad34233c..d36aa6ec 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -192,24 +192,197 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
192 192
193} 193}
194 194
195static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc,
196 bool *post_event, struct channel_gk20a *fault_ch,
197 u32 *hww_global_esr)
198{
199 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
200 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
201 u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
202 u32 lrf_ecc_status, lrf_ecc_corrected_err_status = 0;
203 u32 lrf_ecc_uncorrected_err_status = 0;
204 u32 lrf_corrected_err_count_delta = 0;
205 u32 lrf_uncorrected_err_count_delta = 0;
206 bool is_lrf_ecc_corrected_total_err_overflow = 0;
207 bool is_lrf_ecc_uncorrected_total_err_overflow = 0;
208
209 /* Check for LRF ECC errors. */
210 lrf_ecc_status = gk20a_readl(g,
211 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset);
212 lrf_ecc_corrected_err_status = lrf_ecc_status &
213 (gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp0_m() |
214 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp1_m() |
215 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp2_m() |
216 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp3_m() |
217 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp4_m() |
218 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp5_m() |
219 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp6_m() |
220 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_qrfdp7_m());
221 lrf_ecc_uncorrected_err_status = lrf_ecc_status &
222 (gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp0_m() |
223 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp1_m() |
224 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp2_m() |
225 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp3_m() |
226 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp4_m() |
227 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp5_m() |
228 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp6_m() |
229 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_qrfdp7_m());
230
231 if ((lrf_ecc_corrected_err_status == 0) && (lrf_ecc_uncorrected_err_status == 0))
232 return 0;
233
234 lrf_corrected_err_count_delta =
235 gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_total_v(
236 gk20a_readl(g,
237 gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r() +
238 offset));
239 lrf_uncorrected_err_count_delta =
240 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_v(
241 gk20a_readl(g,
242 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() +
243 offset));
244 is_lrf_ecc_corrected_total_err_overflow =
245 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_corrected_err_total_counter_overflow_v(lrf_ecc_status);
246 is_lrf_ecc_uncorrected_total_err_overflow =
247 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_uncorrected_err_total_counter_overflow_v(lrf_ecc_status);
248
249 if ((lrf_corrected_err_count_delta > 0) || is_lrf_ecc_corrected_total_err_overflow) {
250 gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
251 "corrected error (SBE) detected in SM LRF! err_mask [%08x] is_overf [%d]",
252 lrf_ecc_corrected_err_status, is_lrf_ecc_corrected_total_err_overflow);
253
254 /* HW uses 16-bits counter */
255 lrf_corrected_err_count_delta +=
256 (is_lrf_ecc_corrected_total_err_overflow <<
257 gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_total_s());
258 g->gr.t18x.ecc_stats.sm_lrf_single_err_count.counters[tpc] +=
259 lrf_corrected_err_count_delta;
260 gk20a_writel(g,
261 gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r() + offset,
262 0);
263 }
264 if ((lrf_uncorrected_err_count_delta > 0) || is_lrf_ecc_uncorrected_total_err_overflow) {
265 gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
266 "Uncorrected error (DBE) detected in SM LRF! err_mask [%08x] is_overf [%d]",
267 lrf_ecc_uncorrected_err_status, is_lrf_ecc_uncorrected_total_err_overflow);
268
269 /* HW uses 16-bits counter */
270 lrf_uncorrected_err_count_delta +=
271 (is_lrf_ecc_uncorrected_total_err_overflow <<
272 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_s());
273 g->gr.t18x.ecc_stats.sm_lrf_double_err_count.counters[tpc] +=
274 lrf_uncorrected_err_count_delta;
275 gk20a_writel(g,
276 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset,
277 0);
278 }
279
280 gk20a_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset,
281 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_reset_task_f());
282
283 return 0;
284
285}
286
287static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc,
288 bool *post_event, struct channel_gk20a *fault_ch,
289 u32 *hww_global_esr)
290{
291 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
292 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
293 u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
294 u32 cbu_ecc_status, cbu_ecc_corrected_err_status = 0;
295 u32 cbu_ecc_uncorrected_err_status = 0;
296 u32 cbu_corrected_err_count_delta = 0;
297 u32 cbu_uncorrected_err_count_delta = 0;
298 bool is_cbu_ecc_corrected_total_err_overflow = 0;
299 bool is_cbu_ecc_uncorrected_total_err_overflow = 0;
300
301 /* Check for CBU ECC errors. */
302 cbu_ecc_status = gk20a_readl(g,
303 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_r() + offset);
304 cbu_ecc_corrected_err_status = cbu_ecc_status &
305 (gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_warp_sm0_m() |
306 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_warp_sm1_m() |
307 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_barrier_sm0_m() |
308 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_barrier_sm1_m());
309 cbu_ecc_uncorrected_err_status = cbu_ecc_status &
310 (gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_warp_sm0_m() |
311 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_warp_sm1_m() |
312 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_barrier_sm0_m() |
313 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_barrier_sm1_m());
314
315 if ((cbu_ecc_corrected_err_status == 0) && (cbu_ecc_uncorrected_err_status == 0))
316 return 0;
317
318 cbu_corrected_err_count_delta =
319 gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_total_v(
320 gk20a_readl(g,
321 gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_r() +
322 offset));
323 cbu_uncorrected_err_count_delta =
324 gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_v(
325 gk20a_readl(g,
326 gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() +
327 offset));
328 is_cbu_ecc_corrected_total_err_overflow =
329 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_corrected_err_total_counter_overflow_v(cbu_ecc_status);
330 is_cbu_ecc_uncorrected_total_err_overflow =
331 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_uncorrected_err_total_counter_overflow_v(cbu_ecc_status);
332
333 if ((cbu_corrected_err_count_delta > 0) || is_cbu_ecc_corrected_total_err_overflow) {
334 gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
335 "corrected error (SBE) detected in SM CBU! err_mask [%08x] is_overf [%d]",
336 cbu_ecc_corrected_err_status, is_cbu_ecc_corrected_total_err_overflow);
337
338 /* HW uses 16-bits counter */
339 cbu_corrected_err_count_delta +=
340 (is_cbu_ecc_corrected_total_err_overflow <<
341 gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_total_s());
342 g->gr.t19x.ecc_stats.sm_cbu_corrected_err_count.counters[tpc] +=
343 cbu_corrected_err_count_delta;
344 gk20a_writel(g,
345 gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_r() + offset,
346 0);
347 }
348 if ((cbu_uncorrected_err_count_delta > 0) || is_cbu_ecc_uncorrected_total_err_overflow) {
349 gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
350 "Uncorrected error (DBE) detected in SM CBU! err_mask [%08x] is_overf [%d]",
351 cbu_ecc_uncorrected_err_status, is_cbu_ecc_uncorrected_total_err_overflow);
352
353 /* HW uses 16-bits counter */
354 cbu_uncorrected_err_count_delta +=
355 (is_cbu_ecc_uncorrected_total_err_overflow <<
356 gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_s());
357 g->gr.t19x.ecc_stats.sm_cbu_uncorrected_err_count.counters[tpc] +=
358 cbu_uncorrected_err_count_delta;
359 gk20a_writel(g,
360 gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset,
361 0);
362 }
363
364 gk20a_writel(g, gr_pri_gpc0_tpc0_sm_cbu_ecc_status_r() + offset,
365 gr_pri_gpc0_tpc0_sm_cbu_ecc_status_reset_task_f());
366
367 return 0;
368
369}
370
195static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, 371static int gr_gv11b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
196 bool *post_event, struct channel_gk20a *fault_ch, 372 bool *post_event, struct channel_gk20a *fault_ch,
197 u32 *hww_global_esr) 373 u32 *hww_global_esr)
198{ 374{
199 int ret = 0; 375 int ret = 0;
200 u32 offset = proj_gpc_stride_v() * gpc +
201 proj_tpc_in_gpc_stride_v() * tpc;
202 u32 lrf_ecc_status;
203 376
204 /* Check for L1 tag ECC errors. */ 377 /* Check for L1 tag ECC errors. */
205 gr_gv11b_handle_l1_tag_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); 378 gr_gv11b_handle_l1_tag_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr);
206 379
207 /* Check for LRF ECC errors. */ 380 /* Check for LRF ECC errors. */
208 lrf_ecc_status = gk20a_readl(g, 381 gr_gv11b_handle_lrf_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr);
209 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset); 382
383 /* Check for CBU ECC errors. */
384 gr_gv11b_handle_cbu_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr);
210 385
211 gk20a_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset,
212 lrf_ecc_status);
213 return ret; 386 return ret;
214} 387}
215 388