summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Nieto <dmartineznie@nvidia.com>2017-05-18 19:45:40 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2017-05-24 07:55:59 -0400
commitc771d0b979cd9f42a21da520d5010873d2a6aa47 (patch)
treeff03cd455a5d953d5d06c597af94e819e8793a37
parent2173add7ae7210606afdaa56995a61d012b9a2f1 (diff)
gpu: nvgpu: add GPC parity counters
(1) Re-arrange the structure for ecc counters reporting so multiple units can be managed (2) Add counters and handling for additional GPC counters JIRA: GPUT19X-84 Change-Id: I74fd474d7daf7590fc7f7ddc9837bb692512d208 Signed-off-by: David Nieto <dmartineznie@nvidia.com> Reviewed-on: http://git-master/r/1485277 Reviewed-by: svccoveritychecker <svccoveritychecker@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/ecc_t19x.h20
-rw-r--r--drivers/gpu/nvgpu/gv11b/ecc_gv11b.h36
-rw-r--r--drivers/gpu/nvgpu/gv11b/gr_gv11b.c103
-rw-r--r--drivers/gpu/nvgpu/gv11b/gr_gv11b.h15
-rw-r--r--drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c94
5 files changed, 212 insertions, 56 deletions
diff --git a/drivers/gpu/nvgpu/ecc_t19x.h b/drivers/gpu/nvgpu/ecc_t19x.h
new file mode 100644
index 00000000..27f00c6c
--- /dev/null
+++ b/drivers/gpu/nvgpu/ecc_t19x.h
@@ -0,0 +1,20 @@
1/*
2 * NVIDIA T19x ECC
3 *
4 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 */
15#ifndef _NVGPU_ECC_T19X_H_
16#define _NVGPU_ECC_T19X_H_
17
18#include "gv11b/ecc_gv11b.h"
19
20#endif
diff --git a/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h b/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h
new file mode 100644
index 00000000..6b471655
--- /dev/null
+++ b/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h
@@ -0,0 +1,36 @@
1/*
2 * GV11B GPU ECC
3 *
4 * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 */
15
16#ifndef _NVGPU_ECC_GV11B_H_
17#define _NVGPU_ECC_GV11B_H_
18
19struct ecc_gr_t19x {
20 struct gk20a_ecc_stat sm_l1_tag_corrected_err_count;
21 struct gk20a_ecc_stat sm_l1_tag_uncorrected_err_count;
22 struct gk20a_ecc_stat sm_cbu_corrected_err_count;
23 struct gk20a_ecc_stat sm_cbu_uncorrected_err_count;
24 struct gk20a_ecc_stat sm_l1_data_corrected_err_count;
25 struct gk20a_ecc_stat sm_l1_data_uncorrected_err_count;
26 struct gk20a_ecc_stat sm_icache_corrected_err_count;
27 struct gk20a_ecc_stat sm_icache_uncorrected_err_count;
28 struct gk20a_ecc_stat gcc_l15_corrected_err_count;
29 struct gk20a_ecc_stat gcc_l15_uncorrected_err_count;
30 struct gk20a_ecc_stat fecs_corrected_err_count;
31 struct gk20a_ecc_stat fecs_uncorrected_err_count;
32 struct gk20a_ecc_stat gpccs_corrected_err_count;
33 struct gk20a_ecc_stat gpccs_uncorrected_err_count;
34};
35
36#endif
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index 764374cc..8b4471ca 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -163,7 +163,7 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
163 l1_tag_corrected_err_count_delta += 163 l1_tag_corrected_err_count_delta +=
164 (is_l1_tag_ecc_corrected_total_err_overflow << 164 (is_l1_tag_ecc_corrected_total_err_overflow <<
165 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_s()); 165 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_total_s());
166 g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count.counters[tpc] += 166 g->ecc.gr.t19x.sm_l1_tag_corrected_err_count.counters[tpc] +=
167 l1_tag_corrected_err_count_delta; 167 l1_tag_corrected_err_count_delta;
168 gk20a_writel(g, 168 gk20a_writel(g,
169 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset, 169 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_corrected_err_count_r() + offset,
@@ -178,7 +178,7 @@ static int gr_gv11b_handle_l1_tag_exception(struct gk20a *g, u32 gpc, u32 tpc,
178 l1_tag_uncorrected_err_count_delta += 178 l1_tag_uncorrected_err_count_delta +=
179 (is_l1_tag_ecc_uncorrected_total_err_overflow << 179 (is_l1_tag_ecc_uncorrected_total_err_overflow <<
180 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_s()); 180 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_total_s());
181 g->gr.t19x.ecc_stats.sm_l1_tag_uncorrected_err_count.counters[tpc] += 181 g->ecc.gr.t19x.sm_l1_tag_uncorrected_err_count.counters[tpc] +=
182 l1_tag_uncorrected_err_count_delta; 182 l1_tag_uncorrected_err_count_delta;
183 gk20a_writel(g, 183 gk20a_writel(g,
184 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset, 184 gr_pri_gpc0_tpc0_sm_l1_tag_ecc_uncorrected_err_count_r() + offset,
@@ -255,7 +255,7 @@ static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc,
255 lrf_corrected_err_count_delta += 255 lrf_corrected_err_count_delta +=
256 (is_lrf_ecc_corrected_total_err_overflow << 256 (is_lrf_ecc_corrected_total_err_overflow <<
257 gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_total_s()); 257 gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_total_s());
258 g->gr.t18x.ecc_stats.sm_lrf_single_err_count.counters[tpc] += 258 g->ecc.gr.t18x.sm_lrf_single_err_count.counters[tpc] +=
259 lrf_corrected_err_count_delta; 259 lrf_corrected_err_count_delta;
260 gk20a_writel(g, 260 gk20a_writel(g,
261 gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r() + offset, 261 gr_pri_gpc0_tpc0_sm_lrf_ecc_corrected_err_count_r() + offset,
@@ -270,7 +270,7 @@ static int gr_gv11b_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc,
270 lrf_uncorrected_err_count_delta += 270 lrf_uncorrected_err_count_delta +=
271 (is_lrf_ecc_uncorrected_total_err_overflow << 271 (is_lrf_ecc_uncorrected_total_err_overflow <<
272 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_s()); 272 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_total_s());
273 g->gr.t18x.ecc_stats.sm_lrf_double_err_count.counters[tpc] += 273 g->ecc.gr.t18x.sm_lrf_double_err_count.counters[tpc] +=
274 lrf_uncorrected_err_count_delta; 274 lrf_uncorrected_err_count_delta;
275 gk20a_writel(g, 275 gk20a_writel(g,
276 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset, 276 gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r() + offset,
@@ -339,7 +339,7 @@ static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc,
339 cbu_corrected_err_count_delta += 339 cbu_corrected_err_count_delta +=
340 (is_cbu_ecc_corrected_total_err_overflow << 340 (is_cbu_ecc_corrected_total_err_overflow <<
341 gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_total_s()); 341 gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_total_s());
342 g->gr.t19x.ecc_stats.sm_cbu_corrected_err_count.counters[tpc] += 342 g->ecc.gr.t19x.sm_cbu_corrected_err_count.counters[tpc] +=
343 cbu_corrected_err_count_delta; 343 cbu_corrected_err_count_delta;
344 gk20a_writel(g, 344 gk20a_writel(g,
345 gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_r() + offset, 345 gr_pri_gpc0_tpc0_sm_cbu_ecc_corrected_err_count_r() + offset,
@@ -354,7 +354,7 @@ static int gr_gv11b_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc,
354 cbu_uncorrected_err_count_delta += 354 cbu_uncorrected_err_count_delta +=
355 (is_cbu_ecc_uncorrected_total_err_overflow << 355 (is_cbu_ecc_uncorrected_total_err_overflow <<
356 gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_s()); 356 gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_total_s());
357 g->gr.t19x.ecc_stats.sm_cbu_uncorrected_err_count.counters[tpc] += 357 g->ecc.gr.t19x.sm_cbu_uncorrected_err_count.counters[tpc] +=
358 cbu_uncorrected_err_count_delta; 358 cbu_uncorrected_err_count_delta;
359 gk20a_writel(g, 359 gk20a_writel(g,
360 gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset, 360 gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r() + offset,
@@ -419,7 +419,7 @@ static int gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc,
419 l1_data_corrected_err_count_delta += 419 l1_data_corrected_err_count_delta +=
420 (is_l1_data_ecc_corrected_total_err_overflow << 420 (is_l1_data_ecc_corrected_total_err_overflow <<
421 gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_total_s()); 421 gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_total_s());
422 g->gr.t19x.ecc_stats.sm_l1_data_corrected_err_count.counters[tpc] += 422 g->ecc.gr.t19x.sm_l1_data_corrected_err_count.counters[tpc] +=
423 l1_data_corrected_err_count_delta; 423 l1_data_corrected_err_count_delta;
424 gk20a_writel(g, 424 gk20a_writel(g,
425 gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_r() + offset, 425 gr_pri_gpc0_tpc0_sm_l1_data_ecc_corrected_err_count_r() + offset,
@@ -434,7 +434,7 @@ static int gr_gv11b_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32 tpc,
434 l1_data_uncorrected_err_count_delta += 434 l1_data_uncorrected_err_count_delta +=
435 (is_l1_data_ecc_uncorrected_total_err_overflow << 435 (is_l1_data_ecc_uncorrected_total_err_overflow <<
436 gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_s()); 436 gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_total_s());
437 g->gr.t19x.ecc_stats.sm_l1_data_uncorrected_err_count.counters[tpc] += 437 g->ecc.gr.t19x.sm_l1_data_uncorrected_err_count.counters[tpc] +=
438 l1_data_uncorrected_err_count_delta; 438 l1_data_uncorrected_err_count_delta;
439 gk20a_writel(g, 439 gk20a_writel(g,
440 gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset, 440 gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r() + offset,
@@ -503,7 +503,7 @@ static int gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc,
503 icache_corrected_err_count_delta += 503 icache_corrected_err_count_delta +=
504 (is_icache_ecc_corrected_total_err_overflow << 504 (is_icache_ecc_corrected_total_err_overflow <<
505 gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_s()); 505 gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_total_s());
506 g->gr.t19x.ecc_stats.sm_icache_corrected_err_count.counters[tpc] += 506 g->ecc.gr.t19x.sm_icache_corrected_err_count.counters[tpc] +=
507 icache_corrected_err_count_delta; 507 icache_corrected_err_count_delta;
508 gk20a_writel(g, 508 gk20a_writel(g,
509 gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r() + offset, 509 gr_pri_gpc0_tpc0_sm_icache_ecc_corrected_err_count_r() + offset,
@@ -518,7 +518,7 @@ static int gr_gv11b_handle_icache_exception(struct gk20a *g, u32 gpc, u32 tpc,
518 icache_uncorrected_err_count_delta += 518 icache_uncorrected_err_count_delta +=
519 (is_icache_ecc_uncorrected_total_err_overflow << 519 (is_icache_ecc_uncorrected_total_err_overflow <<
520 gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_s()); 520 gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_total_s());
521 g->gr.t19x.ecc_stats.sm_icache_uncorrected_err_count.counters[tpc] += 521 g->ecc.gr.t19x.sm_icache_uncorrected_err_count.counters[tpc] +=
522 icache_uncorrected_err_count_delta; 522 icache_uncorrected_err_count_delta;
523 gk20a_writel(g, 523 gk20a_writel(g,
524 gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r() + offset, 524 gr_pri_gpc0_tpc0_sm_icache_ecc_uncorrected_err_count_r() + offset,
@@ -606,7 +606,7 @@ static int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc,
606 gcc_l15_corrected_err_count_delta += 606 gcc_l15_corrected_err_count_delta +=
607 (is_gcc_l15_ecc_corrected_total_err_overflow << 607 (is_gcc_l15_ecc_corrected_total_err_overflow <<
608 gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_total_s()); 608 gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_total_s());
609 g->gr.t19x.ecc_stats.gcc_l15_corrected_err_count.counters[gpc] += 609 g->ecc.gr.t19x.gcc_l15_corrected_err_count.counters[gpc] +=
610 gcc_l15_corrected_err_count_delta; 610 gcc_l15_corrected_err_count_delta;
611 gk20a_writel(g, 611 gk20a_writel(g,
612 gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_r() + offset, 612 gr_pri_gpc0_gcc_l15_ecc_corrected_err_count_r() + offset,
@@ -621,7 +621,7 @@ static int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc,
621 gcc_l15_uncorrected_err_count_delta += 621 gcc_l15_uncorrected_err_count_delta +=
622 (is_gcc_l15_ecc_uncorrected_total_err_overflow << 622 (is_gcc_l15_ecc_uncorrected_total_err_overflow <<
623 gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_total_s()); 623 gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_total_s());
624 g->gr.t19x.ecc_stats.gcc_l15_uncorrected_err_count.counters[gpc] += 624 g->ecc.gr.t19x.gcc_l15_uncorrected_err_count.counters[gpc] +=
625 gcc_l15_uncorrected_err_count_delta; 625 gcc_l15_uncorrected_err_count_delta;
626 gk20a_writel(g, 626 gk20a_writel(g,
627 gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_r() + offset, 627 gr_pri_gpc0_gcc_l15_ecc_uncorrected_err_count_r() + offset,
@@ -639,6 +639,9 @@ static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,
639{ 639{
640 int ret = 0; 640 int ret = 0;
641 u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt; 641 u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
642 u32 corrected_delta, uncorrected_delta;
643 u32 corrected_overflow, uncorrected_overflow;
644
642 int hww_esr; 645 int hww_esr;
643 u32 offset = proj_gpc_stride_v() * gpc; 646 u32 offset = proj_gpc_stride_v() * gpc;
644 647
@@ -657,10 +660,34 @@ static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,
657 uncorrected_cnt = gk20a_readl(g, 660 uncorrected_cnt = gk20a_readl(g,
658 gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_r() + offset); 661 gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_r() + offset);
659 662
663 corrected_delta = gr_gpc0_gpccs_falcon_ecc_corrected_err_count_total_v(
664 corrected_cnt);
665 uncorrected_delta = gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_total_v(
666 uncorrected_cnt);
667 corrected_overflow = ecc_status &
668 gr_gpc0_gpccs_falcon_ecc_status_corrected_err_total_counter_overflow_m();
669
670 uncorrected_overflow = ecc_status &
671 gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_total_counter_overflow_m();
672
673
660 /* clear the interrupt */ 674 /* clear the interrupt */
675 if ((corrected_delta > 0) || corrected_overflow)
676 gk20a_writel(g,
677 gr_gpc0_gpccs_falcon_ecc_corrected_err_count_r() +
678 offset, 0);
679 if ((uncorrected_delta > 0) || uncorrected_overflow)
680 gk20a_writel(g,
681 gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_r() +
682 offset, 0);
683
661 gk20a_writel(g, gr_gpc0_gpccs_falcon_ecc_status_r() + offset, 684 gk20a_writel(g, gr_gpc0_gpccs_falcon_ecc_status_r() + offset,
662 gr_gpc0_gpccs_falcon_ecc_status_reset_task_f()); 685 gr_gpc0_gpccs_falcon_ecc_status_reset_task_f());
663 686
687 g->ecc.gr.t19x.gpccs_corrected_err_count.counters[gpc] +=
688 corrected_delta;
689 g->ecc.gr.t19x.gpccs_uncorrected_err_count.counters[gpc] +=
690 uncorrected_delta;
664 nvgpu_log(g, gpu_dbg_intr, 691 nvgpu_log(g, gpu_dbg_intr,
665 "gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr); 692 "gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
666 693
@@ -675,6 +702,8 @@ static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,
675 if (ecc_status & 702 if (ecc_status &
676 gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) 703 gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m())
677 nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected"); 704 nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected");
705 if (corrected_overflow || uncorrected_overflow)
706 nvgpu_info(g, "gpccs ecc counter overflow!");
678 707
679 nvgpu_log(g, gpu_dbg_intr, 708 nvgpu_log(g, gpu_dbg_intr,
680 "ecc error row address: 0x%x", 709 "ecc error row address: 0x%x",
@@ -682,8 +711,8 @@ static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,
682 711
683 nvgpu_log(g, gpu_dbg_intr, 712 nvgpu_log(g, gpu_dbg_intr,
684 "ecc error count corrected: %d, uncorrected %d", 713 "ecc error count corrected: %d, uncorrected %d",
685 gr_gpc0_gpccs_falcon_ecc_corrected_err_count_total_v(corrected_cnt), 714 g->ecc.gr.t19x.gpccs_corrected_err_count.counters[gpc],
686 gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_total_v(uncorrected_cnt)); 715 g->ecc.gr.t19x.gpccs_uncorrected_err_count.counters[gpc]);
687 716
688 return ret; 717 return ret;
689} 718}
@@ -710,8 +739,8 @@ static void gr_gv11b_enable_gpc_exceptions(struct gk20a *g)
710 gr_gpcs_gpccs_gpc_exception_en_tpc_f((1 << gr->tpc_count) - 1); 739 gr_gpcs_gpccs_gpc_exception_en_tpc_f((1 << gr->tpc_count) - 1);
711 740
712 gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(), 741 gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(),
713 (tpc_mask | gr_gpcs_gpccs_gpc_exception_en_gcc_f(1) 742 (tpc_mask | gr_gpcs_gpccs_gpc_exception_en_gcc_f(1) |
714 gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1)); 743 gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1)));
715} 744}
716 745
717static int gr_gv11b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc, 746static int gr_gv11b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc,
@@ -1690,6 +1719,8 @@ static int gr_gv11b_get_cilp_preempt_pending_chid(struct gk20a *g, int *__chid)
1690static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr) 1719static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
1691{ 1720{
1692 u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt; 1721 u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
1722 u32 corrected_delta, uncorrected_delta;
1723 u32 corrected_overflow, uncorrected_overflow;
1693 1724
1694 if (intr & (gr_fecs_host_int_status_ecc_uncorrected_m() | 1725 if (intr & (gr_fecs_host_int_status_ecc_uncorrected_m() |
1695 gr_fecs_host_int_status_ecc_corrected_m())) { 1726 gr_fecs_host_int_status_ecc_corrected_m())) {
@@ -1701,10 +1732,42 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
1701 uncorrected_cnt = gk20a_readl(g, 1732 uncorrected_cnt = gk20a_readl(g,
1702 gr_fecs_falcon_ecc_uncorrected_err_count_r()); 1733 gr_fecs_falcon_ecc_uncorrected_err_count_r());
1703 1734
1735 corrected_delta =
1736 gr_fecs_falcon_ecc_corrected_err_count_total_v(
1737 corrected_cnt);
1738 uncorrected_delta =
1739 gr_fecs_falcon_ecc_uncorrected_err_count_total_v(
1740 uncorrected_cnt);
1741
1742 corrected_overflow = ecc_status &
1743 gr_fecs_falcon_ecc_status_corrected_err_total_counter_overflow_m();
1744 uncorrected_overflow = ecc_status &
1745 gr_fecs_falcon_ecc_status_uncorrected_err_total_counter_overflow_m();
1746
1747 /* clear the interrupt */
1748 if ((corrected_delta > 0) || corrected_overflow)
1749 gk20a_writel(g,
1750 gr_fecs_falcon_ecc_corrected_err_count_r(), 0);
1751 if ((uncorrected_delta > 0) || uncorrected_overflow)
1752 gk20a_writel(g,
1753 gr_fecs_falcon_ecc_uncorrected_err_count_r(),
1754 0);
1755
1756
1757 /* clear the interrupt */
1758 gk20a_writel(g, gr_fecs_falcon_ecc_uncorrected_err_count_r(),
1759 0);
1760 gk20a_writel(g, gr_fecs_falcon_ecc_corrected_err_count_r(), 0);
1761
1704 /* clear the interrupt */ 1762 /* clear the interrupt */
1705 gk20a_writel(g, gr_fecs_falcon_ecc_status_r(), 1763 gk20a_writel(g, gr_fecs_falcon_ecc_status_r(),
1706 gr_fecs_falcon_ecc_status_reset_task_f()); 1764 gr_fecs_falcon_ecc_status_reset_task_f());
1707 1765
1766 g->ecc.gr.t19x.fecs_corrected_err_count.counters[0] +=
1767 corrected_delta;
1768 g->ecc.gr.t19x.fecs_uncorrected_err_count.counters[0] +=
1769 uncorrected_delta;
1770
1708 nvgpu_log(g, gpu_dbg_intr, 1771 nvgpu_log(g, gpu_dbg_intr,
1709 "fecs ecc interrupt intr: 0x%x", intr); 1772 "fecs ecc interrupt intr: 0x%x", intr);
1710 1773
@@ -1722,6 +1785,8 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
1722 gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m()) 1785 gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m())
1723 nvgpu_log(g, gpu_dbg_intr, 1786 nvgpu_log(g, gpu_dbg_intr,
1724 "dmem ecc error uncorrected"); 1787 "dmem ecc error uncorrected");
1788 if (corrected_overflow || uncorrected_overflow)
1789 nvgpu_info(g, "gpccs ecc counter overflow!");
1725 1790
1726 nvgpu_log(g, gpu_dbg_intr, 1791 nvgpu_log(g, gpu_dbg_intr,
1727 "ecc error row address: 0x%x", 1792 "ecc error row address: 0x%x",
@@ -1729,10 +1794,8 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
1729 1794
1730 nvgpu_log(g, gpu_dbg_intr, 1795 nvgpu_log(g, gpu_dbg_intr,
1731 "ecc error count corrected: %d, uncorrected %d", 1796 "ecc error count corrected: %d, uncorrected %d",
1732 gr_fecs_falcon_ecc_corrected_err_count_total_v( 1797 g->ecc.gr.t19x.fecs_corrected_err_count.counters[0],
1733 corrected_cnt), 1798 g->ecc.gr.t19x.fecs_uncorrected_err_count.counters[0]);
1734 gr_fecs_falcon_ecc_uncorrected_err_count_total_v(
1735 uncorrected_cnt));
1736 } 1799 }
1737} 1800}
1738 1801
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
index cf3842b6..9283a597 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
@@ -35,21 +35,6 @@ enum {
35 VOLTA_DMA_COPY_A = 0xC3B5, 35 VOLTA_DMA_COPY_A = 0xC3B5,
36}; 36};
37 37
38struct gr_t19x {
39 struct {
40 struct gr_gp10b_ecc_stat sm_l1_tag_corrected_err_count;
41 struct gr_gp10b_ecc_stat sm_l1_tag_uncorrected_err_count;
42 struct gr_gp10b_ecc_stat sm_cbu_corrected_err_count;
43 struct gr_gp10b_ecc_stat sm_cbu_uncorrected_err_count;
44 struct gr_gp10b_ecc_stat sm_l1_data_corrected_err_count;
45 struct gr_gp10b_ecc_stat sm_l1_data_uncorrected_err_count;
46 struct gr_gp10b_ecc_stat sm_icache_corrected_err_count;
47 struct gr_gp10b_ecc_stat sm_icache_uncorrected_err_count;
48 struct gr_gp10b_ecc_stat gcc_l15_corrected_err_count;
49 struct gr_gp10b_ecc_stat gcc_l15_uncorrected_err_count;
50 } ecc_stats;
51};
52
53#define NVC397_SET_SHADER_EXCEPTIONS 0x1528 38#define NVC397_SET_SHADER_EXCEPTIONS 0x1528
54#define NVC397_SET_CIRCULAR_BUFFER_SIZE 0x1280 39#define NVC397_SET_CIRCULAR_BUFFER_SIZE 0x1280
55#define NVC397_SET_ALPHA_CIRCULAR_BUFFER_SIZE 0x02dc 40#define NVC397_SET_ALPHA_CIRCULAR_BUFFER_SIZE 0x02dc
diff --git a/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c b/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c
index 39ae68eb..1cfa2ef2 100644
--- a/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c
+++ b/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c
@@ -134,6 +134,11 @@ static struct device_attribute *dev_attr_sm_icache_ecc_uncorrected_err_count_arr
134static struct device_attribute *dev_attr_gcc_l15_ecc_corrected_err_count_array; 134static struct device_attribute *dev_attr_gcc_l15_ecc_corrected_err_count_array;
135static struct device_attribute *dev_attr_gcc_l15_ecc_uncorrected_err_count_array; 135static struct device_attribute *dev_attr_gcc_l15_ecc_uncorrected_err_count_array;
136 136
137static struct device_attribute *dev_attr_fecs_ecc_corrected_err_count_array;
138static struct device_attribute *dev_attr_fecs_ecc_uncorrected_err_count_array;
139static struct device_attribute *dev_attr_gpccs_ecc_corrected_err_count_array;
140static struct device_attribute *dev_attr_gpccs_ecc_uncorrected_err_count_array;
141
137void gr_gv11b_create_sysfs(struct device *dev) 142void gr_gv11b_create_sysfs(struct device *dev)
138{ 143{
139 struct gk20a *g = get_gk20a(dev); 144 struct gk20a *g = get_gk20a(dev);
@@ -142,7 +147,7 @@ void gr_gv11b_create_sysfs(struct device *dev)
142 initialized multiple times but we only need to create the ECC 147 initialized multiple times but we only need to create the ECC
143 stats once. Therefore, add the following check to avoid 148 stats once. Therefore, add the following check to avoid
144 creating duplicate stat sysfs nodes. */ 149 creating duplicate stat sysfs nodes. */
145 if (g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count.counters != NULL) 150 if (g->ecc.gr.t19x.sm_l1_tag_corrected_err_count.counters != NULL)
146 return; 151 return;
147 152
148 gr_gp10b_create_sysfs(dev); 153 gr_gp10b_create_sysfs(dev);
@@ -150,63 +155,91 @@ void gr_gv11b_create_sysfs(struct device *dev)
150 error |= gr_gp10b_ecc_stat_create(dev, 155 error |= gr_gp10b_ecc_stat_create(dev,
151 0, 156 0,
152 "sm_l1_tag_ecc_corrected_err_count", 157 "sm_l1_tag_ecc_corrected_err_count",
153 &g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count, 158 &g->ecc.gr.t19x.sm_l1_tag_corrected_err_count,
154 dev_attr_sm_l1_tag_ecc_corrected_err_count_array); 159 dev_attr_sm_l1_tag_ecc_corrected_err_count_array);
155 160
156 error |= gr_gp10b_ecc_stat_create(dev, 161 error |= gr_gp10b_ecc_stat_create(dev,
157 0, 162 0,
158 "sm_l1_tag_ecc_uncorrected_err_count", 163 "sm_l1_tag_ecc_uncorrected_err_count",
159 &g->gr.t19x.ecc_stats.sm_l1_tag_uncorrected_err_count, 164 &g->ecc.gr.t19x.sm_l1_tag_uncorrected_err_count,
160 dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array); 165 dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array);
161 166
162 error |= gr_gp10b_ecc_stat_create(dev, 167 error |= gr_gp10b_ecc_stat_create(dev,
163 0, 168 0,
164 "sm_cbu_ecc_corrected_err_count", 169 "sm_cbu_ecc_corrected_err_count",
165 &g->gr.t19x.ecc_stats.sm_cbu_corrected_err_count, 170 &g->ecc.gr.t19x.sm_cbu_corrected_err_count,
166 dev_attr_sm_cbu_ecc_corrected_err_count_array); 171 dev_attr_sm_cbu_ecc_corrected_err_count_array);
167 172
168 error |= gr_gp10b_ecc_stat_create(dev, 173 error |= gr_gp10b_ecc_stat_create(dev,
169 0, 174 0,
170 "sm_cbu_ecc_uncorrected_err_count", 175 "sm_cbu_ecc_uncorrected_err_count",
171 &g->gr.t19x.ecc_stats.sm_cbu_uncorrected_err_count, 176 &g->ecc.gr.t19x.sm_cbu_uncorrected_err_count,
172 dev_attr_sm_cbu_ecc_uncorrected_err_count_array); 177 dev_attr_sm_cbu_ecc_uncorrected_err_count_array);
173 178
174 error |= gr_gp10b_ecc_stat_create(dev, 179 error |= gr_gp10b_ecc_stat_create(dev,
175 0, 180 0,
176 "sm_l1_data_ecc_corrected_err_count", 181 "sm_l1_data_ecc_corrected_err_count",
177 &g->gr.t19x.ecc_stats.sm_l1_data_corrected_err_count, 182 &g->ecc.gr.t19x.sm_l1_data_corrected_err_count,
178 dev_attr_sm_l1_data_ecc_corrected_err_count_array); 183 dev_attr_sm_l1_data_ecc_corrected_err_count_array);
179 184
180 error |= gr_gp10b_ecc_stat_create(dev, 185 error |= gr_gp10b_ecc_stat_create(dev,
181 0, 186 0,
182 "sm_l1_data_ecc_uncorrected_err_count", 187 "sm_l1_data_ecc_uncorrected_err_count",
183 &g->gr.t19x.ecc_stats.sm_l1_data_uncorrected_err_count, 188 &g->ecc.gr.t19x.sm_l1_data_uncorrected_err_count,
184 dev_attr_sm_l1_data_ecc_uncorrected_err_count_array); 189 dev_attr_sm_l1_data_ecc_uncorrected_err_count_array);
185 190
186 error |= gr_gp10b_ecc_stat_create(dev, 191 error |= gr_gp10b_ecc_stat_create(dev,
187 0, 192 0,
188 "sm_icache_ecc_corrected_err_count", 193 "sm_icache_ecc_corrected_err_count",
189 &g->gr.t19x.ecc_stats.sm_icache_corrected_err_count, 194 &g->ecc.gr.t19x.sm_icache_corrected_err_count,
190 dev_attr_sm_icache_ecc_corrected_err_count_array); 195 dev_attr_sm_icache_ecc_corrected_err_count_array);
191 196
192 error |= gr_gp10b_ecc_stat_create(dev, 197 error |= gr_gp10b_ecc_stat_create(dev,
193 0, 198 0,
194 "sm_icache_ecc_uncorrected_err_count", 199 "sm_icache_ecc_uncorrected_err_count",
195 &g->gr.t19x.ecc_stats.sm_icache_uncorrected_err_count, 200 &g->ecc.gr.t19x.sm_icache_uncorrected_err_count,
196 dev_attr_sm_icache_ecc_uncorrected_err_count_array); 201 dev_attr_sm_icache_ecc_uncorrected_err_count_array);
197 202
198 error |= gr_gp10b_ecc_stat_create(dev, 203 error |= gr_gp10b_ecc_stat_create(dev,
199 0, 204 0,
200 "gcc_l15_ecc_corrected_err_count", 205 "gcc_l15_ecc_corrected_err_count",
201 &g->gr.t19x.ecc_stats.gcc_l15_corrected_err_count, 206 &g->ecc.gr.t19x.gcc_l15_corrected_err_count,
202 dev_attr_gcc_l15_ecc_corrected_err_count_array); 207 dev_attr_gcc_l15_ecc_corrected_err_count_array);
203 208
204 error |= gr_gp10b_ecc_stat_create(dev, 209 error |= gr_gp10b_ecc_stat_create(dev,
205 0, 210 0,
206 "gcc_l15_ecc_uncorrected_err_count", 211 "gcc_l15_ecc_uncorrected_err_count",
207 &g->gr.t19x.ecc_stats.gcc_l15_uncorrected_err_count, 212 &g->ecc.gr.t19x.gcc_l15_uncorrected_err_count,
208 dev_attr_gcc_l15_ecc_uncorrected_err_count_array); 213 dev_attr_gcc_l15_ecc_uncorrected_err_count_array);
209 214
215 error |= gp10b_ecc_stat_create(dev,
216 1,
217 "gpc",
218 "fecs_ecc_uncorrected_err_count",
219 &g->ecc.gr.t19x.fecs_uncorrected_err_count,
220 dev_attr_fecs_ecc_uncorrected_err_count_array);
221
222 error |= gp10b_ecc_stat_create(dev,
223 1,
224 "gpc",
225 "fecs_ecc_corrected_err_count",
226 &g->ecc.gr.t19x.fecs_corrected_err_count,
227 dev_attr_fecs_ecc_corrected_err_count_array);
228
229 error |= gp10b_ecc_stat_create(dev,
230 g->gr.gpc_count,
231 "gpc",
232 "gpccs_ecc_uncorrected_err_count",
233 &g->ecc.gr.t19x.gpccs_uncorrected_err_count,
234 dev_attr_gpccs_ecc_uncorrected_err_count_array);
235
236 error |= gp10b_ecc_stat_create(dev,
237 g->gr.gpc_count,
238 "gpc",
239 "gpccs_ecc_corrected_err_count",
240 &g->ecc.gr.t19x.gpccs_corrected_err_count,
241 dev_attr_gpccs_ecc_corrected_err_count_array);
242
210 if (error) 243 if (error)
211 dev_err(dev, "Failed to create gv11b sysfs attributes!\n"); 244 dev_err(dev, "Failed to create gv11b sysfs attributes!\n");
212} 245}
@@ -217,52 +250,71 @@ static void gr_gv11b_remove_sysfs(struct device *dev)
217 250
218 gr_gp10b_ecc_stat_remove(dev, 251 gr_gp10b_ecc_stat_remove(dev,
219 0, 252 0,
220 &g->gr.t19x.ecc_stats.sm_l1_tag_corrected_err_count, 253 &g->ecc.gr.t19x.sm_l1_tag_corrected_err_count,
221 dev_attr_sm_l1_tag_ecc_corrected_err_count_array); 254 dev_attr_sm_l1_tag_ecc_corrected_err_count_array);
222 255
223 gr_gp10b_ecc_stat_remove(dev, 256 gr_gp10b_ecc_stat_remove(dev,
224 0, 257 0,
225 &g->gr.t19x.ecc_stats.sm_l1_tag_uncorrected_err_count, 258 &g->ecc.gr.t19x.sm_l1_tag_uncorrected_err_count,
226 dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array); 259 dev_attr_sm_l1_tag_ecc_uncorrected_err_count_array);
227 260
228 gr_gp10b_ecc_stat_remove(dev, 261 gr_gp10b_ecc_stat_remove(dev,
229 0, 262 0,
230 &g->gr.t19x.ecc_stats.sm_cbu_corrected_err_count, 263 &g->ecc.gr.t19x.sm_cbu_corrected_err_count,
231 dev_attr_sm_cbu_ecc_corrected_err_count_array); 264 dev_attr_sm_cbu_ecc_corrected_err_count_array);
232 265
233 gr_gp10b_ecc_stat_remove(dev, 266 gr_gp10b_ecc_stat_remove(dev,
234 0, 267 0,
235 &g->gr.t19x.ecc_stats.sm_cbu_uncorrected_err_count, 268 &g->ecc.gr.t19x.sm_cbu_uncorrected_err_count,
236 dev_attr_sm_cbu_ecc_uncorrected_err_count_array); 269 dev_attr_sm_cbu_ecc_uncorrected_err_count_array);
237 270
238 gr_gp10b_ecc_stat_remove(dev, 271 gr_gp10b_ecc_stat_remove(dev,
239 0, 272 0,
240 &g->gr.t19x.ecc_stats.sm_l1_data_corrected_err_count, 273 &g->ecc.gr.t19x.sm_l1_data_corrected_err_count,
241 dev_attr_sm_l1_data_ecc_corrected_err_count_array); 274 dev_attr_sm_l1_data_ecc_corrected_err_count_array);
242 275
243 gr_gp10b_ecc_stat_remove(dev, 276 gr_gp10b_ecc_stat_remove(dev,
244 0, 277 0,
245 &g->gr.t19x.ecc_stats.sm_l1_data_uncorrected_err_count, 278 &g->ecc.gr.t19x.sm_l1_data_uncorrected_err_count,
246 dev_attr_sm_l1_data_ecc_uncorrected_err_count_array); 279 dev_attr_sm_l1_data_ecc_uncorrected_err_count_array);
247 280
248 gr_gp10b_ecc_stat_remove(dev, 281 gr_gp10b_ecc_stat_remove(dev,
249 0, 282 0,
250 &g->gr.t19x.ecc_stats.sm_icache_corrected_err_count, 283 &g->ecc.gr.t19x.sm_icache_corrected_err_count,
251 dev_attr_sm_icache_ecc_corrected_err_count_array); 284 dev_attr_sm_icache_ecc_corrected_err_count_array);
252 285
253 gr_gp10b_ecc_stat_remove(dev, 286 gr_gp10b_ecc_stat_remove(dev,
254 0, 287 0,
255 &g->gr.t19x.ecc_stats.sm_icache_uncorrected_err_count, 288 &g->ecc.gr.t19x.sm_icache_uncorrected_err_count,
256 dev_attr_sm_icache_ecc_uncorrected_err_count_array); 289 dev_attr_sm_icache_ecc_uncorrected_err_count_array);
257 290
258 gr_gp10b_ecc_stat_remove(dev, 291 gr_gp10b_ecc_stat_remove(dev,
259 0, 292 0,
260 &g->gr.t19x.ecc_stats.gcc_l15_corrected_err_count, 293 &g->ecc.gr.t19x.gcc_l15_corrected_err_count,
261 dev_attr_gcc_l15_ecc_corrected_err_count_array); 294 dev_attr_gcc_l15_ecc_corrected_err_count_array);
262 295
263 gr_gp10b_ecc_stat_remove(dev, 296 gr_gp10b_ecc_stat_remove(dev,
264 0, 297 0,
265 &g->gr.t19x.ecc_stats.gcc_l15_uncorrected_err_count, 298 &g->ecc.gr.t19x.gcc_l15_uncorrected_err_count,
266 dev_attr_gcc_l15_ecc_uncorrected_err_count_array); 299 dev_attr_gcc_l15_ecc_uncorrected_err_count_array);
267 300
301 gp10b_ecc_stat_remove(dev,
302 1,
303 &g->ecc.gr.t19x.fecs_uncorrected_err_count,
304 dev_attr_fecs_ecc_uncorrected_err_count_array);
305
306 gp10b_ecc_stat_remove(dev,
307 1,
308 &g->ecc.gr.t19x.fecs_corrected_err_count,
309 dev_attr_fecs_ecc_corrected_err_count_array);
310
311 gp10b_ecc_stat_remove(dev,
312 g->gr.gpc_count,
313 &g->ecc.gr.t19x.gpccs_uncorrected_err_count,
314 dev_attr_gpccs_ecc_uncorrected_err_count_array);
315
316 gp10b_ecc_stat_remove(dev,
317 g->gr.gpc_count,
318 &g->ecc.gr.t19x.gpccs_corrected_err_count,
319 dev_attr_gpccs_ecc_corrected_err_count_array);
268} 320}