gpu: nvgpu: GPC MMU ECC support

Adding support for GPC MMU ECC error handling JIRA: GPUT19X-112 Change-Id: I62083bf2f144ff628ecd8c0aefc8d227a233ff36 Signed-off-by: David Nieto <dmartineznie@nvidia.com> Reviewed-on: http://git-master/r/1490772 Reviewed-by: svccoveritychecker <svccoveritychecker@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: David Nieto <dmartineznie@nvidia.com> 2017-05-26 11:31:46 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2017-06-04 23:34:57 -0400
commit: 345eaef6a76771da9c3e8a5e375fc9d659fb1b2b (patch)
tree: 21d2d25eae69ced2a39d62a56a4ee6f42e5c0655 /drivers/gpu/nvgpu/gv11b/gr_gv11b.c
parent: 6bc36bded05ee497a474e5a718c49dc33eb235f1 (diff)
1 files changed, 101 insertions, 4 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index 8176b807..701b840a 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -658,16 +658,101 @@ static int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc,
        return 0;
 }
-static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,
+static int gr_gv11b_handle_gpcmmu_ecc_exception(struct gk20a *g, u32 gpc,
                                                                u32 exception)
 {
        int ret = 0;
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        u32 offset = gpc_stride * gpc;
        u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
        u32 corrected_delta, uncorrected_delta;
        u32 corrected_overflow, uncorrected_overflow;
+        int hww_esr;
+        hww_esr = gk20a_readl(g, gr_gpc0_mmu_gpcmmu_global_esr_r() + offset);
+        if (!(hww_esr & (gr_gpc0_mmu_gpcmmu_global_esr_ecc_corrected_m() |
+                         gr_gpc0_mmu_gpcmmu_global_esr_ecc_uncorrected_m())))
+                return ret;
+        ecc_status = gk20a_readl(g,
+                gr_gpc0_mmu_l1tlb_ecc_status_r() + offset);
+        ecc_addr = gk20a_readl(g,
+                gr_gpc0_mmu_l1tlb_ecc_address_r() + offset);
+        corrected_cnt = gk20a_readl(g,
+                gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() + offset);
+        uncorrected_cnt = gk20a_readl(g,
+                gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() + offset);
+        corrected_delta = gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_v(
+                                                        corrected_cnt);
+        uncorrected_delta = gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_v(
+                                                        uncorrected_cnt);
+        corrected_overflow = ecc_status &
+                gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_total_counter_overflow_m();
+        uncorrected_overflow = ecc_status &
+                gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_total_counter_overflow_m();
+        /* clear the interrupt */
+        if ((corrected_delta > 0) || corrected_overflow)
+                gk20a_writel(g,
+                        gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() +
+                        offset, 0);
+        if ((uncorrected_delta > 0) || uncorrected_overflow)
+                gk20a_writel(g,
+                        gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() +
+                        offset, 0);
+        gk20a_writel(g, gr_gpc0_mmu_l1tlb_ecc_status_r() + offset,
+                                gr_gpc0_mmu_l1tlb_ecc_status_reset_task_f());
+        /* Handle overflow */
+        if (corrected_overflow)
+                corrected_delta += (0x1UL << gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_s());
+        if (uncorrected_overflow)
+                uncorrected_delta += (0x1UL << gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s());
+        g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count.counters[gpc] +=
+                                                        corrected_delta;
+        g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count.counters[gpc] +=
+                                                        uncorrected_delta;
+        nvgpu_log(g, gpu_dbg_intr,
+                        "mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
+        if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m())
+                nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error");
+        if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m())
+                nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error");
+        if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m())
+                nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error");
+        if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m())
+                nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error");
+        if (corrected_overflow || uncorrected_overflow)
+                nvgpu_info(g, "mmu l1tlb ecc counter overflow!");
+        nvgpu_log(g, gpu_dbg_intr,
+                "ecc error address: 0x%x", ecc_addr);
+        nvgpu_log(g, gpu_dbg_intr,
+                "ecc error count corrected: %d, uncorrected %d",
+                g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count.counters[gpc],
+                g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count.counters[gpc]);
+        return ret;
+}
+static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,
+                                                                u32 exception)
+{
+        int ret = 0;
+        u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+        u32 offset = gpc_stride * gpc;
+        u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
+        u32 corrected_delta, uncorrected_delta;
+        u32 corrected_overflow, uncorrected_overflow;
        int hww_esr;
-        u32 offset = proj_gpc_stride_v() * gpc;
        hww_esr = gk20a_readl(g, gr_gpc0_gpccs_hww_esr_r() + offset);
@@ -741,6 +826,15 @@ static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,
        return ret;
 }
+static int gr_gv11b_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
+                                                        u32 gpc_exception)
+{
+        if (gpc_exception & gr_gpc0_gpccs_gpc_exception_gpcmmu_m())
+                return gr_gv11b_handle_gpcmmu_ecc_exception(g, gpc,
+                                                                gpc_exception);
+        return 0;
+}
 static int gr_gv11b_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
                                                        u32 gpc_exception)
 {
@@ -764,7 +858,8 @@ static void gr_gv11b_enable_gpc_exceptions(struct gk20a *g)
        gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(),
                (tpc_mask | gr_gpcs_gpccs_gpc_exception_en_gcc_f(1) |
-                            gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1)));
+                            gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1) |
+                            gr_gpcs_gpccs_gpc_exception_en_gpcmmu_f(1)));
 }
 static int gr_gv11b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc,
@@ -1810,7 +1905,7 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
                        nvgpu_log(g, gpu_dbg_intr,
                                                "dmem ecc error uncorrected");
                if (corrected_overflow || uncorrected_overflow)
-                        nvgpu_info(g, "gpccs ecc counter overflow!");
+                        nvgpu_info(g, "fecs ecc counter overflow!");
                nvgpu_log(g, gpu_dbg_intr,
                        "ecc error row address: 0x%x",
@@ -2422,4 +2517,6 @@ void gv11b_init_gr(struct gpu_ops *gops)
        gops->gr.handle_gpc_gpccs_exception =
                        gr_gv11b_handle_gpc_gpccs_exception;
        gops->gr.set_czf_bypass = NULL;
+        gops->gr.handle_gpc_gpcmmu_exception =
+                        gr_gv11b_handle_gpc_gpcmmu_exception;
 }
author	David Nieto <dmartineznie@nvidia.com>	2017-05-26 11:31:46 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2017-06-04 23:34:57 -0400
commit	345eaef6a76771da9c3e8a5e375fc9d659fb1b2b (patch)
tree	21d2d25eae69ced2a39d62a56a4ee6f42e5c0655 /drivers/gpu/nvgpu/gv11b/gr_gv11b.c
parent	6bc36bded05ee497a474e5a718c49dc33eb235f1 (diff)

diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 8176b807..701b840a 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -658,16 +658,101 @@ static int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc,
658	return 0;	658	return 0;
659	}	659	}
660		660
661	static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,	661	static int gr_gv11b_handle_gpcmmu_ecc_exception(struct gk20a *g, u32 gpc,
662	u32 exception)	662	u32 exception)
663	{	663	{
664	int ret = 0;	664	int ret = 0;
		665	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
		666	u32 offset = gpc_stride * gpc;
665	u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;	667	u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
666	u32 corrected_delta, uncorrected_delta;	668	u32 corrected_delta, uncorrected_delta;
667	u32 corrected_overflow, uncorrected_overflow;	669	u32 corrected_overflow, uncorrected_overflow;
		670	int hww_esr;
		671
		672	hww_esr = gk20a_readl(g, gr_gpc0_mmu_gpcmmu_global_esr_r() + offset);
		673
		674	if (!(hww_esr & (gr_gpc0_mmu_gpcmmu_global_esr_ecc_corrected_m() \|
		675	gr_gpc0_mmu_gpcmmu_global_esr_ecc_uncorrected_m())))
		676	return ret;
		677
		678	ecc_status = gk20a_readl(g,
		679	gr_gpc0_mmu_l1tlb_ecc_status_r() + offset);
		680	ecc_addr = gk20a_readl(g,
		681	gr_gpc0_mmu_l1tlb_ecc_address_r() + offset);
		682	corrected_cnt = gk20a_readl(g,
		683	gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() + offset);
		684	uncorrected_cnt = gk20a_readl(g,
		685	gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() + offset);
		686
		687	corrected_delta = gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_v(
		688	corrected_cnt);
		689	uncorrected_delta = gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_v(
		690	uncorrected_cnt);
		691	corrected_overflow = ecc_status &
		692	gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_total_counter_overflow_m();
		693
		694	uncorrected_overflow = ecc_status &
		695	gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_total_counter_overflow_m();
		696
		697
		698	/* clear the interrupt */
		699	if ((corrected_delta > 0) \|\| corrected_overflow)
		700	gk20a_writel(g,
		701	gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() +
		702	offset, 0);
		703	if ((uncorrected_delta > 0) \|\| uncorrected_overflow)
		704	gk20a_writel(g,
		705	gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() +
		706	offset, 0);
		707
		708	gk20a_writel(g, gr_gpc0_mmu_l1tlb_ecc_status_r() + offset,
		709	gr_gpc0_mmu_l1tlb_ecc_status_reset_task_f());
		710
		711	/* Handle overflow */
		712	if (corrected_overflow)
		713	corrected_delta += (0x1UL << gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_s());
		714	if (uncorrected_overflow)
		715	uncorrected_delta += (0x1UL << gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s());
668		716
		717
		718	g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count.counters[gpc] +=
		719	corrected_delta;
		720	g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count.counters[gpc] +=
		721	uncorrected_delta;
		722	nvgpu_log(g, gpu_dbg_intr,
		723	"mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
		724
		725	if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m())
		726	nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error");
		727	if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m())
		728	nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error");
		729	if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m())
		730	nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error");
		731	if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m())
		732	nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error");
		733	if (corrected_overflow \|\| uncorrected_overflow)
		734	nvgpu_info(g, "mmu l1tlb ecc counter overflow!");
		735
		736	nvgpu_log(g, gpu_dbg_intr,
		737	"ecc error address: 0x%x", ecc_addr);
		738	nvgpu_log(g, gpu_dbg_intr,
		739	"ecc error count corrected: %d, uncorrected %d",
		740	g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count.counters[gpc],
		741	g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count.counters[gpc]);
		742
		743	return ret;
		744	}
		745
		746	static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,
		747	u32 exception)
		748	{
		749	int ret = 0;
		750	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
		751	u32 offset = gpc_stride * gpc;
		752	u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
		753	u32 corrected_delta, uncorrected_delta;
		754	u32 corrected_overflow, uncorrected_overflow;
669	int hww_esr;	755	int hww_esr;
670	u32 offset = proj_gpc_stride_v() * gpc;
671		756
672	hww_esr = gk20a_readl(g, gr_gpc0_gpccs_hww_esr_r() + offset);	757	hww_esr = gk20a_readl(g, gr_gpc0_gpccs_hww_esr_r() + offset);
673		758
@@ -741,6 +826,15 @@ static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,
741	return ret;	826	return ret;
742	}	827	}
743		828
		829	static int gr_gv11b_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
		830	u32 gpc_exception)
		831	{
		832	if (gpc_exception & gr_gpc0_gpccs_gpc_exception_gpcmmu_m())
		833	return gr_gv11b_handle_gpcmmu_ecc_exception(g, gpc,
		834	gpc_exception);
		835	return 0;
		836	}
		837
744	static int gr_gv11b_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,	838	static int gr_gv11b_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
745	u32 gpc_exception)	839	u32 gpc_exception)
746	{	840	{
@@ -764,7 +858,8 @@ static void gr_gv11b_enable_gpc_exceptions(struct gk20a *g)
764		858
765	gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(),	859	gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(),
766	(tpc_mask \| gr_gpcs_gpccs_gpc_exception_en_gcc_f(1) \|	860	(tpc_mask \| gr_gpcs_gpccs_gpc_exception_en_gcc_f(1) \|
767	gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1)));	861	gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1) \|
		862	gr_gpcs_gpccs_gpc_exception_en_gpcmmu_f(1)));
768	}	863	}
769		864
770	static int gr_gv11b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc,	865	static int gr_gv11b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc,
@@ -1810,7 +1905,7 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
1810	nvgpu_log(g, gpu_dbg_intr,	1905	nvgpu_log(g, gpu_dbg_intr,
1811	"dmem ecc error uncorrected");	1906	"dmem ecc error uncorrected");
1812	if (corrected_overflow \|\| uncorrected_overflow)	1907	if (corrected_overflow \|\| uncorrected_overflow)
1813	nvgpu_info(g, "gpccs ecc counter overflow!");	1908	nvgpu_info(g, "fecs ecc counter overflow!");
1814		1909
1815	nvgpu_log(g, gpu_dbg_intr,	1910	nvgpu_log(g, gpu_dbg_intr,
1816	"ecc error row address: 0x%x",	1911	"ecc error row address: 0x%x",
@@ -2422,4 +2517,6 @@ void gv11b_init_gr(struct gpu_ops *gops)
2422	gops->gr.handle_gpc_gpccs_exception =	2517	gops->gr.handle_gpc_gpccs_exception =
2423	gr_gv11b_handle_gpc_gpccs_exception;	2518	gr_gv11b_handle_gpc_gpccs_exception;
2424	gops->gr.set_czf_bypass = NULL;	2519	gops->gr.set_czf_bypass = NULL;
		2520	gops->gr.handle_gpc_gpcmmu_exception =
		2521	gr_gv11b_handle_gpc_gpcmmu_exception;
2425	}	2522	}