gpu: nvgpu: per-chip GPCCS exception support

Adding support for ISR handling of GPCCS exceptions and GCC ECC support JIRA: GPUT19X-83 Change-Id: Ica749dc678f152d536052cf47f2ea2b205a231d6 Signed-off-by: David Nieto <dmartineznie@nvidia.com> Reviewed-on: http://git-master/r/1480997 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: David Nieto <dmartineznie@nvidia.com> 2017-05-12 14:07:00 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2017-05-24 07:55:42 -0400
commit: 2173add7ae7210606afdaa56995a61d012b9a2f1 (patch)
tree: 3e6f637ab0c4f2e28aa63823105764f39c774a85 /drivers/gpu/nvgpu/gv11b
parent: 45ca7cb8c5774cfc15015973b1883faa1d93b9e6 (diff)
1 files changed, 120 insertions, 2 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index 014ba537..764374cc 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -634,6 +634,70 @@ static int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc,
        return 0;
 }
+static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,
+                                                                u32 exception)
+{
+        int ret = 0;
+        u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
+        int hww_esr;
+        u32 offset = proj_gpc_stride_v() * gpc;
+        hww_esr = gk20a_readl(g, gr_gpc0_gpccs_hww_esr_r() + offset);
+        if (!(hww_esr & (gr_gpc0_gpccs_hww_esr_ecc_uncorrected_m() |
+                         gr_gpc0_gpccs_hww_esr_ecc_corrected_m())))
+                return ret;
+        ecc_status = gk20a_readl(g,
+                gr_gpc0_gpccs_falcon_ecc_status_r() + offset);
+        ecc_addr = gk20a_readl(g,
+                gr_gpc0_gpccs_falcon_ecc_address_r() + offset);
+        corrected_cnt = gk20a_readl(g,
+                gr_gpc0_gpccs_falcon_ecc_corrected_err_count_r() + offset);
+        uncorrected_cnt = gk20a_readl(g,
+                gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_r() + offset);
+        /* clear the interrupt */
+        gk20a_writel(g, gr_gpc0_gpccs_falcon_ecc_status_r() + offset,
+                                gr_gpc0_gpccs_falcon_ecc_status_reset_task_f());
+        nvgpu_log(g, gpu_dbg_intr,
+                        "gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
+        if (ecc_status & gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m())
+                nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
+        if (ecc_status &
+                gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m())
+                nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected");
+        if (ecc_status &
+                gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m())
+                nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected");
+        if (ecc_status &
+                gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m())
+                nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected");
+        nvgpu_log(g, gpu_dbg_intr,
+                "ecc error row address: 0x%x",
+                gr_gpc0_gpccs_falcon_ecc_address_row_address_v(ecc_addr));
+        nvgpu_log(g, gpu_dbg_intr,
+                "ecc error count corrected: %d, uncorrected %d",
+                gr_gpc0_gpccs_falcon_ecc_corrected_err_count_total_v(corrected_cnt),
+                gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_total_v(uncorrected_cnt));
+        return ret;
+}
+static int gr_gv11b_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
+                                                        u32 gpc_exception)
+{
+        if (gpc_exception & gr_gpc0_gpccs_gpc_exception_gpccs_m())
+                return gr_gv11b_handle_gpccs_ecc_exception(g, gpc,
+                                                                gpc_exception);
+        return 0;
+}
 static void gr_gv11b_enable_gpc_exceptions(struct gk20a *g)
 {
        struct gr_gk20a *gr = &g->gr;
@@ -646,7 +710,8 @@ static void gr_gv11b_enable_gpc_exceptions(struct gk20a *g)
                gr_gpcs_gpccs_gpc_exception_en_tpc_f((1 << gr->tpc_count) - 1);
        gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(),
-                (tpc_mask | gr_gpcs_gpccs_gpc_exception_en_gcc_f(1)));
+                (tpc_mask | gr_gpcs_gpccs_gpc_exception_en_gcc_f(1)
+                            gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1));
 }
 static int gr_gv11b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc,
@@ -1622,6 +1687,55 @@ static int gr_gv11b_get_cilp_preempt_pending_chid(struct gk20a *g, int *__chid)
        return ret;
 }
+static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
+{
+        u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
+        if (intr & (gr_fecs_host_int_status_ecc_uncorrected_m() |
+                    gr_fecs_host_int_status_ecc_corrected_m())) {
+                ecc_status = gk20a_readl(g, gr_fecs_falcon_ecc_status_r());
+                ecc_addr = gk20a_readl(g,
+                        gr_fecs_falcon_ecc_address_r());
+                corrected_cnt = gk20a_readl(g,
+                        gr_fecs_falcon_ecc_corrected_err_count_r());
+                uncorrected_cnt = gk20a_readl(g,
+                        gr_fecs_falcon_ecc_uncorrected_err_count_r());
+                /* clear the interrupt */
+                gk20a_writel(g, gr_fecs_falcon_ecc_status_r(),
+                                gr_fecs_falcon_ecc_status_reset_task_f());
+                nvgpu_log(g, gpu_dbg_intr,
+                        "fecs ecc interrupt intr: 0x%x", intr);
+                if (ecc_status &
+                        gr_fecs_falcon_ecc_status_corrected_err_imem_m())
+                        nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
+                if (ecc_status &
+                        gr_fecs_falcon_ecc_status_uncorrected_err_imem_m())
+                        nvgpu_log(g, gpu_dbg_intr,
+                                                "imem ecc error uncorrected");
+                if (ecc_status &
+                        gr_fecs_falcon_ecc_status_corrected_err_dmem_m())
+                        nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected");
+                if (ecc_status &
+                        gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m())
+                        nvgpu_log(g, gpu_dbg_intr,
+                                                "dmem ecc error uncorrected");
+                nvgpu_log(g, gpu_dbg_intr,
+                        "ecc error row address: 0x%x",
+                        gr_fecs_falcon_ecc_address_row_address_v(ecc_addr));
+                nvgpu_log(g, gpu_dbg_intr,
+                        "ecc error count corrected: %d, uncorrected %d",
+                        gr_fecs_falcon_ecc_corrected_err_count_total_v(
+                                                        corrected_cnt),
+                        gr_fecs_falcon_ecc_uncorrected_err_count_total_v(
+                                                        uncorrected_cnt));
+        }
+}
 static int gr_gv11b_handle_fecs_error(struct gk20a *g,
                                struct channel_gk20a *__ch,
                                struct gr_gk20a_isr_data *isr_data)
@@ -1680,6 +1794,9 @@ static int gr_gv11b_handle_fecs_error(struct gk20a *g,
                gk20a_channel_put(ch);
        }
+        /* Handle ECC errors */
+        gr_gv11b_handle_fecs_ecc_error(g, gr_fecs_intr);
 clean_up:
        /* handle any remaining interrupts */
        return gk20a_gr_handle_fecs_error(g, __ch, isr_data);
@@ -2214,5 +2331,6 @@ void gv11b_init_gr(struct gpu_ops *gops)
        gops->gr.write_pm_ptr = gr_gv11b_write_pm_ptr;
        gops->gr.init_elcg_mode = gr_gv11b_init_elcg_mode;
        gops->gr.load_tpc_mask = gr_gv11b_load_tpc_mask;
+        gops->gr.handle_gpc_gpccs_exception =
+                        gr_gv11b_handle_gpc_gpccs_exception;
 }
author	David Nieto <dmartineznie@nvidia.com>	2017-05-12 14:07:00 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2017-05-24 07:55:42 -0400
commit	2173add7ae7210606afdaa56995a61d012b9a2f1 (patch)
tree	3e6f637ab0c4f2e28aa63823105764f39c774a85 /drivers/gpu/nvgpu/gv11b
parent	45ca7cb8c5774cfc15015973b1883faa1d93b9e6 (diff)

diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 014ba537..764374cc 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -634,6 +634,70 @@ static int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc,
634	return 0;	634	return 0;
635	}	635	}
636		636
		637	static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,
		638	u32 exception)
		639	{
		640	int ret = 0;
		641	u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
		642	int hww_esr;
		643	u32 offset = proj_gpc_stride_v() * gpc;
		644
		645	hww_esr = gk20a_readl(g, gr_gpc0_gpccs_hww_esr_r() + offset);
		646
		647	if (!(hww_esr & (gr_gpc0_gpccs_hww_esr_ecc_uncorrected_m() \|
		648	gr_gpc0_gpccs_hww_esr_ecc_corrected_m())))
		649	return ret;
		650
		651	ecc_status = gk20a_readl(g,
		652	gr_gpc0_gpccs_falcon_ecc_status_r() + offset);
		653	ecc_addr = gk20a_readl(g,
		654	gr_gpc0_gpccs_falcon_ecc_address_r() + offset);
		655	corrected_cnt = gk20a_readl(g,
		656	gr_gpc0_gpccs_falcon_ecc_corrected_err_count_r() + offset);
		657	uncorrected_cnt = gk20a_readl(g,
		658	gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_r() + offset);
		659
		660	/* clear the interrupt */
		661	gk20a_writel(g, gr_gpc0_gpccs_falcon_ecc_status_r() + offset,
		662	gr_gpc0_gpccs_falcon_ecc_status_reset_task_f());
		663
		664	nvgpu_log(g, gpu_dbg_intr,
		665	"gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
		666
		667	if (ecc_status & gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m())
		668	nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
		669	if (ecc_status &
		670	gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m())
		671	nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected");
		672	if (ecc_status &
		673	gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m())
		674	nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected");
		675	if (ecc_status &
		676	gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m())
		677	nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected");
		678
		679	nvgpu_log(g, gpu_dbg_intr,
		680	"ecc error row address: 0x%x",
		681	gr_gpc0_gpccs_falcon_ecc_address_row_address_v(ecc_addr));
		682
		683	nvgpu_log(g, gpu_dbg_intr,
		684	"ecc error count corrected: %d, uncorrected %d",
		685	gr_gpc0_gpccs_falcon_ecc_corrected_err_count_total_v(corrected_cnt),
		686	gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_total_v(uncorrected_cnt));
		687
		688	return ret;
		689	}
		690
		691	static int gr_gv11b_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
		692	u32 gpc_exception)
		693	{
		694	if (gpc_exception & gr_gpc0_gpccs_gpc_exception_gpccs_m())
		695	return gr_gv11b_handle_gpccs_ecc_exception(g, gpc,
		696	gpc_exception);
		697
		698	return 0;
		699	}
		700
637	static void gr_gv11b_enable_gpc_exceptions(struct gk20a *g)	701	static void gr_gv11b_enable_gpc_exceptions(struct gk20a *g)
638	{	702	{
639	struct gr_gk20a *gr = &g->gr;	703	struct gr_gk20a *gr = &g->gr;
@@ -646,7 +710,8 @@ static void gr_gv11b_enable_gpc_exceptions(struct gk20a *g)
646	gr_gpcs_gpccs_gpc_exception_en_tpc_f((1 << gr->tpc_count) - 1);	710	gr_gpcs_gpccs_gpc_exception_en_tpc_f((1 << gr->tpc_count) - 1);
647		711
648	gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(),	712	gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(),
649	(tpc_mask \| gr_gpcs_gpccs_gpc_exception_en_gcc_f(1)));	713	(tpc_mask \| gr_gpcs_gpccs_gpc_exception_en_gcc_f(1)
		714	gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1));
650	}	715	}
651		716
652	static int gr_gv11b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc,	717	static int gr_gv11b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc,
@@ -1622,6 +1687,55 @@ static int gr_gv11b_get_cilp_preempt_pending_chid(struct gk20a g, int __chid)
1622	return ret;	1687	return ret;
1623	}	1688	}
1624		1689
		1690	static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
		1691	{
		1692	u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
		1693
		1694	if (intr & (gr_fecs_host_int_status_ecc_uncorrected_m() \|
		1695	gr_fecs_host_int_status_ecc_corrected_m())) {
		1696	ecc_status = gk20a_readl(g, gr_fecs_falcon_ecc_status_r());
		1697	ecc_addr = gk20a_readl(g,
		1698	gr_fecs_falcon_ecc_address_r());
		1699	corrected_cnt = gk20a_readl(g,
		1700	gr_fecs_falcon_ecc_corrected_err_count_r());
		1701	uncorrected_cnt = gk20a_readl(g,
		1702	gr_fecs_falcon_ecc_uncorrected_err_count_r());
		1703
		1704	/* clear the interrupt */
		1705	gk20a_writel(g, gr_fecs_falcon_ecc_status_r(),
		1706	gr_fecs_falcon_ecc_status_reset_task_f());
		1707
		1708	nvgpu_log(g, gpu_dbg_intr,
		1709	"fecs ecc interrupt intr: 0x%x", intr);
		1710
		1711	if (ecc_status &
		1712	gr_fecs_falcon_ecc_status_corrected_err_imem_m())
		1713	nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
		1714	if (ecc_status &
		1715	gr_fecs_falcon_ecc_status_uncorrected_err_imem_m())
		1716	nvgpu_log(g, gpu_dbg_intr,
		1717	"imem ecc error uncorrected");
		1718	if (ecc_status &
		1719	gr_fecs_falcon_ecc_status_corrected_err_dmem_m())
		1720	nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected");
		1721	if (ecc_status &
		1722	gr_fecs_falcon_ecc_status_uncorrected_err_dmem_m())
		1723	nvgpu_log(g, gpu_dbg_intr,
		1724	"dmem ecc error uncorrected");
		1725
		1726	nvgpu_log(g, gpu_dbg_intr,
		1727	"ecc error row address: 0x%x",
		1728	gr_fecs_falcon_ecc_address_row_address_v(ecc_addr));
		1729
		1730	nvgpu_log(g, gpu_dbg_intr,
		1731	"ecc error count corrected: %d, uncorrected %d",
		1732	gr_fecs_falcon_ecc_corrected_err_count_total_v(
		1733	corrected_cnt),
		1734	gr_fecs_falcon_ecc_uncorrected_err_count_total_v(
		1735	uncorrected_cnt));
		1736	}
		1737	}
		1738
1625	static int gr_gv11b_handle_fecs_error(struct gk20a *g,	1739	static int gr_gv11b_handle_fecs_error(struct gk20a *g,
1626	struct channel_gk20a *__ch,	1740	struct channel_gk20a *__ch,
1627	struct gr_gk20a_isr_data *isr_data)	1741	struct gr_gk20a_isr_data *isr_data)
@@ -1680,6 +1794,9 @@ static int gr_gv11b_handle_fecs_error(struct gk20a *g,
1680	gk20a_channel_put(ch);	1794	gk20a_channel_put(ch);
1681	}	1795	}
1682		1796
		1797	/* Handle ECC errors */
		1798	gr_gv11b_handle_fecs_ecc_error(g, gr_fecs_intr);
		1799
1683	clean_up:	1800	clean_up:
1684	/* handle any remaining interrupts */	1801	/* handle any remaining interrupts */
1685	return gk20a_gr_handle_fecs_error(g, __ch, isr_data);	1802	return gk20a_gr_handle_fecs_error(g, __ch, isr_data);
@@ -2214,5 +2331,6 @@ void gv11b_init_gr(struct gpu_ops *gops)
2214	gops->gr.write_pm_ptr = gr_gv11b_write_pm_ptr;	2331	gops->gr.write_pm_ptr = gr_gv11b_write_pm_ptr;
2215	gops->gr.init_elcg_mode = gr_gv11b_init_elcg_mode;	2332	gops->gr.init_elcg_mode = gr_gv11b_init_elcg_mode;
2216	gops->gr.load_tpc_mask = gr_gv11b_load_tpc_mask;	2333	gops->gr.load_tpc_mask = gr_gv11b_load_tpc_mask;
2217		2334	gops->gr.handle_gpc_gpccs_exception =
		2335	gr_gv11b_handle_gpc_gpccs_exception;
2218	}	2336	}