gpu: nvgpu: gv11b: PMU parity HWW ECC support

Adding support for ISR handling of ECC parity errors for PMU unit and setting the initial IRQDST mask to deliver ECC interrupts to host in the non-stall PMU irq path JIRA: GPUT19X-83 Change-Id: I8efae6777811893ecce79d0e32ba81b62c27b1ef Signed-off-by: David Nieto <dmartineznie@nvidia.com> Signed-off-by: Richard Zhao <rizhao@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1611625 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: David Nieto <dmartineznie@nvidia.com> 2017-12-05 18:20:18 -0500
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2017-12-11 19:42:01 -0500
commit: 258ae4471296bcee03987778e3b7c79d3a027e53 (patch)
tree: a4890fa3a54b1857ba5c6ff3d770f84733b95154 /drivers/gpu/nvgpu/gv11b
parent: ba69628aafefcf4567f2f3b1459ccc4ebd8e203f (diff)
4 files changed, 123 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h b/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h
index 94b25c02..ebce46ce 100644
--- a/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h
@@ -59,6 +59,8 @@ struct ecc_eng_t19x {
        struct gk20a_ecc_stat mmu_fillunit_corrected_err_count;
        struct gk20a_ecc_stat mmu_fillunit_uncorrected_err_count;
        /* PMU */
+        struct gk20a_ecc_stat pmu_corrected_err_count;
+        struct gk20a_ecc_stat pmu_uncorrected_err_count;
 };
 #endif
diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
index 6a21eb2d..f6bdf6e5 100644
--- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
@@ -611,6 +611,8 @@ static const struct gpu_ops gv11b_ops = {
                .pmu_nsbootstrap = gv11b_pmu_bootstrap,
                .pmu_pg_set_sub_feature_mask = gv11b_pg_set_subfeature_mask,
                .is_pmu_supported = gv11b_is_pmu_supported,
+                .get_irqdest = gv11b_pmu_get_irqdest,
+                .handle_ext_irq = gv11b_pmu_handle_ext_irq,
        },
        .regops = {
                .get_global_whitelist_ranges =
diff --git a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
index 4b244f5a..a972510f 100644
--- a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
@@ -287,6 +287,123 @@ int gv11b_pmu_bootstrap(struct nvgpu_pmu *pmu)
        return 0;
 }
+void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0)
+{
+        u32 intr1;
+        u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
+        u32 corrected_delta, uncorrected_delta;
+        u32 corrected_overflow, uncorrected_overflow;
+        /*
+         * handle the ECC interrupt
+         */
+        if (intr0 & pwr_falcon_irqstat_ext_ecc_parity_true_f()) {
+                intr1 = gk20a_readl(g, pwr_pmu_ecc_intr_status_r());
+                if (intr1 & (pwr_pmu_ecc_intr_status_corrected_m() |
+                             pwr_pmu_ecc_intr_status_uncorrected_m())) {
+                        ecc_status = gk20a_readl(g,
+                                pwr_pmu_falcon_ecc_status_r());
+                        ecc_addr = gk20a_readl(g,
+                                pwr_pmu_falcon_ecc_address_r());
+                        corrected_cnt = gk20a_readl(g,
+                                pwr_pmu_falcon_ecc_corrected_err_count_r());
+                        uncorrected_cnt = gk20a_readl(g,
+                                pwr_pmu_falcon_ecc_uncorrected_err_count_r());
+                        corrected_delta =
+                                pwr_pmu_falcon_ecc_corrected_err_count_total_v(corrected_cnt);
+                        uncorrected_delta =
+                                pwr_pmu_falcon_ecc_uncorrected_err_count_total_v(uncorrected_cnt);
+                        corrected_overflow = ecc_status &
+                                pwr_pmu_falcon_ecc_status_corrected_err_total_counter_overflow_m();
+                        uncorrected_overflow = ecc_status &
+                                pwr_pmu_falcon_ecc_status_uncorrected_err_total_counter_overflow_m();
+                        corrected_overflow = ecc_status &
+                                pwr_pmu_falcon_ecc_status_corrected_err_total_counter_overflow_m();
+                        /* clear the interrupt */
+                        if ((intr1 & pwr_pmu_ecc_intr_status_corrected_m()) ||
+                                                        corrected_overflow) {
+                                gk20a_writel(g, pwr_pmu_falcon_ecc_corrected_err_count_r(), 0);
+                        }
+                        if ((intr1 & pwr_pmu_ecc_intr_status_uncorrected_m()) ||
+                                                        uncorrected_overflow) {
+                                gk20a_writel(g,
+                                        pwr_pmu_falcon_ecc_uncorrected_err_count_r(), 0);
+                        }
+                        gk20a_writel(g, pwr_pmu_falcon_ecc_status_r(),
+                                pwr_pmu_falcon_ecc_status_reset_task_f());
+                        /* update counters per slice */
+                        if (corrected_overflow)
+                                corrected_delta += (0x1UL << pwr_pmu_falcon_ecc_corrected_err_count_total_s());
+                        if (uncorrected_overflow)
+                                uncorrected_delta += (0x1UL << pwr_pmu_falcon_ecc_uncorrected_err_count_total_s());
+                        g->ecc.eng.t19x.pmu_corrected_err_count.counters[0] += corrected_delta;
+                        g->ecc.eng.t19x.pmu_uncorrected_err_count.counters[0] += uncorrected_delta;
+                        nvgpu_log(g, gpu_dbg_intr,
+                                "pmu ecc interrupt intr1: 0x%x", intr1);
+                        if (ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m())
+                                nvgpu_log(g, gpu_dbg_intr,
+                                        "imem ecc error corrected");
+                        if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m())
+                                nvgpu_log(g, gpu_dbg_intr,
+                                        "imem ecc error uncorrected");
+                        if (ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_dmem_m())
+                                nvgpu_log(g, gpu_dbg_intr,
+                                        "dmem ecc error corrected");
+                        if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m())
+                                nvgpu_log(g, gpu_dbg_intr,
+                                        "dmem ecc error uncorrected");
+                        if (corrected_overflow || uncorrected_overflow)
+                                nvgpu_info(g, "ecc counter overflow!");
+                        nvgpu_log(g, gpu_dbg_intr,
+                                "ecc error row address: 0x%x",
+                                pwr_pmu_falcon_ecc_address_row_address_v(ecc_addr));
+                        nvgpu_log(g, gpu_dbg_intr,
+                                "ecc error count corrected: %d, uncorrected %d",
+                                g->ecc.eng.t19x.pmu_corrected_err_count.counters[0],
+                                g->ecc.eng.t19x.pmu_uncorrected_err_count.counters[0]);
+                }
+        }
+}
+u32 gv11b_pmu_get_irqdest(struct gk20a *g)
+{
+        u32 intr_dest;
+        /* dest 0=falcon, 1=host; level 0=irq0, 1=irq1 */
+        intr_dest = pwr_falcon_irqdest_host_gptmr_f(0)      |
+                pwr_falcon_irqdest_host_wdtmr_f(1)          |
+                pwr_falcon_irqdest_host_mthd_f(0)           |
+                pwr_falcon_irqdest_host_ctxsw_f(0)          |
+                pwr_falcon_irqdest_host_halt_f(1)           |
+                pwr_falcon_irqdest_host_exterr_f(0)         |
+                pwr_falcon_irqdest_host_swgen0_f(1)         |
+                pwr_falcon_irqdest_host_swgen1_f(0)         |
+                pwr_falcon_irqdest_host_ext_ecc_parity_f(1) |
+                pwr_falcon_irqdest_target_gptmr_f(1)        |
+                pwr_falcon_irqdest_target_wdtmr_f(0)        |
+                pwr_falcon_irqdest_target_mthd_f(0)         |
+                pwr_falcon_irqdest_target_ctxsw_f(0)        |
+                pwr_falcon_irqdest_target_halt_f(0)         |
+                pwr_falcon_irqdest_target_exterr_f(0)       |
+                pwr_falcon_irqdest_target_swgen0_f(0)       |
+                pwr_falcon_irqdest_target_swgen1_f(0)       |
+                pwr_falcon_irqdest_target_ext_ecc_parity_f(0);
+        return intr_dest;
+}
 static void pmu_handle_pg_sub_feature_msg(struct gk20a *g, struct pmu_msg *msg,
                        void *param, u32 handle, u32 status)
 {
diff --git a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.h b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.h
index e917188d..dd6db10c 100644
--- a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.h
@@ -35,4 +35,6 @@ bool gv11b_is_lazy_bootstrap(u32 falcon_id);
 bool gv11b_is_priv_load(u32 falcon_id);
 int gv11b_pmu_setup_elpg(struct gk20a *g);
+u32 gv11b_pmu_get_irqdest(struct gk20a *g);
+void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0);
 #endif /*__PMU_GV11B_H_*/
author	David Nieto <dmartineznie@nvidia.com>	2017-12-05 18:20:18 -0500
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2017-12-11 19:42:01 -0500
commit	258ae4471296bcee03987778e3b7c79d3a027e53 (patch)
tree	a4890fa3a54b1857ba5c6ff3d770f84733b95154 /drivers/gpu/nvgpu/gv11b
parent	ba69628aafefcf4567f2f3b1459ccc4ebd8e203f (diff)

diff --git a/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h b/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h index 94b25c02..ebce46ce 100644 --- a/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h
@@ -59,6 +59,8 @@ struct ecc_eng_t19x {
59	struct gk20a_ecc_stat mmu_fillunit_corrected_err_count;	59	struct gk20a_ecc_stat mmu_fillunit_corrected_err_count;
60	struct gk20a_ecc_stat mmu_fillunit_uncorrected_err_count;	60	struct gk20a_ecc_stat mmu_fillunit_uncorrected_err_count;
61	/* PMU */	61	/* PMU */
		62	struct gk20a_ecc_stat pmu_corrected_err_count;
		63	struct gk20a_ecc_stat pmu_uncorrected_err_count;
62	};	64	};
63		65
64	#endif	66	#endif


diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c index 6a21eb2d..f6bdf6e5 100644 --- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
@@ -611,6 +611,8 @@ static const struct gpu_ops gv11b_ops = {
611	.pmu_nsbootstrap = gv11b_pmu_bootstrap,	611	.pmu_nsbootstrap = gv11b_pmu_bootstrap,
612	.pmu_pg_set_sub_feature_mask = gv11b_pg_set_subfeature_mask,	612	.pmu_pg_set_sub_feature_mask = gv11b_pg_set_subfeature_mask,
613	.is_pmu_supported = gv11b_is_pmu_supported,	613	.is_pmu_supported = gv11b_is_pmu_supported,
		614	.get_irqdest = gv11b_pmu_get_irqdest,
		615	.handle_ext_irq = gv11b_pmu_handle_ext_irq,
614	},	616	},
615	.regops = {	617	.regops = {
616	.get_global_whitelist_ranges =	618	.get_global_whitelist_ranges =


diff --git a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c index 4b244f5a..a972510f 100644 --- a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.c
@@ -287,6 +287,123 @@ int gv11b_pmu_bootstrap(struct nvgpu_pmu *pmu)
287	return 0;	287	return 0;
288	}	288	}
289		289
		290	void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0)
		291	{
		292	u32 intr1;
		293	u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
		294	u32 corrected_delta, uncorrected_delta;
		295	u32 corrected_overflow, uncorrected_overflow;
		296
		297	/*
		298	* handle the ECC interrupt
		299	*/
		300	if (intr0 & pwr_falcon_irqstat_ext_ecc_parity_true_f()) {
		301	intr1 = gk20a_readl(g, pwr_pmu_ecc_intr_status_r());
		302	if (intr1 & (pwr_pmu_ecc_intr_status_corrected_m() \|
		303	pwr_pmu_ecc_intr_status_uncorrected_m())) {
		304
		305	ecc_status = gk20a_readl(g,
		306	pwr_pmu_falcon_ecc_status_r());
		307	ecc_addr = gk20a_readl(g,
		308	pwr_pmu_falcon_ecc_address_r());
		309	corrected_cnt = gk20a_readl(g,
		310	pwr_pmu_falcon_ecc_corrected_err_count_r());
		311	uncorrected_cnt = gk20a_readl(g,
		312	pwr_pmu_falcon_ecc_uncorrected_err_count_r());
		313
		314	corrected_delta =
		315	pwr_pmu_falcon_ecc_corrected_err_count_total_v(corrected_cnt);
		316	uncorrected_delta =
		317	pwr_pmu_falcon_ecc_uncorrected_err_count_total_v(uncorrected_cnt);
		318	corrected_overflow = ecc_status &
		319	pwr_pmu_falcon_ecc_status_corrected_err_total_counter_overflow_m();
		320
		321	uncorrected_overflow = ecc_status &
		322	pwr_pmu_falcon_ecc_status_uncorrected_err_total_counter_overflow_m();
		323	corrected_overflow = ecc_status &
		324	pwr_pmu_falcon_ecc_status_corrected_err_total_counter_overflow_m();
		325
		326	/* clear the interrupt */
		327	if ((intr1 & pwr_pmu_ecc_intr_status_corrected_m()) \|\|
		328	corrected_overflow) {
		329	gk20a_writel(g, pwr_pmu_falcon_ecc_corrected_err_count_r(), 0);
		330	}
		331	if ((intr1 & pwr_pmu_ecc_intr_status_uncorrected_m()) \|\|
		332	uncorrected_overflow) {
		333	gk20a_writel(g,
		334	pwr_pmu_falcon_ecc_uncorrected_err_count_r(), 0);
		335	}
		336
		337	gk20a_writel(g, pwr_pmu_falcon_ecc_status_r(),
		338	pwr_pmu_falcon_ecc_status_reset_task_f());
		339
		340	/* update counters per slice */
		341	if (corrected_overflow)
		342	corrected_delta += (0x1UL << pwr_pmu_falcon_ecc_corrected_err_count_total_s());
		343	if (uncorrected_overflow)
		344	uncorrected_delta += (0x1UL << pwr_pmu_falcon_ecc_uncorrected_err_count_total_s());
		345
		346	g->ecc.eng.t19x.pmu_corrected_err_count.counters[0] += corrected_delta;
		347	g->ecc.eng.t19x.pmu_uncorrected_err_count.counters[0] += uncorrected_delta;
		348
		349	nvgpu_log(g, gpu_dbg_intr,
		350	"pmu ecc interrupt intr1: 0x%x", intr1);
		351
		352	if (ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_imem_m())
		353	nvgpu_log(g, gpu_dbg_intr,
		354	"imem ecc error corrected");
		355	if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_imem_m())
		356	nvgpu_log(g, gpu_dbg_intr,
		357	"imem ecc error uncorrected");
		358	if (ecc_status & pwr_pmu_falcon_ecc_status_corrected_err_dmem_m())
		359	nvgpu_log(g, gpu_dbg_intr,
		360	"dmem ecc error corrected");
		361	if (ecc_status & pwr_pmu_falcon_ecc_status_uncorrected_err_dmem_m())
		362	nvgpu_log(g, gpu_dbg_intr,
		363	"dmem ecc error uncorrected");
		364
		365	if (corrected_overflow \|\| uncorrected_overflow)
		366	nvgpu_info(g, "ecc counter overflow!");
		367
		368	nvgpu_log(g, gpu_dbg_intr,
		369	"ecc error row address: 0x%x",
		370	pwr_pmu_falcon_ecc_address_row_address_v(ecc_addr));
		371
		372	nvgpu_log(g, gpu_dbg_intr,
		373	"ecc error count corrected: %d, uncorrected %d",
		374	g->ecc.eng.t19x.pmu_corrected_err_count.counters[0],
		375	g->ecc.eng.t19x.pmu_uncorrected_err_count.counters[0]);
		376	}
		377	}
		378	}
		379
		380	u32 gv11b_pmu_get_irqdest(struct gk20a *g)
		381	{
		382	u32 intr_dest;
		383
		384	/* dest 0=falcon, 1=host; level 0=irq0, 1=irq1 */
		385	intr_dest = pwr_falcon_irqdest_host_gptmr_f(0) \|
		386	pwr_falcon_irqdest_host_wdtmr_f(1) \|
		387	pwr_falcon_irqdest_host_mthd_f(0) \|
		388	pwr_falcon_irqdest_host_ctxsw_f(0) \|
		389	pwr_falcon_irqdest_host_halt_f(1) \|
		390	pwr_falcon_irqdest_host_exterr_f(0) \|
		391	pwr_falcon_irqdest_host_swgen0_f(1) \|
		392	pwr_falcon_irqdest_host_swgen1_f(0) \|
		393	pwr_falcon_irqdest_host_ext_ecc_parity_f(1) \|
		394	pwr_falcon_irqdest_target_gptmr_f(1) \|
		395	pwr_falcon_irqdest_target_wdtmr_f(0) \|
		396	pwr_falcon_irqdest_target_mthd_f(0) \|
		397	pwr_falcon_irqdest_target_ctxsw_f(0) \|
		398	pwr_falcon_irqdest_target_halt_f(0) \|
		399	pwr_falcon_irqdest_target_exterr_f(0) \|
		400	pwr_falcon_irqdest_target_swgen0_f(0) \|
		401	pwr_falcon_irqdest_target_swgen1_f(0) \|
		402	pwr_falcon_irqdest_target_ext_ecc_parity_f(0);
		403
		404	return intr_dest;
		405	}
		406
290	static void pmu_handle_pg_sub_feature_msg(struct gk20a g, struct pmu_msg msg,	407	static void pmu_handle_pg_sub_feature_msg(struct gk20a g, struct pmu_msg msg,
291	void *param, u32 handle, u32 status)	408	void *param, u32 handle, u32 status)
292	{	409	{


diff --git a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.h b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.h index e917188d..dd6db10c 100644 --- a/drivers/gpu/nvgpu/gv11b/pmu_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/pmu_gv11b.h
@@ -35,4 +35,6 @@ bool gv11b_is_lazy_bootstrap(u32 falcon_id);
35	bool gv11b_is_priv_load(u32 falcon_id);	35	bool gv11b_is_priv_load(u32 falcon_id);
36	int gv11b_pmu_setup_elpg(struct gk20a *g);	36	int gv11b_pmu_setup_elpg(struct gk20a *g);
37		37
		38	u32 gv11b_pmu_get_irqdest(struct gk20a *g);
		39	void gv11b_pmu_handle_ext_irq(struct gk20a *g, u32 intr0);
38	#endif /__PMU_GV11B_H_/	40	#endif /__PMU_GV11B_H_/