diff options
author | David Nieto <dmartineznie@nvidia.com> | 2017-05-26 11:31:46 -0400 |
---|---|---|
committer | mobile promotions <svcmobile_promotions@nvidia.com> | 2017-06-04 23:34:57 -0400 |
commit | 345eaef6a76771da9c3e8a5e375fc9d659fb1b2b (patch) | |
tree | 21d2d25eae69ced2a39d62a56a4ee6f42e5c0655 /drivers/gpu/nvgpu/gv11b | |
parent | 6bc36bded05ee497a474e5a718c49dc33eb235f1 (diff) |
gpu: nvgpu: GPC MMU ECC support
Adding support for GPC MMU ECC error handling
JIRA: GPUT19X-112
Change-Id: I62083bf2f144ff628ecd8c0aefc8d227a233ff36
Signed-off-by: David Nieto <dmartineznie@nvidia.com>
Reviewed-on: http://git-master/r/1490772
Reviewed-by: svccoveritychecker <svccoveritychecker@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gv11b')
-rw-r--r-- | drivers/gpu/nvgpu/gv11b/ecc_gv11b.h | 2 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 105 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c | 25 |
3 files changed, 128 insertions, 4 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h b/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h index 4e1696f7..70b1bab8 100644 --- a/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/ecc_gv11b.h | |||
@@ -31,6 +31,8 @@ struct ecc_gr_t19x { | |||
31 | struct gk20a_ecc_stat fecs_uncorrected_err_count; | 31 | struct gk20a_ecc_stat fecs_uncorrected_err_count; |
32 | struct gk20a_ecc_stat gpccs_corrected_err_count; | 32 | struct gk20a_ecc_stat gpccs_corrected_err_count; |
33 | struct gk20a_ecc_stat gpccs_uncorrected_err_count; | 33 | struct gk20a_ecc_stat gpccs_uncorrected_err_count; |
34 | struct gk20a_ecc_stat mmu_l1tlb_corrected_err_count; | ||
35 | struct gk20a_ecc_stat mmu_l1tlb_uncorrected_err_count; | ||
34 | }; | 36 | }; |
35 | 37 | ||
36 | struct ecc_ltc_t19x { | 38 | struct ecc_ltc_t19x { |
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 8176b807..701b840a 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c | |||
@@ -658,16 +658,101 @@ static int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc, | |||
658 | return 0; | 658 | return 0; |
659 | } | 659 | } |
660 | 660 | ||
661 | static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc, | 661 | static int gr_gv11b_handle_gpcmmu_ecc_exception(struct gk20a *g, u32 gpc, |
662 | u32 exception) | 662 | u32 exception) |
663 | { | 663 | { |
664 | int ret = 0; | 664 | int ret = 0; |
665 | u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); | ||
666 | u32 offset = gpc_stride * gpc; | ||
665 | u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt; | 667 | u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt; |
666 | u32 corrected_delta, uncorrected_delta; | 668 | u32 corrected_delta, uncorrected_delta; |
667 | u32 corrected_overflow, uncorrected_overflow; | 669 | u32 corrected_overflow, uncorrected_overflow; |
670 | int hww_esr; | ||
671 | |||
672 | hww_esr = gk20a_readl(g, gr_gpc0_mmu_gpcmmu_global_esr_r() + offset); | ||
673 | |||
674 | if (!(hww_esr & (gr_gpc0_mmu_gpcmmu_global_esr_ecc_corrected_m() | | ||
675 | gr_gpc0_mmu_gpcmmu_global_esr_ecc_uncorrected_m()))) | ||
676 | return ret; | ||
677 | |||
678 | ecc_status = gk20a_readl(g, | ||
679 | gr_gpc0_mmu_l1tlb_ecc_status_r() + offset); | ||
680 | ecc_addr = gk20a_readl(g, | ||
681 | gr_gpc0_mmu_l1tlb_ecc_address_r() + offset); | ||
682 | corrected_cnt = gk20a_readl(g, | ||
683 | gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() + offset); | ||
684 | uncorrected_cnt = gk20a_readl(g, | ||
685 | gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() + offset); | ||
686 | |||
687 | corrected_delta = gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_v( | ||
688 | corrected_cnt); | ||
689 | uncorrected_delta = gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_v( | ||
690 | uncorrected_cnt); | ||
691 | corrected_overflow = ecc_status & | ||
692 | gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_total_counter_overflow_m(); | ||
693 | |||
694 | uncorrected_overflow = ecc_status & | ||
695 | gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_total_counter_overflow_m(); | ||
696 | |||
697 | |||
698 | /* clear the interrupt */ | ||
699 | if ((corrected_delta > 0) || corrected_overflow) | ||
700 | gk20a_writel(g, | ||
701 | gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() + | ||
702 | offset, 0); | ||
703 | if ((uncorrected_delta > 0) || uncorrected_overflow) | ||
704 | gk20a_writel(g, | ||
705 | gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() + | ||
706 | offset, 0); | ||
707 | |||
708 | gk20a_writel(g, gr_gpc0_mmu_l1tlb_ecc_status_r() + offset, | ||
709 | gr_gpc0_mmu_l1tlb_ecc_status_reset_task_f()); | ||
710 | |||
711 | /* Handle overflow */ | ||
712 | if (corrected_overflow) | ||
713 | corrected_delta += (0x1UL << gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_s()); | ||
714 | if (uncorrected_overflow) | ||
715 | uncorrected_delta += (0x1UL << gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s()); | ||
668 | 716 | ||
717 | |||
718 | g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count.counters[gpc] += | ||
719 | corrected_delta; | ||
720 | g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count.counters[gpc] += | ||
721 | uncorrected_delta; | ||
722 | nvgpu_log(g, gpu_dbg_intr, | ||
723 | "mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr); | ||
724 | |||
725 | if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) | ||
726 | nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error"); | ||
727 | if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) | ||
728 | nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error"); | ||
729 | if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) | ||
730 | nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error"); | ||
731 | if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) | ||
732 | nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error"); | ||
733 | if (corrected_overflow || uncorrected_overflow) | ||
734 | nvgpu_info(g, "mmu l1tlb ecc counter overflow!"); | ||
735 | |||
736 | nvgpu_log(g, gpu_dbg_intr, | ||
737 | "ecc error address: 0x%x", ecc_addr); | ||
738 | nvgpu_log(g, gpu_dbg_intr, | ||
739 | "ecc error count corrected: %d, uncorrected %d", | ||
740 | g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count.counters[gpc], | ||
741 | g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count.counters[gpc]); | ||
742 | |||
743 | return ret; | ||
744 | } | ||
745 | |||
746 | static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc, | ||
747 | u32 exception) | ||
748 | { | ||
749 | int ret = 0; | ||
750 | u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); | ||
751 | u32 offset = gpc_stride * gpc; | ||
752 | u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt; | ||
753 | u32 corrected_delta, uncorrected_delta; | ||
754 | u32 corrected_overflow, uncorrected_overflow; | ||
669 | int hww_esr; | 755 | int hww_esr; |
670 | u32 offset = proj_gpc_stride_v() * gpc; | ||
671 | 756 | ||
672 | hww_esr = gk20a_readl(g, gr_gpc0_gpccs_hww_esr_r() + offset); | 757 | hww_esr = gk20a_readl(g, gr_gpc0_gpccs_hww_esr_r() + offset); |
673 | 758 | ||
@@ -741,6 +826,15 @@ static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc, | |||
741 | return ret; | 826 | return ret; |
742 | } | 827 | } |
743 | 828 | ||
829 | static int gr_gv11b_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc, | ||
830 | u32 gpc_exception) | ||
831 | { | ||
832 | if (gpc_exception & gr_gpc0_gpccs_gpc_exception_gpcmmu_m()) | ||
833 | return gr_gv11b_handle_gpcmmu_ecc_exception(g, gpc, | ||
834 | gpc_exception); | ||
835 | return 0; | ||
836 | } | ||
837 | |||
744 | static int gr_gv11b_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc, | 838 | static int gr_gv11b_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc, |
745 | u32 gpc_exception) | 839 | u32 gpc_exception) |
746 | { | 840 | { |
@@ -764,7 +858,8 @@ static void gr_gv11b_enable_gpc_exceptions(struct gk20a *g) | |||
764 | 858 | ||
765 | gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(), | 859 | gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(), |
766 | (tpc_mask | gr_gpcs_gpccs_gpc_exception_en_gcc_f(1) | | 860 | (tpc_mask | gr_gpcs_gpccs_gpc_exception_en_gcc_f(1) | |
767 | gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1))); | 861 | gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1) | |
862 | gr_gpcs_gpccs_gpc_exception_en_gpcmmu_f(1))); | ||
768 | } | 863 | } |
769 | 864 | ||
770 | static int gr_gv11b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc, | 865 | static int gr_gv11b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc, |
@@ -1810,7 +1905,7 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr) | |||
1810 | nvgpu_log(g, gpu_dbg_intr, | 1905 | nvgpu_log(g, gpu_dbg_intr, |
1811 | "dmem ecc error uncorrected"); | 1906 | "dmem ecc error uncorrected"); |
1812 | if (corrected_overflow || uncorrected_overflow) | 1907 | if (corrected_overflow || uncorrected_overflow) |
1813 | nvgpu_info(g, "gpccs ecc counter overflow!"); | 1908 | nvgpu_info(g, "fecs ecc counter overflow!"); |
1814 | 1909 | ||
1815 | nvgpu_log(g, gpu_dbg_intr, | 1910 | nvgpu_log(g, gpu_dbg_intr, |
1816 | "ecc error row address: 0x%x", | 1911 | "ecc error row address: 0x%x", |
@@ -2422,4 +2517,6 @@ void gv11b_init_gr(struct gpu_ops *gops) | |||
2422 | gops->gr.handle_gpc_gpccs_exception = | 2517 | gops->gr.handle_gpc_gpccs_exception = |
2423 | gr_gv11b_handle_gpc_gpccs_exception; | 2518 | gr_gv11b_handle_gpc_gpccs_exception; |
2424 | gops->gr.set_czf_bypass = NULL; | 2519 | gops->gr.set_czf_bypass = NULL; |
2520 | gops->gr.handle_gpc_gpcmmu_exception = | ||
2521 | gr_gv11b_handle_gpc_gpcmmu_exception; | ||
2425 | } | 2522 | } |
diff --git a/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c b/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c index 432af7c1..c69e1478 100644 --- a/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c +++ b/drivers/gpu/nvgpu/gv11b/platform_gv11b_tegra.c | |||
@@ -171,6 +171,8 @@ static struct device_attribute *dev_attr_sm_icache_ecc_corrected_err_count_array | |||
171 | static struct device_attribute *dev_attr_sm_icache_ecc_uncorrected_err_count_array; | 171 | static struct device_attribute *dev_attr_sm_icache_ecc_uncorrected_err_count_array; |
172 | static struct device_attribute *dev_attr_gcc_l15_ecc_corrected_err_count_array; | 172 | static struct device_attribute *dev_attr_gcc_l15_ecc_corrected_err_count_array; |
173 | static struct device_attribute *dev_attr_gcc_l15_ecc_uncorrected_err_count_array; | 173 | static struct device_attribute *dev_attr_gcc_l15_ecc_uncorrected_err_count_array; |
174 | static struct device_attribute *dev_attr_mmu_l1tlb_ecc_corrected_err_count_array; | ||
175 | static struct device_attribute *dev_attr_mmu_l1tlb_ecc_uncorrected_err_count_array; | ||
174 | 176 | ||
175 | static struct device_attribute *dev_attr_fecs_ecc_corrected_err_count_array; | 177 | static struct device_attribute *dev_attr_fecs_ecc_corrected_err_count_array; |
176 | static struct device_attribute *dev_attr_fecs_ecc_uncorrected_err_count_array; | 178 | static struct device_attribute *dev_attr_fecs_ecc_uncorrected_err_count_array; |
@@ -295,6 +297,19 @@ void gr_gv11b_create_sysfs(struct device *dev) | |||
295 | &g->ecc.gr.t19x.gpccs_corrected_err_count, | 297 | &g->ecc.gr.t19x.gpccs_corrected_err_count, |
296 | dev_attr_gpccs_ecc_corrected_err_count_array); | 298 | dev_attr_gpccs_ecc_corrected_err_count_array); |
297 | 299 | ||
300 | error |= gp10b_ecc_stat_create(dev, | ||
301 | g->gr.gpc_count, | ||
302 | "gpc", | ||
303 | "mmu_l1tlb_ecc_uncorrected_err_count", | ||
304 | &g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count, | ||
305 | dev_attr_mmu_l1tlb_ecc_uncorrected_err_count_array); | ||
306 | |||
307 | error |= gp10b_ecc_stat_create(dev, | ||
308 | g->gr.gpc_count, | ||
309 | "gpc", | ||
310 | "mmu_l1tlb_ecc_corrected_err_count", | ||
311 | &g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count, | ||
312 | dev_attr_mmu_l1tlb_ecc_corrected_err_count_array); | ||
298 | if (error) | 313 | if (error) |
299 | dev_err(dev, "Failed to create gv11b sysfs attributes!\n"); | 314 | dev_err(dev, "Failed to create gv11b sysfs attributes!\n"); |
300 | } | 315 | } |
@@ -382,4 +397,14 @@ static void gr_gv11b_remove_sysfs(struct device *dev) | |||
382 | g->gr.gpc_count, | 397 | g->gr.gpc_count, |
383 | &g->ecc.gr.t19x.gpccs_corrected_err_count, | 398 | &g->ecc.gr.t19x.gpccs_corrected_err_count, |
384 | dev_attr_gpccs_ecc_corrected_err_count_array); | 399 | dev_attr_gpccs_ecc_corrected_err_count_array); |
400 | |||
401 | gp10b_ecc_stat_remove(dev, | ||
402 | g->gr.gpc_count, | ||
403 | &g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count, | ||
404 | dev_attr_mmu_l1tlb_ecc_uncorrected_err_count_array); | ||
405 | |||
406 | gp10b_ecc_stat_remove(dev, | ||
407 | g->gr.gpc_count, | ||
408 | &g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count, | ||
409 | dev_attr_mmu_l1tlb_ecc_corrected_err_count_array); | ||
385 | } | 410 | } |