summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
diff options
context:
space:
mode:
authorDavid Nieto <dmartineznie@nvidia.com>2017-05-26 11:31:46 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2017-06-04 23:34:57 -0400
commit345eaef6a76771da9c3e8a5e375fc9d659fb1b2b (patch)
tree21d2d25eae69ced2a39d62a56a4ee6f42e5c0655 /drivers/gpu/nvgpu/gv11b/gr_gv11b.c
parent6bc36bded05ee497a474e5a718c49dc33eb235f1 (diff)
gpu: nvgpu: GPC MMU ECC support
Adding support for GPC MMU ECC error handling JIRA: GPUT19X-112 Change-Id: I62083bf2f144ff628ecd8c0aefc8d227a233ff36 Signed-off-by: David Nieto <dmartineznie@nvidia.com> Reviewed-on: http://git-master/r/1490772 Reviewed-by: svccoveritychecker <svccoveritychecker@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gv11b/gr_gv11b.c')
-rw-r--r--drivers/gpu/nvgpu/gv11b/gr_gv11b.c105
1 files changed, 101 insertions, 4 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index 8176b807..701b840a 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -658,16 +658,101 @@ static int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc,
658 return 0; 658 return 0;
659} 659}
660 660
661static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc, 661static int gr_gv11b_handle_gpcmmu_ecc_exception(struct gk20a *g, u32 gpc,
662 u32 exception) 662 u32 exception)
663{ 663{
664 int ret = 0; 664 int ret = 0;
665 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
666 u32 offset = gpc_stride * gpc;
665 u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt; 667 u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
666 u32 corrected_delta, uncorrected_delta; 668 u32 corrected_delta, uncorrected_delta;
667 u32 corrected_overflow, uncorrected_overflow; 669 u32 corrected_overflow, uncorrected_overflow;
670 int hww_esr;
671
672 hww_esr = gk20a_readl(g, gr_gpc0_mmu_gpcmmu_global_esr_r() + offset);
673
674 if (!(hww_esr & (gr_gpc0_mmu_gpcmmu_global_esr_ecc_corrected_m() |
675 gr_gpc0_mmu_gpcmmu_global_esr_ecc_uncorrected_m())))
676 return ret;
677
678 ecc_status = gk20a_readl(g,
679 gr_gpc0_mmu_l1tlb_ecc_status_r() + offset);
680 ecc_addr = gk20a_readl(g,
681 gr_gpc0_mmu_l1tlb_ecc_address_r() + offset);
682 corrected_cnt = gk20a_readl(g,
683 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() + offset);
684 uncorrected_cnt = gk20a_readl(g,
685 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() + offset);
686
687 corrected_delta = gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_v(
688 corrected_cnt);
689 uncorrected_delta = gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_v(
690 uncorrected_cnt);
691 corrected_overflow = ecc_status &
692 gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_total_counter_overflow_m();
693
694 uncorrected_overflow = ecc_status &
695 gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_total_counter_overflow_m();
696
697
698 /* clear the interrupt */
699 if ((corrected_delta > 0) || corrected_overflow)
700 gk20a_writel(g,
701 gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() +
702 offset, 0);
703 if ((uncorrected_delta > 0) || uncorrected_overflow)
704 gk20a_writel(g,
705 gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() +
706 offset, 0);
707
708 gk20a_writel(g, gr_gpc0_mmu_l1tlb_ecc_status_r() + offset,
709 gr_gpc0_mmu_l1tlb_ecc_status_reset_task_f());
710
711 /* Handle overflow */
712 if (corrected_overflow)
713 corrected_delta += (0x1UL << gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_s());
714 if (uncorrected_overflow)
715 uncorrected_delta += (0x1UL << gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s());
668 716
717
718 g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count.counters[gpc] +=
719 corrected_delta;
720 g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count.counters[gpc] +=
721 uncorrected_delta;
722 nvgpu_log(g, gpu_dbg_intr,
723 "mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
724
725 if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m())
726 nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error");
727 if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m())
728 nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error");
729 if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m())
730 nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error");
731 if (ecc_status & gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m())
732 nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error");
733 if (corrected_overflow || uncorrected_overflow)
734 nvgpu_info(g, "mmu l1tlb ecc counter overflow!");
735
736 nvgpu_log(g, gpu_dbg_intr,
737 "ecc error address: 0x%x", ecc_addr);
738 nvgpu_log(g, gpu_dbg_intr,
739 "ecc error count corrected: %d, uncorrected %d",
740 g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count.counters[gpc],
741 g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count.counters[gpc]);
742
743 return ret;
744}
745
746static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,
747 u32 exception)
748{
749 int ret = 0;
750 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
751 u32 offset = gpc_stride * gpc;
752 u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
753 u32 corrected_delta, uncorrected_delta;
754 u32 corrected_overflow, uncorrected_overflow;
669 int hww_esr; 755 int hww_esr;
670 u32 offset = proj_gpc_stride_v() * gpc;
671 756
672 hww_esr = gk20a_readl(g, gr_gpc0_gpccs_hww_esr_r() + offset); 757 hww_esr = gk20a_readl(g, gr_gpc0_gpccs_hww_esr_r() + offset);
673 758
@@ -741,6 +826,15 @@ static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,
741 return ret; 826 return ret;
742} 827}
743 828
829static int gr_gv11b_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
830 u32 gpc_exception)
831{
832 if (gpc_exception & gr_gpc0_gpccs_gpc_exception_gpcmmu_m())
833 return gr_gv11b_handle_gpcmmu_ecc_exception(g, gpc,
834 gpc_exception);
835 return 0;
836}
837
744static int gr_gv11b_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc, 838static int gr_gv11b_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
745 u32 gpc_exception) 839 u32 gpc_exception)
746{ 840{
@@ -764,7 +858,8 @@ static void gr_gv11b_enable_gpc_exceptions(struct gk20a *g)
764 858
765 gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(), 859 gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(),
766 (tpc_mask | gr_gpcs_gpccs_gpc_exception_en_gcc_f(1) | 860 (tpc_mask | gr_gpcs_gpccs_gpc_exception_en_gcc_f(1) |
767 gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1))); 861 gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1) |
862 gr_gpcs_gpccs_gpc_exception_en_gpcmmu_f(1)));
768} 863}
769 864
770static int gr_gv11b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc, 865static int gr_gv11b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc,
@@ -1810,7 +1905,7 @@ static void gr_gv11b_handle_fecs_ecc_error(struct gk20a *g, u32 intr)
1810 nvgpu_log(g, gpu_dbg_intr, 1905 nvgpu_log(g, gpu_dbg_intr,
1811 "dmem ecc error uncorrected"); 1906 "dmem ecc error uncorrected");
1812 if (corrected_overflow || uncorrected_overflow) 1907 if (corrected_overflow || uncorrected_overflow)
1813 nvgpu_info(g, "gpccs ecc counter overflow!"); 1908 nvgpu_info(g, "fecs ecc counter overflow!");
1814 1909
1815 nvgpu_log(g, gpu_dbg_intr, 1910 nvgpu_log(g, gpu_dbg_intr,
1816 "ecc error row address: 0x%x", 1911 "ecc error row address: 0x%x",
@@ -2422,4 +2517,6 @@ void gv11b_init_gr(struct gpu_ops *gops)
2422 gops->gr.handle_gpc_gpccs_exception = 2517 gops->gr.handle_gpc_gpccs_exception =
2423 gr_gv11b_handle_gpc_gpccs_exception; 2518 gr_gv11b_handle_gpc_gpccs_exception;
2424 gops->gr.set_czf_bypass = NULL; 2519 gops->gr.set_czf_bypass = NULL;
2520 gops->gr.handle_gpc_gpcmmu_exception =
2521 gr_gv11b_handle_gpc_gpcmmu_exception;
2425} 2522}