summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
diff options
context:
space:
mode:
authorAdeel Raza <araza@nvidia.com>2016-05-19 20:24:44 -0400
committerDeepak Nibade <dnibade@nvidia.com>2016-12-27 04:56:16 -0500
commit5bc7b40524e0cd30ae5a601ed685bc2d470b8d78 (patch)
tree6d9b7915c1c9d65c2a9a4e621a20387b1a123aa0 /drivers/gpu/nvgpu/gp10b/gr_gp10b.c
parent140921cdf8c4c27ccf7b9844b2cc23130ba275f9 (diff)
gpu: nvgpu: gp10b: SM LRF ECC overcount WAR
SM LRF ECC HW overcounts errors in certain situations. Implement SW WAR to correct error counts. Bug 1752609 Bug 1761594 Change-Id: I79047d21e2e44e0fca3ece1da80f02faa4cd6c54 Signed-off-by: Adeel Raza <araza@nvidia.com> Reviewed-on: http://git-master/r/1150773 GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gp10b/gr_gp10b.c')
-rw-r--r--drivers/gpu/nvgpu/gp10b/gr_gp10b.c105
1 files changed, 76 insertions, 29 deletions
diff --git a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
index 3c04c2e4..86cc0555 100644
--- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
@@ -58,6 +58,41 @@ static bool gr_gp10b_is_valid_class(struct gk20a *g, u32 class_num)
58 return valid; 58 return valid;
59} 59}
60 60
61static void gr_gp10b_sm_lrf_ecc_overcount_war(int single_err,
62 u32 sed_status,
63 u32 ded_status,
64 u32 *count_to_adjust,
65 u32 opposite_count)
66{
67 u32 over_count = 0;
68
69 sed_status >>= gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_b();
70 ded_status >>= gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp0_b();
71
72 /* One overcount for each partition on which a SBE occurred but not a
73 DBE (or vice-versa) */
74 if (single_err) {
75 over_count =
76 hweight32(sed_status & ~ded_status);
77 } else {
78 over_count =
79 hweight32(ded_status & ~sed_status);
80 }
81
82 /* If both a SBE and a DBE occur on the same partition, then we have an
83 overcount for the subpartition if the opposite error counts are
84 zero. */
85 if ((sed_status & ded_status) && (opposite_count == 0)) {
86 over_count +=
87 hweight32(sed_status & ded_status);
88 }
89
90 if (*count_to_adjust > over_count)
91 *count_to_adjust -= over_count;
92 else
93 *count_to_adjust = 0;
94}
95
61static int gr_gp10b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, 96static int gr_gp10b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
62 bool *post_event, struct channel_gk20a *fault_ch) 97 bool *post_event, struct channel_gk20a *fault_ch)
63{ 98{
@@ -65,50 +100,62 @@ static int gr_gp10b_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
65 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); 100 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
66 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); 101 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
67 u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; 102 u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
68 u32 lrf_ecc_status, shm_ecc_status; 103 u32 lrf_ecc_status, lrf_ecc_sed_status, lrf_ecc_ded_status;
104 u32 lrf_single_count_delta, lrf_double_count_delta;
105 u32 shm_ecc_status;
69 106
70 gr_gk20a_handle_sm_exception(g, gpc, tpc, post_event, fault_ch); 107 gr_gk20a_handle_sm_exception(g, gpc, tpc, post_event, fault_ch);
71 108
72 /* Check for LRF ECC errors. */ 109 /* Check for LRF ECC errors. */
73 lrf_ecc_status = gk20a_readl(g, 110 lrf_ecc_status = gk20a_readl(g,
74 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset); 111 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset);
75 if ( (lrf_ecc_status & 112 lrf_ecc_sed_status = lrf_ecc_status &
76 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_pending_f()) || 113 (gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_pending_f() |
77 (lrf_ecc_status & 114 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp1_pending_f() |
78 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp1_pending_f()) || 115 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp2_pending_f() |
79 (lrf_ecc_status & 116 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp3_pending_f());
80 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp2_pending_f()) || 117 lrf_ecc_ded_status = lrf_ecc_status &
81 (lrf_ecc_status & 118 (gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp0_pending_f() |
82 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp3_pending_f()) ) { 119 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp1_pending_f() |
83 120 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp2_pending_f() |
121 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp3_pending_f());
122 lrf_single_count_delta =
123 gk20a_readl(g,
124 gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() +
125 offset);
126 lrf_double_count_delta =
127 gk20a_readl(g,
128 gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() +
129 offset);
130 gk20a_writel(g,
131 gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() + offset,
132 0);
133 gk20a_writel(g,
134 gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() + offset,
135 0);
136 if (lrf_ecc_sed_status) {
84 gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, 137 gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
85 "Single bit error detected in SM LRF!"); 138 "Single bit error detected in SM LRF!");
86 139
140 gr_gp10b_sm_lrf_ecc_overcount_war(1,
141 lrf_ecc_sed_status,
142 lrf_ecc_ded_status,
143 &lrf_single_count_delta,
144 lrf_double_count_delta);
87 g->gr.t18x.ecc_stats.sm_lrf_single_err_count.counters[tpc] += 145 g->gr.t18x.ecc_stats.sm_lrf_single_err_count.counters[tpc] +=
88 gk20a_readl(g, 146 lrf_single_count_delta;
89 gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() + offset);
90 gk20a_writel(g,
91 gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() + offset,
92 0);
93 } 147 }
94 if ( (lrf_ecc_status & 148 if (lrf_ecc_ded_status) {
95 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp0_pending_f()) ||
96 (lrf_ecc_status &
97 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp1_pending_f()) ||
98 (lrf_ecc_status &
99 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp2_pending_f()) ||
100 (lrf_ecc_status &
101 gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp3_pending_f()) ) {
102
103 gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr, 149 gk20a_dbg(gpu_dbg_fn | gpu_dbg_intr,
104 "Double bit error detected in SM LRF!"); 150 "Double bit error detected in SM LRF!");
105 151
152 gr_gp10b_sm_lrf_ecc_overcount_war(0,
153 lrf_ecc_sed_status,
154 lrf_ecc_ded_status,
155 &lrf_double_count_delta,
156 lrf_single_count_delta);
106 g->gr.t18x.ecc_stats.sm_lrf_double_err_count.counters[tpc] += 157 g->gr.t18x.ecc_stats.sm_lrf_double_err_count.counters[tpc] +=
107 gk20a_readl(g, 158 lrf_double_count_delta;
108 gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() + offset);
109 gk20a_writel(g,
110 gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() + offset,
111 0);
112 } 159 }
113 gk20a_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset, 160 gk20a_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset,
114 lrf_ecc_status); 161 lrf_ecc_status);