summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
diff options
context:
space:
mode:
authorTerje Bergstrom <tbergstrom@nvidia.com>2015-02-12 13:53:26 -0500
committerDan Willemsen <dwillemsen@nvidia.com>2015-04-04 21:06:37 -0400
commit226c671f8e99e7ed274c5c630090c6190a1367a5 (patch)
treee4dfc9a123ec5831210d0b1ea4a3044e987d6c33 /drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
parenta3b26f25a226ea56e84dbda1bb510c1a7bc11054 (diff)
gpu: nvgpu: More robust recovery
Make recovery a more straightforward process. When we detect a fault, trigger MMU fault, and wait for it to trigger, and complete recovery. Also reset engines before aborting channel to ensure no stray sync point increments can happen. Change-Id: Iac685db6534cb64fe62d9fb452391f43100f2999 Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-on: http://git-master/r/709060 (cherry picked from commit 95c62ffd9ac30a0d2eb88d033dcc6e6ff25efd6f) Reviewed-on: http://git-master/r/707443
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/fifo_gk20a.c')
-rw-r--r--drivers/gpu/nvgpu/gk20a/fifo_gk20a.c81
1 files changed, 36 insertions, 45 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index a872e304..18928142 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -827,7 +827,7 @@ static inline void get_exception_mmu_fault_info(
827 f->inst_ptr <<= fifo_intr_mmu_fault_inst_ptr_align_shift_v(); 827 f->inst_ptr <<= fifo_intr_mmu_fault_inst_ptr_align_shift_v();
828} 828}
829 829
830static void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id) 830void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id)
831{ 831{
832 gk20a_dbg_fn(""); 832 gk20a_dbg_fn("");
833 833
@@ -877,34 +877,6 @@ static bool gk20a_fifo_should_defer_engine_reset(struct gk20a *g, u32 engine_id,
877 return true; 877 return true;
878} 878}
879 879
880void fifo_gk20a_finish_mmu_fault_handling(struct gk20a *g,
881 unsigned long fault_id) {
882 u32 engine_mmu_id;
883
884 /* reset engines */
885 for_each_set_bit(engine_mmu_id, &fault_id, 32) {
886 u32 engine_id = gk20a_mmu_id_to_engine_id(engine_mmu_id);
887 if (engine_id != ~0)
888 gk20a_fifo_reset_engine(g, engine_id);
889 }
890
891 /* clear interrupt */
892 gk20a_writel(g, fifo_intr_mmu_fault_id_r(), fault_id);
893
894 /* resume scheduler */
895 gk20a_writel(g, fifo_error_sched_disable_r(),
896 gk20a_readl(g, fifo_error_sched_disable_r()));
897
898 /* Re-enable fifo access */
899 gk20a_writel(g, gr_gpfifo_ctl_r(),
900 gr_gpfifo_ctl_access_enabled_f() |
901 gr_gpfifo_ctl_semaphore_access_enabled_f());
902
903 /* It is safe to enable ELPG again. */
904 if (support_gk20a_pmu(g->dev) && g->elpg_enabled)
905 gk20a_pmu_enable_elpg(g);
906}
907
908static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g, 880static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g,
909 struct channel_gk20a *ch) 881 struct channel_gk20a *ch)
910{ 882{
@@ -1083,10 +1055,12 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
1083 1055
1084 /* handled during channel free */ 1056 /* handled during channel free */
1085 g->fifo.deferred_reset_pending = true; 1057 g->fifo.deferred_reset_pending = true;
1086 } 1058 } else if (engine_id != ~0)
1059 gk20a_fifo_reset_engine(g, engine_id);
1087 1060
1088 /* disable the channel/TSG from hw and increment 1061 /* disable the channel/TSG from hw and increment
1089 * syncpoints */ 1062 * syncpoints */
1063
1090 if (tsg) { 1064 if (tsg) {
1091 struct channel_gk20a *ch = NULL; 1065 struct channel_gk20a *ch = NULL;
1092 if (!g->fifo.deferred_reset_pending) 1066 if (!g->fifo.deferred_reset_pending)
@@ -1119,9 +1093,21 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
1119 return verbose; 1093 return verbose;
1120 } 1094 }
1121 1095
1122 /* resetting the engines and clearing the runlists is done in 1096 /* clear interrupt */
1123 a separate function to allow deferred reset. */ 1097 gk20a_writel(g, fifo_intr_mmu_fault_id_r(), fault_id);
1124 fifo_gk20a_finish_mmu_fault_handling(g, fault_id); 1098
1099 /* resume scheduler */
1100 gk20a_writel(g, fifo_error_sched_disable_r(),
1101 gk20a_readl(g, fifo_error_sched_disable_r()));
1102
1103 /* Re-enable fifo access */
1104 gk20a_writel(g, gr_gpfifo_ctl_r(),
1105 gr_gpfifo_ctl_access_enabled_f() |
1106 gr_gpfifo_ctl_semaphore_access_enabled_f());
1107
1108 /* It is safe to enable ELPG again. */
1109 if (support_gk20a_pmu(g->dev) && g->elpg_enabled)
1110 gk20a_pmu_enable_elpg(g);
1125 return verbose; 1111 return verbose;
1126} 1112}
1127 1113
@@ -1152,15 +1138,6 @@ static void gk20a_fifo_trigger_mmu_fault(struct gk20a *g,
1152 unsigned long engine_id; 1138 unsigned long engine_id;
1153 int ret; 1139 int ret;
1154 1140
1155 /*
1156 * sched error prevents recovery, and ctxsw error will retrigger
1157 * every 100ms. Disable the sched error to allow recovery.
1158 */
1159 gk20a_writel(g, fifo_intr_en_0_r(),
1160 0x7FFFFFFF & ~fifo_intr_en_0_sched_error_m());
1161 gk20a_writel(g, fifo_intr_0_r(),
1162 fifo_intr_0_sched_error_reset_f());
1163
1164 /* trigger faults for all bad engines */ 1141 /* trigger faults for all bad engines */
1165 for_each_set_bit(engine_id, &engine_ids, 32) { 1142 for_each_set_bit(engine_id, &engine_ids, 32) {
1166 if (engine_id > g->fifo.max_engines) { 1143 if (engine_id > g->fifo.max_engines) {
@@ -1194,9 +1171,6 @@ static void gk20a_fifo_trigger_mmu_fault(struct gk20a *g,
1194 /* release mmu fault trigger */ 1171 /* release mmu fault trigger */
1195 for_each_set_bit(engine_id, &engine_ids, 32) 1172 for_each_set_bit(engine_id, &engine_ids, 32)
1196 gk20a_writel(g, fifo_trigger_mmu_fault_r(engine_id), 0); 1173 gk20a_writel(g, fifo_trigger_mmu_fault_r(engine_id), 0);
1197
1198 /* Re-enable sched error */
1199 gk20a_writel(g, fifo_intr_en_0_r(), 0x7FFFFFFF);
1200} 1174}
1201 1175
1202static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg) 1176static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg)
@@ -1272,6 +1246,7 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
1272 unsigned long engine_id, i; 1246 unsigned long engine_id, i;
1273 unsigned long _engine_ids = __engine_ids; 1247 unsigned long _engine_ids = __engine_ids;
1274 unsigned long engine_ids = 0; 1248 unsigned long engine_ids = 0;
1249 u32 val;
1275 1250
1276 if (verbose) 1251 if (verbose)
1277 gk20a_debug_dump(g->dev); 1252 gk20a_debug_dump(g->dev);
@@ -1302,7 +1277,23 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
1302 1277
1303 } 1278 }
1304 1279
1280 /*
1281 * sched error prevents recovery, and ctxsw error will retrigger
1282 * every 100ms. Disable the sched error to allow recovery.
1283 */
1284 val = gk20a_readl(g, fifo_intr_en_0_r());
1285 val &= ~(fifo_intr_en_0_sched_error_m() | fifo_intr_en_0_mmu_fault_m());
1286 gk20a_writel(g, fifo_intr_en_0_r(), val);
1287 gk20a_writel(g, fifo_intr_0_r(),
1288 fifo_intr_0_sched_error_reset_f());
1289
1305 g->ops.fifo.trigger_mmu_fault(g, engine_ids); 1290 g->ops.fifo.trigger_mmu_fault(g, engine_ids);
1291 gk20a_fifo_handle_mmu_fault(g);
1292
1293 val = gk20a_readl(g, fifo_intr_en_0_r());
1294 val |= fifo_intr_en_0_mmu_fault_f(1)
1295 | fifo_intr_en_0_sched_error_f(1);
1296 gk20a_writel(g, fifo_intr_en_0_r(), val);
1306} 1297}
1307 1298
1308int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose) 1299int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose)