diff options
author | Terje Bergstrom <tbergstrom@nvidia.com> | 2015-02-12 13:53:26 -0500 |
---|---|---|
committer | Dan Willemsen <dwillemsen@nvidia.com> | 2015-04-04 21:06:37 -0400 |
commit | 226c671f8e99e7ed274c5c630090c6190a1367a5 (patch) | |
tree | e4dfc9a123ec5831210d0b1ea4a3044e987d6c33 /drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | |
parent | a3b26f25a226ea56e84dbda1bb510c1a7bc11054 (diff) |
gpu: nvgpu: More robust recovery
Make recovery a more straightforward process. When we detect a fault,
trigger MMU fault, and wait for it to trigger, and complete recovery.
Also reset engines before aborting channel to ensure no stray sync
point increments can happen.
Change-Id: Iac685db6534cb64fe62d9fb452391f43100f2999
Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-on: http://git-master/r/709060
(cherry picked from commit 95c62ffd9ac30a0d2eb88d033dcc6e6ff25efd6f)
Reviewed-on: http://git-master/r/707443
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/fifo_gk20a.c')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | 81 |
1 files changed, 36 insertions, 45 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index a872e304..18928142 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | |||
@@ -827,7 +827,7 @@ static inline void get_exception_mmu_fault_info( | |||
827 | f->inst_ptr <<= fifo_intr_mmu_fault_inst_ptr_align_shift_v(); | 827 | f->inst_ptr <<= fifo_intr_mmu_fault_inst_ptr_align_shift_v(); |
828 | } | 828 | } |
829 | 829 | ||
830 | static void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id) | 830 | void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id) |
831 | { | 831 | { |
832 | gk20a_dbg_fn(""); | 832 | gk20a_dbg_fn(""); |
833 | 833 | ||
@@ -877,34 +877,6 @@ static bool gk20a_fifo_should_defer_engine_reset(struct gk20a *g, u32 engine_id, | |||
877 | return true; | 877 | return true; |
878 | } | 878 | } |
879 | 879 | ||
880 | void fifo_gk20a_finish_mmu_fault_handling(struct gk20a *g, | ||
881 | unsigned long fault_id) { | ||
882 | u32 engine_mmu_id; | ||
883 | |||
884 | /* reset engines */ | ||
885 | for_each_set_bit(engine_mmu_id, &fault_id, 32) { | ||
886 | u32 engine_id = gk20a_mmu_id_to_engine_id(engine_mmu_id); | ||
887 | if (engine_id != ~0) | ||
888 | gk20a_fifo_reset_engine(g, engine_id); | ||
889 | } | ||
890 | |||
891 | /* clear interrupt */ | ||
892 | gk20a_writel(g, fifo_intr_mmu_fault_id_r(), fault_id); | ||
893 | |||
894 | /* resume scheduler */ | ||
895 | gk20a_writel(g, fifo_error_sched_disable_r(), | ||
896 | gk20a_readl(g, fifo_error_sched_disable_r())); | ||
897 | |||
898 | /* Re-enable fifo access */ | ||
899 | gk20a_writel(g, gr_gpfifo_ctl_r(), | ||
900 | gr_gpfifo_ctl_access_enabled_f() | | ||
901 | gr_gpfifo_ctl_semaphore_access_enabled_f()); | ||
902 | |||
903 | /* It is safe to enable ELPG again. */ | ||
904 | if (support_gk20a_pmu(g->dev) && g->elpg_enabled) | ||
905 | gk20a_pmu_enable_elpg(g); | ||
906 | } | ||
907 | |||
908 | static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g, | 880 | static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g, |
909 | struct channel_gk20a *ch) | 881 | struct channel_gk20a *ch) |
910 | { | 882 | { |
@@ -1083,10 +1055,12 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g) | |||
1083 | 1055 | ||
1084 | /* handled during channel free */ | 1056 | /* handled during channel free */ |
1085 | g->fifo.deferred_reset_pending = true; | 1057 | g->fifo.deferred_reset_pending = true; |
1086 | } | 1058 | } else if (engine_id != ~0) |
1059 | gk20a_fifo_reset_engine(g, engine_id); | ||
1087 | 1060 | ||
1088 | /* disable the channel/TSG from hw and increment | 1061 | /* disable the channel/TSG from hw and increment |
1089 | * syncpoints */ | 1062 | * syncpoints */ |
1063 | |||
1090 | if (tsg) { | 1064 | if (tsg) { |
1091 | struct channel_gk20a *ch = NULL; | 1065 | struct channel_gk20a *ch = NULL; |
1092 | if (!g->fifo.deferred_reset_pending) | 1066 | if (!g->fifo.deferred_reset_pending) |
@@ -1119,9 +1093,21 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g) | |||
1119 | return verbose; | 1093 | return verbose; |
1120 | } | 1094 | } |
1121 | 1095 | ||
1122 | /* resetting the engines and clearing the runlists is done in | 1096 | /* clear interrupt */ |
1123 | a separate function to allow deferred reset. */ | 1097 | gk20a_writel(g, fifo_intr_mmu_fault_id_r(), fault_id); |
1124 | fifo_gk20a_finish_mmu_fault_handling(g, fault_id); | 1098 | |
1099 | /* resume scheduler */ | ||
1100 | gk20a_writel(g, fifo_error_sched_disable_r(), | ||
1101 | gk20a_readl(g, fifo_error_sched_disable_r())); | ||
1102 | |||
1103 | /* Re-enable fifo access */ | ||
1104 | gk20a_writel(g, gr_gpfifo_ctl_r(), | ||
1105 | gr_gpfifo_ctl_access_enabled_f() | | ||
1106 | gr_gpfifo_ctl_semaphore_access_enabled_f()); | ||
1107 | |||
1108 | /* It is safe to enable ELPG again. */ | ||
1109 | if (support_gk20a_pmu(g->dev) && g->elpg_enabled) | ||
1110 | gk20a_pmu_enable_elpg(g); | ||
1125 | return verbose; | 1111 | return verbose; |
1126 | } | 1112 | } |
1127 | 1113 | ||
@@ -1152,15 +1138,6 @@ static void gk20a_fifo_trigger_mmu_fault(struct gk20a *g, | |||
1152 | unsigned long engine_id; | 1138 | unsigned long engine_id; |
1153 | int ret; | 1139 | int ret; |
1154 | 1140 | ||
1155 | /* | ||
1156 | * sched error prevents recovery, and ctxsw error will retrigger | ||
1157 | * every 100ms. Disable the sched error to allow recovery. | ||
1158 | */ | ||
1159 | gk20a_writel(g, fifo_intr_en_0_r(), | ||
1160 | 0x7FFFFFFF & ~fifo_intr_en_0_sched_error_m()); | ||
1161 | gk20a_writel(g, fifo_intr_0_r(), | ||
1162 | fifo_intr_0_sched_error_reset_f()); | ||
1163 | |||
1164 | /* trigger faults for all bad engines */ | 1141 | /* trigger faults for all bad engines */ |
1165 | for_each_set_bit(engine_id, &engine_ids, 32) { | 1142 | for_each_set_bit(engine_id, &engine_ids, 32) { |
1166 | if (engine_id > g->fifo.max_engines) { | 1143 | if (engine_id > g->fifo.max_engines) { |
@@ -1194,9 +1171,6 @@ static void gk20a_fifo_trigger_mmu_fault(struct gk20a *g, | |||
1194 | /* release mmu fault trigger */ | 1171 | /* release mmu fault trigger */ |
1195 | for_each_set_bit(engine_id, &engine_ids, 32) | 1172 | for_each_set_bit(engine_id, &engine_ids, 32) |
1196 | gk20a_writel(g, fifo_trigger_mmu_fault_r(engine_id), 0); | 1173 | gk20a_writel(g, fifo_trigger_mmu_fault_r(engine_id), 0); |
1197 | |||
1198 | /* Re-enable sched error */ | ||
1199 | gk20a_writel(g, fifo_intr_en_0_r(), 0x7FFFFFFF); | ||
1200 | } | 1174 | } |
1201 | 1175 | ||
1202 | static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg) | 1176 | static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg) |
@@ -1272,6 +1246,7 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids, | |||
1272 | unsigned long engine_id, i; | 1246 | unsigned long engine_id, i; |
1273 | unsigned long _engine_ids = __engine_ids; | 1247 | unsigned long _engine_ids = __engine_ids; |
1274 | unsigned long engine_ids = 0; | 1248 | unsigned long engine_ids = 0; |
1249 | u32 val; | ||
1275 | 1250 | ||
1276 | if (verbose) | 1251 | if (verbose) |
1277 | gk20a_debug_dump(g->dev); | 1252 | gk20a_debug_dump(g->dev); |
@@ -1302,7 +1277,23 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids, | |||
1302 | 1277 | ||
1303 | } | 1278 | } |
1304 | 1279 | ||
1280 | /* | ||
1281 | * sched error prevents recovery, and ctxsw error will retrigger | ||
1282 | * every 100ms. Disable the sched error to allow recovery. | ||
1283 | */ | ||
1284 | val = gk20a_readl(g, fifo_intr_en_0_r()); | ||
1285 | val &= ~(fifo_intr_en_0_sched_error_m() | fifo_intr_en_0_mmu_fault_m()); | ||
1286 | gk20a_writel(g, fifo_intr_en_0_r(), val); | ||
1287 | gk20a_writel(g, fifo_intr_0_r(), | ||
1288 | fifo_intr_0_sched_error_reset_f()); | ||
1289 | |||
1305 | g->ops.fifo.trigger_mmu_fault(g, engine_ids); | 1290 | g->ops.fifo.trigger_mmu_fault(g, engine_ids); |
1291 | gk20a_fifo_handle_mmu_fault(g); | ||
1292 | |||
1293 | val = gk20a_readl(g, fifo_intr_en_0_r()); | ||
1294 | val |= fifo_intr_en_0_mmu_fault_f(1) | ||
1295 | | fifo_intr_en_0_sched_error_f(1); | ||
1296 | gk20a_writel(g, fifo_intr_en_0_r(), val); | ||
1306 | } | 1297 | } |
1307 | 1298 | ||
1308 | int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose) | 1299 | int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose) |