diff options
author | Terje Bergstrom <tbergstrom@nvidia.com> | 2015-02-12 13:53:26 -0500 |
---|---|---|
committer | Dan Willemsen <dwillemsen@nvidia.com> | 2015-04-04 21:06:37 -0400 |
commit | 226c671f8e99e7ed274c5c630090c6190a1367a5 (patch) | |
tree | e4dfc9a123ec5831210d0b1ea4a3044e987d6c33 | |
parent | a3b26f25a226ea56e84dbda1bb510c1a7bc11054 (diff) |
gpu: nvgpu: More robust recovery
Make recovery a more straightforward process. When we detect a fault,
trigger MMU fault, and wait for it to trigger, and complete recovery.
Also reset engines before aborting channel to ensure no stray sync
point increments can happen.
Change-Id: Iac685db6534cb64fe62d9fb452391f43100f2999
Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-on: http://git-master/r/709060
(cherry picked from commit 95c62ffd9ac30a0d2eb88d033dcc6e6ff25efd6f)
Reviewed-on: http://git-master/r/707443
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 2 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | 81 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/fifo_gk20a.h | 1 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/hw_fifo_gk20a.h | 14 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gm20b/hw_fifo_gm20b.h | 14 |
5 files changed, 64 insertions, 48 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index a32496a2..4e68fe67 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c | |||
@@ -570,7 +570,7 @@ void gk20a_free_channel(struct channel_gk20a *ch, bool finish) | |||
570 | if (g->fifo.deferred_reset_pending) { | 570 | if (g->fifo.deferred_reset_pending) { |
571 | gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "engine reset was" | 571 | gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "engine reset was" |
572 | " deferred, running now"); | 572 | " deferred, running now"); |
573 | fifo_gk20a_finish_mmu_fault_handling(g, g->fifo.mmu_fault_engines); | 573 | gk20a_fifo_reset_engine(g, g->fifo.mmu_fault_engines); |
574 | g->fifo.mmu_fault_engines = 0; | 574 | g->fifo.mmu_fault_engines = 0; |
575 | g->fifo.deferred_reset_pending = false; | 575 | g->fifo.deferred_reset_pending = false; |
576 | } | 576 | } |
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index a872e304..18928142 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | |||
@@ -827,7 +827,7 @@ static inline void get_exception_mmu_fault_info( | |||
827 | f->inst_ptr <<= fifo_intr_mmu_fault_inst_ptr_align_shift_v(); | 827 | f->inst_ptr <<= fifo_intr_mmu_fault_inst_ptr_align_shift_v(); |
828 | } | 828 | } |
829 | 829 | ||
830 | static void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id) | 830 | void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id) |
831 | { | 831 | { |
832 | gk20a_dbg_fn(""); | 832 | gk20a_dbg_fn(""); |
833 | 833 | ||
@@ -877,34 +877,6 @@ static bool gk20a_fifo_should_defer_engine_reset(struct gk20a *g, u32 engine_id, | |||
877 | return true; | 877 | return true; |
878 | } | 878 | } |
879 | 879 | ||
880 | void fifo_gk20a_finish_mmu_fault_handling(struct gk20a *g, | ||
881 | unsigned long fault_id) { | ||
882 | u32 engine_mmu_id; | ||
883 | |||
884 | /* reset engines */ | ||
885 | for_each_set_bit(engine_mmu_id, &fault_id, 32) { | ||
886 | u32 engine_id = gk20a_mmu_id_to_engine_id(engine_mmu_id); | ||
887 | if (engine_id != ~0) | ||
888 | gk20a_fifo_reset_engine(g, engine_id); | ||
889 | } | ||
890 | |||
891 | /* clear interrupt */ | ||
892 | gk20a_writel(g, fifo_intr_mmu_fault_id_r(), fault_id); | ||
893 | |||
894 | /* resume scheduler */ | ||
895 | gk20a_writel(g, fifo_error_sched_disable_r(), | ||
896 | gk20a_readl(g, fifo_error_sched_disable_r())); | ||
897 | |||
898 | /* Re-enable fifo access */ | ||
899 | gk20a_writel(g, gr_gpfifo_ctl_r(), | ||
900 | gr_gpfifo_ctl_access_enabled_f() | | ||
901 | gr_gpfifo_ctl_semaphore_access_enabled_f()); | ||
902 | |||
903 | /* It is safe to enable ELPG again. */ | ||
904 | if (support_gk20a_pmu(g->dev) && g->elpg_enabled) | ||
905 | gk20a_pmu_enable_elpg(g); | ||
906 | } | ||
907 | |||
908 | static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g, | 880 | static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g, |
909 | struct channel_gk20a *ch) | 881 | struct channel_gk20a *ch) |
910 | { | 882 | { |
@@ -1083,10 +1055,12 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g) | |||
1083 | 1055 | ||
1084 | /* handled during channel free */ | 1056 | /* handled during channel free */ |
1085 | g->fifo.deferred_reset_pending = true; | 1057 | g->fifo.deferred_reset_pending = true; |
1086 | } | 1058 | } else if (engine_id != ~0) |
1059 | gk20a_fifo_reset_engine(g, engine_id); | ||
1087 | 1060 | ||
1088 | /* disable the channel/TSG from hw and increment | 1061 | /* disable the channel/TSG from hw and increment |
1089 | * syncpoints */ | 1062 | * syncpoints */ |
1063 | |||
1090 | if (tsg) { | 1064 | if (tsg) { |
1091 | struct channel_gk20a *ch = NULL; | 1065 | struct channel_gk20a *ch = NULL; |
1092 | if (!g->fifo.deferred_reset_pending) | 1066 | if (!g->fifo.deferred_reset_pending) |
@@ -1119,9 +1093,21 @@ static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g) | |||
1119 | return verbose; | 1093 | return verbose; |
1120 | } | 1094 | } |
1121 | 1095 | ||
1122 | /* resetting the engines and clearing the runlists is done in | 1096 | /* clear interrupt */ |
1123 | a separate function to allow deferred reset. */ | 1097 | gk20a_writel(g, fifo_intr_mmu_fault_id_r(), fault_id); |
1124 | fifo_gk20a_finish_mmu_fault_handling(g, fault_id); | 1098 | |
1099 | /* resume scheduler */ | ||
1100 | gk20a_writel(g, fifo_error_sched_disable_r(), | ||
1101 | gk20a_readl(g, fifo_error_sched_disable_r())); | ||
1102 | |||
1103 | /* Re-enable fifo access */ | ||
1104 | gk20a_writel(g, gr_gpfifo_ctl_r(), | ||
1105 | gr_gpfifo_ctl_access_enabled_f() | | ||
1106 | gr_gpfifo_ctl_semaphore_access_enabled_f()); | ||
1107 | |||
1108 | /* It is safe to enable ELPG again. */ | ||
1109 | if (support_gk20a_pmu(g->dev) && g->elpg_enabled) | ||
1110 | gk20a_pmu_enable_elpg(g); | ||
1125 | return verbose; | 1111 | return verbose; |
1126 | } | 1112 | } |
1127 | 1113 | ||
@@ -1152,15 +1138,6 @@ static void gk20a_fifo_trigger_mmu_fault(struct gk20a *g, | |||
1152 | unsigned long engine_id; | 1138 | unsigned long engine_id; |
1153 | int ret; | 1139 | int ret; |
1154 | 1140 | ||
1155 | /* | ||
1156 | * sched error prevents recovery, and ctxsw error will retrigger | ||
1157 | * every 100ms. Disable the sched error to allow recovery. | ||
1158 | */ | ||
1159 | gk20a_writel(g, fifo_intr_en_0_r(), | ||
1160 | 0x7FFFFFFF & ~fifo_intr_en_0_sched_error_m()); | ||
1161 | gk20a_writel(g, fifo_intr_0_r(), | ||
1162 | fifo_intr_0_sched_error_reset_f()); | ||
1163 | |||
1164 | /* trigger faults for all bad engines */ | 1141 | /* trigger faults for all bad engines */ |
1165 | for_each_set_bit(engine_id, &engine_ids, 32) { | 1142 | for_each_set_bit(engine_id, &engine_ids, 32) { |
1166 | if (engine_id > g->fifo.max_engines) { | 1143 | if (engine_id > g->fifo.max_engines) { |
@@ -1194,9 +1171,6 @@ static void gk20a_fifo_trigger_mmu_fault(struct gk20a *g, | |||
1194 | /* release mmu fault trigger */ | 1171 | /* release mmu fault trigger */ |
1195 | for_each_set_bit(engine_id, &engine_ids, 32) | 1172 | for_each_set_bit(engine_id, &engine_ids, 32) |
1196 | gk20a_writel(g, fifo_trigger_mmu_fault_r(engine_id), 0); | 1173 | gk20a_writel(g, fifo_trigger_mmu_fault_r(engine_id), 0); |
1197 | |||
1198 | /* Re-enable sched error */ | ||
1199 | gk20a_writel(g, fifo_intr_en_0_r(), 0x7FFFFFFF); | ||
1200 | } | 1174 | } |
1201 | 1175 | ||
1202 | static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg) | 1176 | static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg) |
@@ -1272,6 +1246,7 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids, | |||
1272 | unsigned long engine_id, i; | 1246 | unsigned long engine_id, i; |
1273 | unsigned long _engine_ids = __engine_ids; | 1247 | unsigned long _engine_ids = __engine_ids; |
1274 | unsigned long engine_ids = 0; | 1248 | unsigned long engine_ids = 0; |
1249 | u32 val; | ||
1275 | 1250 | ||
1276 | if (verbose) | 1251 | if (verbose) |
1277 | gk20a_debug_dump(g->dev); | 1252 | gk20a_debug_dump(g->dev); |
@@ -1302,7 +1277,23 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids, | |||
1302 | 1277 | ||
1303 | } | 1278 | } |
1304 | 1279 | ||
1280 | /* | ||
1281 | * sched error prevents recovery, and ctxsw error will retrigger | ||
1282 | * every 100ms. Disable the sched error to allow recovery. | ||
1283 | */ | ||
1284 | val = gk20a_readl(g, fifo_intr_en_0_r()); | ||
1285 | val &= ~(fifo_intr_en_0_sched_error_m() | fifo_intr_en_0_mmu_fault_m()); | ||
1286 | gk20a_writel(g, fifo_intr_en_0_r(), val); | ||
1287 | gk20a_writel(g, fifo_intr_0_r(), | ||
1288 | fifo_intr_0_sched_error_reset_f()); | ||
1289 | |||
1305 | g->ops.fifo.trigger_mmu_fault(g, engine_ids); | 1290 | g->ops.fifo.trigger_mmu_fault(g, engine_ids); |
1291 | gk20a_fifo_handle_mmu_fault(g); | ||
1292 | |||
1293 | val = gk20a_readl(g, fifo_intr_en_0_r()); | ||
1294 | val |= fifo_intr_en_0_mmu_fault_f(1) | ||
1295 | | fifo_intr_en_0_sched_error_f(1); | ||
1296 | gk20a_writel(g, fifo_intr_en_0_r(), val); | ||
1306 | } | 1297 | } |
1307 | 1298 | ||
1308 | int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose) | 1299 | int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose) |
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h index ecae970f..8fda38f5 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h | |||
@@ -162,6 +162,7 @@ void gk20a_fifo_recover(struct gk20a *g, u32 engine_ids, bool verbose); | |||
162 | void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose); | 162 | void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose); |
163 | void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose); | 163 | void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose); |
164 | int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose); | 164 | int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose); |
165 | void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id); | ||
165 | int gk20a_init_fifo_reset_enable_hw(struct gk20a *g); | 166 | int gk20a_init_fifo_reset_enable_hw(struct gk20a *g); |
166 | void gk20a_init_fifo(struct gpu_ops *gops); | 167 | void gk20a_init_fifo(struct gpu_ops *gops); |
167 | 168 | ||
diff --git a/drivers/gpu/nvgpu/gk20a/hw_fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_fifo_gk20a.h index 757ae3f0..a131972e 100644 --- a/drivers/gpu/nvgpu/gk20a/hw_fifo_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/hw_fifo_gk20a.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (c) 2012-2014, NVIDIA CORPORATION. All rights reserved. | 2 | * Copyright (c) 2012-2015, NVIDIA CORPORATION. All rights reserved. |
3 | * | 3 | * |
4 | * This program is free software; you can redistribute it and/or modify it | 4 | * This program is free software; you can redistribute it and/or modify it |
5 | * under the terms and conditions of the GNU General Public License, | 5 | * under the terms and conditions of the GNU General Public License, |
@@ -246,10 +246,22 @@ static inline u32 fifo_intr_en_0_r(void) | |||
246 | { | 246 | { |
247 | return 0x00002140; | 247 | return 0x00002140; |
248 | } | 248 | } |
249 | static inline u32 fifo_intr_en_0_sched_error_f(u32 v) | ||
250 | { | ||
251 | return (v & 0x1) << 8; | ||
252 | } | ||
249 | static inline u32 fifo_intr_en_0_sched_error_m(void) | 253 | static inline u32 fifo_intr_en_0_sched_error_m(void) |
250 | { | 254 | { |
251 | return 0x1 << 8; | 255 | return 0x1 << 8; |
252 | } | 256 | } |
257 | static inline u32 fifo_intr_en_0_mmu_fault_f(u32 v) | ||
258 | { | ||
259 | return (v & 0x1) << 28; | ||
260 | } | ||
261 | static inline u32 fifo_intr_en_0_mmu_fault_m(void) | ||
262 | { | ||
263 | return 0x1 << 28; | ||
264 | } | ||
253 | static inline u32 fifo_intr_en_1_r(void) | 265 | static inline u32 fifo_intr_en_1_r(void) |
254 | { | 266 | { |
255 | return 0x00002528; | 267 | return 0x00002528; |
diff --git a/drivers/gpu/nvgpu/gm20b/hw_fifo_gm20b.h b/drivers/gpu/nvgpu/gm20b/hw_fifo_gm20b.h index acbe6a4e..f3a24f61 100644 --- a/drivers/gpu/nvgpu/gm20b/hw_fifo_gm20b.h +++ b/drivers/gpu/nvgpu/gm20b/hw_fifo_gm20b.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved. | 2 | * Copyright (c) 2014-2015, NVIDIA CORPORATION. All rights reserved. |
3 | * | 3 | * |
4 | * This program is free software; you can redistribute it and/or modify it | 4 | * This program is free software; you can redistribute it and/or modify it |
5 | * under the terms and conditions of the GNU General Public License, | 5 | * under the terms and conditions of the GNU General Public License, |
@@ -206,10 +206,22 @@ static inline u32 fifo_intr_en_0_r(void) | |||
206 | { | 206 | { |
207 | return 0x00002140; | 207 | return 0x00002140; |
208 | } | 208 | } |
209 | static inline u32 fifo_intr_en_0_sched_error_f(u32 v) | ||
210 | { | ||
211 | return (v & 0x1) << 8; | ||
212 | } | ||
209 | static inline u32 fifo_intr_en_0_sched_error_m(void) | 213 | static inline u32 fifo_intr_en_0_sched_error_m(void) |
210 | { | 214 | { |
211 | return 0x1 << 8; | 215 | return 0x1 << 8; |
212 | } | 216 | } |
217 | static inline u32 fifo_intr_en_0_mmu_fault_f(u32 v) | ||
218 | { | ||
219 | return (v & 0x1) << 28; | ||
220 | } | ||
221 | static inline u32 fifo_intr_en_0_mmu_fault_m(void) | ||
222 | { | ||
223 | return 0x1 << 28; | ||
224 | } | ||
213 | static inline u32 fifo_intr_en_1_r(void) | 225 | static inline u32 fifo_intr_en_1_r(void) |
214 | { | 226 | { |
215 | return 0x00002528; | 227 | return 0x00002528; |