diff options
author | Vijayakumar <vsubbu@nvidia.com> | 2015-06-29 05:12:56 -0400 |
---|---|---|
committer | Vijayakumar Subbu <vsubbu@nvidia.com> | 2015-07-17 04:16:48 -0400 |
commit | 55c85cfa7bc297b525a3b099d469eee0b71b155a (patch) | |
tree | 9835cc4b39d45fa1dac2e4b9dc477a6d39569232 /drivers/gpu/nvgpu/gk20a | |
parent | 37869170e4f3c42fa31faa1bcda1e6c0a188179c (diff) |
gpu: nvgpu: improve sched err handling
bug 200114561
1) when handling sched error, if CTXSW status reads switch
check FECS mailbox register to know whether next or current
channel caused error
2) Update recovery function to use ch id passed to it
3) Recovery function now passes mmu_engine_id to mmu fault
handler instead of fifo_engine_id
Change-Id: I3576cc4a90408b2f76b2c42cce19c27344531b1c
Signed-off-by: Vijayakumar <vsubbu@nvidia.com>
Reviewed-on: http://git-master/r/763538
Reviewed-by: Automatic_Commit_Validation_User
Reviewed-by: Sachin Nikam <snikam@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | 132 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/fifo_gk20a.h | 2 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 2 |
3 files changed, 85 insertions, 51 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index 034d060a..b195cf88 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include "hw_top_gk20a.h" | 34 | #include "hw_top_gk20a.h" |
35 | #include "hw_mc_gk20a.h" | 35 | #include "hw_mc_gk20a.h" |
36 | #include "hw_gr_gk20a.h" | 36 | #include "hw_gr_gk20a.h" |
37 | #define FECS_METHOD_WFI_RESTORE 0x80000 | ||
37 | 38 | ||
38 | static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id, | 39 | static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id, |
39 | u32 hw_chid, bool add, | 40 | u32 hw_chid, bool add, |
@@ -1177,7 +1178,6 @@ static u32 gk20a_fifo_engines_on_id(struct gk20a *g, u32 id, bool is_tsg) | |||
1177 | fifo_engine_status_id_type_v(status); | 1178 | fifo_engine_status_id_type_v(status); |
1178 | bool busy = fifo_engine_status_engine_v(status) == | 1179 | bool busy = fifo_engine_status_engine_v(status) == |
1179 | fifo_engine_status_engine_busy_v(); | 1180 | fifo_engine_status_engine_busy_v(); |
1180 | |||
1181 | if (busy && ctx_id == id) { | 1181 | if (busy && ctx_id == id) { |
1182 | if ((is_tsg && type == | 1182 | if ((is_tsg && type == |
1183 | fifo_engine_status_id_type_tsgid_v()) || | 1183 | fifo_engine_status_id_type_tsgid_v()) || |
@@ -1202,7 +1202,7 @@ void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose) | |||
1202 | engines = gk20a_fifo_engines_on_id(g, hw_chid, false); | 1202 | engines = gk20a_fifo_engines_on_id(g, hw_chid, false); |
1203 | 1203 | ||
1204 | if (engines) | 1204 | if (engines) |
1205 | gk20a_fifo_recover(g, engines, hw_chid, false, verbose); | 1205 | gk20a_fifo_recover(g, engines, hw_chid, false, true, verbose); |
1206 | else { | 1206 | else { |
1207 | struct channel_gk20a *ch = &g->fifo.channel[hw_chid]; | 1207 | struct channel_gk20a *ch = &g->fifo.channel[hw_chid]; |
1208 | 1208 | ||
@@ -1232,7 +1232,7 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose) | |||
1232 | engines = gk20a_fifo_engines_on_id(g, tsgid, true); | 1232 | engines = gk20a_fifo_engines_on_id(g, tsgid, true); |
1233 | 1233 | ||
1234 | if (engines) | 1234 | if (engines) |
1235 | gk20a_fifo_recover(g, engines, tsgid, true, verbose); | 1235 | gk20a_fifo_recover(g, engines, tsgid, true, true, verbose); |
1236 | else { | 1236 | else { |
1237 | struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid]; | 1237 | struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid]; |
1238 | 1238 | ||
@@ -1248,13 +1248,16 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose) | |||
1248 | 1248 | ||
1249 | void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids, | 1249 | void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids, |
1250 | u32 hw_id, bool id_is_tsg, | 1250 | u32 hw_id, bool id_is_tsg, |
1251 | bool verbose) | 1251 | bool id_is_known, bool verbose) |
1252 | { | 1252 | { |
1253 | unsigned long engine_id, i; | 1253 | unsigned long engine_id, i; |
1254 | unsigned long _engine_ids = __engine_ids; | 1254 | unsigned long _engine_ids = __engine_ids; |
1255 | unsigned long engine_ids = 0; | 1255 | unsigned long engine_ids = 0; |
1256 | u32 val; | 1256 | u32 val; |
1257 | u32 mmu_fault_engines = 0; | 1257 | u32 mmu_fault_engines = 0; |
1258 | u32 ref_type; | ||
1259 | u32 ref_id; | ||
1260 | u32 ref_id_is_tsg = false; | ||
1258 | 1261 | ||
1259 | if (verbose) | 1262 | if (verbose) |
1260 | gk20a_debug_dump(g->dev); | 1263 | gk20a_debug_dump(g->dev); |
@@ -1262,44 +1265,65 @@ void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids, | |||
1262 | if (g->ops.ltc.flush) | 1265 | if (g->ops.ltc.flush) |
1263 | g->ops.ltc.flush(g); | 1266 | g->ops.ltc.flush(g); |
1264 | 1267 | ||
1265 | /* store faulted engines in advance */ | 1268 | if (id_is_known) { |
1266 | for_each_set_bit(engine_id, &_engine_ids, 32) { | 1269 | engine_ids = gk20a_fifo_engines_on_id(g, hw_id, id_is_tsg); |
1267 | u32 ref_type; | 1270 | ref_id = hw_id; |
1268 | u32 ref_id; | 1271 | ref_type = id_is_tsg ? |
1269 | gk20a_fifo_get_faulty_id_type(g, engine_id, &ref_id, | 1272 | fifo_engine_status_id_type_tsgid_v() : |
1270 | &ref_type); | 1273 | fifo_engine_status_id_type_chid_v(); |
1271 | 1274 | ref_id_is_tsg = id_is_tsg; | |
1272 | /* Reset *all* engines that use the | 1275 | /* atleast one engine will get passed during sched err*/ |
1273 | * same channel as faulty engine */ | 1276 | engine_ids |= __engine_ids; |
1274 | for (i = 0; i < g->fifo.max_engines; i++) { | 1277 | for_each_set_bit(engine_id, &engine_ids, 32) { |
1275 | u32 type; | 1278 | mmu_fault_engines |= |
1276 | u32 id; | 1279 | BIT(gk20a_engine_id_to_mmu_id(engine_id)); |
1277 | gk20a_fifo_get_faulty_id_type(g, i, &id, &type); | 1280 | } |
1278 | if (ref_type == type && ref_id == id) { | 1281 | } else { |
1279 | engine_ids |= BIT(i); | 1282 | /* store faulted engines in advance */ |
1280 | mmu_fault_engines |= | 1283 | for_each_set_bit(engine_id, &_engine_ids, 32) { |
1284 | gk20a_fifo_get_faulty_id_type(g, engine_id, &ref_id, | ||
1285 | &ref_type); | ||
1286 | if (ref_type == fifo_engine_status_id_type_tsgid_v()) | ||
1287 | ref_id_is_tsg = true; | ||
1288 | else | ||
1289 | ref_id_is_tsg = false; | ||
1290 | /* Reset *all* engines that use the | ||
1291 | * same channel as faulty engine */ | ||
1292 | for (i = 0; i < g->fifo.max_engines; i++) { | ||
1293 | u32 type; | ||
1294 | u32 id; | ||
1295 | |||
1296 | gk20a_fifo_get_faulty_id_type(g, i, &id, &type); | ||
1297 | if (ref_type == type && ref_id == id) { | ||
1298 | engine_ids |= BIT(i); | ||
1299 | mmu_fault_engines |= | ||
1281 | BIT(gk20a_engine_id_to_mmu_id(i)); | 1300 | BIT(gk20a_engine_id_to_mmu_id(i)); |
1301 | } | ||
1282 | } | 1302 | } |
1283 | } | 1303 | } |
1284 | } | 1304 | } |
1285 | 1305 | ||
1286 | /* | 1306 | if (mmu_fault_engines) { |
1287 | * sched error prevents recovery, and ctxsw error will retrigger | 1307 | /* |
1288 | * every 100ms. Disable the sched error to allow recovery. | 1308 | * sched error prevents recovery, and ctxsw error will retrigger |
1289 | */ | 1309 | * every 100ms. Disable the sched error to allow recovery. |
1290 | val = gk20a_readl(g, fifo_intr_en_0_r()); | 1310 | */ |
1291 | val &= ~(fifo_intr_en_0_sched_error_m() | fifo_intr_en_0_mmu_fault_m()); | 1311 | val = gk20a_readl(g, fifo_intr_en_0_r()); |
1292 | gk20a_writel(g, fifo_intr_en_0_r(), val); | 1312 | val &= ~(fifo_intr_en_0_sched_error_m() | |
1293 | gk20a_writel(g, fifo_intr_0_r(), | 1313 | fifo_intr_en_0_mmu_fault_m()); |
1294 | fifo_intr_0_sched_error_reset_f()); | 1314 | gk20a_writel(g, fifo_intr_en_0_r(), val); |
1295 | 1315 | gk20a_writel(g, fifo_intr_0_r(), | |
1296 | g->ops.fifo.trigger_mmu_fault(g, engine_ids); | 1316 | fifo_intr_0_sched_error_reset_f()); |
1297 | gk20a_fifo_handle_mmu_fault(g, engine_ids, hw_id, id_is_tsg); | 1317 | |
1298 | 1318 | g->ops.fifo.trigger_mmu_fault(g, engine_ids); | |
1299 | val = gk20a_readl(g, fifo_intr_en_0_r()); | 1319 | gk20a_fifo_handle_mmu_fault(g, mmu_fault_engines, ref_id, |
1300 | val |= fifo_intr_en_0_mmu_fault_f(1) | 1320 | ref_id_is_tsg); |
1301 | | fifo_intr_en_0_sched_error_f(1); | 1321 | |
1302 | gk20a_writel(g, fifo_intr_en_0_r(), val); | 1322 | val = gk20a_readl(g, fifo_intr_en_0_r()); |
1323 | val |= fifo_intr_en_0_mmu_fault_f(1) | ||
1324 | | fifo_intr_en_0_sched_error_f(1); | ||
1325 | gk20a_writel(g, fifo_intr_en_0_r(), val); | ||
1326 | } | ||
1303 | } | 1327 | } |
1304 | 1328 | ||
1305 | /* force reset channel and tsg (if it's part of one) */ | 1329 | /* force reset channel and tsg (if it's part of one) */ |
@@ -1340,7 +1364,7 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g) | |||
1340 | int id = -1; | 1364 | int id = -1; |
1341 | bool non_chid = false; | 1365 | bool non_chid = false; |
1342 | bool ret = false; | 1366 | bool ret = false; |
1343 | 1367 | u32 mailbox2; | |
1344 | /* read the scheduler error register */ | 1368 | /* read the scheduler error register */ |
1345 | sched_error = gk20a_readl(g, fifo_intr_sched_error_r()); | 1369 | sched_error = gk20a_readl(g, fifo_intr_sched_error_r()); |
1346 | 1370 | ||
@@ -1362,15 +1386,24 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g) | |||
1362 | || ctx_status == | 1386 | || ctx_status == |
1363 | fifo_engine_status_ctx_status_ctxsw_load_v()); | 1387 | fifo_engine_status_ctx_status_ctxsw_load_v()); |
1364 | 1388 | ||
1365 | if (failing_engine) { | 1389 | if (!failing_engine) |
1366 | id = (ctx_status == | 1390 | continue; |
1367 | fifo_engine_status_ctx_status_ctxsw_load_v()) ? | 1391 | if (ctx_status == |
1368 | fifo_engine_status_next_id_v(status) : | 1392 | fifo_engine_status_ctx_status_ctxsw_load_v()) { |
1369 | fifo_engine_status_id_v(status); | 1393 | id = fifo_engine_status_next_id_v(status); |
1370 | non_chid = fifo_pbdma_status_id_type_v(status) != | 1394 | non_chid = fifo_pbdma_status_id_type_v(status) |
1371 | fifo_pbdma_status_id_type_chid_v(); | 1395 | != fifo_pbdma_status_id_type_chid_v(); |
1372 | break; | 1396 | } else if (ctx_status == |
1397 | fifo_engine_status_ctx_status_ctxsw_switch_v()) { | ||
1398 | mailbox2 = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(2)); | ||
1399 | if (mailbox2 & FECS_METHOD_WFI_RESTORE) | ||
1400 | id = fifo_engine_status_next_id_v(status); | ||
1401 | else | ||
1402 | id = fifo_engine_status_id_v(status); | ||
1403 | } else { | ||
1404 | id = fifo_engine_status_id_v(status); | ||
1373 | } | 1405 | } |
1406 | break; | ||
1374 | } | 1407 | } |
1375 | 1408 | ||
1376 | /* could not find the engine - should never happen */ | 1409 | /* could not find the engine - should never happen */ |
@@ -1387,7 +1420,8 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g) | |||
1387 | struct channel_gk20a *ch = &f->channel[id]; | 1420 | struct channel_gk20a *ch = &f->channel[id]; |
1388 | 1421 | ||
1389 | if (non_chid) { | 1422 | if (non_chid) { |
1390 | gk20a_fifo_recover(g, BIT(engine_id), id, true, true); | 1423 | gk20a_fifo_recover(g, BIT(engine_id), id, true, |
1424 | true, true); | ||
1391 | ret = true; | 1425 | ret = true; |
1392 | goto err; | 1426 | goto err; |
1393 | } | 1427 | } |
@@ -1404,7 +1438,7 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g) | |||
1404 | "engine = %u, ch = %d", engine_id, id); | 1438 | "engine = %u, ch = %d", engine_id, id); |
1405 | gk20a_gr_debug_dump(g->dev); | 1439 | gk20a_gr_debug_dump(g->dev); |
1406 | gk20a_fifo_recover(g, BIT(engine_id), id, false, | 1440 | gk20a_fifo_recover(g, BIT(engine_id), id, false, |
1407 | ch->timeout_debug_dump); | 1441 | true, ch->timeout_debug_dump); |
1408 | ret = true; | 1442 | ret = true; |
1409 | } else { | 1443 | } else { |
1410 | gk20a_dbg_info( | 1444 | gk20a_dbg_info( |
@@ -1899,7 +1933,7 @@ static void gk20a_fifo_runlist_reset_engines(struct gk20a *g, u32 runlist_id) | |||
1899 | } | 1933 | } |
1900 | 1934 | ||
1901 | if (engines) | 1935 | if (engines) |
1902 | gk20a_fifo_recover(g, engines, ~(u32)0, false, true); | 1936 | gk20a_fifo_recover(g, engines, ~(u32)0, false, false, true); |
1903 | } | 1937 | } |
1904 | 1938 | ||
1905 | static int gk20a_fifo_runlist_wait_pending(struct gk20a *g, u32 runlist_id) | 1939 | static int gk20a_fifo_runlist_wait_pending(struct gk20a *g, u32 runlist_id) |
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h index fdf843d2..bc9315d2 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h | |||
@@ -164,7 +164,7 @@ void gk20a_fifo_recover(struct gk20a *g, | |||
164 | u32 engine_ids, /* if zero, will be queried from HW */ | 164 | u32 engine_ids, /* if zero, will be queried from HW */ |
165 | u32 hw_id, /* if ~0, will be queried from HW */ | 165 | u32 hw_id, /* if ~0, will be queried from HW */ |
166 | bool hw_id_is_tsg, /* ignored if hw_id == ~0 */ | 166 | bool hw_id_is_tsg, /* ignored if hw_id == ~0 */ |
167 | bool verbose); | 167 | bool id_is_known, bool verbose); |
168 | void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose); | 168 | void gk20a_fifo_recover_ch(struct gk20a *g, u32 hw_chid, bool verbose); |
169 | void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose); | 169 | void gk20a_fifo_recover_tsg(struct gk20a *g, u32 tsgid, bool verbose); |
170 | int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose); | 170 | int gk20a_fifo_force_reset_ch(struct channel_gk20a *ch, bool verbose); |
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index cf9cab0e..501a7deb 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c | |||
@@ -5581,7 +5581,7 @@ int gk20a_gr_isr(struct gk20a *g) | |||
5581 | 5581 | ||
5582 | if (need_reset) | 5582 | if (need_reset) |
5583 | gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A), | 5583 | gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A), |
5584 | ~(u32)0, false, true); | 5584 | ~(u32)0, false, false, true); |
5585 | 5585 | ||
5586 | clean_up: | 5586 | clean_up: |
5587 | if (gr_intr && !ch) { | 5587 | if (gr_intr && !ch) { |