diff options
author | Seema Khowala <seemaj@nvidia.com> | 2017-03-09 01:34:49 -0500 |
---|---|---|
committer | mobile promotions <svcmobile_promotions@nvidia.com> | 2017-04-12 18:33:50 -0400 |
commit | 457f176785af5c8821889d00d89db05bbaf8f772 (patch) | |
tree | d4b7913ffc728c1bda19a70746d758df3ff2f7a0 /drivers/gpu/nvgpu/gv11b | |
parent | fbce374aa0f6101d27ca5b3de97905d2798c6f04 (diff) |
gpu: nvgpu: gv11b: init handle sched_error & ctxsw_timout ops
- detect and decode sched_error type. Any sched error starting with xxx_* is
not supported in h/w and should never be seen by s/w
- for bad_tsg sched error, preempt all runlists to recover as faulted ch/tsg
is unknown. For other errors, just report error.
- ctxsw timeout is not part of sched error fifo interrupt. A new
fifo interrupt, ctxsw timeout is added in gv11b. Add s/w handling.
Bug 1856152
JIRA GPUT19X-74
Change-Id: I474e1a3cda29a450691fe2ea1dc1e239ce57df1a
Signed-off-by: Seema Khowala <seemaj@nvidia.com>
Reviewed-on: http://git-master/r/1317615
Reviewed-by: svccoveritychecker <svccoveritychecker@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gv11b')
-rw-r--r-- | drivers/gpu/nvgpu/gv11b/fifo_gv11b.c | 237 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gv11b/fifo_gv11b.h | 14 |
2 files changed, 248 insertions, 3 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c index 3c1982fe..6883d867 100644 --- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c | |||
@@ -15,13 +15,15 @@ | |||
15 | #include <linux/delay.h> | 15 | #include <linux/delay.h> |
16 | #include <linux/types.h> | 16 | #include <linux/types.h> |
17 | 17 | ||
18 | #include "nvgpu/semaphore.h" | 18 | #include <nvgpu/semaphore.h> |
19 | #include <nvgpu/timers.h> | 19 | #include <nvgpu/timers.h> |
20 | #include <nvgpu/log.h> | ||
20 | 21 | ||
21 | 22 | ||
22 | #include "gk20a/gk20a.h" | 23 | #include "gk20a/gk20a.h" |
23 | #include "gk20a/fifo_gk20a.h" | 24 | #include "gk20a/fifo_gk20a.h" |
24 | #include "gk20a/ctxsw_trace_gk20a.h" | 25 | #include "gk20a/ctxsw_trace_gk20a.h" |
26 | #include "gk20a/channel_gk20a.h" | ||
25 | 27 | ||
26 | #include "gp10b/fifo_gp10b.h" | 28 | #include "gp10b/fifo_gp10b.h" |
27 | 29 | ||
@@ -862,7 +864,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, | |||
862 | gk20a_dbg_info("hw id =%d", id); | 864 | gk20a_dbg_info("hw id =%d", id); |
863 | gk20a_dbg_info("id_type =%d", id_type); | 865 | gk20a_dbg_info("id_type =%d", id_type); |
864 | gk20a_dbg_info("rc_type =%d", rc_type); | 866 | gk20a_dbg_info("rc_type =%d", rc_type); |
865 | gk20a_dbg_info("mmu_fault =%p", mmfault); | 867 | gk20a_dbg_info("mmu_fault =0x%p", mmfault); |
866 | 868 | ||
867 | runlists_mask = gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id, | 869 | runlists_mask = gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id, |
868 | id_type, rc_type, mmfault); | 870 | id_type, rc_type, mmfault); |
@@ -1060,7 +1062,8 @@ static u32 gv11b_fifo_intr_0_en_mask(struct gk20a *g) | |||
1060 | intr_0_en_mask = g->ops.fifo.intr_0_error_mask(g); | 1062 | intr_0_en_mask = g->ops.fifo.intr_0_error_mask(g); |
1061 | 1063 | ||
1062 | intr_0_en_mask |= fifo_intr_0_runlist_event_pending_f() | | 1064 | intr_0_en_mask |= fifo_intr_0_runlist_event_pending_f() | |
1063 | fifo_intr_0_pbdma_intr_pending_f(); | 1065 | fifo_intr_0_pbdma_intr_pending_f() | |
1066 | fifo_intr_0_ctxsw_timeout_pending_f(); | ||
1064 | 1067 | ||
1065 | return intr_0_en_mask; | 1068 | return intr_0_en_mask; |
1066 | } | 1069 | } |
@@ -1072,6 +1075,7 @@ int gv11b_init_fifo_reset_enable_hw(struct gk20a *g) | |||
1072 | u32 timeout; | 1075 | u32 timeout; |
1073 | unsigned int i; | 1076 | unsigned int i; |
1074 | u32 host_num_pbdma = nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_PBDMA); | 1077 | u32 host_num_pbdma = nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_PBDMA); |
1078 | struct gk20a_platform *platform = dev_get_drvdata(g->dev); | ||
1075 | 1079 | ||
1076 | gk20a_dbg_fn(""); | 1080 | gk20a_dbg_fn(""); |
1077 | 1081 | ||
@@ -1123,6 +1127,16 @@ int gv11b_init_fifo_reset_enable_hw(struct gk20a *g) | |||
1123 | gk20a_writel(g, pbdma_intr_en_1_r(i), intr_stall); | 1127 | gk20a_writel(g, pbdma_intr_en_1_r(i), intr_stall); |
1124 | } | 1128 | } |
1125 | 1129 | ||
1130 | /* clear ctxsw timeout interrupts */ | ||
1131 | gk20a_writel(g, fifo_intr_ctxsw_timeout_r(), ~0); | ||
1132 | |||
1133 | /* enable ctxsw timeout */ | ||
1134 | timeout = GRFIFO_TIMEOUT_CHECK_PERIOD_US; | ||
1135 | timeout = scale_ptimer(timeout, | ||
1136 | ptimer_scalingfactor10x(platform->ptimer_src_freq)); | ||
1137 | timeout |= fifo_eng_ctxsw_timeout_detection_enabled_f(); | ||
1138 | gk20a_writel(g, fifo_eng_ctxsw_timeout_r(), timeout); | ||
1139 | |||
1126 | /* clear runlist interrupts */ | 1140 | /* clear runlist interrupts */ |
1127 | gk20a_writel(g, fifo_intr_runlist_r(), ~0); | 1141 | gk20a_writel(g, fifo_intr_runlist_r(), ~0); |
1128 | 1142 | ||
@@ -1139,6 +1153,221 @@ int gv11b_init_fifo_reset_enable_hw(struct gk20a *g) | |||
1139 | return 0; | 1153 | return 0; |
1140 | } | 1154 | } |
1141 | 1155 | ||
1156 | static const char *const gv11b_sched_error_str[] = { | ||
1157 | "xxx-0", | ||
1158 | "xxx-1", | ||
1159 | "xxx-2", | ||
1160 | "xxx-3", | ||
1161 | "xxx-4", | ||
1162 | "engine_reset", | ||
1163 | "rl_ack_timeout", | ||
1164 | "rl_ack_extra", | ||
1165 | "rl_rdat_timeout", | ||
1166 | "rl_rdat_extra", | ||
1167 | "xxx-a", | ||
1168 | "xxx-b", | ||
1169 | "rl_req_timeout", | ||
1170 | "new_runlist", | ||
1171 | "code_config_while_busy", | ||
1172 | "xxx-f", | ||
1173 | "xxx-0x10", | ||
1174 | "xxx-0x11", | ||
1175 | "xxx-0x12", | ||
1176 | "xxx-0x13", | ||
1177 | "xxx-0x14", | ||
1178 | "xxx-0x15", | ||
1179 | "xxx-0x16", | ||
1180 | "xxx-0x17", | ||
1181 | "xxx-0x18", | ||
1182 | "xxx-0x19", | ||
1183 | "xxx-0x1a", | ||
1184 | "xxx-0x1b", | ||
1185 | "xxx-0x1c", | ||
1186 | "xxx-0x1d", | ||
1187 | "xxx-0x1e", | ||
1188 | "xxx-0x1f", | ||
1189 | "bad_tsg", | ||
1190 | }; | ||
1191 | |||
1192 | static bool gv11b_fifo_handle_sched_error(struct gk20a *g) | ||
1193 | { | ||
1194 | u32 sched_error; | ||
1195 | |||
1196 | sched_error = gk20a_readl(g, fifo_intr_sched_error_r()); | ||
1197 | |||
1198 | if (sched_error < ARRAY_SIZE(gv11b_sched_error_str)) | ||
1199 | nvgpu_err(g, "fifo sched error :%s", | ||
1200 | gv11b_sched_error_str[sched_error]); | ||
1201 | else | ||
1202 | nvgpu_err(g, "fifo sched error code not supported"); | ||
1203 | |||
1204 | if (sched_error == SCHED_ERROR_CODE_BAD_TSG ) { | ||
1205 | /* id is unknown, preempt all runlists and do recovery */ | ||
1206 | gk20a_fifo_recover(g, 0, 0, false, false, false); | ||
1207 | } | ||
1208 | |||
1209 | return false; | ||
1210 | } | ||
1211 | |||
1212 | static u32 gv11b_fifo_ctxsw_timeout_info(struct gk20a *g, u32 active_eng_id) | ||
1213 | { | ||
1214 | u32 tsgid = FIFO_INVAL_TSG_ID; | ||
1215 | u32 timeout_info; | ||
1216 | u32 ctx_status, info_status; | ||
1217 | |||
1218 | timeout_info = gk20a_readl(g, | ||
1219 | fifo_intr_ctxsw_timeout_info_r(active_eng_id)); | ||
1220 | |||
1221 | /* | ||
1222 | * ctxsw_state and tsgid are snapped at the point of the timeout and | ||
1223 | * will not change while the corresponding INTR_CTXSW_TIMEOUT_ENGINE bit | ||
1224 | * is PENDING. | ||
1225 | */ | ||
1226 | ctx_status = fifo_intr_ctxsw_timeout_info_ctxsw_state_v(timeout_info); | ||
1227 | if (ctx_status == | ||
1228 | fifo_intr_ctxsw_timeout_info_ctxsw_state_load_v()) { | ||
1229 | |||
1230 | tsgid = fifo_intr_ctxsw_timeout_info_next_tsgid_v(timeout_info); | ||
1231 | |||
1232 | } else if (ctx_status == | ||
1233 | fifo_intr_ctxsw_timeout_info_ctxsw_state_switch_v() || | ||
1234 | ctx_status == | ||
1235 | fifo_intr_ctxsw_timeout_info_ctxsw_state_save_v()) { | ||
1236 | |||
1237 | tsgid = fifo_intr_ctxsw_timeout_info_prev_tsgid_v(timeout_info); | ||
1238 | } | ||
1239 | gk20a_dbg_info("ctxsw timeout info: tsgid = %d", tsgid); | ||
1240 | |||
1241 | /* | ||
1242 | * STATUS indicates whether the context request ack was eventually | ||
1243 | * received and whether a subsequent request timed out. This field is | ||
1244 | * updated live while the corresponding INTR_CTXSW_TIMEOUT_ENGINE bit | ||
1245 | * is PENDING. STATUS starts in AWAITING_ACK, and progresses to | ||
1246 | * ACK_RECEIVED and finally ends with DROPPED_TIMEOUT. | ||
1247 | * | ||
1248 | * AWAITING_ACK - context request ack still not returned from engine. | ||
1249 | * ENG_WAS_RESET - The engine was reset via a PRI write to NV_PMC_ENABLE | ||
1250 | * or NV_PMC_ELPG_ENABLE prior to receiving the ack. Host will not | ||
1251 | * expect ctx ack to return, but if it is already in flight, STATUS will | ||
1252 | * transition shortly to ACK_RECEIVED unless the interrupt is cleared | ||
1253 | * first. Once the engine is reset, additional context switches can | ||
1254 | * occur; if one times out, STATUS will transition to DROPPED_TIMEOUT | ||
1255 | * if the interrupt isn't cleared first. | ||
1256 | * ACK_RECEIVED - The ack for the timed-out context request was | ||
1257 | * received between the point of the timeout and this register being | ||
1258 | * read. Note this STATUS can be reported during the load stage of the | ||
1259 | * same context switch that timed out if the timeout occurred during the | ||
1260 | * save half of a context switch. Additional context requests may have | ||
1261 | * completed or may be outstanding, but no further context timeout has | ||
1262 | * occurred. This simplifies checking for spurious context switch | ||
1263 | * timeouts. | ||
1264 | * DROPPED_TIMEOUT - The originally timed-out context request acked, | ||
1265 | * but a subsequent context request then timed out. | ||
1266 | * Information about the subsequent timeout is not stored; in fact, that | ||
1267 | * context request may also have already been acked by the time SW | ||
1268 | * SW reads this register. If not, there is a chance SW can get the | ||
1269 | * dropped information by clearing the corresponding | ||
1270 | * INTR_CTXSW_TIMEOUT_ENGINE bit and waiting for the timeout to occur | ||
1271 | * again. Note, however, that if the engine does time out again, | ||
1272 | * it may not be from the original request that caused the | ||
1273 | * DROPPED_TIMEOUT state, as that request may | ||
1274 | * be acked in the interim. | ||
1275 | */ | ||
1276 | info_status = fifo_intr_ctxsw_timeout_info_status_v(timeout_info); | ||
1277 | if (info_status == | ||
1278 | fifo_intr_ctxsw_timeout_info_status_awaiting_ack_v()) { | ||
1279 | |||
1280 | gk20a_dbg_info("ctxsw timeout info : awaiting ack"); | ||
1281 | |||
1282 | } else if (info_status == | ||
1283 | fifo_intr_ctxsw_timeout_info_status_eng_was_reset_v()) { | ||
1284 | |||
1285 | gk20a_dbg_info("ctxsw timeout info : eng was reset"); | ||
1286 | |||
1287 | } else if (info_status == | ||
1288 | fifo_intr_ctxsw_timeout_info_status_ack_received_v()) { | ||
1289 | |||
1290 | gk20a_dbg_info("ctxsw timeout info : ack received"); | ||
1291 | /* no need to recover */ | ||
1292 | tsgid = FIFO_INVAL_TSG_ID; | ||
1293 | |||
1294 | } else if (info_status == | ||
1295 | fifo_intr_ctxsw_timeout_info_status_dropped_timeout_v()) { | ||
1296 | |||
1297 | gk20a_dbg_info("ctxsw timeout info : dropped timeout"); | ||
1298 | /* no need to recover */ | ||
1299 | tsgid = FIFO_INVAL_TSG_ID; | ||
1300 | |||
1301 | } else { | ||
1302 | gk20a_dbg_info("ctxsw timeout info status = %u", info_status); | ||
1303 | } | ||
1304 | |||
1305 | return tsgid; | ||
1306 | } | ||
1307 | |||
1308 | static bool gv11b_fifo_handle_ctxsw_timeout(struct gk20a *g, u32 fifo_intr) | ||
1309 | { | ||
1310 | bool ret = false; | ||
1311 | u32 tsgid = FIFO_INVAL_TSG_ID; | ||
1312 | u32 engine_id, active_eng_id; | ||
1313 | u32 timeout_val, ctxsw_timeout_engines; | ||
1314 | |||
1315 | |||
1316 | if (!(fifo_intr & fifo_intr_0_ctxsw_timeout_pending_f())) | ||
1317 | return ret; | ||
1318 | |||
1319 | /* get ctxsw timedout engines */ | ||
1320 | ctxsw_timeout_engines = gk20a_readl(g, fifo_intr_ctxsw_timeout_r()); | ||
1321 | if (ctxsw_timeout_engines == 0) { | ||
1322 | nvgpu_err(g, "no eng ctxsw timeout pending"); | ||
1323 | return ret; | ||
1324 | } | ||
1325 | |||
1326 | timeout_val = gk20a_readl(g, fifo_eng_ctxsw_timeout_r()); | ||
1327 | timeout_val = fifo_eng_ctxsw_timeout_period_v(timeout_val); | ||
1328 | |||
1329 | gk20a_dbg_info("eng ctxsw timeout period = 0x%x", timeout_val); | ||
1330 | |||
1331 | for (engine_id = 0; engine_id < g->fifo.num_engines; engine_id++) { | ||
1332 | active_eng_id = g->fifo.active_engines_list[engine_id]; | ||
1333 | |||
1334 | if (ctxsw_timeout_engines & | ||
1335 | fifo_intr_ctxsw_timeout_engine_pending_f( | ||
1336 | active_eng_id)) { | ||
1337 | |||
1338 | struct fifo_gk20a *f = &g->fifo; | ||
1339 | u32 ms = 0; | ||
1340 | bool verbose = false; | ||
1341 | |||
1342 | tsgid = gv11b_fifo_ctxsw_timeout_info(g, active_eng_id); | ||
1343 | |||
1344 | if (tsgid == FIFO_INVAL_TSG_ID) | ||
1345 | continue; | ||
1346 | |||
1347 | if (gk20a_fifo_check_tsg_ctxsw_timeout( | ||
1348 | &f->tsg[tsgid], &verbose, &ms)) { | ||
1349 | ret = true; | ||
1350 | nvgpu_err(g, | ||
1351 | "ctxsw timeout error:" | ||
1352 | "active engine id =%u, %s=%d, ms=%u", | ||
1353 | active_eng_id, "tsg", tsgid, ms); | ||
1354 | |||
1355 | /* Cancel all channels' timeout */ | ||
1356 | gk20a_channel_timeout_restart_all_channels(g); | ||
1357 | gk20a_fifo_recover(g, BIT(active_eng_id), tsgid, | ||
1358 | true, true, verbose); | ||
1359 | } else { | ||
1360 | gk20a_dbg_info( | ||
1361 | "fifo is waiting for ctx switch: " | ||
1362 | "for %d ms, %s=%d", ms, "tsg", tsgid); | ||
1363 | } | ||
1364 | } | ||
1365 | } | ||
1366 | /* clear interrupt */ | ||
1367 | gk20a_writel(g, fifo_intr_ctxsw_timeout_r(), ctxsw_timeout_engines); | ||
1368 | return ret; | ||
1369 | } | ||
1370 | |||
1142 | void gv11b_init_fifo(struct gpu_ops *gops) | 1371 | void gv11b_init_fifo(struct gpu_ops *gops) |
1143 | { | 1372 | { |
1144 | gp10b_init_fifo(gops); | 1373 | gp10b_init_fifo(gops); |
@@ -1169,4 +1398,6 @@ void gv11b_init_fifo(struct gpu_ops *gops) | |||
1169 | gops->fifo.init_pbdma_intr_descs = gv11b_fifo_init_pbdma_intr_descs; | 1398 | gops->fifo.init_pbdma_intr_descs = gv11b_fifo_init_pbdma_intr_descs; |
1170 | gops->fifo.reset_enable_hw = gv11b_init_fifo_reset_enable_hw; | 1399 | gops->fifo.reset_enable_hw = gv11b_init_fifo_reset_enable_hw; |
1171 | gops->fifo.teardown_ch_tsg = gv11b_fifo_teardown_ch_tsg; | 1400 | gops->fifo.teardown_ch_tsg = gv11b_fifo_teardown_ch_tsg; |
1401 | gops->fifo.handle_sched_error = gv11b_fifo_handle_sched_error; | ||
1402 | gops->fifo.handle_ctxsw_timeout = gv11b_fifo_handle_ctxsw_timeout; | ||
1172 | } | 1403 | } |
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h index 07a39da0..ab56b876 100644 --- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.h | |||
@@ -18,6 +18,20 @@ | |||
18 | 18 | ||
19 | #define FIFO_INVAL_PBDMA_ID ((u32)~0) | 19 | #define FIFO_INVAL_PBDMA_ID ((u32)~0) |
20 | 20 | ||
21 | /* engine context-switch request occurred while the engine was in reset */ | ||
22 | #define SCHED_ERROR_CODE_ENGINE_RESET 0x00000005 | ||
23 | |||
24 | /* | ||
25 | * ERROR_CODE_BAD_TSG indicates that Host encountered a badly formed TSG header | ||
26 | * or a badly formed channel type runlist entry in the runlist. This is typically | ||
27 | * caused by encountering a new TSG entry in the middle of a TSG definition. | ||
28 | * A channel type entry having wrong runqueue selector can also cause this. | ||
29 | * Additionally this error code can indicate when a channel is encountered on | ||
30 | * the runlist which is outside of a TSG. | ||
31 | */ | ||
32 | #define SCHED_ERROR_CODE_BAD_TSG 0x00000020 | ||
33 | |||
34 | |||
21 | struct gpu_ops; | 35 | struct gpu_ops; |
22 | void gv11b_init_fifo(struct gpu_ops *gops); | 36 | void gv11b_init_fifo(struct gpu_ops *gops); |
23 | #endif | 37 | #endif |