From 457f176785af5c8821889d00d89db05bbaf8f772 Mon Sep 17 00:00:00 2001 From: Seema Khowala Date: Wed, 8 Mar 2017 22:34:49 -0800 Subject: gpu: nvgpu: gv11b: init handle sched_error & ctxsw_timout ops - detect and decode sched_error type. Any sched error starting with xxx_* is not supported in h/w and should never be seen by s/w - for bad_tsg sched error, preempt all runlists to recover as faulted ch/tsg is unknown. For other errors, just report error. - ctxsw timeout is not part of sched error fifo interrupt. A new fifo interrupt, ctxsw timeout is added in gv11b. Add s/w handling. Bug 1856152 JIRA GPUT19X-74 Change-Id: I474e1a3cda29a450691fe2ea1dc1e239ce57df1a Signed-off-by: Seema Khowala Reviewed-on: http://git-master/r/1317615 Reviewed-by: svccoveritychecker GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom --- drivers/gpu/nvgpu/gv11b/fifo_gv11b.c | 237 ++++++++++++++++++++++++++++++++++- 1 file changed, 234 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/nvgpu/gv11b/fifo_gv11b.c') diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c index 3c1982fe..6883d867 100644 --- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c @@ -15,13 +15,15 @@ #include #include -#include "nvgpu/semaphore.h" +#include #include +#include #include "gk20a/gk20a.h" #include "gk20a/fifo_gk20a.h" #include "gk20a/ctxsw_trace_gk20a.h" +#include "gk20a/channel_gk20a.h" #include "gp10b/fifo_gp10b.h" @@ -862,7 +864,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, gk20a_dbg_info("hw id =%d", id); gk20a_dbg_info("id_type =%d", id_type); gk20a_dbg_info("rc_type =%d", rc_type); - gk20a_dbg_info("mmu_fault =%p", mmfault); + gk20a_dbg_info("mmu_fault =0x%p", mmfault); runlists_mask = gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id, id_type, rc_type, mmfault); @@ -1060,7 +1062,8 @@ static u32 gv11b_fifo_intr_0_en_mask(struct gk20a *g) intr_0_en_mask = g->ops.fifo.intr_0_error_mask(g); intr_0_en_mask |= fifo_intr_0_runlist_event_pending_f() | - fifo_intr_0_pbdma_intr_pending_f(); + fifo_intr_0_pbdma_intr_pending_f() | + fifo_intr_0_ctxsw_timeout_pending_f(); return intr_0_en_mask; } @@ -1072,6 +1075,7 @@ int gv11b_init_fifo_reset_enable_hw(struct gk20a *g) u32 timeout; unsigned int i; u32 host_num_pbdma = nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_PBDMA); + struct gk20a_platform *platform = dev_get_drvdata(g->dev); gk20a_dbg_fn(""); @@ -1123,6 +1127,16 @@ int gv11b_init_fifo_reset_enable_hw(struct gk20a *g) gk20a_writel(g, pbdma_intr_en_1_r(i), intr_stall); } + /* clear ctxsw timeout interrupts */ + gk20a_writel(g, fifo_intr_ctxsw_timeout_r(), ~0); + + /* enable ctxsw timeout */ + timeout = GRFIFO_TIMEOUT_CHECK_PERIOD_US; + timeout = scale_ptimer(timeout, + ptimer_scalingfactor10x(platform->ptimer_src_freq)); + timeout |= fifo_eng_ctxsw_timeout_detection_enabled_f(); + gk20a_writel(g, fifo_eng_ctxsw_timeout_r(), timeout); + /* clear runlist interrupts */ gk20a_writel(g, fifo_intr_runlist_r(), ~0); @@ -1139,6 +1153,221 @@ int gv11b_init_fifo_reset_enable_hw(struct gk20a *g) return 0; } +static const char *const gv11b_sched_error_str[] = { + "xxx-0", + "xxx-1", + "xxx-2", + "xxx-3", + "xxx-4", + "engine_reset", + "rl_ack_timeout", + "rl_ack_extra", + "rl_rdat_timeout", + "rl_rdat_extra", + "xxx-a", + "xxx-b", + "rl_req_timeout", + "new_runlist", + "code_config_while_busy", + "xxx-f", + "xxx-0x10", + "xxx-0x11", + "xxx-0x12", + "xxx-0x13", + "xxx-0x14", + "xxx-0x15", + "xxx-0x16", + "xxx-0x17", + "xxx-0x18", + "xxx-0x19", + "xxx-0x1a", + "xxx-0x1b", + "xxx-0x1c", + "xxx-0x1d", + "xxx-0x1e", + "xxx-0x1f", + "bad_tsg", +}; + +static bool gv11b_fifo_handle_sched_error(struct gk20a *g) +{ + u32 sched_error; + + sched_error = gk20a_readl(g, fifo_intr_sched_error_r()); + + if (sched_error < ARRAY_SIZE(gv11b_sched_error_str)) + nvgpu_err(g, "fifo sched error :%s", + gv11b_sched_error_str[sched_error]); + else + nvgpu_err(g, "fifo sched error code not supported"); + + if (sched_error == SCHED_ERROR_CODE_BAD_TSG ) { + /* id is unknown, preempt all runlists and do recovery */ + gk20a_fifo_recover(g, 0, 0, false, false, false); + } + + return false; +} + +static u32 gv11b_fifo_ctxsw_timeout_info(struct gk20a *g, u32 active_eng_id) +{ + u32 tsgid = FIFO_INVAL_TSG_ID; + u32 timeout_info; + u32 ctx_status, info_status; + + timeout_info = gk20a_readl(g, + fifo_intr_ctxsw_timeout_info_r(active_eng_id)); + + /* + * ctxsw_state and tsgid are snapped at the point of the timeout and + * will not change while the corresponding INTR_CTXSW_TIMEOUT_ENGINE bit + * is PENDING. + */ + ctx_status = fifo_intr_ctxsw_timeout_info_ctxsw_state_v(timeout_info); + if (ctx_status == + fifo_intr_ctxsw_timeout_info_ctxsw_state_load_v()) { + + tsgid = fifo_intr_ctxsw_timeout_info_next_tsgid_v(timeout_info); + + } else if (ctx_status == + fifo_intr_ctxsw_timeout_info_ctxsw_state_switch_v() || + ctx_status == + fifo_intr_ctxsw_timeout_info_ctxsw_state_save_v()) { + + tsgid = fifo_intr_ctxsw_timeout_info_prev_tsgid_v(timeout_info); + } + gk20a_dbg_info("ctxsw timeout info: tsgid = %d", tsgid); + + /* + * STATUS indicates whether the context request ack was eventually + * received and whether a subsequent request timed out. This field is + * updated live while the corresponding INTR_CTXSW_TIMEOUT_ENGINE bit + * is PENDING. STATUS starts in AWAITING_ACK, and progresses to + * ACK_RECEIVED and finally ends with DROPPED_TIMEOUT. + * + * AWAITING_ACK - context request ack still not returned from engine. + * ENG_WAS_RESET - The engine was reset via a PRI write to NV_PMC_ENABLE + * or NV_PMC_ELPG_ENABLE prior to receiving the ack. Host will not + * expect ctx ack to return, but if it is already in flight, STATUS will + * transition shortly to ACK_RECEIVED unless the interrupt is cleared + * first. Once the engine is reset, additional context switches can + * occur; if one times out, STATUS will transition to DROPPED_TIMEOUT + * if the interrupt isn't cleared first. + * ACK_RECEIVED - The ack for the timed-out context request was + * received between the point of the timeout and this register being + * read. Note this STATUS can be reported during the load stage of the + * same context switch that timed out if the timeout occurred during the + * save half of a context switch. Additional context requests may have + * completed or may be outstanding, but no further context timeout has + * occurred. This simplifies checking for spurious context switch + * timeouts. + * DROPPED_TIMEOUT - The originally timed-out context request acked, + * but a subsequent context request then timed out. + * Information about the subsequent timeout is not stored; in fact, that + * context request may also have already been acked by the time SW + * SW reads this register. If not, there is a chance SW can get the + * dropped information by clearing the corresponding + * INTR_CTXSW_TIMEOUT_ENGINE bit and waiting for the timeout to occur + * again. Note, however, that if the engine does time out again, + * it may not be from the original request that caused the + * DROPPED_TIMEOUT state, as that request may + * be acked in the interim. + */ + info_status = fifo_intr_ctxsw_timeout_info_status_v(timeout_info); + if (info_status == + fifo_intr_ctxsw_timeout_info_status_awaiting_ack_v()) { + + gk20a_dbg_info("ctxsw timeout info : awaiting ack"); + + } else if (info_status == + fifo_intr_ctxsw_timeout_info_status_eng_was_reset_v()) { + + gk20a_dbg_info("ctxsw timeout info : eng was reset"); + + } else if (info_status == + fifo_intr_ctxsw_timeout_info_status_ack_received_v()) { + + gk20a_dbg_info("ctxsw timeout info : ack received"); + /* no need to recover */ + tsgid = FIFO_INVAL_TSG_ID; + + } else if (info_status == + fifo_intr_ctxsw_timeout_info_status_dropped_timeout_v()) { + + gk20a_dbg_info("ctxsw timeout info : dropped timeout"); + /* no need to recover */ + tsgid = FIFO_INVAL_TSG_ID; + + } else { + gk20a_dbg_info("ctxsw timeout info status = %u", info_status); + } + + return tsgid; +} + +static bool gv11b_fifo_handle_ctxsw_timeout(struct gk20a *g, u32 fifo_intr) +{ + bool ret = false; + u32 tsgid = FIFO_INVAL_TSG_ID; + u32 engine_id, active_eng_id; + u32 timeout_val, ctxsw_timeout_engines; + + + if (!(fifo_intr & fifo_intr_0_ctxsw_timeout_pending_f())) + return ret; + + /* get ctxsw timedout engines */ + ctxsw_timeout_engines = gk20a_readl(g, fifo_intr_ctxsw_timeout_r()); + if (ctxsw_timeout_engines == 0) { + nvgpu_err(g, "no eng ctxsw timeout pending"); + return ret; + } + + timeout_val = gk20a_readl(g, fifo_eng_ctxsw_timeout_r()); + timeout_val = fifo_eng_ctxsw_timeout_period_v(timeout_val); + + gk20a_dbg_info("eng ctxsw timeout period = 0x%x", timeout_val); + + for (engine_id = 0; engine_id < g->fifo.num_engines; engine_id++) { + active_eng_id = g->fifo.active_engines_list[engine_id]; + + if (ctxsw_timeout_engines & + fifo_intr_ctxsw_timeout_engine_pending_f( + active_eng_id)) { + + struct fifo_gk20a *f = &g->fifo; + u32 ms = 0; + bool verbose = false; + + tsgid = gv11b_fifo_ctxsw_timeout_info(g, active_eng_id); + + if (tsgid == FIFO_INVAL_TSG_ID) + continue; + + if (gk20a_fifo_check_tsg_ctxsw_timeout( + &f->tsg[tsgid], &verbose, &ms)) { + ret = true; + nvgpu_err(g, + "ctxsw timeout error:" + "active engine id =%u, %s=%d, ms=%u", + active_eng_id, "tsg", tsgid, ms); + + /* Cancel all channels' timeout */ + gk20a_channel_timeout_restart_all_channels(g); + gk20a_fifo_recover(g, BIT(active_eng_id), tsgid, + true, true, verbose); + } else { + gk20a_dbg_info( + "fifo is waiting for ctx switch: " + "for %d ms, %s=%d", ms, "tsg", tsgid); + } + } + } + /* clear interrupt */ + gk20a_writel(g, fifo_intr_ctxsw_timeout_r(), ctxsw_timeout_engines); + return ret; +} + void gv11b_init_fifo(struct gpu_ops *gops) { gp10b_init_fifo(gops); @@ -1169,4 +1398,6 @@ void gv11b_init_fifo(struct gpu_ops *gops) gops->fifo.init_pbdma_intr_descs = gv11b_fifo_init_pbdma_intr_descs; gops->fifo.reset_enable_hw = gv11b_init_fifo_reset_enable_hw; gops->fifo.teardown_ch_tsg = gv11b_fifo_teardown_ch_tsg; + gops->fifo.handle_sched_error = gv11b_fifo_handle_sched_error; + gops->fifo.handle_ctxsw_timeout = gv11b_fifo_handle_ctxsw_timeout; } -- cgit v1.2.2