diff options
author | Seema Khowala <seemaj@nvidia.com> | 2017-11-16 16:46:11 -0500 |
---|---|---|
committer | mobile promotions <svcmobile_promotions@nvidia.com> | 2017-11-28 12:46:46 -0500 |
commit | 3fbb44d7576238d42635e2ca6501a17cdc7306f7 (patch) | |
tree | e0182e16b6dd13838503d06b111c266c2f1d1b52 /drivers/gpu/nvgpu | |
parent | 87f42744e0821d54eff7bf0bb863c7e53063e5c1 (diff) |
gpu: nvgpu: gv11b: channel/tsg recovery reorged
Context TSG teardown procedure:
1. Disable scheduling for the engine's runlist via PFIFO_SCHED_DISABLE.
This enables SW to determine whether a context has hung later in the
process: otherwise, ongoing work on the runlist may keep ENG_STATUS from
reaching a steady state.
2. Disable all channels in the TSG being torn down or submit a new runlist
that does not contain the TSG. This is to prevent the TSG from being
rescheduled once scheduling is reenabled in step 6.
3.
a)Initiate a preempt of the TSG by writing NV_PFIFO_PREEMPT
with the TSG's ID and the TYPE set to TSG if TSG id is known else
do 3b
b)Initiate a preempt of the engine by writing the bit associated with its
runlist to NV_PFIFO_RUNLIST_PREEMPT. This allows us to begin the preempt
process prior to doing the slow register reads needed to determine
whether the context has hit any interrupts or is hung. Do not poll
NV_PFIFO_RUNLIST_PREEMPT for the preempt to complete.
4. Check for preempt done
5. If a reset is needed as determined by step 4:
a. Halt the memory interface for the engine (as per the relevant engine
procedure).
b. Reset the engine via PMC_ENABLE.
c. Take the engine out of reset and reinit the engine (as per the relevant
engine procedure)
6. Re-enable scheduling for the engine's runlist via PFIFO_SCHED_ENABLE.
Bug 200277163
Change-Id: I1e945a2c6b9845f365d6952109f6803309aa2270
Signed-off-by: Seema Khowala <seemaj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1599841
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu')
-rw-r--r-- | drivers/gpu/nvgpu/gv11b/fifo_gv11b.c | 111 |
1 files changed, 60 insertions, 51 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c index 0238ae6c..ae2b6cfc 100644 --- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c | |||
@@ -660,7 +660,7 @@ static u32 gv11b_fifo_get_runlists_mask(struct gk20a *g, u32 act_eng_bitmask, | |||
660 | } | 660 | } |
661 | } | 661 | } |
662 | } | 662 | } |
663 | gk20a_dbg_info("runlists_mask = %08x", runlists_mask); | 663 | nvgpu_log(g, gpu_dbg_info, "runlists_mask = 0x%08x", runlists_mask); |
664 | return runlists_mask; | 664 | return runlists_mask; |
665 | } | 665 | } |
666 | 666 | ||
@@ -873,9 +873,14 @@ static int gv11b_fifo_preempt_runlists(struct gk20a *g, u32 runlists_mask) | |||
873 | nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token); | 873 | nvgpu_pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token); |
874 | 874 | ||
875 | for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) { | 875 | for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) { |
876 | if (runlists_mask & fifo_runlist_preempt_runlist_m(runlist_id)) | 876 | if (runlists_mask & |
877 | fifo_runlist_preempt_runlist_m(runlist_id)) { | ||
878 | /* during recovery reset engs served by this runlist */ | ||
879 | g->fifo.runlist_info[runlist_id].reset_eng_bitmask = | ||
880 | g->fifo.runlist_info[runlist_id].eng_bitmask; | ||
877 | nvgpu_mutex_release(&g->fifo. | 881 | nvgpu_mutex_release(&g->fifo. |
878 | runlist_info[runlist_id].mutex); | 882 | runlist_info[runlist_id].mutex); |
883 | } | ||
879 | } | 884 | } |
880 | 885 | ||
881 | return ret; | 886 | return ret; |
@@ -955,11 +960,9 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, | |||
955 | struct fifo_runlist_info_gk20a *runlist = NULL; | 960 | struct fifo_runlist_info_gk20a *runlist = NULL; |
956 | u32 engine_id, client_type = ~0; | 961 | u32 engine_id, client_type = ~0; |
957 | 962 | ||
958 | gk20a_dbg_info("active engine ids bitmask =0x%x", act_eng_bitmask); | 963 | nvgpu_log(g, gpu_dbg_info, "id = %d, id_type = %d, rc_type = %d, " |
959 | gk20a_dbg_info("hw id =%d", id); | 964 | "act_eng_bitmask = 0x%x, mmfault ptr = 0x%p", |
960 | gk20a_dbg_info("id_type =%d", id_type); | 965 | id, id_type, rc_type, act_eng_bitmask, mmfault); |
961 | gk20a_dbg_info("rc_type =%d", rc_type); | ||
962 | gk20a_dbg_info("mmu_fault =0x%p", mmfault); | ||
963 | 966 | ||
964 | runlists_mask = gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id, | 967 | runlists_mask = gv11b_fifo_get_runlists_mask(g, act_eng_bitmask, id, |
965 | id_type, rc_type, mmfault); | 968 | id_type, rc_type, mmfault); |
@@ -986,25 +989,29 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, | |||
986 | 989 | ||
987 | gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN); | 990 | gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN); |
988 | 991 | ||
989 | if (rc_type == RC_TYPE_MMU_FAULT) | 992 | /* Get tsg/ch */ |
993 | if (rc_type == RC_TYPE_MMU_FAULT) { | ||
990 | gk20a_debug_dump(g); | 994 | gk20a_debug_dump(g); |
991 | |||
992 | /* get the channel/TSG */ | ||
993 | if (rc_type == RC_TYPE_MMU_FAULT && mmfault && mmfault->refch) { | ||
994 | refch = mmfault->refch; | 995 | refch = mmfault->refch; |
995 | client_type = mmfault->client_type; | 996 | client_type = mmfault->client_type; |
996 | if (gk20a_is_channel_marked_as_tsg(refch)) | ||
997 | tsg = &g->fifo.tsg[refch->tsgid]; | ||
998 | gv11b_fifo_reset_pbdma_and_eng_faulted(g, refch, | 997 | gv11b_fifo_reset_pbdma_and_eng_faulted(g, refch, |
999 | mmfault->faulted_pbdma, | 998 | mmfault->faulted_pbdma, |
1000 | mmfault->faulted_engine); | 999 | mmfault->faulted_engine); |
1001 | } else { | 1000 | } |
1002 | if (id_type == ID_TYPE_TSG) | 1001 | |
1003 | tsg = &g->fifo.tsg[id]; | 1002 | if (id_type == ID_TYPE_TSG) { |
1004 | else if (id_type == ID_TYPE_CHANNEL) | 1003 | tsg = &g->fifo.tsg[id]; |
1004 | } else if (id_type == ID_TYPE_CHANNEL) { | ||
1005 | if (refch == NULL) | ||
1005 | refch = gk20a_channel_get(&g->fifo.channel[id]); | 1006 | refch = gk20a_channel_get(&g->fifo.channel[id]); |
1006 | } | 1007 | } |
1008 | /* Disable tsg/ch */ | ||
1009 | if (tsg) | ||
1010 | gk20a_disable_tsg(tsg); | ||
1011 | else if (refch) | ||
1012 | g->ops.fifo.disable_channel(refch); | ||
1007 | 1013 | ||
1014 | /* Preempt tsg/ch */ | ||
1008 | if (id_type == ID_TYPE_TSG || id_type == ID_TYPE_CHANNEL) { | 1015 | if (id_type == ID_TYPE_TSG || id_type == ID_TYPE_CHANNEL) { |
1009 | g->ops.fifo.preempt_ch_tsg(g, id, id_type, | 1016 | g->ops.fifo.preempt_ch_tsg(g, id, id_type, |
1010 | PREEMPT_TIMEOUT_NORC); | 1017 | PREEMPT_TIMEOUT_NORC); |
@@ -1012,35 +1019,6 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, | |||
1012 | gv11b_fifo_preempt_runlists(g, runlists_mask); | 1019 | gv11b_fifo_preempt_runlists(g, runlists_mask); |
1013 | } | 1020 | } |
1014 | 1021 | ||
1015 | if (tsg) { | ||
1016 | if (!g->fifo.deferred_reset_pending) { | ||
1017 | if (rc_type == RC_TYPE_MMU_FAULT) { | ||
1018 | gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg); | ||
1019 | verbose = gk20a_fifo_error_tsg(g, tsg); | ||
1020 | } | ||
1021 | } | ||
1022 | gk20a_fifo_abort_tsg(g, tsg->tsgid, false); | ||
1023 | if (refch) | ||
1024 | gk20a_channel_put(refch); | ||
1025 | } else if (refch) { | ||
1026 | if (!g->fifo.deferred_reset_pending) { | ||
1027 | if (rc_type == RC_TYPE_MMU_FAULT) { | ||
1028 | gk20a_fifo_set_ctx_mmu_error_ch(g, refch); | ||
1029 | verbose = gk20a_fifo_error_ch(g, refch); | ||
1030 | } | ||
1031 | } | ||
1032 | gk20a_channel_abort(refch, false); | ||
1033 | gk20a_channel_put(refch); | ||
1034 | } else { | ||
1035 | nvgpu_err(g, "id unknown, abort runlist"); | ||
1036 | for (runlist_id = 0; runlist_id < g->fifo.max_runlists; | ||
1037 | runlist_id++) { | ||
1038 | if (runlists_mask & BIT(runlist_id)) | ||
1039 | g->ops.fifo.update_runlist(g, runlist_id, | ||
1040 | FIFO_INVAL_CHANNEL_ID, false, true); | ||
1041 | } | ||
1042 | } | ||
1043 | |||
1044 | /* check if engine reset should be deferred */ | 1022 | /* check if engine reset should be deferred */ |
1045 | for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) { | 1023 | for (runlist_id = 0; runlist_id < g->fifo.max_runlists; runlist_id++) { |
1046 | 1024 | ||
@@ -1051,7 +1029,8 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, | |||
1051 | unsigned long __reset_eng_bitmask = | 1029 | unsigned long __reset_eng_bitmask = |
1052 | runlist->reset_eng_bitmask; | 1030 | runlist->reset_eng_bitmask; |
1053 | 1031 | ||
1054 | for_each_set_bit(engine_id, &__reset_eng_bitmask, 32) { | 1032 | for_each_set_bit(engine_id, &__reset_eng_bitmask, |
1033 | g->fifo.max_engines) { | ||
1055 | if ((refch || tsg) && | 1034 | if ((refch || tsg) && |
1056 | gk20a_fifo_should_defer_engine_reset(g, | 1035 | gk20a_fifo_should_defer_engine_reset(g, |
1057 | engine_id, client_type, false)) { | 1036 | engine_id, client_type, false)) { |
@@ -1061,7 +1040,7 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, | |||
1061 | 1040 | ||
1062 | /* handled during channel free */ | 1041 | /* handled during channel free */ |
1063 | g->fifo.deferred_reset_pending = true; | 1042 | g->fifo.deferred_reset_pending = true; |
1064 | gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, | 1043 | nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, |
1065 | "sm debugger attached," | 1044 | "sm debugger attached," |
1066 | " deferring channel recovery to channel free"); | 1045 | " deferring channel recovery to channel free"); |
1067 | } else { | 1046 | } else { |
@@ -1084,12 +1063,42 @@ void gv11b_fifo_teardown_ch_tsg(struct gk20a *g, u32 act_eng_bitmask, | |||
1084 | } | 1063 | } |
1085 | 1064 | ||
1086 | #ifdef CONFIG_GK20A_CTXSW_TRACE | 1065 | #ifdef CONFIG_GK20A_CTXSW_TRACE |
1087 | if (refch) | 1066 | /* tsg and refch both could be valid for mmu fault. Check tsg first */ |
1088 | gk20a_ctxsw_trace_channel_reset(g, refch); | 1067 | if (tsg) |
1089 | else if (tsg) | ||
1090 | gk20a_ctxsw_trace_tsg_reset(g, tsg); | 1068 | gk20a_ctxsw_trace_tsg_reset(g, tsg); |
1069 | else if (refch) | ||
1070 | gk20a_ctxsw_trace_channel_reset(g, refch); | ||
1091 | #endif | 1071 | #endif |
1092 | 1072 | ||
1073 | if (tsg) { | ||
1074 | if (!g->fifo.deferred_reset_pending) { | ||
1075 | if (rc_type == RC_TYPE_MMU_FAULT) { | ||
1076 | gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg); | ||
1077 | verbose = gk20a_fifo_error_tsg(g, tsg); | ||
1078 | } | ||
1079 | } | ||
1080 | gk20a_fifo_abort_tsg(g, tsg->tsgid, false); | ||
1081 | if (refch) | ||
1082 | gk20a_channel_put(refch); | ||
1083 | } else if (refch) { | ||
1084 | if (!g->fifo.deferred_reset_pending) { | ||
1085 | if (rc_type == RC_TYPE_MMU_FAULT) { | ||
1086 | gk20a_fifo_set_ctx_mmu_error_ch(g, refch); | ||
1087 | verbose = gk20a_fifo_error_ch(g, refch); | ||
1088 | } | ||
1089 | } | ||
1090 | gk20a_channel_abort(refch, false); | ||
1091 | gk20a_channel_put(refch); | ||
1092 | } else { | ||
1093 | nvgpu_err(g, "id unknown, abort runlist"); | ||
1094 | for (runlist_id = 0; runlist_id < g->fifo.max_runlists; | ||
1095 | runlist_id++) { | ||
1096 | if (runlists_mask & BIT(runlist_id)) | ||
1097 | g->ops.fifo.update_runlist(g, runlist_id, | ||
1098 | FIFO_INVAL_CHANNEL_ID, false, true); | ||
1099 | } | ||
1100 | } | ||
1101 | |||
1093 | gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_ENABLED, | 1102 | gk20a_fifo_set_runlist_state(g, runlists_mask, RUNLIST_ENABLED, |
1094 | !RUNLIST_INFO_MUTEX_LOCKED); | 1103 | !RUNLIST_INFO_MUTEX_LOCKED); |
1095 | 1104 | ||