summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAparna Das <aparnad@nvidia.com>2018-06-04 22:40:56 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2018-06-15 20:47:06 -0400
commit98d996f4ffb0137d119b5849cae46d7b7e5693e1 (patch)
tree4e3243b91e8e559dd14b8bc30fc2468713182027
parent1f51620fda57443c77506c354af837a60883d78b (diff)
gpu: nvgpu: recover on first interrupt reported for mmu nack
In case of mmu nack error interrupt is received twice through SM reported mmu nack interrupt and mmu fault in undertermined order. Recover on the first received interrupt to avoid semaphore release and skip doing a second recovery. Also fix NULL pointer dereference in function gv11b_fifo_reset_pbdma_and_eng_faulted when channel reference is invalid in teardown path. Bug 200382235 Change-Id: I361a5725d7b6355ebf02b2870727f647fbd7a37e Signed-off-by: Aparna Das <aparnad@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1739804 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c2
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.h2
-rw-r--r--drivers/gpu/nvgpu/gv11b/fb_gv11b.c25
-rw-r--r--drivers/gpu/nvgpu/gv11b/fifo_gv11b.c3
-rw-r--r--drivers/gpu/nvgpu/gv11b/gr_gv11b.c35
5 files changed, 53 insertions, 14 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 551e8b04..4e6837ef 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -684,6 +684,8 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g,
684 ch->subctx_id = 0; 684 ch->subctx_id = 0;
685 ch->runqueue_sel = 0; 685 ch->runqueue_sel = 0;
686 686
687 ch->mmu_nack_handled = false;
688
687 /* The channel is *not* runnable at this point. It still needs to have 689 /* The channel is *not* runnable at this point. It still needs to have
688 * an address space bound and allocate a gpfifo and grctx. */ 690 * an address space bound and allocate a gpfifo and grctx. */
689 691
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index f95184be..aa37db62 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -295,6 +295,8 @@ struct channel_gk20a {
295 295
296 /* Any operating system specific data. */ 296 /* Any operating system specific data. */
297 void *os_priv; 297 void *os_priv;
298
299 bool mmu_nack_handled;
298}; 300};
299 301
300static inline struct channel_gk20a * 302static inline struct channel_gk20a *
diff --git a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
index ce8f5669..bba7e66c 100644
--- a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
@@ -935,12 +935,37 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
935 id = mmfault->chid; 935 id = mmfault->chid;
936 id_type = ID_TYPE_CHANNEL; 936 id_type = ID_TYPE_CHANNEL;
937 } 937 }
938 if (mmfault->refch->mmu_nack_handled) {
939 /* We have already recovered for the same
940 * context, skip doing another recovery.
941 */
942 mmfault->refch->mmu_nack_handled = false;
943 /*
944 * Recovery path can be entered twice for the
945 * same error in case of mmu nack. If mmu
946 * nack interrupt is handled before mmu fault
947 * then channel reference is increased to avoid
948 * closing the channel by userspace. Decrement
949 * channel reference.
950 */
951 gk20a_channel_put(mmfault->refch);
952 /* refch in mmfault is assigned at the time
953 * of copying fault info from snap reg or bar2
954 * fault buf.
955 */
956 gk20a_channel_put(mmfault->refch);
957 return;
958 }
938 } else { 959 } else {
939 id_type = ID_TYPE_UNKNOWN; 960 id_type = ID_TYPE_UNKNOWN;
940 } 961 }
941 if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID) 962 if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID)
942 act_eng_bitmask = BIT(mmfault->faulted_engine); 963 act_eng_bitmask = BIT(mmfault->faulted_engine);
943 964
965 /* Indicate recovery is handled if mmu fault is a result of
966 * mmu nack.
967 */
968 mmfault->refch->mmu_nack_handled = true;
944 g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask, 969 g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,
945 id, id_type, RC_TYPE_MMU_FAULT, mmfault); 970 id, id_type, RC_TYPE_MMU_FAULT, mmfault);
946 } else { 971 } else {
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index 84d63b91..9c64675f 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -596,6 +596,9 @@ void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g,
596 nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x", 596 nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x",
597 faulted_pbdma, faulted_engine); 597 faulted_pbdma, faulted_engine);
598 598
599 if (!refch)
600 return;
601
599 if (gk20a_is_channel_marked_as_tsg(refch)) { 602 if (gk20a_is_channel_marked_as_tsg(refch)) {
600 tsg = &g->fifo.tsg[refch->tsgid]; 603 tsg = &g->fifo.tsg[refch->tsgid];
601 if (faulted_pbdma != FIFO_INVAL_PBDMA_ID) 604 if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index 694ff8ad..aed45ceb 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -2093,23 +2093,30 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
2093 u32 warp_esr_error, 2093 u32 warp_esr_error,
2094 struct channel_gk20a *fault_ch) 2094 struct channel_gk20a *fault_ch)
2095{ 2095{
2096 struct tsg_gk20a *tsg;
2097 u32 offset; 2096 u32 offset;
2097 int err = 0;
2098 2098
2099 fault_ch = gk20a_channel_get(fault_ch);
2099 if (fault_ch) { 2100 if (fault_ch) {
2100 tsg = &g->fifo.tsg[fault_ch->tsgid]; 2101 if (!fault_ch->mmu_nack_handled) {
2101 2102 /* recovery is not done for the channel implying mmu
2102 /* 2103 * nack interrupt is serviced before mmu fault. Force
2103 * Upon receiving MMU_FAULT error, MMU will forward MMU_NACK 2104 * recovery by returning an error. Also indicate we
2104 * to SM. So MMU_FAULT handling path will take care of 2105 * should skip a second recovery.
2105 * triggering RC recovery 2106 */
2106 * 2107 fault_ch->mmu_nack_handled = true;
2107 * In MMU_NACK handling path, we just set the error notifier 2108 err = -EFAULT;
2108 * and clear the interrupt so that the User Space sees the error 2109 }
2109 * as soon as semaphores are released by SM
2110 */
2111 gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
2112 } 2110 }
2111 /* else mmu fault is serviced first and channel is closed */
2112
2113 /* do not release reference to ch as we do not want userspace to close
2114 * this channel on recovery. Otherwise mmu fault handler will enter
2115 * recovery path even if channel is invalid. We want to explicitly check
2116 * for teardown value in mmu fault handler.
2117 */
2118 if (!err)
2119 gk20a_channel_put(fault_ch);
2113 2120
2114 /* clear interrupt */ 2121 /* clear interrupt */
2115 offset = gk20a_gr_gpc_offset(g, gpc) + 2122 offset = gk20a_gr_gpc_offset(g, gpc) +
@@ -2122,7 +2129,7 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
2122 "ESR %s(0x%x)", 2129 "ESR %s(0x%x)",
2123 "MMU NACK ERROR", 2130 "MMU NACK ERROR",
2124 warp_esr_error); 2131 warp_esr_error);
2125 return 0; 2132 return err;
2126} 2133}
2127 2134
2128static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error) 2135static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)