summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c2
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.h2
-rw-r--r--drivers/gpu/nvgpu/gv11b/fb_gv11b.c25
-rw-r--r--drivers/gpu/nvgpu/gv11b/fifo_gv11b.c3
-rw-r--r--drivers/gpu/nvgpu/gv11b/gr_gv11b.c35
5 files changed, 53 insertions, 14 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 551e8b04..4e6837ef 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -684,6 +684,8 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g,
684 ch->subctx_id = 0; 684 ch->subctx_id = 0;
685 ch->runqueue_sel = 0; 685 ch->runqueue_sel = 0;
686 686
687 ch->mmu_nack_handled = false;
688
687 /* The channel is *not* runnable at this point. It still needs to have 689 /* The channel is *not* runnable at this point. It still needs to have
688 * an address space bound and allocate a gpfifo and grctx. */ 690 * an address space bound and allocate a gpfifo and grctx. */
689 691
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index f95184be..aa37db62 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -295,6 +295,8 @@ struct channel_gk20a {
295 295
296 /* Any operating system specific data. */ 296 /* Any operating system specific data. */
297 void *os_priv; 297 void *os_priv;
298
299 bool mmu_nack_handled;
298}; 300};
299 301
300static inline struct channel_gk20a * 302static inline struct channel_gk20a *
diff --git a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
index ce8f5669..bba7e66c 100644
--- a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
@@ -935,12 +935,37 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
935 id = mmfault->chid; 935 id = mmfault->chid;
936 id_type = ID_TYPE_CHANNEL; 936 id_type = ID_TYPE_CHANNEL;
937 } 937 }
938 if (mmfault->refch->mmu_nack_handled) {
939 /* We have already recovered for the same
940 * context, skip doing another recovery.
941 */
942 mmfault->refch->mmu_nack_handled = false;
943 /*
944 * Recovery path can be entered twice for the
945 * same error in case of mmu nack. If mmu
946 * nack interrupt is handled before mmu fault
947 * then channel reference is increased to avoid
948 * closing the channel by userspace. Decrement
949 * channel reference.
950 */
951 gk20a_channel_put(mmfault->refch);
952 /* refch in mmfault is assigned at the time
953 * of copying fault info from snap reg or bar2
954 * fault buf.
955 */
956 gk20a_channel_put(mmfault->refch);
957 return;
958 }
938 } else { 959 } else {
939 id_type = ID_TYPE_UNKNOWN; 960 id_type = ID_TYPE_UNKNOWN;
940 } 961 }
941 if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID) 962 if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID)
942 act_eng_bitmask = BIT(mmfault->faulted_engine); 963 act_eng_bitmask = BIT(mmfault->faulted_engine);
943 964
965 /* Indicate recovery is handled if mmu fault is a result of
966 * mmu nack.
967 */
968 mmfault->refch->mmu_nack_handled = true;
944 g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask, 969 g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,
945 id, id_type, RC_TYPE_MMU_FAULT, mmfault); 970 id, id_type, RC_TYPE_MMU_FAULT, mmfault);
946 } else { 971 } else {
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index 84d63b91..9c64675f 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -596,6 +596,9 @@ void gv11b_fifo_reset_pbdma_and_eng_faulted(struct gk20a *g,
596 nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x", 596 nvgpu_log(g, gpu_dbg_intr, "reset faulted pbdma:0x%x eng:0x%x",
597 faulted_pbdma, faulted_engine); 597 faulted_pbdma, faulted_engine);
598 598
599 if (!refch)
600 return;
601
599 if (gk20a_is_channel_marked_as_tsg(refch)) { 602 if (gk20a_is_channel_marked_as_tsg(refch)) {
600 tsg = &g->fifo.tsg[refch->tsgid]; 603 tsg = &g->fifo.tsg[refch->tsgid];
601 if (faulted_pbdma != FIFO_INVAL_PBDMA_ID) 604 if (faulted_pbdma != FIFO_INVAL_PBDMA_ID)
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index 694ff8ad..aed45ceb 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -2093,23 +2093,30 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
2093 u32 warp_esr_error, 2093 u32 warp_esr_error,
2094 struct channel_gk20a *fault_ch) 2094 struct channel_gk20a *fault_ch)
2095{ 2095{
2096 struct tsg_gk20a *tsg;
2097 u32 offset; 2096 u32 offset;
2097 int err = 0;
2098 2098
2099 fault_ch = gk20a_channel_get(fault_ch);
2099 if (fault_ch) { 2100 if (fault_ch) {
2100 tsg = &g->fifo.tsg[fault_ch->tsgid]; 2101 if (!fault_ch->mmu_nack_handled) {
2101 2102 /* recovery is not done for the channel implying mmu
2102 /* 2103 * nack interrupt is serviced before mmu fault. Force
2103 * Upon receiving MMU_FAULT error, MMU will forward MMU_NACK 2104 * recovery by returning an error. Also indicate we
2104 * to SM. So MMU_FAULT handling path will take care of 2105 * should skip a second recovery.
2105 * triggering RC recovery 2106 */
2106 * 2107 fault_ch->mmu_nack_handled = true;
2107 * In MMU_NACK handling path, we just set the error notifier 2108 err = -EFAULT;
2108 * and clear the interrupt so that the User Space sees the error 2109 }
2109 * as soon as semaphores are released by SM
2110 */
2111 gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
2112 } 2110 }
2111 /* else mmu fault is serviced first and channel is closed */
2112
2113 /* do not release reference to ch as we do not want userspace to close
2114 * this channel on recovery. Otherwise mmu fault handler will enter
2115 * recovery path even if channel is invalid. We want to explicitly check
2116 * for teardown value in mmu fault handler.
2117 */
2118 if (!err)
2119 gk20a_channel_put(fault_ch);
2113 2120
2114 /* clear interrupt */ 2121 /* clear interrupt */
2115 offset = gk20a_gr_gpc_offset(g, gpc) + 2122 offset = gk20a_gr_gpc_offset(g, gpc) +
@@ -2122,7 +2129,7 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
2122 "ESR %s(0x%x)", 2129 "ESR %s(0x%x)",
2123 "MMU NACK ERROR", 2130 "MMU NACK ERROR",
2124 warp_esr_error); 2131 warp_esr_error);
2125 return 0; 2132 return err;
2126} 2133}
2127 2134
2128static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error) 2135static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)