From 66fb130bfdf12175c117f36737503b1b5f33d42e Mon Sep 17 00:00:00 2001 From: Seema Khowala Date: Mon, 15 May 2017 16:33:41 -0700 Subject: gpu: nvgpu: gv11b: recover from replay and ce mmu fault Fix pte valid bit for replayable fault and ce fault JIRA GPUT19X-12 Change-Id: I77a7a452d9b5b304f182e120e8d75959d46d4422 Signed-off-by: Seema Khowala Reviewed-on: https://git-master.nvidia.com/r/1515538 GVS: Gerrit_Virtual_Submit Reviewed-by: Vijayakumar Subbu --- drivers/gpu/nvgpu/gv11b/ce_gv11b.c | 13 ++- drivers/gpu/nvgpu/gv11b/ce_gv11b.h | 3 +- drivers/gpu/nvgpu/gv11b/fb_gv11b.c | 218 +++++++++++++++++++++++++++++++++---- 3 files changed, 210 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/nvgpu/gv11b/ce_gv11b.c b/drivers/gpu/nvgpu/gv11b/ce_gv11b.c index af87f990..9716c6d6 100644 --- a/drivers/gpu/nvgpu/gv11b/ce_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/ce_gv11b.c @@ -75,14 +75,23 @@ static void gv11b_ce_isr(struct gk20a *g, u32 inst_id, u32 pri_base) gp10b_ce_isr(g, inst_id, pri_base); } -void gv11b_ce_mthd_buffer_fault_in_bar2_fault(struct gk20a *g) +u32 gv11b_ce_get_num_lce(struct gk20a *g) { - u32 reg_val, num_lce, lce, clear_intr; + u32 reg_val, num_lce; reg_val = gk20a_readl(g, top_num_ces_r()); num_lce = top_num_ces_value_v(reg_val); nvgpu_log_info(g, "num LCE: %d", num_lce); + return num_lce; +} + +void gv11b_ce_mthd_buffer_fault_in_bar2_fault(struct gk20a *g) +{ + u32 reg_val, num_lce, lce, clear_intr; + + num_lce = gv11b_ce_get_num_lce(g); + for (lce = 0; lce < num_lce; lce++) { reg_val = gk20a_readl(g, ce_intr_status_r(lce)); if (reg_val & ce_intr_status_mthd_buffer_fault_pending_f()) { diff --git a/drivers/gpu/nvgpu/gv11b/ce_gv11b.h b/drivers/gpu/nvgpu/gv11b/ce_gv11b.h index 09d18058..23053199 100644 --- a/drivers/gpu/nvgpu/gv11b/ce_gv11b.h +++ b/drivers/gpu/nvgpu/gv11b/ce_gv11b.h @@ -2,7 +2,7 @@ * * Volta GPU series copy engine * - * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -23,5 +23,6 @@ struct gpu_ops; void gv11b_init_ce(struct gpu_ops *gops); void gv11b_ce_mthd_buffer_fault_in_bar2_fault(struct gk20a *g); +u32 gv11b_ce_get_num_lce(struct gk20a *g); #endif /*__CE2_GV11B_H__*/ diff --git a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c index f9532d66..0ec6c9dd 100644 --- a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c @@ -17,9 +17,12 @@ #include #include +#include +#include #include "gk20a/gk20a.h" #include "gk20a/kind_gk20a.h" +#include "gk20a/mm_gk20a.h" #include "gp10b/fb_gp10b.h" @@ -33,8 +36,12 @@ #include #include -#include -#include + +static int gv11b_fb_fix_page_fault(struct gk20a *g, + struct mmu_fault_info *mmfault); + +static int gv11b_fb_mmu_invalidate_replay(struct gk20a *g, + u32 invalidate_replay_val); static void gv11b_init_nvlink_soc_credits(struct gk20a *g) { @@ -964,17 +971,43 @@ static void gv11b_fb_copy_from_hw_fault_buf(struct gk20a *g, } static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g, - struct mmu_fault_info *mmfault) + struct mmu_fault_info *mmfault, u32 *invalidate_replay_val) { unsigned int id_type; - u32 act_eng_bitmask = 0; + u32 num_lce, act_eng_bitmask = 0; + int err = 0; if (!mmfault->valid) return; gv11b_fb_print_fault_info(g, mmfault); - if (mmfault->fault_type == gmmu_fault_type_unbound_inst_block_v()) { + num_lce = gv11b_ce_get_num_lce(g); + if ((mmfault->mmu_engine_id >= + gmmu_fault_mmu_eng_id_ce0_v()) && + (mmfault->mmu_engine_id < + gmmu_fault_mmu_eng_id_ce0_v() + num_lce)) { + /* CE page faults are not reported as replayable */ + nvgpu_log(g, gpu_dbg_intr, "CE Faulted"); + err = gv11b_fb_fix_page_fault(g, mmfault); + gv11b_fifo_reset_pbdma_and_eng_faulted(g, mmfault->refch, + mmfault->faulted_pbdma, mmfault->faulted_engine); + if (!err) { + nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Fixed"); + *invalidate_replay_val = 0; + /* refch in mmfault is assigned at the time of copying + * fault info from snap reg or bar2 fault buf + */ + gk20a_channel_put(mmfault->refch); + return; + } + /* Do recovery. Channel recovery needs refch */ + nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Not Fixed"); + } + + if (!mmfault->replayable_fault) { + if (mmfault->fault_type == + gmmu_fault_type_unbound_inst_block_v()) { /* * Bug 1847172: When an engine faults due to an unbound * instance block, the fault cannot be isolated to a @@ -983,20 +1016,56 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g, id_type = ID_TYPE_UNKNOWN; nvgpu_log(g, gpu_dbg_intr, "UNBOUND INST BLOCK MMU FAULT"); - } else if (mmfault->refch) { - if (gk20a_is_channel_marked_as_tsg(mmfault->refch)) - id_type = ID_TYPE_TSG; - else - id_type = ID_TYPE_CHANNEL; + } else if (mmfault->refch) { + if (gk20a_is_channel_marked_as_tsg(mmfault->refch)) + id_type = ID_TYPE_TSG; + else + id_type = ID_TYPE_CHANNEL; + } else { + id_type = ID_TYPE_UNKNOWN; + } + if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID) + act_eng_bitmask = BIT(mmfault->faulted_engine); + + g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask, + mmfault->chid, id_type, RC_TYPE_MMU_FAULT, mmfault); } else { - id_type = ID_TYPE_UNKNOWN; + err = gv11b_fb_fix_page_fault(g, mmfault); + if (err) { + *invalidate_replay_val |= + fb_mmu_invalidate_replay_cancel_global_f(); + } else { + *invalidate_replay_val |= + fb_mmu_invalidate_replay_start_ack_all_f(); + } + /* refch in mmfault is assigned at the time of copying + * fault info from snap reg or bar2 fault buf + */ + gk20a_channel_put(mmfault->refch); } +} - if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID) - act_eng_bitmask = BIT(mmfault->faulted_engine); +static void gv11b_fb_replay_or_cancel_faults(struct gk20a *g, + u32 invalidate_replay_val) +{ + int err = 0; - g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask, mmfault->chid, - id_type, RC_TYPE_MMU_FAULT, mmfault); + nvgpu_log_fn(g, " "); + + if (invalidate_replay_val & + fb_mmu_invalidate_replay_cancel_global_f()) { + /* + * cancel faults so that next time it faults as + * replayable faults and channel recovery can be done + */ + err = gv11b_fb_mmu_invalidate_replay(g, + fb_mmu_invalidate_replay_cancel_global_f()); + } else if (invalidate_replay_val & + fb_mmu_invalidate_replay_start_ack_all_f()) { + /* pte valid is fixed. replay faulting request */ + err = gv11b_fb_mmu_invalidate_replay(g, + fb_mmu_invalidate_replay_start_ack_all_f()); + } } static void gv11b_fb_handle_mmu_nonreplay_replay_fault(struct gk20a *g, @@ -1005,10 +1074,13 @@ static void gv11b_fb_handle_mmu_nonreplay_replay_fault(struct gk20a *g, u32 get_indx, offset, rd32_val, entries; struct nvgpu_mem *mem; struct mmu_fault_info *mmfault; + u32 invalidate_replay_val = 0; + u64 prev_fault_addr = 0; + u64 next_fault_addr = 0; - if (gv11b_fb_is_fault_buffer_empty(g, index, - &get_indx)) { - nvgpu_log(g, gpu_dbg_intr, "SPURIOUS fault"); + if (gv11b_fb_is_fault_buffer_empty(g, index, &get_indx)) { + nvgpu_log(g, gpu_dbg_intr, + "SPURIOUS mmu fault: reg index:%d", index); return; } nvgpu_log(g, gpu_dbg_intr, "get ptr = %d", get_indx); @@ -1041,14 +1113,29 @@ static void gv11b_fb_handle_mmu_nonreplay_replay_fault(struct gk20a *g, gv11b_fb_fault_buffer_get_ptr_update(g, index, get_indx); - gv11b_fb_handle_mmu_fault_common(g, mmfault); - offset = (get_indx * gmmu_fault_buf_size_v()) / sizeof(u32); nvgpu_log(g, gpu_dbg_intr, "next word offset = 0x%x", offset); rd32_val = nvgpu_mem_rd32(g, mem, offset + gmmu_fault_buf_entry_valid_w()); + + if (index == REPLAY_REG_INDEX) { + prev_fault_addr = next_fault_addr; + next_fault_addr = mmfault->fault_addr; + if (prev_fault_addr == next_fault_addr) { + if (mmfault->refch) + gk20a_channel_put(mmfault->refch); + /* pte already fixed for this addr */ + continue; + } + } + + gv11b_fb_handle_mmu_fault_common(g, mmfault, + &invalidate_replay_val); + } + if (index == REPLAY_REG_INDEX && invalidate_replay_val) + gv11b_fb_replay_or_cancel_faults(g, invalidate_replay_val); } static void gv11b_mm_copy_from_fault_snap_reg(struct gk20a *g, @@ -1228,6 +1315,7 @@ static void gv11b_fb_handle_other_fault_notify(struct gk20a *g, u32 fault_status) { struct mmu_fault_info *mmfault; + u32 invalidate_replay_val = 0; mmfault = g->mm.fault_info[FAULT_TYPE_OTHER_AND_NONREPLAY]; @@ -1244,7 +1332,12 @@ static void gv11b_fb_handle_other_fault_notify(struct gk20a *g, nvgpu_err(g, "PHYSICAL MMU FAULT"); } else { - gv11b_fb_handle_mmu_fault_common(g, mmfault); + gv11b_fb_handle_mmu_fault_common(g, mmfault, + &invalidate_replay_val); + + if (invalidate_replay_val) + gv11b_fb_replay_or_cancel_faults(g, + invalidate_replay_val); } } @@ -1398,6 +1491,89 @@ bool gv11b_fb_mmu_fault_pending(struct gk20a *g) return false; } +static int gv11b_fb_mmu_invalidate_replay(struct gk20a *g, + u32 invalidate_replay_val) +{ + int err = -ETIMEDOUT; + u32 reg_val; + struct nvgpu_timeout timeout; + + gk20a_dbg_fn(""); + + nvgpu_mutex_acquire(&g->mm.tlb_lock); + + reg_val = gk20a_readl(g, fb_mmu_invalidate_r()); + + reg_val |= fb_mmu_invalidate_all_va_true_f() | + fb_mmu_invalidate_all_pdb_true_f() | + invalidate_replay_val | + fb_mmu_invalidate_trigger_true_f(); + + gk20a_writel(g, fb_mmu_invalidate_r(), reg_val); + + /* retry 200 times */ + nvgpu_timeout_init(g, &timeout, 200, NVGPU_TIMER_RETRY_TIMER); + do { + reg_val = gk20a_readl(g, fb_mmu_ctrl_r()); + if (fb_mmu_ctrl_pri_fifo_empty_v(reg_val) != + fb_mmu_ctrl_pri_fifo_empty_false_f()) { + err = 0; + break; + } + nvgpu_udelay(5); + } while (!nvgpu_timeout_expired_msg(&timeout, + "invalidate replay failed on 0x%llx")); + if (err) + nvgpu_err(g, "invalidate replay timedout"); + + nvgpu_mutex_release(&g->mm.tlb_lock); + + return err; +} + +static int gv11b_fb_fix_page_fault(struct gk20a *g, + struct mmu_fault_info *mmfault) +{ + int err = 0; + u32 pte[2]; + + if (mmfault->refch == NULL) { + nvgpu_log(g, gpu_dbg_intr, "refch from mmu_fault_info is NULL"); + return -EINVAL; + } + + err = __nvgpu_get_pte(g, + mmfault->refch->vm, mmfault->fault_addr, &pte[0]); + if (err) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_pte, "pte not found"); + return err; + } + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_pte, + "pte: %#08x %#08x", pte[1], pte[0]); + + pte[0] |= gmmu_new_pte_valid_true_f(); + if (pte[0] & gmmu_new_pte_read_only_true_f()) + pte[0] &= ~(gmmu_new_pte_read_only_true_f()); + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_pte, + "new pte: %#08x %#08x", pte[1], pte[0]); + + err = __nvgpu_set_pte(g, + mmfault->refch->vm, mmfault->fault_addr, &pte[0]); + if (err) { + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_pte, "pte not fixed"); + return err; + } + /* invalidate tlb so that GMMU does not use old cached translation */ + g->ops.fb.tlb_invalidate(g, mmfault->refch->vm->pdb.mem); + + err = __nvgpu_get_pte(g, + mmfault->refch->vm, mmfault->fault_addr, &pte[0]); + nvgpu_log(g, gpu_dbg_intr | gpu_dbg_pte, + "pte after tlb invalidate: %#08x %#08x", + pte[1], pte[0]); + return err; +} + void gv11b_init_fb(struct gpu_ops *gops) { gp10b_init_fb(gops); -- cgit v1.2.2