summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
diff options
context:
space:
mode:
authorSeema Khowala <seemaj@nvidia.com>2017-05-15 19:33:41 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2017-07-12 10:44:49 -0400
commit66fb130bfdf12175c117f36737503b1b5f33d42e (patch)
treed6a26dee697fc5e9f16e43328a448705a39ff5d4 /drivers/gpu/nvgpu/gv11b/fb_gv11b.c
parent971c90e3b6b7f1e3f5dff67ccd701c99c1b0f7b5 (diff)
gpu: nvgpu: gv11b: recover from replay and ce mmu fault
Fix pte valid bit for replayable fault and ce fault JIRA GPUT19X-12 Change-Id: I77a7a452d9b5b304f182e120e8d75959d46d4422 Signed-off-by: Seema Khowala <seemaj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1515538 GVS: Gerrit_Virtual_Submit Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gv11b/fb_gv11b.c')
-rw-r--r--drivers/gpu/nvgpu/gv11b/fb_gv11b.c218
1 files changed, 197 insertions, 21 deletions
diff --git a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
index f9532d66..0ec6c9dd 100644
--- a/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fb_gv11b.c
@@ -17,9 +17,12 @@
17 17
18#include <nvgpu/dma.h> 18#include <nvgpu/dma.h>
19#include <nvgpu/log.h> 19#include <nvgpu/log.h>
20#include <nvgpu/enabled.h>
21#include <nvgpu/gmmu.h>
20 22
21#include "gk20a/gk20a.h" 23#include "gk20a/gk20a.h"
22#include "gk20a/kind_gk20a.h" 24#include "gk20a/kind_gk20a.h"
25#include "gk20a/mm_gk20a.h"
23 26
24#include "gp10b/fb_gp10b.h" 27#include "gp10b/fb_gp10b.h"
25 28
@@ -33,8 +36,12 @@
33#include <nvgpu/hw/gv11b/hw_fifo_gv11b.h> 36#include <nvgpu/hw/gv11b/hw_fifo_gv11b.h>
34#include <nvgpu/hw/gv11b/hw_ram_gv11b.h> 37#include <nvgpu/hw/gv11b/hw_ram_gv11b.h>
35 38
36#include <nvgpu/log.h> 39
37#include <nvgpu/enabled.h> 40static int gv11b_fb_fix_page_fault(struct gk20a *g,
41 struct mmu_fault_info *mmfault);
42
43static int gv11b_fb_mmu_invalidate_replay(struct gk20a *g,
44 u32 invalidate_replay_val);
38 45
39static void gv11b_init_nvlink_soc_credits(struct gk20a *g) 46static void gv11b_init_nvlink_soc_credits(struct gk20a *g)
40{ 47{
@@ -964,17 +971,43 @@ static void gv11b_fb_copy_from_hw_fault_buf(struct gk20a *g,
964} 971}
965 972
966static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g, 973static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
967 struct mmu_fault_info *mmfault) 974 struct mmu_fault_info *mmfault, u32 *invalidate_replay_val)
968{ 975{
969 unsigned int id_type; 976 unsigned int id_type;
970 u32 act_eng_bitmask = 0; 977 u32 num_lce, act_eng_bitmask = 0;
978 int err = 0;
971 979
972 if (!mmfault->valid) 980 if (!mmfault->valid)
973 return; 981 return;
974 982
975 gv11b_fb_print_fault_info(g, mmfault); 983 gv11b_fb_print_fault_info(g, mmfault);
976 984
977 if (mmfault->fault_type == gmmu_fault_type_unbound_inst_block_v()) { 985 num_lce = gv11b_ce_get_num_lce(g);
986 if ((mmfault->mmu_engine_id >=
987 gmmu_fault_mmu_eng_id_ce0_v()) &&
988 (mmfault->mmu_engine_id <
989 gmmu_fault_mmu_eng_id_ce0_v() + num_lce)) {
990 /* CE page faults are not reported as replayable */
991 nvgpu_log(g, gpu_dbg_intr, "CE Faulted");
992 err = gv11b_fb_fix_page_fault(g, mmfault);
993 gv11b_fifo_reset_pbdma_and_eng_faulted(g, mmfault->refch,
994 mmfault->faulted_pbdma, mmfault->faulted_engine);
995 if (!err) {
996 nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Fixed");
997 *invalidate_replay_val = 0;
998 /* refch in mmfault is assigned at the time of copying
999 * fault info from snap reg or bar2 fault buf
1000 */
1001 gk20a_channel_put(mmfault->refch);
1002 return;
1003 }
1004 /* Do recovery. Channel recovery needs refch */
1005 nvgpu_log(g, gpu_dbg_intr, "CE Page Fault Not Fixed");
1006 }
1007
1008 if (!mmfault->replayable_fault) {
1009 if (mmfault->fault_type ==
1010 gmmu_fault_type_unbound_inst_block_v()) {
978 /* 1011 /*
979 * Bug 1847172: When an engine faults due to an unbound 1012 * Bug 1847172: When an engine faults due to an unbound
980 * instance block, the fault cannot be isolated to a 1013 * instance block, the fault cannot be isolated to a
@@ -983,20 +1016,56 @@ static void gv11b_fb_handle_mmu_fault_common(struct gk20a *g,
983 id_type = ID_TYPE_UNKNOWN; 1016 id_type = ID_TYPE_UNKNOWN;
984 nvgpu_log(g, gpu_dbg_intr, "UNBOUND INST BLOCK MMU FAULT"); 1017 nvgpu_log(g, gpu_dbg_intr, "UNBOUND INST BLOCK MMU FAULT");
985 1018
986 } else if (mmfault->refch) { 1019 } else if (mmfault->refch) {
987 if (gk20a_is_channel_marked_as_tsg(mmfault->refch)) 1020 if (gk20a_is_channel_marked_as_tsg(mmfault->refch))
988 id_type = ID_TYPE_TSG; 1021 id_type = ID_TYPE_TSG;
989 else 1022 else
990 id_type = ID_TYPE_CHANNEL; 1023 id_type = ID_TYPE_CHANNEL;
1024 } else {
1025 id_type = ID_TYPE_UNKNOWN;
1026 }
1027 if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID)
1028 act_eng_bitmask = BIT(mmfault->faulted_engine);
1029
1030 g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask,
1031 mmfault->chid, id_type, RC_TYPE_MMU_FAULT, mmfault);
991 } else { 1032 } else {
992 id_type = ID_TYPE_UNKNOWN; 1033 err = gv11b_fb_fix_page_fault(g, mmfault);
1034 if (err) {
1035 *invalidate_replay_val |=
1036 fb_mmu_invalidate_replay_cancel_global_f();
1037 } else {
1038 *invalidate_replay_val |=
1039 fb_mmu_invalidate_replay_start_ack_all_f();
1040 }
1041 /* refch in mmfault is assigned at the time of copying
1042 * fault info from snap reg or bar2 fault buf
1043 */
1044 gk20a_channel_put(mmfault->refch);
993 } 1045 }
1046}
994 1047
995 if (mmfault->faulted_engine != FIFO_INVAL_ENGINE_ID) 1048static void gv11b_fb_replay_or_cancel_faults(struct gk20a *g,
996 act_eng_bitmask = BIT(mmfault->faulted_engine); 1049 u32 invalidate_replay_val)
1050{
1051 int err = 0;
997 1052
998 g->ops.fifo.teardown_ch_tsg(g, act_eng_bitmask, mmfault->chid, 1053 nvgpu_log_fn(g, " ");
999 id_type, RC_TYPE_MMU_FAULT, mmfault); 1054
1055 if (invalidate_replay_val &
1056 fb_mmu_invalidate_replay_cancel_global_f()) {
1057 /*
1058 * cancel faults so that next time it faults as
1059 * replayable faults and channel recovery can be done
1060 */
1061 err = gv11b_fb_mmu_invalidate_replay(g,
1062 fb_mmu_invalidate_replay_cancel_global_f());
1063 } else if (invalidate_replay_val &
1064 fb_mmu_invalidate_replay_start_ack_all_f()) {
1065 /* pte valid is fixed. replay faulting request */
1066 err = gv11b_fb_mmu_invalidate_replay(g,
1067 fb_mmu_invalidate_replay_start_ack_all_f());
1068 }
1000} 1069}
1001 1070
1002static void gv11b_fb_handle_mmu_nonreplay_replay_fault(struct gk20a *g, 1071static void gv11b_fb_handle_mmu_nonreplay_replay_fault(struct gk20a *g,
@@ -1005,10 +1074,13 @@ static void gv11b_fb_handle_mmu_nonreplay_replay_fault(struct gk20a *g,
1005 u32 get_indx, offset, rd32_val, entries; 1074 u32 get_indx, offset, rd32_val, entries;
1006 struct nvgpu_mem *mem; 1075 struct nvgpu_mem *mem;
1007 struct mmu_fault_info *mmfault; 1076 struct mmu_fault_info *mmfault;
1077 u32 invalidate_replay_val = 0;
1078 u64 prev_fault_addr = 0;
1079 u64 next_fault_addr = 0;
1008 1080
1009 if (gv11b_fb_is_fault_buffer_empty(g, index, 1081 if (gv11b_fb_is_fault_buffer_empty(g, index, &get_indx)) {
1010 &get_indx)) { 1082 nvgpu_log(g, gpu_dbg_intr,
1011 nvgpu_log(g, gpu_dbg_intr, "SPURIOUS fault"); 1083 "SPURIOUS mmu fault: reg index:%d", index);
1012 return; 1084 return;
1013 } 1085 }
1014 nvgpu_log(g, gpu_dbg_intr, "get ptr = %d", get_indx); 1086 nvgpu_log(g, gpu_dbg_intr, "get ptr = %d", get_indx);
@@ -1041,14 +1113,29 @@ static void gv11b_fb_handle_mmu_nonreplay_replay_fault(struct gk20a *g,
1041 1113
1042 gv11b_fb_fault_buffer_get_ptr_update(g, index, get_indx); 1114 gv11b_fb_fault_buffer_get_ptr_update(g, index, get_indx);
1043 1115
1044 gv11b_fb_handle_mmu_fault_common(g, mmfault);
1045
1046 offset = (get_indx * gmmu_fault_buf_size_v()) / sizeof(u32); 1116 offset = (get_indx * gmmu_fault_buf_size_v()) / sizeof(u32);
1047 nvgpu_log(g, gpu_dbg_intr, "next word offset = 0x%x", offset); 1117 nvgpu_log(g, gpu_dbg_intr, "next word offset = 0x%x", offset);
1048 1118
1049 rd32_val = nvgpu_mem_rd32(g, mem, 1119 rd32_val = nvgpu_mem_rd32(g, mem,
1050 offset + gmmu_fault_buf_entry_valid_w()); 1120 offset + gmmu_fault_buf_entry_valid_w());
1121
1122 if (index == REPLAY_REG_INDEX) {
1123 prev_fault_addr = next_fault_addr;
1124 next_fault_addr = mmfault->fault_addr;
1125 if (prev_fault_addr == next_fault_addr) {
1126 if (mmfault->refch)
1127 gk20a_channel_put(mmfault->refch);
1128 /* pte already fixed for this addr */
1129 continue;
1130 }
1131 }
1132
1133 gv11b_fb_handle_mmu_fault_common(g, mmfault,
1134 &invalidate_replay_val);
1135
1051 } 1136 }
1137 if (index == REPLAY_REG_INDEX && invalidate_replay_val)
1138 gv11b_fb_replay_or_cancel_faults(g, invalidate_replay_val);
1052} 1139}
1053 1140
1054static void gv11b_mm_copy_from_fault_snap_reg(struct gk20a *g, 1141static void gv11b_mm_copy_from_fault_snap_reg(struct gk20a *g,
@@ -1228,6 +1315,7 @@ static void gv11b_fb_handle_other_fault_notify(struct gk20a *g,
1228 u32 fault_status) 1315 u32 fault_status)
1229{ 1316{
1230 struct mmu_fault_info *mmfault; 1317 struct mmu_fault_info *mmfault;
1318 u32 invalidate_replay_val = 0;
1231 1319
1232 mmfault = g->mm.fault_info[FAULT_TYPE_OTHER_AND_NONREPLAY]; 1320 mmfault = g->mm.fault_info[FAULT_TYPE_OTHER_AND_NONREPLAY];
1233 1321
@@ -1244,7 +1332,12 @@ static void gv11b_fb_handle_other_fault_notify(struct gk20a *g,
1244 nvgpu_err(g, "PHYSICAL MMU FAULT"); 1332 nvgpu_err(g, "PHYSICAL MMU FAULT");
1245 1333
1246 } else { 1334 } else {
1247 gv11b_fb_handle_mmu_fault_common(g, mmfault); 1335 gv11b_fb_handle_mmu_fault_common(g, mmfault,
1336 &invalidate_replay_val);
1337
1338 if (invalidate_replay_val)
1339 gv11b_fb_replay_or_cancel_faults(g,
1340 invalidate_replay_val);
1248 } 1341 }
1249} 1342}
1250 1343
@@ -1398,6 +1491,89 @@ bool gv11b_fb_mmu_fault_pending(struct gk20a *g)
1398 return false; 1491 return false;
1399} 1492}
1400 1493
1494static int gv11b_fb_mmu_invalidate_replay(struct gk20a *g,
1495 u32 invalidate_replay_val)
1496{
1497 int err = -ETIMEDOUT;
1498 u32 reg_val;
1499 struct nvgpu_timeout timeout;
1500
1501 gk20a_dbg_fn("");
1502
1503 nvgpu_mutex_acquire(&g->mm.tlb_lock);
1504
1505 reg_val = gk20a_readl(g, fb_mmu_invalidate_r());
1506
1507 reg_val |= fb_mmu_invalidate_all_va_true_f() |
1508 fb_mmu_invalidate_all_pdb_true_f() |
1509 invalidate_replay_val |
1510 fb_mmu_invalidate_trigger_true_f();
1511
1512 gk20a_writel(g, fb_mmu_invalidate_r(), reg_val);
1513
1514 /* retry 200 times */
1515 nvgpu_timeout_init(g, &timeout, 200, NVGPU_TIMER_RETRY_TIMER);
1516 do {
1517 reg_val = gk20a_readl(g, fb_mmu_ctrl_r());
1518 if (fb_mmu_ctrl_pri_fifo_empty_v(reg_val) !=
1519 fb_mmu_ctrl_pri_fifo_empty_false_f()) {
1520 err = 0;
1521 break;
1522 }
1523 nvgpu_udelay(5);
1524 } while (!nvgpu_timeout_expired_msg(&timeout,
1525 "invalidate replay failed on 0x%llx"));
1526 if (err)
1527 nvgpu_err(g, "invalidate replay timedout");
1528
1529 nvgpu_mutex_release(&g->mm.tlb_lock);
1530
1531 return err;
1532}
1533
1534static int gv11b_fb_fix_page_fault(struct gk20a *g,
1535 struct mmu_fault_info *mmfault)
1536{
1537 int err = 0;
1538 u32 pte[2];
1539
1540 if (mmfault->refch == NULL) {
1541 nvgpu_log(g, gpu_dbg_intr, "refch from mmu_fault_info is NULL");
1542 return -EINVAL;
1543 }
1544
1545 err = __nvgpu_get_pte(g,
1546 mmfault->refch->vm, mmfault->fault_addr, &pte[0]);
1547 if (err) {
1548 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_pte, "pte not found");
1549 return err;
1550 }
1551 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_pte,
1552 "pte: %#08x %#08x", pte[1], pte[0]);
1553
1554 pte[0] |= gmmu_new_pte_valid_true_f();
1555 if (pte[0] & gmmu_new_pte_read_only_true_f())
1556 pte[0] &= ~(gmmu_new_pte_read_only_true_f());
1557 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_pte,
1558 "new pte: %#08x %#08x", pte[1], pte[0]);
1559
1560 err = __nvgpu_set_pte(g,
1561 mmfault->refch->vm, mmfault->fault_addr, &pte[0]);
1562 if (err) {
1563 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_pte, "pte not fixed");
1564 return err;
1565 }
1566 /* invalidate tlb so that GMMU does not use old cached translation */
1567 g->ops.fb.tlb_invalidate(g, mmfault->refch->vm->pdb.mem);
1568
1569 err = __nvgpu_get_pte(g,
1570 mmfault->refch->vm, mmfault->fault_addr, &pte[0]);
1571 nvgpu_log(g, gpu_dbg_intr | gpu_dbg_pte,
1572 "pte after tlb invalidate: %#08x %#08x",
1573 pte[1], pte[0]);
1574 return err;
1575}
1576
1401void gv11b_init_fb(struct gpu_ops *gops) 1577void gv11b_init_fb(struct gpu_ops *gops)
1402{ 1578{
1403 gp10b_init_fb(gops); 1579 gp10b_init_fb(gops);