summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDeepak Nibade <dnibade@nvidia.com>2018-04-02 09:40:42 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2018-04-04 14:49:46 -0400
commit89e0745fa024891b988508c3baa20c453230a80b (patch)
treeb59efd3831570c624c9f8b611abb81489346862d
parenta108d3f0368c72f2d553cac1470531677b5a7b88 (diff)
gpu: nvgpu: handle misaligned_addr SM exception
We right now do not handle misaligned_addr SM exception explicitly and hence we incorrectly initiate CILP on this exception Handle this exception explicitly in this sequence - - set error notifier first - clear the interrupt - return error from gr_gv11b_handle_warp_esr_error_misaligned_addr() so that RC recovery is triggered by gk20a_gr_isr() Ensure that the error value is propagated back to gk20a_gr_isr() correctly Use nvgpu_set_error_notifier_if_empty() to set error notifier since this will prevent overwriting of error notifier value in case gk20a_gr_isr() also tries to write to some error notifier value Bug 200388475 Jira NVGPU-554 Change-Id: I84c4d202a8068e738567ccd344e05d9d5f6ad2f0 Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1686781 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/gk20a/gr_gk20a.c14
-rw-r--r--drivers/gpu/nvgpu/gv100/hal_gv100.c2
-rw-r--r--drivers/gpu/nvgpu/gv11b/gr_gv11b.c40
-rw-r--r--drivers/gpu/nvgpu/gv11b/hal_gv11b.c2
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/hw/gv100/hw_gr_gv100.h4
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h4
6 files changed, 57 insertions, 9 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index c6a58fec..680b1637 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -5740,7 +5740,7 @@ static int gk20a_gr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc,
5740 "GPC%d TPC%d: SM%d exception pending", 5740 "GPC%d TPC%d: SM%d exception pending",
5741 gpc, tpc, sm); 5741 gpc, tpc, sm);
5742 5742
5743 ret = g->ops.gr.handle_sm_exception(g, 5743 ret |= g->ops.gr.handle_sm_exception(g,
5744 gpc, tpc, sm, post_event, fault_ch, 5744 gpc, tpc, sm, post_event, fault_ch,
5745 hww_global_esr); 5745 hww_global_esr);
5746 /* clear the hwws, also causes tpc and gpc 5746 /* clear the hwws, also causes tpc and gpc
@@ -5759,11 +5759,11 @@ static int gk20a_gr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc,
5759 gr_gpc0_tpc0_tpccs_tpc_exception_tex_pending_v()) { 5759 gr_gpc0_tpc0_tpccs_tpc_exception_tex_pending_v()) {
5760 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, 5760 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5761 "GPC%d TPC%d: TEX exception pending", gpc, tpc); 5761 "GPC%d TPC%d: TEX exception pending", gpc, tpc);
5762 ret = g->ops.gr.handle_tex_exception(g, gpc, tpc, post_event); 5762 ret |= g->ops.gr.handle_tex_exception(g, gpc, tpc, post_event);
5763 } 5763 }
5764 5764
5765 if (g->ops.gr.handle_tpc_mpc_exception) 5765 if (g->ops.gr.handle_tpc_mpc_exception)
5766 ret = g->ops.gr.handle_tpc_mpc_exception(g, 5766 ret |= g->ops.gr.handle_tpc_mpc_exception(g,
5767 gpc, tpc, post_event); 5767 gpc, tpc, post_event);
5768 5768
5769 return ret; 5769 return ret;
@@ -5801,7 +5801,7 @@ static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event,
5801 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, 5801 gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
5802 "GPC%d: TPC%d exception pending", gpc, tpc); 5802 "GPC%d: TPC%d exception pending", gpc, tpc);
5803 5803
5804 ret = gk20a_gr_handle_tpc_exception(g, gpc, tpc, 5804 ret |= gk20a_gr_handle_tpc_exception(g, gpc, tpc,
5805 post_event, fault_ch, hww_global_esr); 5805 post_event, fault_ch, hww_global_esr);
5806 5806
5807 } 5807 }
@@ -5812,7 +5812,7 @@ static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event,
5812 int gcc_ret = 0; 5812 int gcc_ret = 0;
5813 gcc_ret = g->ops.gr.handle_gcc_exception(g, gpc, tpc, 5813 gcc_ret = g->ops.gr.handle_gcc_exception(g, gpc, tpc,
5814 post_event, fault_ch, hww_global_esr); 5814 post_event, fault_ch, hww_global_esr);
5815 ret = ret ? ret : gcc_ret; 5815 ret |= ret ? ret : gcc_ret;
5816 } 5816 }
5817 5817
5818 /* Handle GPCCS exceptions */ 5818 /* Handle GPCCS exceptions */
@@ -5820,7 +5820,7 @@ static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event,
5820 int ret_ecc = 0; 5820 int ret_ecc = 0;
5821 ret_ecc = g->ops.gr.handle_gpc_gpccs_exception(g, gpc, 5821 ret_ecc = g->ops.gr.handle_gpc_gpccs_exception(g, gpc,
5822 gpc_exception); 5822 gpc_exception);
5823 ret = ret ? ret : ret_ecc; 5823 ret |= ret ? ret : ret_ecc;
5824 } 5824 }
5825 5825
5826 /* Handle GPCMMU exceptions */ 5826 /* Handle GPCMMU exceptions */
@@ -5829,7 +5829,7 @@ static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event,
5829 5829
5830 ret_mmu = g->ops.gr.handle_gpc_gpcmmu_exception(g, gpc, 5830 ret_mmu = g->ops.gr.handle_gpc_gpcmmu_exception(g, gpc,
5831 gpc_exception); 5831 gpc_exception);
5832 ret = ret ? ret : ret_mmu; 5832 ret |= ret ? ret : ret_mmu;
5833 } 5833 }
5834 5834
5835 } 5835 }
diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c
index f0187dab..b38260a5 100644
--- a/drivers/gpu/nvgpu/gv100/hal_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c
@@ -517,7 +517,7 @@ static const struct gpu_ops gv100_ops = {
517 .check_ch_ctxsw_timeout = gk20a_fifo_check_ch_ctxsw_timeout, 517 .check_ch_ctxsw_timeout = gk20a_fifo_check_ch_ctxsw_timeout,
518 .channel_suspend = gk20a_channel_suspend, 518 .channel_suspend = gk20a_channel_suspend,
519 .channel_resume = gk20a_channel_resume, 519 .channel_resume = gk20a_channel_resume,
520 .set_error_notifier = nvgpu_set_error_notifier, 520 .set_error_notifier = nvgpu_set_error_notifier_if_empty,
521 .setup_sw = gk20a_init_fifo_setup_sw, 521 .setup_sw = gk20a_init_fifo_setup_sw,
522#ifdef CONFIG_TEGRA_GK20A_NVHOST 522#ifdef CONFIG_TEGRA_GK20A_NVHOST
523 .alloc_syncpt_buf = gv11b_fifo_alloc_syncpt_buf, 523 .alloc_syncpt_buf = gv11b_fifo_alloc_syncpt_buf,
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index 7f6d1906..c43c6e83 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -30,6 +30,7 @@
30#include <nvgpu/enabled.h> 30#include <nvgpu/enabled.h>
31#include <nvgpu/fuse.h> 31#include <nvgpu/fuse.h>
32#include <nvgpu/bug.h> 32#include <nvgpu/bug.h>
33#include <nvgpu/error_notifier.h>
33 34
34#include "gk20a/gk20a.h" 35#include "gk20a/gk20a.h"
35#include "gk20a/gr_gk20a.h" 36#include "gk20a/gr_gk20a.h"
@@ -2090,6 +2091,41 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
2090 return 0; 2091 return 0;
2091} 2092}
2092 2093
2094static int gr_gv11b_handle_warp_esr_error_misaligned_addr(struct gk20a *g,
2095 u32 gpc, u32 tpc, u32 sm,
2096 u32 warp_esr,
2097 struct channel_gk20a *fault_ch)
2098{
2099 struct tsg_gk20a *tsg;
2100 u32 offset;
2101 struct channel_gk20a *ch_tsg;
2102
2103 if (fault_ch) {
2104 tsg = &g->fifo.tsg[fault_ch->tsgid];
2105
2106 nvgpu_rwsem_down_read(&tsg->ch_list_lock);
2107 nvgpu_list_for_each_entry(ch_tsg, &tsg->ch_list,
2108 channel_gk20a, ch_entry) {
2109 if (gk20a_channel_get(ch_tsg)) {
2110 g->ops.fifo.set_error_notifier(ch_tsg,
2111 NVGPU_ERR_NOTIFIER_GR_EXCEPTION);
2112 gk20a_channel_put(ch_tsg);
2113 }
2114 }
2115 nvgpu_rwsem_up_read(&tsg->ch_list_lock);
2116 }
2117
2118 /* clear interrupt */
2119 offset = gk20a_gr_gpc_offset(g, gpc) +
2120 gk20a_gr_tpc_offset(g, tpc) +
2121 gv11b_gr_sm_offset(g, sm);
2122 nvgpu_writel(g,
2123 gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset, 0);
2124
2125 /* return error so that recovery is triggered by gk20a_gr_isr() */
2126 return -EFAULT;
2127}
2128
2093/* @brief pre-process work on the SM exceptions to determine if we clear them or not. 2129/* @brief pre-process work on the SM exceptions to determine if we clear them or not.
2094 * 2130 *
2095 * On Pascal, if we are in CILP preemtion mode, preempt the channel and handle errors with special processing 2131 * On Pascal, if we are in CILP preemtion mode, preempt the channel and handle errors with special processing
@@ -2118,6 +2154,10 @@ int gr_gv11b_pre_process_sm_exception(struct gk20a *g,
2118 return gr_gv11b_handle_warp_esr_error_mmu_nack(g, gpc, tpc, sm, 2154 return gr_gv11b_handle_warp_esr_error_mmu_nack(g, gpc, tpc, sm,
2119 warp_esr, fault_ch); 2155 warp_esr, fault_ch);
2120 2156
2157 if (warp_esr & gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_addr_f())
2158 return gr_gv11b_handle_warp_esr_error_misaligned_addr(g, gpc, tpc, sm,
2159 warp_esr, fault_ch);
2160
2121 if (fault_ch) { 2161 if (fault_ch) {
2122 tsg = tsg_gk20a_from_ch(fault_ch); 2162 tsg = tsg_gk20a_from_ch(fault_ch);
2123 if (!tsg) 2163 if (!tsg)
diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
index 2d6dc9b0..dd4bd55a 100644
--- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
@@ -534,7 +534,7 @@ static const struct gpu_ops gv11b_ops = {
534 .check_ch_ctxsw_timeout = gk20a_fifo_check_ch_ctxsw_timeout, 534 .check_ch_ctxsw_timeout = gk20a_fifo_check_ch_ctxsw_timeout,
535 .channel_suspend = gk20a_channel_suspend, 535 .channel_suspend = gk20a_channel_suspend,
536 .channel_resume = gk20a_channel_resume, 536 .channel_resume = gk20a_channel_resume,
537 .set_error_notifier = nvgpu_set_error_notifier, 537 .set_error_notifier = nvgpu_set_error_notifier_if_empty,
538 .setup_sw = gk20a_init_fifo_setup_sw, 538 .setup_sw = gk20a_init_fifo_setup_sw,
539#ifdef CONFIG_TEGRA_GK20A_NVHOST 539#ifdef CONFIG_TEGRA_GK20A_NVHOST
540 .alloc_syncpt_buf = gv11b_fifo_alloc_syncpt_buf, 540 .alloc_syncpt_buf = gv11b_fifo_alloc_syncpt_buf,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/hw/gv100/hw_gr_gv100.h b/drivers/gpu/nvgpu/include/nvgpu/hw/gv100/hw_gr_gv100.h
index 8e475895..f5f09cdf 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv100/hw_gr_gv100.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv100/hw_gr_gv100.h
@@ -3632,6 +3632,10 @@ static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_v(void)
3632{ 3632{
3633 return 0x00000000U; 3633 return 0x00000000U;
3634} 3634}
3635static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_addr_f(void)
3636{
3637 return 0xfU;
3638}
3635static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_nack_f(void) 3639static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_nack_f(void)
3636{ 3640{
3637 return 0x20U; 3641 return 0x20U;
diff --git a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h
index 4458265d..f7968089 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_gr_gv11b.h
@@ -4392,6 +4392,10 @@ static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_f(void)
4392{ 4392{
4393 return 0x0U; 4393 return 0x0U;
4394} 4394}
4395static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_addr_f(void)
4396{
4397 return 0xfU;
4398}
4395static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_nack_f(void) 4399static inline u32 gr_gpc0_tpc0_sm0_hww_warp_esr_error_mmu_nack_f(void)
4396{ 4400{
4397 return 0x20U; 4401 return 0x20U;