summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRichard Zhao <rizhao@nvidia.com>2018-06-06 23:46:03 -0400
committerTejal Kudav <tkudav@nvidia.com>2018-06-14 09:44:08 -0400
commit6a46965eb3b7b657c089142579ab20d6efefc0fc (patch)
tree60aa4098d4b50af9db21e316098cfbe35c9a4797
parent7a5d498a711833990a9d8fc3f5d3f3e26bee301c (diff)
gpu: nvgpu: correct calculation of sm_id for .record_sm_error_state
Starting with Volta, one TPC could have more than 1 SMs. So .record_sm_error_state needs to have sm number as parameter. Logic tpc id should be read from gr_gpc0_gpm_pd_sm_id_r. Let the function return logical sm_id. RM server will need it to nofify client. Jira EVLR-2643 Bug 200405202 Change-Id: Iffaff05b89b1c5058616b8a6bf50dd73bd4e52f6 Signed-off-by: Richard Zhao <rizhao@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1742165 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.h4
-rw-r--r--drivers/gpu/nvgpu/gk20a/gr_gk20a.c2
-rw-r--r--drivers/gpu/nvgpu/gm20b/gr_gm20b.c4
-rw-r--r--drivers/gpu/nvgpu/gm20b/gr_gm20b.h2
-rw-r--r--drivers/gpu/nvgpu/gv11b/gr_gv11b.c18
-rw-r--r--drivers/gpu/nvgpu/gv11b/gr_gv11b.h2
6 files changed, 15 insertions, 17 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index a2b2e53f..49f2a34a 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -368,8 +368,8 @@ struct gpu_ops {
368 void (*enable_exceptions)(struct gk20a *g); 368 void (*enable_exceptions)(struct gk20a *g);
369 void (*create_gr_sysfs)(struct gk20a *g); 369 void (*create_gr_sysfs)(struct gk20a *g);
370 u32 (*get_lrf_tex_ltc_dram_override)(struct gk20a *g); 370 u32 (*get_lrf_tex_ltc_dram_override)(struct gk20a *g);
371 int (*record_sm_error_state)(struct gk20a *g, u32 gpc, 371 int (*record_sm_error_state)(struct gk20a *g, u32 gpc, u32 tpc,
372 u32 tpc, struct channel_gk20a *fault_ch); 372 u32 sm, struct channel_gk20a *fault_ch);
373 int (*update_sm_error_state)(struct gk20a *g, 373 int (*update_sm_error_state)(struct gk20a *g,
374 struct channel_gk20a *ch, u32 sm_id, 374 struct channel_gk20a *ch, u32 sm_id,
375 struct nvgpu_gr_sm_error_state *sm_error_state); 375 struct nvgpu_gr_sm_error_state *sm_error_state);
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 37ac8748..a082cd92 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -5608,7 +5608,7 @@ int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
5608 "sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr); 5608 "sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr);
5609 5609
5610 gr_gk20a_elpg_protected_call(g, 5610 gr_gk20a_elpg_protected_call(g,
5611 g->ops.gr.record_sm_error_state(g, gpc, tpc, fault_ch)); 5611 g->ops.gr.record_sm_error_state(g, gpc, tpc, sm, fault_ch));
5612 5612
5613 if (g->ops.gr.pre_process_sm_exception) { 5613 if (g->ops.gr.pre_process_sm_exception) {
5614 ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm, 5614 ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm,
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
index 331c3af9..261c3054 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
@@ -1278,7 +1278,7 @@ void gr_gm20b_get_access_map(struct gk20a *g,
1278 *num_entries = ARRAY_SIZE(wl_addr_gm20b); 1278 *num_entries = ARRAY_SIZE(wl_addr_gm20b);
1279} 1279}
1280 1280
1281int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, 1281int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
1282 struct channel_gk20a *fault_ch) 1282 struct channel_gk20a *fault_ch)
1283{ 1283{
1284 int sm_id; 1284 int sm_id;
@@ -1306,7 +1306,7 @@ int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc,
1306 1306
1307 nvgpu_mutex_release(&g->dbg_sessions_lock); 1307 nvgpu_mutex_release(&g->dbg_sessions_lock);
1308 1308
1309 return 0; 1309 return sm_id;
1310} 1310}
1311 1311
1312int gm20b_gr_update_sm_error_state(struct gk20a *g, 1312int gm20b_gr_update_sm_error_state(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
index ff32d8ff..5c82fd65 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
@@ -116,7 +116,7 @@ void gr_gm20b_bpt_reg_info(struct gk20a *g, struct nvgpu_warpstate *w_state);
116void gr_gm20b_get_access_map(struct gk20a *g, 116void gr_gm20b_get_access_map(struct gk20a *g,
117 u32 **whitelist, int *num_entries); 117 u32 **whitelist, int *num_entries);
118int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, 118int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc,
119 u32 tpc, struct channel_gk20a *fault_ch); 119 u32 tpc, u32 sm, struct channel_gk20a *fault_ch);
120int gm20b_gr_update_sm_error_state(struct gk20a *g, 120int gm20b_gr_update_sm_error_state(struct gk20a *g,
121 struct channel_gk20a *ch, u32 sm_id, 121 struct channel_gk20a *ch, u32 sm_id,
122 struct nvgpu_gr_sm_error_state *sm_error_state); 122 struct nvgpu_gr_sm_error_state *sm_error_state);
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index 9bd48fdc..f57be9dd 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -3263,24 +3263,22 @@ int gv11b_gr_set_sm_debug_mode(struct gk20a *g,
3263 return err; 3263 return err;
3264} 3264}
3265 3265
3266int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, 3266int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
3267 struct channel_gk20a *fault_ch) 3267 struct channel_gk20a *fault_ch)
3268{ 3268{
3269 int sm_id; 3269 int sm_id;
3270 struct gr_gk20a *gr = &g->gr; 3270 struct gr_gk20a *gr = &g->gr;
3271 u32 offset, sm, sm_per_tpc; 3271 u32 offset, sm_per_tpc, tpc_id;
3272 u32 gpc_tpc_offset; 3272 u32 gpc_offset, gpc_tpc_offset;
3273 3273
3274 nvgpu_mutex_acquire(&g->dbg_sessions_lock); 3274 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
3275 3275
3276 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC); 3276 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
3277 gpc_tpc_offset = gk20a_gr_gpc_offset(g, gpc) + 3277 gpc_offset = gk20a_gr_gpc_offset(g, gpc);
3278 gk20a_gr_tpc_offset(g, tpc); 3278 gpc_tpc_offset = gpc_offset + gk20a_gr_tpc_offset(g, tpc);
3279 3279
3280 sm_id = gr_gpc0_tpc0_sm_cfg_tpc_id_v(gk20a_readl(g, 3280 tpc_id = gk20a_readl(g, gr_gpc0_gpm_pd_sm_id_r(tpc) + gpc_offset);
3281 gr_gpc0_tpc0_sm_cfg_r() + gpc_tpc_offset)); 3281 sm_id = tpc_id * sm_per_tpc + sm;
3282
3283 sm = sm_id % sm_per_tpc;
3284 3282
3285 offset = gpc_tpc_offset + gv11b_gr_sm_offset(g, sm); 3283 offset = gpc_tpc_offset + gv11b_gr_sm_offset(g, sm);
3286 3284
@@ -3301,7 +3299,7 @@ int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc,
3301 3299
3302 nvgpu_mutex_release(&g->dbg_sessions_lock); 3300 nvgpu_mutex_release(&g->dbg_sessions_lock);
3303 3301
3304 return 0; 3302 return sm_id;
3305} 3303}
3306 3304
3307void gv11b_gr_set_hww_esr_report_mask(struct gk20a *g) 3305void gv11b_gr_set_hww_esr_report_mask(struct gk20a *g)
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
index b4a7e411..f6f05a3b 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
@@ -171,7 +171,7 @@ int gv11b_gr_update_sm_error_state(struct gk20a *g,
171 struct nvgpu_gr_sm_error_state *sm_error_state); 171 struct nvgpu_gr_sm_error_state *sm_error_state);
172int gv11b_gr_set_sm_debug_mode(struct gk20a *g, 172int gv11b_gr_set_sm_debug_mode(struct gk20a *g,
173 struct channel_gk20a *ch, u64 sms, bool enable); 173 struct channel_gk20a *ch, u64 sms, bool enable);
174int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, 174int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
175 struct channel_gk20a *fault_ch); 175 struct channel_gk20a *fault_ch);
176void gv11b_gr_set_hww_esr_report_mask(struct gk20a *g); 176void gv11b_gr_set_hww_esr_report_mask(struct gk20a *g);
177bool gv11b_gr_sm_debugger_attached(struct gk20a *g); 177bool gv11b_gr_sm_debugger_attached(struct gk20a *g);