summaryrefslogtreecommitdiffstats
path: root/drivers/gpu
diff options
context:
space:
mode:
authorTerje Bergstrom <tbergstrom@nvidia.com>2017-10-25 17:17:30 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2017-10-26 16:26:25 -0400
commit9eebb7831facaa16b2975f50a716d2986c67b699 (patch)
tree8c0f5ba76e76c10762a04ea7fd7b681960f8ed5b /drivers/gpu
parent34ce21a588ad3e6d11a8fa6bc5b9e7282dca8f61 (diff)
gpu: nvgpu: Linux specific sm_error_state_record
Create an nvgpu internal nvgpu_gr_sm_error_state to store and propagate SM error state within driver. Use nvgpu_dbg_gpu_sm_error_state_record only in Linux code. JIRA NVGPU-259 Change-Id: I7365cdf5a1a42cbcdb418dfcef3e0020e02a960f Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1585645 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers/gpu')
-rw-r--r--drivers/gpu/nvgpu/common/linux/ioctl_dbg.c50
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.h3
-rw-r--r--drivers/gpu/nvgpu/gk20a/gr_gk20a.c4
-rw-r--r--drivers/gpu/nvgpu/gk20a/gr_gk20a.h10
-rw-r--r--drivers/gpu/nvgpu/gm20b/gr_gm20b.c2
-rw-r--r--drivers/gpu/nvgpu/gm20b/gr_gm20b.h2
-rw-r--r--drivers/gpu/nvgpu/vgpu/gr_vgpu.c4
7 files changed, 48 insertions, 27 deletions
diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_dbg.c b/drivers/gpu/nvgpu/common/linux/ioctl_dbg.c
index 7e62bb5c..403d9261 100644
--- a/drivers/gpu/nvgpu/common/linux/ioctl_dbg.c
+++ b/drivers/gpu/nvgpu/common/linux/ioctl_dbg.c
@@ -239,7 +239,8 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state(
239 struct gr_gk20a *gr = &g->gr; 239 struct gr_gk20a *gr = &g->gr;
240 u32 sm_id; 240 u32 sm_id;
241 struct channel_gk20a *ch; 241 struct channel_gk20a *ch;
242 struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state; 242 struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record;
243 struct nvgpu_gr_sm_error_state sm_error_state;
243 int err = 0; 244 int err = 0;
244 245
245 ch = nvgpu_dbg_gpu_get_session_channel(dbg_s); 246 ch = nvgpu_dbg_gpu_get_session_channel(dbg_s);
@@ -250,41 +251,43 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state(
250 if (sm_id >= gr->no_of_sm) 251 if (sm_id >= gr->no_of_sm)
251 return -EINVAL; 252 return -EINVAL;
252 253
253 sm_error_state = nvgpu_kzalloc(g, sizeof(*sm_error_state));
254 if (!sm_error_state)
255 return -ENOMEM;
256
257 if (args->sm_error_state_record_size > 0) { 254 if (args->sm_error_state_record_size > 0) {
258 size_t read_size = sizeof(*sm_error_state); 255 size_t read_size = sizeof(sm_error_state_record);
259 256
260 if (read_size > args->sm_error_state_record_size) 257 if (read_size > args->sm_error_state_record_size)
261 read_size = args->sm_error_state_record_size; 258 read_size = args->sm_error_state_record_size;
262 259
263 nvgpu_mutex_acquire(&g->dbg_sessions_lock); 260 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
264 err = copy_from_user(sm_error_state, 261 err = copy_from_user(&sm_error_state_record,
265 (void __user *)(uintptr_t) 262 (void __user *)(uintptr_t)
266 args->sm_error_state_record_mem, 263 args->sm_error_state_record_mem,
267 read_size); 264 read_size);
268 nvgpu_mutex_release(&g->dbg_sessions_lock); 265 nvgpu_mutex_release(&g->dbg_sessions_lock);
269 if (err) { 266 if (err)
270 err = -ENOMEM; 267 return -ENOMEM;
271 goto err_free;
272 }
273 } 268 }
274 269
275 err = gk20a_busy(g); 270 err = gk20a_busy(g);
276 if (err) 271 if (err)
277 goto err_free; 272 return err;
273
274 sm_error_state.hww_global_esr =
275 sm_error_state_record.hww_global_esr;
276 sm_error_state.hww_warp_esr =
277 sm_error_state_record.hww_warp_esr;
278 sm_error_state.hww_warp_esr_pc =
279 sm_error_state_record.hww_warp_esr_pc;
280 sm_error_state.hww_global_esr_report_mask =
281 sm_error_state_record.hww_global_esr_report_mask;
282 sm_error_state.hww_warp_esr_report_mask =
283 sm_error_state_record.hww_warp_esr_report_mask;
278 284
279 err = gr_gk20a_elpg_protected_call(g, 285 err = gr_gk20a_elpg_protected_call(g,
280 g->ops.gr.update_sm_error_state(g, ch, 286 g->ops.gr.update_sm_error_state(g, ch,
281 sm_id, sm_error_state)); 287 sm_id, &sm_error_state));
282 288
283 gk20a_idle(g); 289 gk20a_idle(g);
284 290
285err_free:
286 nvgpu_kfree(g, sm_error_state);
287
288 return err; 291 return err;
289} 292}
290 293
@@ -295,7 +298,8 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state(
295{ 298{
296 struct gk20a *g = dbg_s->g; 299 struct gk20a *g = dbg_s->g;
297 struct gr_gk20a *gr = &g->gr; 300 struct gr_gk20a *gr = &g->gr;
298 struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state; 301 struct nvgpu_gr_sm_error_state *sm_error_state;
302 struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record;
299 u32 sm_id; 303 u32 sm_id;
300 int err = 0; 304 int err = 0;
301 305
@@ -304,6 +308,16 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state(
304 return -EINVAL; 308 return -EINVAL;
305 309
306 sm_error_state = gr->sm_error_states + sm_id; 310 sm_error_state = gr->sm_error_states + sm_id;
311 sm_error_state_record.hww_global_esr =
312 sm_error_state->hww_global_esr;
313 sm_error_state_record.hww_warp_esr =
314 sm_error_state->hww_warp_esr;
315 sm_error_state_record.hww_warp_esr_pc =
316 sm_error_state->hww_warp_esr_pc;
317 sm_error_state_record.hww_global_esr_report_mask =
318 sm_error_state->hww_global_esr_report_mask;
319 sm_error_state_record.hww_warp_esr_report_mask =
320 sm_error_state->hww_warp_esr_report_mask;
307 321
308 if (args->sm_error_state_record_size > 0) { 322 if (args->sm_error_state_record_size > 0) {
309 size_t write_size = sizeof(*sm_error_state); 323 size_t write_size = sizeof(*sm_error_state);
@@ -314,7 +328,7 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state(
314 nvgpu_mutex_acquire(&g->dbg_sessions_lock); 328 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
315 err = copy_to_user((void __user *)(uintptr_t) 329 err = copy_to_user((void __user *)(uintptr_t)
316 args->sm_error_state_record_mem, 330 args->sm_error_state_record_mem,
317 sm_error_state, 331 &sm_error_state_record,
318 write_size); 332 write_size);
319 nvgpu_mutex_release(&g->dbg_sessions_lock); 333 nvgpu_mutex_release(&g->dbg_sessions_lock);
320 if (err) { 334 if (err) {
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 13d534c4..80d85d65 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -361,8 +361,7 @@ struct gpu_ops {
361 u32 gpc, u32 tpc); 361 u32 gpc, u32 tpc);
362 int (*update_sm_error_state)(struct gk20a *g, 362 int (*update_sm_error_state)(struct gk20a *g,
363 struct channel_gk20a *ch, u32 sm_id, 363 struct channel_gk20a *ch, u32 sm_id,
364 struct nvgpu_dbg_gpu_sm_error_state_record * 364 struct nvgpu_gr_sm_error_state *sm_error_state);
365 sm_error_state);
366 int (*clear_sm_error_state)(struct gk20a *g, 365 int (*clear_sm_error_state)(struct gk20a *g,
367 struct channel_gk20a *ch, u32 sm_id); 366 struct channel_gk20a *ch, u32 sm_id);
368 int (*suspend_contexts)(struct gk20a *g, 367 int (*suspend_contexts)(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 5910c7d9..2fd6f72c 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -1543,7 +1543,7 @@ restore_fe_go_idle:
1543 * we initialize gr->no_of_sm in this function 1543 * we initialize gr->no_of_sm in this function
1544 */ 1544 */
1545 gr->sm_error_states = nvgpu_kzalloc(g, 1545 gr->sm_error_states = nvgpu_kzalloc(g,
1546 sizeof(struct nvgpu_dbg_gpu_sm_error_state_record) 1546 sizeof(struct nvgpu_gr_sm_error_state)
1547 * gr->no_of_sm); 1547 * gr->no_of_sm);
1548 if (!gr->sm_error_states) { 1548 if (!gr->sm_error_states) {
1549 err = -ENOMEM; 1549 err = -ENOMEM;
@@ -4566,7 +4566,7 @@ restore_fe_go_idle:
4566 * we initialize gr->no_of_sm in this function 4566 * we initialize gr->no_of_sm in this function
4567 */ 4567 */
4568 gr->sm_error_states = nvgpu_kzalloc(g, 4568 gr->sm_error_states = nvgpu_kzalloc(g,
4569 sizeof(struct nvgpu_dbg_gpu_sm_error_state_record) * 4569 sizeof(struct nvgpu_gr_sm_error_state) *
4570 gr->no_of_sm); 4570 gr->no_of_sm);
4571 if (!gr->sm_error_states) { 4571 if (!gr->sm_error_states) {
4572 err = -ENOMEM; 4572 err = -ENOMEM;
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index 6b422138..22fc40d1 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -231,6 +231,14 @@ struct nvgpu_preemption_modes_rec {
231 u32 default_compute_preempt_mode; /* default mode */ 231 u32 default_compute_preempt_mode; /* default mode */
232}; 232};
233 233
234struct nvgpu_gr_sm_error_state {
235 u32 hww_global_esr;
236 u32 hww_warp_esr;
237 u64 hww_warp_esr_pc;
238 u32 hww_global_esr_report_mask;
239 u32 hww_warp_esr_report_mask;
240};
241
234struct gr_gk20a { 242struct gr_gk20a {
235 struct gk20a *g; 243 struct gk20a *g;
236 struct { 244 struct {
@@ -387,7 +395,7 @@ struct gr_gk20a {
387 u32 *fbp_rop_l2_en_mask; 395 u32 *fbp_rop_l2_en_mask;
388 u32 no_of_sm; 396 u32 no_of_sm;
389 struct sm_info *sm_to_cluster; 397 struct sm_info *sm_to_cluster;
390 struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_states; 398 struct nvgpu_gr_sm_error_state *sm_error_states;
391#if defined(CONFIG_GK20A_CYCLE_STATS) 399#if defined(CONFIG_GK20A_CYCLE_STATS)
392 struct nvgpu_mutex cs_lock; 400 struct nvgpu_mutex cs_lock;
393 struct gk20a_cs_snapshot *cs_data; 401 struct gk20a_cs_snapshot *cs_data;
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
index a1078b10..c10517b7 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
@@ -1297,7 +1297,7 @@ int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc)
1297 1297
1298int gm20b_gr_update_sm_error_state(struct gk20a *g, 1298int gm20b_gr_update_sm_error_state(struct gk20a *g,
1299 struct channel_gk20a *ch, u32 sm_id, 1299 struct channel_gk20a *ch, u32 sm_id,
1300 struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state) 1300 struct nvgpu_gr_sm_error_state *sm_error_state)
1301{ 1301{
1302 u32 gpc, tpc, offset; 1302 u32 gpc, tpc, offset;
1303 struct gr_gk20a *gr = &g->gr; 1303 struct gr_gk20a *gr = &g->gr;
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
index 67f1ea29..15deaa0d 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
@@ -119,7 +119,7 @@ void gr_gm20b_get_access_map(struct gk20a *g,
119int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc); 119int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc);
120int gm20b_gr_update_sm_error_state(struct gk20a *g, 120int gm20b_gr_update_sm_error_state(struct gk20a *g,
121 struct channel_gk20a *ch, u32 sm_id, 121 struct channel_gk20a *ch, u32 sm_id,
122 struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state); 122 struct nvgpu_gr_sm_error_state *sm_error_state);
123int gm20b_gr_clear_sm_error_state(struct gk20a *g, 123int gm20b_gr_clear_sm_error_state(struct gk20a *g,
124 struct channel_gk20a *ch, u32 sm_id); 124 struct channel_gk20a *ch, u32 sm_id);
125int gr_gm20b_get_preemption_mode_flags(struct gk20a *g, 125int gr_gm20b_get_preemption_mode_flags(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
index 2d6beda6..d400f08e 100644
--- a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
@@ -899,7 +899,7 @@ static int vgpu_gr_init_gr_setup_sw(struct gk20a *g)
899 nvgpu_mutex_init(&gr->ctx_mutex); 899 nvgpu_mutex_init(&gr->ctx_mutex);
900 900
901 gr->sm_error_states = nvgpu_kzalloc(g, 901 gr->sm_error_states = nvgpu_kzalloc(g,
902 sizeof(struct nvgpu_dbg_gpu_sm_error_state_record) * 902 sizeof(struct nvgpu_gr_sm_error_state) *
903 gr->no_of_sm); 903 gr->no_of_sm);
904 if (!gr->sm_error_states) { 904 if (!gr->sm_error_states) {
905 err = -ENOMEM; 905 err = -ENOMEM;
@@ -1195,7 +1195,7 @@ int vgpu_gr_resume_contexts(struct gk20a *g,
1195void vgpu_gr_handle_sm_esr_event(struct gk20a *g, 1195void vgpu_gr_handle_sm_esr_event(struct gk20a *g,
1196 struct tegra_vgpu_sm_esr_info *info) 1196 struct tegra_vgpu_sm_esr_info *info)
1197{ 1197{
1198 struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_states; 1198 struct nvgpu_gr_sm_error_state *sm_error_states;
1199 1199
1200 if (info->sm_id >= g->gr.no_of_sm) { 1200 if (info->sm_id >= g->gr.no_of_sm) {
1201 nvgpu_err(g, "invalid smd_id %d / %d", 1201 nvgpu_err(g, "invalid smd_id %d / %d",