diff options
author | Terje Bergstrom <tbergstrom@nvidia.com> | 2017-10-25 17:17:30 -0400 |
---|---|---|
committer | mobile promotions <svcmobile_promotions@nvidia.com> | 2017-10-26 16:26:25 -0400 |
commit | 9eebb7831facaa16b2975f50a716d2986c67b699 (patch) | |
tree | 8c0f5ba76e76c10762a04ea7fd7b681960f8ed5b | |
parent | 34ce21a588ad3e6d11a8fa6bc5b9e7282dca8f61 (diff) |
gpu: nvgpu: Linux specific sm_error_state_record
Create an nvgpu internal nvgpu_gr_sm_error_state to store and
propagate SM error state within driver. Use
nvgpu_dbg_gpu_sm_error_state_record only in Linux code.
JIRA NVGPU-259
Change-Id: I7365cdf5a1a42cbcdb418dfcef3e0020e02a960f
Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1585645
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
-rw-r--r-- | drivers/gpu/nvgpu/common/linux/ioctl_dbg.c | 50 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/gk20a.h | 3 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 4 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/gr_gk20a.h | 10 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gm20b/gr_gm20b.c | 2 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gm20b/gr_gm20b.h | 2 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/vgpu/gr_vgpu.c | 4 |
7 files changed, 48 insertions, 27 deletions
diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_dbg.c b/drivers/gpu/nvgpu/common/linux/ioctl_dbg.c index 7e62bb5c..403d9261 100644 --- a/drivers/gpu/nvgpu/common/linux/ioctl_dbg.c +++ b/drivers/gpu/nvgpu/common/linux/ioctl_dbg.c | |||
@@ -239,7 +239,8 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state( | |||
239 | struct gr_gk20a *gr = &g->gr; | 239 | struct gr_gk20a *gr = &g->gr; |
240 | u32 sm_id; | 240 | u32 sm_id; |
241 | struct channel_gk20a *ch; | 241 | struct channel_gk20a *ch; |
242 | struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state; | 242 | struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record; |
243 | struct nvgpu_gr_sm_error_state sm_error_state; | ||
243 | int err = 0; | 244 | int err = 0; |
244 | 245 | ||
245 | ch = nvgpu_dbg_gpu_get_session_channel(dbg_s); | 246 | ch = nvgpu_dbg_gpu_get_session_channel(dbg_s); |
@@ -250,41 +251,43 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state( | |||
250 | if (sm_id >= gr->no_of_sm) | 251 | if (sm_id >= gr->no_of_sm) |
251 | return -EINVAL; | 252 | return -EINVAL; |
252 | 253 | ||
253 | sm_error_state = nvgpu_kzalloc(g, sizeof(*sm_error_state)); | ||
254 | if (!sm_error_state) | ||
255 | return -ENOMEM; | ||
256 | |||
257 | if (args->sm_error_state_record_size > 0) { | 254 | if (args->sm_error_state_record_size > 0) { |
258 | size_t read_size = sizeof(*sm_error_state); | 255 | size_t read_size = sizeof(sm_error_state_record); |
259 | 256 | ||
260 | if (read_size > args->sm_error_state_record_size) | 257 | if (read_size > args->sm_error_state_record_size) |
261 | read_size = args->sm_error_state_record_size; | 258 | read_size = args->sm_error_state_record_size; |
262 | 259 | ||
263 | nvgpu_mutex_acquire(&g->dbg_sessions_lock); | 260 | nvgpu_mutex_acquire(&g->dbg_sessions_lock); |
264 | err = copy_from_user(sm_error_state, | 261 | err = copy_from_user(&sm_error_state_record, |
265 | (void __user *)(uintptr_t) | 262 | (void __user *)(uintptr_t) |
266 | args->sm_error_state_record_mem, | 263 | args->sm_error_state_record_mem, |
267 | read_size); | 264 | read_size); |
268 | nvgpu_mutex_release(&g->dbg_sessions_lock); | 265 | nvgpu_mutex_release(&g->dbg_sessions_lock); |
269 | if (err) { | 266 | if (err) |
270 | err = -ENOMEM; | 267 | return -ENOMEM; |
271 | goto err_free; | ||
272 | } | ||
273 | } | 268 | } |
274 | 269 | ||
275 | err = gk20a_busy(g); | 270 | err = gk20a_busy(g); |
276 | if (err) | 271 | if (err) |
277 | goto err_free; | 272 | return err; |
273 | |||
274 | sm_error_state.hww_global_esr = | ||
275 | sm_error_state_record.hww_global_esr; | ||
276 | sm_error_state.hww_warp_esr = | ||
277 | sm_error_state_record.hww_warp_esr; | ||
278 | sm_error_state.hww_warp_esr_pc = | ||
279 | sm_error_state_record.hww_warp_esr_pc; | ||
280 | sm_error_state.hww_global_esr_report_mask = | ||
281 | sm_error_state_record.hww_global_esr_report_mask; | ||
282 | sm_error_state.hww_warp_esr_report_mask = | ||
283 | sm_error_state_record.hww_warp_esr_report_mask; | ||
278 | 284 | ||
279 | err = gr_gk20a_elpg_protected_call(g, | 285 | err = gr_gk20a_elpg_protected_call(g, |
280 | g->ops.gr.update_sm_error_state(g, ch, | 286 | g->ops.gr.update_sm_error_state(g, ch, |
281 | sm_id, sm_error_state)); | 287 | sm_id, &sm_error_state)); |
282 | 288 | ||
283 | gk20a_idle(g); | 289 | gk20a_idle(g); |
284 | 290 | ||
285 | err_free: | ||
286 | nvgpu_kfree(g, sm_error_state); | ||
287 | |||
288 | return err; | 291 | return err; |
289 | } | 292 | } |
290 | 293 | ||
@@ -295,7 +298,8 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state( | |||
295 | { | 298 | { |
296 | struct gk20a *g = dbg_s->g; | 299 | struct gk20a *g = dbg_s->g; |
297 | struct gr_gk20a *gr = &g->gr; | 300 | struct gr_gk20a *gr = &g->gr; |
298 | struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state; | 301 | struct nvgpu_gr_sm_error_state *sm_error_state; |
302 | struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record; | ||
299 | u32 sm_id; | 303 | u32 sm_id; |
300 | int err = 0; | 304 | int err = 0; |
301 | 305 | ||
@@ -304,6 +308,16 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state( | |||
304 | return -EINVAL; | 308 | return -EINVAL; |
305 | 309 | ||
306 | sm_error_state = gr->sm_error_states + sm_id; | 310 | sm_error_state = gr->sm_error_states + sm_id; |
311 | sm_error_state_record.hww_global_esr = | ||
312 | sm_error_state->hww_global_esr; | ||
313 | sm_error_state_record.hww_warp_esr = | ||
314 | sm_error_state->hww_warp_esr; | ||
315 | sm_error_state_record.hww_warp_esr_pc = | ||
316 | sm_error_state->hww_warp_esr_pc; | ||
317 | sm_error_state_record.hww_global_esr_report_mask = | ||
318 | sm_error_state->hww_global_esr_report_mask; | ||
319 | sm_error_state_record.hww_warp_esr_report_mask = | ||
320 | sm_error_state->hww_warp_esr_report_mask; | ||
307 | 321 | ||
308 | if (args->sm_error_state_record_size > 0) { | 322 | if (args->sm_error_state_record_size > 0) { |
309 | size_t write_size = sizeof(*sm_error_state); | 323 | size_t write_size = sizeof(*sm_error_state); |
@@ -314,7 +328,7 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state( | |||
314 | nvgpu_mutex_acquire(&g->dbg_sessions_lock); | 328 | nvgpu_mutex_acquire(&g->dbg_sessions_lock); |
315 | err = copy_to_user((void __user *)(uintptr_t) | 329 | err = copy_to_user((void __user *)(uintptr_t) |
316 | args->sm_error_state_record_mem, | 330 | args->sm_error_state_record_mem, |
317 | sm_error_state, | 331 | &sm_error_state_record, |
318 | write_size); | 332 | write_size); |
319 | nvgpu_mutex_release(&g->dbg_sessions_lock); | 333 | nvgpu_mutex_release(&g->dbg_sessions_lock); |
320 | if (err) { | 334 | if (err) { |
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h index 13d534c4..80d85d65 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a.h | |||
@@ -361,8 +361,7 @@ struct gpu_ops { | |||
361 | u32 gpc, u32 tpc); | 361 | u32 gpc, u32 tpc); |
362 | int (*update_sm_error_state)(struct gk20a *g, | 362 | int (*update_sm_error_state)(struct gk20a *g, |
363 | struct channel_gk20a *ch, u32 sm_id, | 363 | struct channel_gk20a *ch, u32 sm_id, |
364 | struct nvgpu_dbg_gpu_sm_error_state_record * | 364 | struct nvgpu_gr_sm_error_state *sm_error_state); |
365 | sm_error_state); | ||
366 | int (*clear_sm_error_state)(struct gk20a *g, | 365 | int (*clear_sm_error_state)(struct gk20a *g, |
367 | struct channel_gk20a *ch, u32 sm_id); | 366 | struct channel_gk20a *ch, u32 sm_id); |
368 | int (*suspend_contexts)(struct gk20a *g, | 367 | int (*suspend_contexts)(struct gk20a *g, |
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index 5910c7d9..2fd6f72c 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c | |||
@@ -1543,7 +1543,7 @@ restore_fe_go_idle: | |||
1543 | * we initialize gr->no_of_sm in this function | 1543 | * we initialize gr->no_of_sm in this function |
1544 | */ | 1544 | */ |
1545 | gr->sm_error_states = nvgpu_kzalloc(g, | 1545 | gr->sm_error_states = nvgpu_kzalloc(g, |
1546 | sizeof(struct nvgpu_dbg_gpu_sm_error_state_record) | 1546 | sizeof(struct nvgpu_gr_sm_error_state) |
1547 | * gr->no_of_sm); | 1547 | * gr->no_of_sm); |
1548 | if (!gr->sm_error_states) { | 1548 | if (!gr->sm_error_states) { |
1549 | err = -ENOMEM; | 1549 | err = -ENOMEM; |
@@ -4566,7 +4566,7 @@ restore_fe_go_idle: | |||
4566 | * we initialize gr->no_of_sm in this function | 4566 | * we initialize gr->no_of_sm in this function |
4567 | */ | 4567 | */ |
4568 | gr->sm_error_states = nvgpu_kzalloc(g, | 4568 | gr->sm_error_states = nvgpu_kzalloc(g, |
4569 | sizeof(struct nvgpu_dbg_gpu_sm_error_state_record) * | 4569 | sizeof(struct nvgpu_gr_sm_error_state) * |
4570 | gr->no_of_sm); | 4570 | gr->no_of_sm); |
4571 | if (!gr->sm_error_states) { | 4571 | if (!gr->sm_error_states) { |
4572 | err = -ENOMEM; | 4572 | err = -ENOMEM; |
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h index 6b422138..22fc40d1 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h | |||
@@ -231,6 +231,14 @@ struct nvgpu_preemption_modes_rec { | |||
231 | u32 default_compute_preempt_mode; /* default mode */ | 231 | u32 default_compute_preempt_mode; /* default mode */ |
232 | }; | 232 | }; |
233 | 233 | ||
234 | struct nvgpu_gr_sm_error_state { | ||
235 | u32 hww_global_esr; | ||
236 | u32 hww_warp_esr; | ||
237 | u64 hww_warp_esr_pc; | ||
238 | u32 hww_global_esr_report_mask; | ||
239 | u32 hww_warp_esr_report_mask; | ||
240 | }; | ||
241 | |||
234 | struct gr_gk20a { | 242 | struct gr_gk20a { |
235 | struct gk20a *g; | 243 | struct gk20a *g; |
236 | struct { | 244 | struct { |
@@ -387,7 +395,7 @@ struct gr_gk20a { | |||
387 | u32 *fbp_rop_l2_en_mask; | 395 | u32 *fbp_rop_l2_en_mask; |
388 | u32 no_of_sm; | 396 | u32 no_of_sm; |
389 | struct sm_info *sm_to_cluster; | 397 | struct sm_info *sm_to_cluster; |
390 | struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_states; | 398 | struct nvgpu_gr_sm_error_state *sm_error_states; |
391 | #if defined(CONFIG_GK20A_CYCLE_STATS) | 399 | #if defined(CONFIG_GK20A_CYCLE_STATS) |
392 | struct nvgpu_mutex cs_lock; | 400 | struct nvgpu_mutex cs_lock; |
393 | struct gk20a_cs_snapshot *cs_data; | 401 | struct gk20a_cs_snapshot *cs_data; |
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c index a1078b10..c10517b7 100644 --- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c +++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c | |||
@@ -1297,7 +1297,7 @@ int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc) | |||
1297 | 1297 | ||
1298 | int gm20b_gr_update_sm_error_state(struct gk20a *g, | 1298 | int gm20b_gr_update_sm_error_state(struct gk20a *g, |
1299 | struct channel_gk20a *ch, u32 sm_id, | 1299 | struct channel_gk20a *ch, u32 sm_id, |
1300 | struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state) | 1300 | struct nvgpu_gr_sm_error_state *sm_error_state) |
1301 | { | 1301 | { |
1302 | u32 gpc, tpc, offset; | 1302 | u32 gpc, tpc, offset; |
1303 | struct gr_gk20a *gr = &g->gr; | 1303 | struct gr_gk20a *gr = &g->gr; |
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h index 67f1ea29..15deaa0d 100644 --- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h +++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h | |||
@@ -119,7 +119,7 @@ void gr_gm20b_get_access_map(struct gk20a *g, | |||
119 | int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc); | 119 | int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc); |
120 | int gm20b_gr_update_sm_error_state(struct gk20a *g, | 120 | int gm20b_gr_update_sm_error_state(struct gk20a *g, |
121 | struct channel_gk20a *ch, u32 sm_id, | 121 | struct channel_gk20a *ch, u32 sm_id, |
122 | struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state); | 122 | struct nvgpu_gr_sm_error_state *sm_error_state); |
123 | int gm20b_gr_clear_sm_error_state(struct gk20a *g, | 123 | int gm20b_gr_clear_sm_error_state(struct gk20a *g, |
124 | struct channel_gk20a *ch, u32 sm_id); | 124 | struct channel_gk20a *ch, u32 sm_id); |
125 | int gr_gm20b_get_preemption_mode_flags(struct gk20a *g, | 125 | int gr_gm20b_get_preemption_mode_flags(struct gk20a *g, |
diff --git a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c index 2d6beda6..d400f08e 100644 --- a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c +++ b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c | |||
@@ -899,7 +899,7 @@ static int vgpu_gr_init_gr_setup_sw(struct gk20a *g) | |||
899 | nvgpu_mutex_init(&gr->ctx_mutex); | 899 | nvgpu_mutex_init(&gr->ctx_mutex); |
900 | 900 | ||
901 | gr->sm_error_states = nvgpu_kzalloc(g, | 901 | gr->sm_error_states = nvgpu_kzalloc(g, |
902 | sizeof(struct nvgpu_dbg_gpu_sm_error_state_record) * | 902 | sizeof(struct nvgpu_gr_sm_error_state) * |
903 | gr->no_of_sm); | 903 | gr->no_of_sm); |
904 | if (!gr->sm_error_states) { | 904 | if (!gr->sm_error_states) { |
905 | err = -ENOMEM; | 905 | err = -ENOMEM; |
@@ -1195,7 +1195,7 @@ int vgpu_gr_resume_contexts(struct gk20a *g, | |||
1195 | void vgpu_gr_handle_sm_esr_event(struct gk20a *g, | 1195 | void vgpu_gr_handle_sm_esr_event(struct gk20a *g, |
1196 | struct tegra_vgpu_sm_esr_info *info) | 1196 | struct tegra_vgpu_sm_esr_info *info) |
1197 | { | 1197 | { |
1198 | struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_states; | 1198 | struct nvgpu_gr_sm_error_state *sm_error_states; |
1199 | 1199 | ||
1200 | if (info->sm_id >= g->gr.no_of_sm) { | 1200 | if (info->sm_id >= g->gr.no_of_sm) { |
1201 | nvgpu_err(g, "invalid smd_id %d / %d", | 1201 | nvgpu_err(g, "invalid smd_id %d / %d", |