summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorVinod G <vinodg@nvidia.com>2018-08-08 02:09:30 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2018-08-25 05:10:43 -0400
commitbfe65407bde2b5d0776724301e215c6553c989f3 (patch)
treef68a01361052afe1c30a0c6dcd5d359b762e647a /drivers
parent3bd47da0954d3486d9ccd3c396f84445918f82b4 (diff)
gpu: nvgpu: Read sm error ioctl support for tsg
Add READ_SM_ERROR IOCTL support to TSG level. Moved the struct to save the sm_error details from gr to tsg as the sm_error support is context based, not global. Also corrected MISRA 21.1 error in header file. nvgpu_dbg_gpu_ioctl_write_single_sm_error_state and nvgpu_dbg_gpu_ioctl_read_single_sm_error_state functions are modified to use the tsg struct nvgpu_tsg_sm_error_state. Bug 200412642 Change-Id: I9e334b059078a4bb0e360b945444cc4bf1cc56ec Signed-off-by: Vinod G <vinodg@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1794856 Reviewed-by: svc-misra-checker <svc-misra-checker@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.h2
-rw-r--r--drivers/gpu/nvgpu/gk20a/gr_gk20a.c30
-rw-r--r--drivers/gpu/nvgpu/gk20a/gr_gk20a.h9
-rw-r--r--drivers/gpu/nvgpu/gk20a/tsg_gk20a.c82
-rw-r--r--drivers/gpu/nvgpu/gk20a/tsg_gk20a.h21
-rw-r--r--drivers/gpu/nvgpu/gm20b/gr_gm20b.c106
-rw-r--r--drivers/gpu/nvgpu/gm20b/gr_gm20b.h2
-rw-r--r--drivers/gpu/nvgpu/gv11b/gr_gv11b.c109
-rw-r--r--drivers/gpu/nvgpu/gv11b/gr_gv11b.h4
-rw-r--r--drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h5
-rw-r--r--drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c55
-rw-r--r--drivers/gpu/nvgpu/os/linux/ioctl_dbg.c50
-rw-r--r--drivers/gpu/nvgpu/os/linux/ioctl_tsg.c58
-rw-r--r--drivers/gpu/nvgpu/vgpu/gr_vgpu.c36
14 files changed, 349 insertions, 220 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index cf202f14..192f4c3e 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -396,7 +396,7 @@ struct gpu_ops {
396 u32 sm, struct channel_gk20a *fault_ch); 396 u32 sm, struct channel_gk20a *fault_ch);
397 int (*update_sm_error_state)(struct gk20a *g, 397 int (*update_sm_error_state)(struct gk20a *g,
398 struct channel_gk20a *ch, u32 sm_id, 398 struct channel_gk20a *ch, u32 sm_id,
399 struct nvgpu_gr_sm_error_state *sm_error_state); 399 struct nvgpu_tsg_sm_error_state *sm_error_state);
400 int (*clear_sm_error_state)(struct gk20a *g, 400 int (*clear_sm_error_state)(struct gk20a *g,
401 struct channel_gk20a *ch, u32 sm_id); 401 struct channel_gk20a *ch, u32 sm_id);
402 int (*suspend_contexts)(struct gk20a *g, 402 int (*suspend_contexts)(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index f2b083d7..cdc00bbd 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -1561,19 +1561,6 @@ restore_fe_go_idle:
1561 if (err) 1561 if (err)
1562 goto clean_up; 1562 goto clean_up;
1563 1563
1564 nvgpu_kfree(g, gr->sm_error_states);
1565
1566 /* we need to allocate this after g->ops.gr.init_fs_state() since
1567 * we initialize gr->no_of_sm in this function
1568 */
1569 gr->sm_error_states = nvgpu_kzalloc(g,
1570 sizeof(struct nvgpu_gr_sm_error_state)
1571 * gr->no_of_sm);
1572 if (!gr->sm_error_states) {
1573 err = -ENOMEM;
1574 goto restore_fe_go_idle;
1575 }
1576
1577 ctx_header_words = roundup(ctx_header_bytes, sizeof(u32)); 1564 ctx_header_words = roundup(ctx_header_bytes, sizeof(u32));
1578 ctx_header_words >>= 2; 1565 ctx_header_words >>= 2;
1579 1566
@@ -3072,7 +3059,6 @@ static void gk20a_remove_gr_support(struct gr_gk20a *gr)
3072 3059
3073 memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc)); 3060 memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
3074 3061
3075 nvgpu_kfree(g, gr->sm_error_states);
3076 nvgpu_kfree(g, gr->gpc_tpc_count); 3062 nvgpu_kfree(g, gr->gpc_tpc_count);
3077 nvgpu_kfree(g, gr->gpc_zcb_count); 3063 nvgpu_kfree(g, gr->gpc_zcb_count);
3078 nvgpu_kfree(g, gr->gpc_ppc_count); 3064 nvgpu_kfree(g, gr->gpc_ppc_count);
@@ -4545,22 +4531,6 @@ restore_fe_go_idle:
4545 4531
4546 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g), 4532 err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
4547 GR_IDLE_CHECK_DEFAULT); 4533 GR_IDLE_CHECK_DEFAULT);
4548 if (err)
4549 goto out;
4550
4551 nvgpu_kfree(g, gr->sm_error_states);
4552
4553 /* we need to allocate this after g->ops.gr.init_fs_state() since
4554 * we initialize gr->no_of_sm in this function
4555 */
4556 gr->sm_error_states = nvgpu_kzalloc(g,
4557 sizeof(struct nvgpu_gr_sm_error_state) *
4558 gr->no_of_sm);
4559 if (!gr->sm_error_states) {
4560 err = -ENOMEM;
4561 goto restore_fe_go_idle;
4562 }
4563
4564out: 4534out:
4565 nvgpu_log_fn(g, "done"); 4535 nvgpu_log_fn(g, "done");
4566 return err; 4536 return err;
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index 3fc7e55f..bd5e625d 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -254,14 +254,6 @@ struct nvgpu_preemption_modes_rec {
254 u32 default_compute_preempt_mode; /* default mode */ 254 u32 default_compute_preempt_mode; /* default mode */
255}; 255};
256 256
257struct nvgpu_gr_sm_error_state {
258 u32 hww_global_esr;
259 u32 hww_warp_esr;
260 u64 hww_warp_esr_pc;
261 u32 hww_global_esr_report_mask;
262 u32 hww_warp_esr_report_mask;
263};
264
265struct gr_gk20a { 257struct gr_gk20a {
266 struct gk20a *g; 258 struct gk20a *g;
267 struct { 259 struct {
@@ -427,7 +419,6 @@ struct gr_gk20a {
427 u32 *fbp_rop_l2_en_mask; 419 u32 *fbp_rop_l2_en_mask;
428 u32 no_of_sm; 420 u32 no_of_sm;
429 struct sm_info *sm_to_cluster; 421 struct sm_info *sm_to_cluster;
430 struct nvgpu_gr_sm_error_state *sm_error_states;
431 422
432#define NVGPU_SM_EXCEPTION_TYPE_MASK_NONE (0x0U) 423#define NVGPU_SM_EXCEPTION_TYPE_MASK_NONE (0x0U)
433#define NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL (0x1U << 0) 424#define NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL (0x1U << 0)
diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
index 62763da3..624ee1d7 100644
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
@@ -275,8 +275,23 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid)
275 int err; 275 int err;
276 276
277 tsg = gk20a_tsg_acquire_unused_tsg(&g->fifo); 277 tsg = gk20a_tsg_acquire_unused_tsg(&g->fifo);
278 if (!tsg) 278 if (tsg == NULL) {
279 return NULL; 279 return NULL;
280 }
281
282 /* we need to allocate this after g->ops.gr.init_fs_state() since
283 * we initialize gr->no_of_sm in this function
284 */
285 if (g->gr.no_of_sm == 0U) {
286 nvgpu_err(g, "no_of_sm %d not set, failed allocation",
287 g->gr.no_of_sm);
288 return NULL;
289 }
290
291 err = gk20a_tsg_alloc_sm_error_states_mem(g, tsg, g->gr.no_of_sm);
292 if (err != 0) {
293 return NULL;
294 }
280 295
281 tsg->g = g; 296 tsg->g = g;
282 tsg->num_active_channels = 0; 297 tsg->num_active_channels = 0;
@@ -295,7 +310,7 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid)
295 310
296 if (g->ops.fifo.tsg_open) { 311 if (g->ops.fifo.tsg_open) {
297 err = g->ops.fifo.tsg_open(tsg); 312 err = g->ops.fifo.tsg_open(tsg);
298 if (err) { 313 if (err != 0) {
299 nvgpu_err(g, "tsg %d fifo open failed %d", 314 nvgpu_err(g, "tsg %d fifo open failed %d",
300 tsg->tsgid, err); 315 tsg->tsgid, err);
301 goto clean_up; 316 goto clean_up;
@@ -307,6 +322,12 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid)
307 return tsg; 322 return tsg;
308 323
309clean_up: 324clean_up:
325
326 if(tsg->sm_error_states != NULL) {
327 nvgpu_kfree(g, tsg->sm_error_states);
328 tsg->sm_error_states = NULL;
329 }
330
310 nvgpu_ref_put(&tsg->refcount, gk20a_tsg_release); 331 nvgpu_ref_put(&tsg->refcount, gk20a_tsg_release);
311 return NULL; 332 return NULL;
312} 333}
@@ -317,20 +338,28 @@ void gk20a_tsg_release(struct nvgpu_ref *ref)
317 struct gk20a *g = tsg->g; 338 struct gk20a *g = tsg->g;
318 struct gk20a_event_id_data *event_id_data, *event_id_data_temp; 339 struct gk20a_event_id_data *event_id_data, *event_id_data_temp;
319 340
320 if (g->ops.fifo.tsg_release) 341 if (g->ops.fifo.tsg_release != NULL) {
321 g->ops.fifo.tsg_release(tsg); 342 g->ops.fifo.tsg_release(tsg);
343 }
322 344
323 if (nvgpu_mem_is_valid(&tsg->gr_ctx.mem)) 345 if (nvgpu_mem_is_valid(&tsg->gr_ctx.mem)) {
324 gr_gk20a_free_tsg_gr_ctx(tsg); 346 gr_gk20a_free_tsg_gr_ctx(tsg);
347 }
325 348
326 if (g->ops.fifo.deinit_eng_method_buffers) 349 if (g->ops.fifo.deinit_eng_method_buffers != NULL) {
327 g->ops.fifo.deinit_eng_method_buffers(g, tsg); 350 g->ops.fifo.deinit_eng_method_buffers(g, tsg);
351 }
328 352
329 if (tsg->vm) { 353 if (tsg->vm != NULL) {
330 nvgpu_vm_put(tsg->vm); 354 nvgpu_vm_put(tsg->vm);
331 tsg->vm = NULL; 355 tsg->vm = NULL;
332 } 356 }
333 357
358 if(tsg->sm_error_states != NULL) {
359 nvgpu_kfree(g, tsg->sm_error_states);
360 tsg->sm_error_states = NULL;
361 }
362
334 /* unhook all events created on this TSG */ 363 /* unhook all events created on this TSG */
335 nvgpu_mutex_acquire(&tsg->event_id_list_lock); 364 nvgpu_mutex_acquire(&tsg->event_id_list_lock);
336 nvgpu_list_for_each_entry_safe(event_id_data, event_id_data_temp, 365 nvgpu_list_for_each_entry_safe(event_id_data, event_id_data_temp,
@@ -360,3 +389,44 @@ struct tsg_gk20a *tsg_gk20a_from_ch(struct channel_gk20a *ch)
360 389
361 return tsg; 390 return tsg;
362} 391}
392
393int gk20a_tsg_alloc_sm_error_states_mem(struct gk20a *g,
394 struct tsg_gk20a *tsg,
395 u32 num_sm)
396{
397 int err = 0;
398
399 if (tsg->sm_error_states != NULL) {
400 return err;
401 }
402
403 tsg->sm_error_states = nvgpu_kzalloc(g,
404 sizeof(struct nvgpu_tsg_sm_error_state)
405 * num_sm);
406 if (tsg->sm_error_states == NULL) {
407 nvgpu_err(g, "sm_error_states mem allocation failed");
408 err = -ENOMEM;
409 }
410
411 return err;
412}
413
414void gk20a_tsg_update_sm_error_state_locked(struct tsg_gk20a *tsg,
415 u32 sm_id,
416 struct nvgpu_tsg_sm_error_state *sm_error_state)
417{
418 struct nvgpu_tsg_sm_error_state *tsg_sm_error_states;
419
420 tsg_sm_error_states = tsg->sm_error_states + sm_id;
421
422 tsg_sm_error_states->hww_global_esr =
423 sm_error_state->hww_global_esr;
424 tsg_sm_error_states->hww_warp_esr =
425 sm_error_state->hww_warp_esr;
426 tsg_sm_error_states->hww_warp_esr_pc =
427 sm_error_state->hww_warp_esr_pc;
428 tsg_sm_error_states->hww_global_esr_report_mask =
429 sm_error_state->hww_global_esr_report_mask;
430 tsg_sm_error_states->hww_warp_esr_report_mask =
431 sm_error_state->hww_warp_esr_report_mask;
432}
diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
index 552c3bb3..67ccb9f5 100644
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
@@ -19,8 +19,8 @@
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE. 20 * DEALINGS IN THE SOFTWARE.
21 */ 21 */
22#ifndef __TSG_GK20A_H_ 22#ifndef TSG_GK20A_H
23#define __TSG_GK20A_H_ 23#define TSG_GK20A_H
24 24
25#include <nvgpu/lock.h> 25#include <nvgpu/lock.h>
26#include <nvgpu/kref.h> 26#include <nvgpu/kref.h>
@@ -39,6 +39,14 @@ void gk20a_tsg_release(struct nvgpu_ref *ref);
39int gk20a_init_tsg_support(struct gk20a *g, u32 tsgid); 39int gk20a_init_tsg_support(struct gk20a *g, u32 tsgid);
40struct tsg_gk20a *tsg_gk20a_from_ch(struct channel_gk20a *ch); 40struct tsg_gk20a *tsg_gk20a_from_ch(struct channel_gk20a *ch);
41 41
42struct nvgpu_tsg_sm_error_state {
43 u32 hww_global_esr;
44 u32 hww_warp_esr;
45 u64 hww_warp_esr_pc;
46 u32 hww_global_esr_report_mask;
47 u32 hww_warp_esr_report_mask;
48};
49
42struct tsg_gk20a { 50struct tsg_gk20a {
43 struct gk20a *g; 51 struct gk20a *g;
44 52
@@ -69,6 +77,7 @@ struct tsg_gk20a {
69 bool tpc_num_initialized; 77 bool tpc_num_initialized;
70 bool in_use; 78 bool in_use;
71 79
80 struct nvgpu_tsg_sm_error_state *sm_error_states;
72}; 81};
73 82
74int gk20a_enable_tsg(struct tsg_gk20a *tsg); 83int gk20a_enable_tsg(struct tsg_gk20a *tsg);
@@ -84,6 +93,12 @@ int gk20a_tsg_set_timeslice(struct tsg_gk20a *tsg, u32 timeslice);
84u32 gk20a_tsg_get_timeslice(struct tsg_gk20a *tsg); 93u32 gk20a_tsg_get_timeslice(struct tsg_gk20a *tsg);
85int gk20a_tsg_set_priority(struct gk20a *g, struct tsg_gk20a *tsg, 94int gk20a_tsg_set_priority(struct gk20a *g, struct tsg_gk20a *tsg,
86 u32 priority); 95 u32 priority);
96int gk20a_tsg_alloc_sm_error_states_mem(struct gk20a *g,
97 struct tsg_gk20a *tsg,
98 u32 num_sm);
99void gk20a_tsg_update_sm_error_state_locked(struct tsg_gk20a *tsg,
100 u32 sm_id,
101 struct nvgpu_tsg_sm_error_state *sm_error_state);
87 102
88struct gk20a_event_id_data { 103struct gk20a_event_id_data {
89 struct gk20a *g; 104 struct gk20a *g;
@@ -106,4 +121,4 @@ gk20a_event_id_data_from_event_id_node(struct nvgpu_list_node *node)
106 ((uintptr_t)node - offsetof(struct gk20a_event_id_data, event_id_node)); 121 ((uintptr_t)node - offsetof(struct gk20a_event_id_data, event_id_node));
107}; 122};
108 123
109#endif /* __TSG_GK20A_H_ */ 124#endif /* TSG_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
index 68ae91e8..fc4ab3dd 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
@@ -1268,32 +1268,68 @@ void gr_gm20b_get_access_map(struct gk20a *g,
1268 *num_entries = ARRAY_SIZE(wl_addr_gm20b); 1268 *num_entries = ARRAY_SIZE(wl_addr_gm20b);
1269} 1269}
1270 1270
1271static void gm20b_gr_read_sm_error_state(struct gk20a *g,
1272 u32 offset,
1273 struct nvgpu_tsg_sm_error_state *sm_error_states)
1274{
1275 sm_error_states->hww_global_esr = gk20a_readl(g,
1276 gr_gpc0_tpc0_sm_hww_global_esr_r() + offset);
1277 sm_error_states->hww_warp_esr = gk20a_readl(g,
1278 gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset);
1279 sm_error_states->hww_warp_esr_pc = (u64)(gk20a_readl(g,
1280 gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset));
1281 sm_error_states->hww_global_esr_report_mask = gk20a_readl(g,
1282 gr_gpc0_tpc0_sm_hww_global_esr_report_mask_r() + offset);
1283 sm_error_states->hww_warp_esr_report_mask = gk20a_readl(g,
1284 gr_gpc0_tpc0_sm_hww_warp_esr_report_mask_r() + offset);
1285
1286}
1287
1288static void gm20b_gr_write_sm_error_state(struct gk20a *g,
1289 u32 offset,
1290 struct nvgpu_tsg_sm_error_state *sm_error_states)
1291{
1292 gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset,
1293 sm_error_states->hww_global_esr);
1294 gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset,
1295 sm_error_states->hww_warp_esr);
1296 gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset,
1297 u64_lo32(sm_error_states->hww_warp_esr_pc));
1298 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
1299 sm_error_states->hww_global_esr_report_mask);
1300 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
1301 sm_error_states->hww_warp_esr_report_mask);
1302}
1303
1271int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, 1304int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
1272 struct channel_gk20a *fault_ch) 1305 struct channel_gk20a *fault_ch)
1273{ 1306{
1274 int sm_id; 1307 int sm_id;
1275 struct gr_gk20a *gr = &g->gr;
1276 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); 1308 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
1277 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, 1309 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g,
1278 GPU_LIT_TPC_IN_GPC_STRIDE); 1310 GPU_LIT_TPC_IN_GPC_STRIDE);
1279 u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; 1311 u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
1312 struct nvgpu_tsg_sm_error_state *sm_error_states = NULL;
1313 struct tsg_gk20a *tsg = NULL;
1280 1314
1281 nvgpu_mutex_acquire(&g->dbg_sessions_lock); 1315 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
1282 1316
1283 sm_id = gr_gpc0_tpc0_sm_cfg_sm_id_v(gk20a_readl(g, 1317 sm_id = gr_gpc0_tpc0_sm_cfg_sm_id_v(gk20a_readl(g,
1284 gr_gpc0_tpc0_sm_cfg_r() + offset)); 1318 gr_gpc0_tpc0_sm_cfg_r() + offset));
1285 1319
1286 gr->sm_error_states[sm_id].hww_global_esr = gk20a_readl(g, 1320 if (fault_ch != NULL) {
1287 gr_gpc0_tpc0_sm_hww_global_esr_r() + offset); 1321 tsg = tsg_gk20a_from_ch(fault_ch);
1288 gr->sm_error_states[sm_id].hww_warp_esr = gk20a_readl(g, 1322 }
1289 gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset); 1323
1290 gr->sm_error_states[sm_id].hww_warp_esr_pc = gk20a_readl(g, 1324 if (tsg == NULL) {
1291 gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset); 1325 nvgpu_err(g, "no valid tsg");
1292 gr->sm_error_states[sm_id].hww_global_esr_report_mask = gk20a_readl(g, 1326 goto record_fail;
1293 gr_gpc0_tpc0_sm_hww_global_esr_report_mask_r() + offset); 1327 }
1294 gr->sm_error_states[sm_id].hww_warp_esr_report_mask = gk20a_readl(g, 1328
1295 gr_gpc0_tpc0_sm_hww_warp_esr_report_mask_r() + offset); 1329 sm_error_states = tsg->sm_error_states + sm_id;
1330 gm20b_gr_read_sm_error_state(g, offset, sm_error_states);
1296 1331
1332record_fail:
1297 nvgpu_mutex_release(&g->dbg_sessions_lock); 1333 nvgpu_mutex_release(&g->dbg_sessions_lock);
1298 1334
1299 return sm_id; 1335 return sm_id;
@@ -1301,12 +1337,12 @@ int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
1301 1337
1302int gm20b_gr_update_sm_error_state(struct gk20a *g, 1338int gm20b_gr_update_sm_error_state(struct gk20a *g,
1303 struct channel_gk20a *ch, u32 sm_id, 1339 struct channel_gk20a *ch, u32 sm_id,
1304 struct nvgpu_gr_sm_error_state *sm_error_state) 1340 struct nvgpu_tsg_sm_error_state *sm_error_state)
1305{ 1341{
1306 u32 gpc, tpc, offset; 1342 u32 gpc, tpc, offset;
1307 struct gr_gk20a *gr = &g->gr;
1308 struct tsg_gk20a *tsg; 1343 struct tsg_gk20a *tsg;
1309 struct nvgpu_gr_ctx *ch_ctx; 1344 struct nvgpu_gr_ctx *ch_ctx;
1345 struct nvgpu_tsg_sm_error_state *tsg_sm_error_states;
1310 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); 1346 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
1311 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, 1347 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g,
1312 GPU_LIT_TPC_IN_GPC_STRIDE); 1348 GPU_LIT_TPC_IN_GPC_STRIDE);
@@ -1320,16 +1356,8 @@ int gm20b_gr_update_sm_error_state(struct gk20a *g,
1320 1356
1321 nvgpu_mutex_acquire(&g->dbg_sessions_lock); 1357 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
1322 1358
1323 gr->sm_error_states[sm_id].hww_global_esr = 1359 tsg_sm_error_states = tsg->sm_error_states + sm_id;
1324 sm_error_state->hww_global_esr; 1360 gk20a_tsg_update_sm_error_state_locked(tsg, sm_id, sm_error_state);
1325 gr->sm_error_states[sm_id].hww_warp_esr =
1326 sm_error_state->hww_warp_esr;
1327 gr->sm_error_states[sm_id].hww_warp_esr_pc =
1328 sm_error_state->hww_warp_esr_pc;
1329 gr->sm_error_states[sm_id].hww_global_esr_report_mask =
1330 sm_error_state->hww_global_esr_report_mask;
1331 gr->sm_error_states[sm_id].hww_warp_esr_report_mask =
1332 sm_error_state->hww_warp_esr_report_mask;
1333 1361
1334 err = gr_gk20a_disable_ctxsw(g); 1362 err = gr_gk20a_disable_ctxsw(g);
1335 if (err) { 1363 if (err) {
@@ -1343,29 +1371,20 @@ int gm20b_gr_update_sm_error_state(struct gk20a *g,
1343 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; 1371 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
1344 1372
1345 if (gk20a_is_channel_ctx_resident(ch)) { 1373 if (gk20a_is_channel_ctx_resident(ch)) {
1346 gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset, 1374 gm20b_gr_write_sm_error_state(g, offset, tsg_sm_error_states);
1347 gr->sm_error_states[sm_id].hww_global_esr);
1348 gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset,
1349 gr->sm_error_states[sm_id].hww_warp_esr);
1350 gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset,
1351 gr->sm_error_states[sm_id].hww_warp_esr_pc);
1352 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
1353 gr->sm_error_states[sm_id].hww_global_esr_report_mask);
1354 gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
1355 gr->sm_error_states[sm_id].hww_warp_esr_report_mask);
1356 } else { 1375 } else {
1357 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, false); 1376 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, false);
1358 if (err) 1377 if (err)
1359 goto enable_ctxsw; 1378 goto enable_ctxsw;
1360 1379
1361 gr_gk20a_ctx_patch_write(g, ch_ctx, 1380 gr_gk20a_ctx_patch_write(g, ch_ctx,
1362 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset, 1381 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
1363 gr->sm_error_states[sm_id].hww_global_esr_report_mask, 1382 tsg_sm_error_states->hww_global_esr_report_mask,
1364 true); 1383 true);
1365 gr_gk20a_ctx_patch_write(g, ch_ctx, 1384 gr_gk20a_ctx_patch_write(g, ch_ctx,
1366 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset, 1385 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
1367 gr->sm_error_states[sm_id].hww_warp_esr_report_mask, 1386 tsg_sm_error_states->hww_warp_esr_report_mask,
1368 true); 1387 true);
1369 1388
1370 gr_gk20a_ctx_patch_write_end(g, ch_ctx, false); 1389 gr_gk20a_ctx_patch_write_end(g, ch_ctx, false);
1371 } 1390 }
@@ -1383,15 +1402,20 @@ int gm20b_gr_clear_sm_error_state(struct gk20a *g,
1383{ 1402{
1384 u32 gpc, tpc, offset; 1403 u32 gpc, tpc, offset;
1385 u32 val; 1404 u32 val;
1386 struct gr_gk20a *gr = &g->gr; 1405 struct tsg_gk20a *tsg;
1387 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); 1406 u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
1388 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, 1407 u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g,
1389 GPU_LIT_TPC_IN_GPC_STRIDE); 1408 GPU_LIT_TPC_IN_GPC_STRIDE);
1390 int err = 0; 1409 int err = 0;
1391 1410
1411 tsg = tsg_gk20a_from_ch(ch);
1412 if (tsg == NULL) {
1413 return -EINVAL;
1414 }
1415
1392 nvgpu_mutex_acquire(&g->dbg_sessions_lock); 1416 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
1393 1417
1394 memset(&gr->sm_error_states[sm_id], 0, sizeof(*gr->sm_error_states)); 1418 memset(&tsg->sm_error_states[sm_id], 0, sizeof(*tsg->sm_error_states));
1395 1419
1396 err = gr_gk20a_disable_ctxsw(g); 1420 err = gr_gk20a_disable_ctxsw(g);
1397 if (err) { 1421 if (err) {
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
index 9d8e5cdf..7c3baa59 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
@@ -119,7 +119,7 @@ int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc,
119 u32 tpc, u32 sm, struct channel_gk20a *fault_ch); 119 u32 tpc, u32 sm, struct channel_gk20a *fault_ch);
120int gm20b_gr_update_sm_error_state(struct gk20a *g, 120int gm20b_gr_update_sm_error_state(struct gk20a *g,
121 struct channel_gk20a *ch, u32 sm_id, 121 struct channel_gk20a *ch, u32 sm_id,
122 struct nvgpu_gr_sm_error_state *sm_error_state); 122 struct nvgpu_tsg_sm_error_state *sm_error_state);
123int gm20b_gr_clear_sm_error_state(struct gk20a *g, 123int gm20b_gr_clear_sm_error_state(struct gk20a *g,
124 struct channel_gk20a *ch, u32 sm_id); 124 struct channel_gk20a *ch, u32 sm_id);
125int gr_gm20b_get_preemption_mode_flags(struct gk20a *g, 125int gr_gm20b_get_preemption_mode_flags(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index 1e001824..bc659a7b 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -3212,18 +3212,42 @@ void gv11b_gr_bpt_reg_info(struct gk20a *g, struct nvgpu_warpstate *w_state)
3212 } 3212 }
3213} 3213}
3214 3214
3215static void gv11b_gr_write_sm_error_state(struct gk20a *g,
3216 u32 offset,
3217 struct nvgpu_tsg_sm_error_state *sm_error_states)
3218{
3219 nvgpu_writel(g,
3220 gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset,
3221 sm_error_states->hww_global_esr);
3222 nvgpu_writel(g,
3223 gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset,
3224 sm_error_states->hww_warp_esr);
3225 nvgpu_writel(g,
3226 gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset,
3227 u64_lo32(sm_error_states->hww_warp_esr_pc));
3228 nvgpu_writel(g,
3229 gr_gpc0_tpc0_sm0_hww_warp_esr_pc_hi_r() + offset,
3230 u64_hi32(sm_error_states->hww_warp_esr_pc));
3231 nvgpu_writel(g,
3232 gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r() + offset,
3233 sm_error_states->hww_global_esr_report_mask);
3234 nvgpu_writel(g,
3235 gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r() + offset,
3236 sm_error_states->hww_warp_esr_report_mask);
3237}
3238
3215int gv11b_gr_update_sm_error_state(struct gk20a *g, 3239int gv11b_gr_update_sm_error_state(struct gk20a *g,
3216 struct channel_gk20a *ch, u32 sm_id, 3240 struct channel_gk20a *ch, u32 sm_id,
3217 struct nvgpu_gr_sm_error_state *sm_error_state) 3241 struct nvgpu_tsg_sm_error_state *sm_error_state)
3218{ 3242{
3219 struct tsg_gk20a *tsg; 3243 struct tsg_gk20a *tsg;
3220 u32 gpc, tpc, sm, offset; 3244 u32 gpc, tpc, sm, offset;
3221 struct gr_gk20a *gr = &g->gr;
3222 struct nvgpu_gr_ctx *ch_ctx; 3245 struct nvgpu_gr_ctx *ch_ctx;
3223 int err = 0; 3246 int err = 0;
3247 struct nvgpu_tsg_sm_error_state *tsg_sm_error_states;
3224 3248
3225 tsg = tsg_gk20a_from_ch(ch); 3249 tsg = tsg_gk20a_from_ch(ch);
3226 if (!tsg) { 3250 if (tsg == NULL) {
3227 return -EINVAL; 3251 return -EINVAL;
3228 } 3252 }
3229 3253
@@ -3231,16 +3255,8 @@ int gv11b_gr_update_sm_error_state(struct gk20a *g,
3231 3255
3232 nvgpu_mutex_acquire(&g->dbg_sessions_lock); 3256 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
3233 3257
3234 gr->sm_error_states[sm_id].hww_global_esr = 3258 tsg_sm_error_states = tsg->sm_error_states + sm_id;
3235 sm_error_state->hww_global_esr; 3259 gk20a_tsg_update_sm_error_state_locked(tsg, sm_id, sm_error_state);
3236 gr->sm_error_states[sm_id].hww_warp_esr =
3237 sm_error_state->hww_warp_esr;
3238 gr->sm_error_states[sm_id].hww_warp_esr_pc =
3239 sm_error_state->hww_warp_esr_pc;
3240 gr->sm_error_states[sm_id].hww_global_esr_report_mask =
3241 sm_error_state->hww_global_esr_report_mask;
3242 gr->sm_error_states[sm_id].hww_warp_esr_report_mask =
3243 sm_error_state->hww_warp_esr_report_mask;
3244 3260
3245 err = gr_gk20a_disable_ctxsw(g); 3261 err = gr_gk20a_disable_ctxsw(g);
3246 if (err) { 3262 if (err) {
@@ -3257,21 +3273,7 @@ int gv11b_gr_update_sm_error_state(struct gk20a *g,
3257 gv11b_gr_sm_offset(g, sm); 3273 gv11b_gr_sm_offset(g, sm);
3258 3274
3259 if (gk20a_is_channel_ctx_resident(ch)) { 3275 if (gk20a_is_channel_ctx_resident(ch)) {
3260 gk20a_writel(g, 3276 gv11b_gr_write_sm_error_state(g, offset, tsg_sm_error_states);
3261 gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset,
3262 gr->sm_error_states[sm_id].hww_global_esr);
3263 gk20a_writel(g,
3264 gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset,
3265 gr->sm_error_states[sm_id].hww_warp_esr);
3266 gk20a_writel(g,
3267 gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset,
3268 gr->sm_error_states[sm_id].hww_warp_esr_pc);
3269 gk20a_writel(g,
3270 gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r() + offset,
3271 gr->sm_error_states[sm_id].hww_global_esr_report_mask);
3272 gk20a_writel(g,
3273 gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r() + offset,
3274 gr->sm_error_states[sm_id].hww_warp_esr_report_mask);
3275 } else { 3277 } else {
3276 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, false); 3278 err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, false);
3277 if (err) { 3279 if (err) {
@@ -3281,12 +3283,12 @@ int gv11b_gr_update_sm_error_state(struct gk20a *g,
3281 gr_gk20a_ctx_patch_write(g, ch_ctx, 3283 gr_gk20a_ctx_patch_write(g, ch_ctx,
3282 gr_gpcs_tpcs_sms_hww_global_esr_report_mask_r() + 3284 gr_gpcs_tpcs_sms_hww_global_esr_report_mask_r() +
3283 offset, 3285 offset,
3284 gr->sm_error_states[sm_id].hww_global_esr_report_mask, 3286 tsg_sm_error_states->hww_global_esr_report_mask,
3285 true); 3287 true);
3286 gr_gk20a_ctx_patch_write(g, ch_ctx, 3288 gr_gk20a_ctx_patch_write(g, ch_ctx,
3287 gr_gpcs_tpcs_sms_hww_warp_esr_report_mask_r() + 3289 gr_gpcs_tpcs_sms_hww_warp_esr_report_mask_r() +
3288 offset, 3290 offset,
3289 gr->sm_error_states[sm_id].hww_warp_esr_report_mask, 3291 tsg_sm_error_states->hww_warp_esr_report_mask,
3290 true); 3292 true);
3291 3293
3292 gr_gk20a_ctx_patch_write_end(g, ch_ctx, false); 3294 gr_gk20a_ctx_patch_write_end(g, ch_ctx, false);
@@ -3362,13 +3364,36 @@ int gv11b_gr_set_sm_debug_mode(struct gk20a *g,
3362 return err; 3364 return err;
3363} 3365}
3364 3366
3367static void gv11b_gr_read_sm_error_state(struct gk20a *g,
3368 u32 offset,
3369 struct nvgpu_tsg_sm_error_state *sm_error_states)
3370{
3371 sm_error_states->hww_global_esr = nvgpu_readl(g,
3372 gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset);
3373
3374 sm_error_states->hww_warp_esr = nvgpu_readl(g,
3375 gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset);
3376
3377 sm_error_states->hww_warp_esr_pc = hi32_lo32_to_u64((nvgpu_readl(g,
3378 gr_gpc0_tpc0_sm0_hww_warp_esr_pc_hi_r() + offset)),
3379 (nvgpu_readl(g,
3380 gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset)));
3381
3382 sm_error_states->hww_global_esr_report_mask = nvgpu_readl(g,
3383 gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r() + offset);
3384
3385 sm_error_states->hww_warp_esr_report_mask = nvgpu_readl(g,
3386 gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r() + offset);
3387}
3388
3365int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, 3389int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
3366 struct channel_gk20a *fault_ch) 3390 struct channel_gk20a *fault_ch)
3367{ 3391{
3368 int sm_id; 3392 int sm_id;
3369 struct gr_gk20a *gr = &g->gr;
3370 u32 offset, sm_per_tpc, tpc_id; 3393 u32 offset, sm_per_tpc, tpc_id;
3371 u32 gpc_offset, gpc_tpc_offset; 3394 u32 gpc_offset, gpc_tpc_offset;
3395 struct nvgpu_tsg_sm_error_state *sm_error_states = NULL;
3396 struct tsg_gk20a *tsg = NULL;
3372 3397
3373 nvgpu_mutex_acquire(&g->dbg_sessions_lock); 3398 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
3374 3399
@@ -3381,21 +3406,19 @@ int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
3381 3406
3382 offset = gpc_tpc_offset + gv11b_gr_sm_offset(g, sm); 3407 offset = gpc_tpc_offset + gv11b_gr_sm_offset(g, sm);
3383 3408
3384 gr->sm_error_states[sm_id].hww_global_esr = gk20a_readl(g, 3409 if (fault_ch != NULL) {
3385 gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset); 3410 tsg = tsg_gk20a_from_ch(fault_ch);
3386 3411 }
3387 gr->sm_error_states[sm_id].hww_warp_esr = gk20a_readl(g,
3388 gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset);
3389
3390 gr->sm_error_states[sm_id].hww_warp_esr_pc = gk20a_readl(g,
3391 gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset);
3392 3412
3393 gr->sm_error_states[sm_id].hww_global_esr_report_mask = gk20a_readl(g, 3413 if (tsg == NULL) {
3394 gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r() + offset); 3414 nvgpu_err(g, "no valid tsg");
3415 goto record_fail;
3416 }
3395 3417
3396 gr->sm_error_states[sm_id].hww_warp_esr_report_mask = gk20a_readl(g, 3418 sm_error_states = tsg->sm_error_states + sm_id;
3397 gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r() + offset); 3419 gv11b_gr_read_sm_error_state(g, offset, sm_error_states);
3398 3420
3421record_fail:
3399 nvgpu_mutex_release(&g->dbg_sessions_lock); 3422 nvgpu_mutex_release(&g->dbg_sessions_lock);
3400 3423
3401 return sm_id; 3424 return sm_id;
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
index 0f29ea24..30cc7f0a 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
@@ -43,7 +43,7 @@ struct zbc_entry;
43struct zbc_query_params; 43struct zbc_query_params;
44struct nvgpu_gr_ctx; 44struct nvgpu_gr_ctx;
45struct nvgpu_warpstate; 45struct nvgpu_warpstate;
46struct nvgpu_gr_sm_error_state; 46struct nvgpu_tsg_sm_error_state;
47struct gr_ctx_desc; 47struct gr_ctx_desc;
48struct gr_gk20a_isr_data; 48struct gr_gk20a_isr_data;
49struct gk20a_debug_output; 49struct gk20a_debug_output;
@@ -168,7 +168,7 @@ int gv11b_gr_sm_trigger_suspend(struct gk20a *g);
168void gv11b_gr_bpt_reg_info(struct gk20a *g, struct nvgpu_warpstate *w_state); 168void gv11b_gr_bpt_reg_info(struct gk20a *g, struct nvgpu_warpstate *w_state);
169int gv11b_gr_update_sm_error_state(struct gk20a *g, 169int gv11b_gr_update_sm_error_state(struct gk20a *g,
170 struct channel_gk20a *ch, u32 sm_id, 170 struct channel_gk20a *ch, u32 sm_id,
171 struct nvgpu_gr_sm_error_state *sm_error_state); 171 struct nvgpu_tsg_sm_error_state *sm_error_state);
172int gv11b_gr_set_sm_debug_mode(struct gk20a *g, 172int gv11b_gr_set_sm_debug_mode(struct gk20a *g,
173 struct channel_gk20a *ch, u64 sms, bool enable); 173 struct channel_gk20a *ch, u64 sms, bool enable);
174int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, 174int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
index 39d68dd1..f7a58c87 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
@@ -22,8 +22,8 @@
22 * DEALINGS IN THE SOFTWARE. 22 * DEALINGS IN THE SOFTWARE.
23 */ 23 */
24 24
25#ifndef __TEGRA_VGPU_H 25#ifndef TEGRA_VGPU_H
26#define __TEGRA_VGPU_H 26#define TEGRA_VGPU_H
27 27
28#include <nvgpu/types.h> 28#include <nvgpu/types.h>
29#include <nvgpu/ecc.h> /* For NVGPU_ECC_STAT_NAME_MAX_SIZE */ 29#include <nvgpu/ecc.h> /* For NVGPU_ECC_STAT_NAME_MAX_SIZE */
@@ -737,6 +737,7 @@ struct tegra_vgpu_channel_event_info {
737}; 737};
738 738
739struct tegra_vgpu_sm_esr_info { 739struct tegra_vgpu_sm_esr_info {
740 u32 tsg_id;
740 u32 sm_id; 741 u32 sm_id;
741 u32 hww_global_esr; 742 u32 hww_global_esr;
742 u32 hww_warp_esr; 743 u32 hww_warp_esr;
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
index fc1f7011..2f013029 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
@@ -1567,56 +1567,6 @@ out:
1567 return err; 1567 return err;
1568} 1568}
1569 1569
1570static int nvgpu_gpu_read_single_sm_error_state(struct gk20a *g,
1571 struct nvgpu_gpu_read_single_sm_error_state_args *args)
1572{
1573 struct gr_gk20a *gr = &g->gr;
1574 struct nvgpu_gr_sm_error_state *sm_error_state;
1575 struct nvgpu_gpu_sm_error_state_record sm_error_state_record;
1576 u32 sm_id;
1577 int err = 0;
1578
1579 sm_id = args->sm_id;
1580 if (sm_id >= gr->no_of_sm)
1581 return -EINVAL;
1582
1583 nvgpu_speculation_barrier();
1584
1585 sm_error_state = gr->sm_error_states + sm_id;
1586 sm_error_state_record.global_esr =
1587 sm_error_state->hww_global_esr;
1588 sm_error_state_record.warp_esr =
1589 sm_error_state->hww_warp_esr;
1590 sm_error_state_record.warp_esr_pc =
1591 sm_error_state->hww_warp_esr_pc;
1592 sm_error_state_record.global_esr_report_mask =
1593 sm_error_state->hww_global_esr_report_mask;
1594 sm_error_state_record.warp_esr_report_mask =
1595 sm_error_state->hww_warp_esr_report_mask;
1596
1597 if (args->record_size > 0) {
1598 size_t write_size = sizeof(*sm_error_state);
1599
1600 if (write_size > args->record_size)
1601 write_size = args->record_size;
1602
1603 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
1604 err = copy_to_user((void __user *)(uintptr_t)
1605 args->record_mem,
1606 &sm_error_state_record,
1607 write_size);
1608 nvgpu_mutex_release(&g->dbg_sessions_lock);
1609 if (err) {
1610 nvgpu_err(g, "copy_to_user failed!");
1611 return err;
1612 }
1613
1614 args->record_size = write_size;
1615 }
1616
1617 return 0;
1618}
1619
1620long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 1570long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1621{ 1571{
1622 struct gk20a_ctrl_priv *priv = filp->private_data; 1572 struct gk20a_ctrl_priv *priv = filp->private_data;
@@ -1925,11 +1875,6 @@ long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg
1925 (struct nvgpu_gpu_set_deterministic_opts_args *)buf); 1875 (struct nvgpu_gpu_set_deterministic_opts_args *)buf);
1926 break; 1876 break;
1927 1877
1928 case NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE:
1929 err = nvgpu_gpu_read_single_sm_error_state(g,
1930 (struct nvgpu_gpu_read_single_sm_error_state_args *)buf);
1931 break;
1932
1933 default: 1878 default:
1934 nvgpu_log_info(g, "unrecognized gpu ioctl cmd: 0x%x", cmd); 1879 nvgpu_log_info(g, "unrecognized gpu ioctl cmd: 0x%x", cmd);
1935 err = -ENOTTY; 1880 err = -ENOTTY;
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
index ff4fcdca..4ac4fb62 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
@@ -35,6 +35,7 @@
35 35
36#include "gk20a/gk20a.h" 36#include "gk20a/gk20a.h"
37#include "gk20a/gr_gk20a.h" 37#include "gk20a/gr_gk20a.h"
38#include "gk20a/tsg_gk20a.h"
38#include "gk20a/regops_gk20a.h" 39#include "gk20a/regops_gk20a.h"
39#include "gk20a/dbg_gpu_gk20a.h" 40#include "gk20a/dbg_gpu_gk20a.h"
40#include "os_linux.h" 41#include "os_linux.h"
@@ -271,20 +272,23 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state(
271 u32 sm_id; 272 u32 sm_id;
272 struct channel_gk20a *ch; 273 struct channel_gk20a *ch;
273 struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record; 274 struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record;
274 struct nvgpu_gr_sm_error_state sm_error_state; 275 struct nvgpu_tsg_sm_error_state sm_error_state;
275 int err = 0; 276 int err = 0;
276 277
277 /* Not currently supported in the virtual case */ 278 /* Not currently supported in the virtual case */
278 if (g->is_virtual) 279 if (g->is_virtual) {
279 return -ENOSYS; 280 return -ENOSYS;
281 }
280 282
281 ch = nvgpu_dbg_gpu_get_session_channel(dbg_s); 283 ch = nvgpu_dbg_gpu_get_session_channel(dbg_s);
282 if (!ch) 284 if (ch == NULL) {
283 return -EINVAL; 285 return -EINVAL;
286 }
284 287
285 sm_id = args->sm_id; 288 sm_id = args->sm_id;
286 if (sm_id >= gr->no_of_sm) 289 if (sm_id >= gr->no_of_sm) {
287 return -EINVAL; 290 return -EINVAL;
291 }
288 292
289 nvgpu_speculation_barrier(); 293 nvgpu_speculation_barrier();
290 294
@@ -300,13 +304,15 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state(
300 args->sm_error_state_record_mem, 304 args->sm_error_state_record_mem,
301 read_size); 305 read_size);
302 nvgpu_mutex_release(&g->dbg_sessions_lock); 306 nvgpu_mutex_release(&g->dbg_sessions_lock);
303 if (err) 307 if (err != 0) {
304 return -ENOMEM; 308 return -ENOMEM;
309 }
305 } 310 }
306 311
307 err = gk20a_busy(g); 312 err = gk20a_busy(g);
308 if (err) 313 if (err != 0) {
309 return err; 314 return err;
315 }
310 316
311 sm_error_state.hww_global_esr = 317 sm_error_state.hww_global_esr =
312 sm_error_state_record.hww_global_esr; 318 sm_error_state_record.hww_global_esr;
@@ -335,18 +341,36 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state(
335{ 341{
336 struct gk20a *g = dbg_s->g; 342 struct gk20a *g = dbg_s->g;
337 struct gr_gk20a *gr = &g->gr; 343 struct gr_gk20a *gr = &g->gr;
338 struct nvgpu_gr_sm_error_state *sm_error_state; 344 struct nvgpu_tsg_sm_error_state *sm_error_state;
339 struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record; 345 struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record;
346 struct channel_gk20a *ch;
347 struct tsg_gk20a *tsg;
340 u32 sm_id; 348 u32 sm_id;
341 int err = 0; 349 int err = 0;
342 350
351 ch = nvgpu_dbg_gpu_get_session_channel(dbg_s);
352 if (ch == NULL) {
353 return -EINVAL;
354 }
355
356 tsg = tsg_gk20a_from_ch(ch);
357 if (tsg == NULL) {
358 nvgpu_err(g, "no valid tsg from ch");
359 return -EINVAL;
360 }
361
343 sm_id = args->sm_id; 362 sm_id = args->sm_id;
344 if (sm_id >= gr->no_of_sm) 363 if (sm_id >= gr->no_of_sm) {
345 return -EINVAL; 364 return -EINVAL;
365 }
366
367 if (tsg->sm_error_states == NULL) {
368 return -EINVAL;
369 }
346 370
347 nvgpu_speculation_barrier(); 371 nvgpu_speculation_barrier();
348 372
349 sm_error_state = gr->sm_error_states + sm_id; 373 sm_error_state = tsg->sm_error_states + sm_id;
350 sm_error_state_record.hww_global_esr = 374 sm_error_state_record.hww_global_esr =
351 sm_error_state->hww_global_esr; 375 sm_error_state->hww_global_esr;
352 sm_error_state_record.hww_warp_esr = 376 sm_error_state_record.hww_warp_esr =
@@ -370,7 +394,7 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state(
370 &sm_error_state_record, 394 &sm_error_state_record,
371 write_size); 395 write_size);
372 nvgpu_mutex_release(&g->dbg_sessions_lock); 396 nvgpu_mutex_release(&g->dbg_sessions_lock);
373 if (err) { 397 if (err != 0) {
374 nvgpu_err(g, "copy_to_user failed!"); 398 nvgpu_err(g, "copy_to_user failed!");
375 return err; 399 return err;
376 } 400 }
@@ -1500,8 +1524,9 @@ static int nvgpu_dbg_gpu_ioctl_clear_single_sm_error_state(
1500 int err = 0; 1524 int err = 0;
1501 1525
1502 ch = nvgpu_dbg_gpu_get_session_channel(dbg_s); 1526 ch = nvgpu_dbg_gpu_get_session_channel(dbg_s);
1503 if (!ch) 1527 if (ch == NULL) {
1504 return -EINVAL; 1528 return -EINVAL;
1529 }
1505 1530
1506 sm_id = args->sm_id; 1531 sm_id = args->sm_id;
1507 if (sm_id >= gr->no_of_sm) 1532 if (sm_id >= gr->no_of_sm)
@@ -1510,8 +1535,9 @@ static int nvgpu_dbg_gpu_ioctl_clear_single_sm_error_state(
1510 nvgpu_speculation_barrier(); 1535 nvgpu_speculation_barrier();
1511 1536
1512 err = gk20a_busy(g); 1537 err = gk20a_busy(g);
1513 if (err) 1538 if (err != 0) {
1514 return err; 1539 return err;
1540 }
1515 1541
1516 err = gr_gk20a_elpg_protected_call(g, 1542 err = gr_gk20a_elpg_protected_call(g,
1517 g->ops.gr.clear_sm_error_state(g, ch, sm_id)); 1543 g->ops.gr.clear_sm_error_state(g, ch, sm_id));
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c b/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c
index f7d20f34..6c68ca58 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c
@@ -536,6 +536,57 @@ static int gk20a_tsg_ioctl_get_timeslice(struct gk20a *g,
536 return 0; 536 return 0;
537} 537}
538 538
539static int gk20a_tsg_ioctl_read_single_sm_error_state(struct gk20a *g,
540 struct tsg_gk20a *tsg,
541 struct nvgpu_tsg_read_single_sm_error_state_args *args)
542{
543 struct gr_gk20a *gr = &g->gr;
544 struct nvgpu_tsg_sm_error_state *sm_error_state;
545 struct nvgpu_tsg_sm_error_state_record sm_error_state_record;
546 u32 sm_id;
547 int err = 0;
548
549 sm_id = args->sm_id;
550 if (sm_id >= gr->no_of_sm)
551 return -EINVAL;
552
553 nvgpu_speculation_barrier();
554
555 sm_error_state = tsg->sm_error_states + sm_id;
556 sm_error_state_record.global_esr =
557 sm_error_state->hww_global_esr;
558 sm_error_state_record.warp_esr =
559 sm_error_state->hww_warp_esr;
560 sm_error_state_record.warp_esr_pc =
561 sm_error_state->hww_warp_esr_pc;
562 sm_error_state_record.global_esr_report_mask =
563 sm_error_state->hww_global_esr_report_mask;
564 sm_error_state_record.warp_esr_report_mask =
565 sm_error_state->hww_warp_esr_report_mask;
566
567 if (args->record_size > 0) {
568 size_t write_size = sizeof(*sm_error_state);
569
570 if (write_size > args->record_size)
571 write_size = args->record_size;
572
573 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
574 err = copy_to_user((void __user *)(uintptr_t)
575 args->record_mem,
576 &sm_error_state_record,
577 write_size);
578 nvgpu_mutex_release(&g->dbg_sessions_lock);
579 if (err) {
580 nvgpu_err(g, "copy_to_user failed!");
581 return err;
582 }
583
584 args->record_size = write_size;
585 }
586
587 return 0;
588}
589
539long nvgpu_ioctl_tsg_dev_ioctl(struct file *filp, unsigned int cmd, 590long nvgpu_ioctl_tsg_dev_ioctl(struct file *filp, unsigned int cmd,
540 unsigned long arg) 591 unsigned long arg)
541{ 592{
@@ -670,6 +721,13 @@ long nvgpu_ioctl_tsg_dev_ioctl(struct file *filp, unsigned int cmd,
670 break; 721 break;
671 } 722 }
672 723
724 case NVGPU_TSG_IOCTL_READ_SINGLE_SM_ERROR_STATE:
725 {
726 err = gk20a_tsg_ioctl_read_single_sm_error_state(g, tsg,
727 (struct nvgpu_tsg_read_single_sm_error_state_args *)buf);
728 break;
729 }
730
673 default: 731 default:
674 nvgpu_err(g, "unrecognized tsg gpu ioctl cmd: 0x%x", 732 nvgpu_err(g, "unrecognized tsg gpu ioctl cmd: 0x%x",
675 cmd); 733 cmd);
diff --git a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
index fa64cb82..9ee57fb4 100644
--- a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
@@ -882,9 +882,6 @@ static void vgpu_remove_gr_support(struct gr_gk20a *gr)
882 882
883 gk20a_comptag_allocator_destroy(gr->g, &gr->comp_tags); 883 gk20a_comptag_allocator_destroy(gr->g, &gr->comp_tags);
884 884
885 nvgpu_kfree(gr->g, gr->sm_error_states);
886 gr->sm_error_states = NULL;
887
888 nvgpu_kfree(gr->g, gr->gpc_tpc_mask); 885 nvgpu_kfree(gr->g, gr->gpc_tpc_mask);
889 gr->gpc_tpc_mask = NULL; 886 gr->gpc_tpc_mask = NULL;
890 887
@@ -935,14 +932,6 @@ static int vgpu_gr_init_gr_setup_sw(struct gk20a *g)
935 nvgpu_mutex_init(&gr->ctx_mutex); 932 nvgpu_mutex_init(&gr->ctx_mutex);
936 nvgpu_spinlock_init(&gr->ch_tlb_lock); 933 nvgpu_spinlock_init(&gr->ch_tlb_lock);
937 934
938 gr->sm_error_states = nvgpu_kzalloc(g,
939 sizeof(struct nvgpu_gr_sm_error_state) *
940 gr->no_of_sm);
941 if (!gr->sm_error_states) {
942 err = -ENOMEM;
943 goto clean_up;
944 }
945
946 gr->remove_support = vgpu_remove_gr_support; 935 gr->remove_support = vgpu_remove_gr_support;
947 gr->sw_ready = true; 936 gr->sw_ready = true;
948 937
@@ -1152,12 +1141,17 @@ int vgpu_gr_update_hwpm_ctxsw_mode(struct gk20a *g,
1152int vgpu_gr_clear_sm_error_state(struct gk20a *g, 1141int vgpu_gr_clear_sm_error_state(struct gk20a *g,
1153 struct channel_gk20a *ch, u32 sm_id) 1142 struct channel_gk20a *ch, u32 sm_id)
1154{ 1143{
1155 struct gr_gk20a *gr = &g->gr;
1156 struct tegra_vgpu_cmd_msg msg; 1144 struct tegra_vgpu_cmd_msg msg;
1157 struct tegra_vgpu_clear_sm_error_state *p = 1145 struct tegra_vgpu_clear_sm_error_state *p =
1158 &msg.params.clear_sm_error_state; 1146 &msg.params.clear_sm_error_state;
1147 struct tsg_gk20a *tsg;
1159 int err; 1148 int err;
1160 1149
1150 tsg = tsg_gk20a_from_ch(ch);
1151 if (!tsg) {
1152 return -EINVAL;
1153 }
1154
1161 nvgpu_mutex_acquire(&g->dbg_sessions_lock); 1155 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
1162 msg.cmd = TEGRA_VGPU_CMD_CLEAR_SM_ERROR_STATE; 1156 msg.cmd = TEGRA_VGPU_CMD_CLEAR_SM_ERROR_STATE;
1163 msg.handle = vgpu_get_handle(g); 1157 msg.handle = vgpu_get_handle(g);
@@ -1167,7 +1161,7 @@ int vgpu_gr_clear_sm_error_state(struct gk20a *g,
1167 err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg)); 1161 err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
1168 WARN_ON(err || msg.ret); 1162 WARN_ON(err || msg.ret);
1169 1163
1170 memset(&gr->sm_error_states[sm_id], 0, sizeof(*gr->sm_error_states)); 1164 memset(&tsg->sm_error_states[sm_id], 0, sizeof(*tsg->sm_error_states));
1171 nvgpu_mutex_release(&g->dbg_sessions_lock); 1165 nvgpu_mutex_release(&g->dbg_sessions_lock);
1172 1166
1173 return err ? err : msg.ret; 1167 return err ? err : msg.ret;
@@ -1264,7 +1258,8 @@ int vgpu_gr_resume_contexts(struct gk20a *g,
1264void vgpu_gr_handle_sm_esr_event(struct gk20a *g, 1258void vgpu_gr_handle_sm_esr_event(struct gk20a *g,
1265 struct tegra_vgpu_sm_esr_info *info) 1259 struct tegra_vgpu_sm_esr_info *info)
1266{ 1260{
1267 struct nvgpu_gr_sm_error_state *sm_error_states; 1261 struct nvgpu_tsg_sm_error_state *sm_error_states;
1262 struct tsg_gk20a *tsg;
1268 1263
1269 if (info->sm_id >= g->gr.no_of_sm) { 1264 if (info->sm_id >= g->gr.no_of_sm) {
1270 nvgpu_err(g, "invalid smd_id %d / %d", 1265 nvgpu_err(g, "invalid smd_id %d / %d",
@@ -1272,9 +1267,20 @@ void vgpu_gr_handle_sm_esr_event(struct gk20a *g,
1272 return; 1267 return;
1273 } 1268 }
1274 1269
1270 if (info->tsg_id >= g->fifo.num_channels) {
1271 nvgpu_err(g, "invalid tsg_id in sm esr event");
1272 return;
1273 }
1274
1275 tsg = &g->fifo.tsg[info->tsg_id];
1276 if (tsg == NULL) {
1277 nvgpu_err(g, "invalid tsg");
1278 return;
1279 }
1280
1275 nvgpu_mutex_acquire(&g->dbg_sessions_lock); 1281 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
1276 1282
1277 sm_error_states = &g->gr.sm_error_states[info->sm_id]; 1283 sm_error_states = &tsg->sm_error_states[info->sm_id];
1278 1284
1279 sm_error_states->hww_global_esr = info->hww_global_esr; 1285 sm_error_states->hww_global_esr = info->hww_global_esr;
1280 sm_error_states->hww_warp_esr = info->hww_warp_esr; 1286 sm_error_states->hww_warp_esr = info->hww_warp_esr;