summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Fleury <tfleury@nvidia.com>2017-01-30 20:48:02 -0500
committermobile promotions <svcmobile_promotions@nvidia.com>2017-03-09 13:44:56 -0500
commit2caea7576a42c5f6593c58229d51f74517e0c60c (patch)
tree26d652848baede8f822afa8009cae7faa29945ac
parent6c35cebdcb2d14741385cfe051577882a806cdb8 (diff)
gpu: nvgpu: vgpu: add clear single SM error state
Add support for clearing single SM error state for CUDA debugger. In addition to clearing local copy of SM error state, vgpu_gr_clear_sm_error_state now sends a command to RM server (TEGRA_VGPU_CMD_CLEAR_SM_ERROR_STATE), to clear global ESR and warp ESR. Bug 1791111 Change-Id: I3a1f0644787fd900ec59a0e7974037d46a603487 Signed-off-by: Thomas Fleury <tfleury@nvidia.com> Reviewed-on: http://git-master/r/1296311 (cherry picked from commit fd07e03c3d086f396e4d65575c576a4dd68c920a) Reviewed-on: http://git-master/r/1299060 Reviewed-by: Automatic_Commit_Validation_User Reviewed-by: svccoveritychecker <svccoveritychecker@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Cory Perry <cperry@nvidia.com> Tested-by: Cory Perry <cperry@nvidia.com> Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/vgpu/gr_vgpu.c23
-rw-r--r--include/linux/tegra_vgpu.h7
2 files changed, 26 insertions, 4 deletions
diff --git a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
index 7ffe96fe..a98c9d38 100644
--- a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
@@ -1077,11 +1077,26 @@ static int vgpu_gr_clear_sm_error_state(struct gk20a *g,
1077 struct channel_gk20a *ch, u32 sm_id) 1077 struct channel_gk20a *ch, u32 sm_id)
1078{ 1078{
1079 struct gr_gk20a *gr = &g->gr; 1079 struct gr_gk20a *gr = &g->gr;
1080 struct tegra_vgpu_cmd_msg msg;
1081 struct tegra_vgpu_clear_sm_error_state *p =
1082 &msg.params.clear_sm_error_state;
1083 int err;
1080 1084
1081 nvgpu_mutex_acquire(&g->dbg_sessions_lock); 1085 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
1086 msg.cmd = TEGRA_VGPU_CMD_CLEAR_SM_ERROR_STATE;
1087 msg.handle = vgpu_get_handle(g);
1088 p->handle = ch->virt_ctx;
1089 p->sm_id = sm_id;
1090
1091 err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
1092 WARN_ON(err || msg.ret);
1093
1082 memset(&gr->sm_error_states[sm_id], 0, sizeof(*gr->sm_error_states)); 1094 memset(&gr->sm_error_states[sm_id], 0, sizeof(*gr->sm_error_states));
1083 nvgpu_mutex_release(&g->dbg_sessions_lock); 1095 nvgpu_mutex_release(&g->dbg_sessions_lock);
1084 1096
1097 return err ? err : msg.ret;
1098
1099
1085 return 0; 1100 return 0;
1086} 1101}
1087 1102
@@ -1099,8 +1114,8 @@ static int vgpu_gr_suspend_resume_contexts(struct gk20a *g,
1099 int channel_fd = -1; 1114 int channel_fd = -1;
1100 int err = 0; 1115 int err = 0;
1101 1116
1102 mutex_lock(&g->dbg_sessions_lock); 1117 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
1103 mutex_lock(&dbg_s->ch_list_lock); 1118 nvgpu_mutex_acquire(&dbg_s->ch_list_lock);
1104 1119
1105 n = 0; 1120 n = 0;
1106 list_for_each_entry(ch_data, &dbg_s->ch_list, ch_entry) 1121 list_for_each_entry(ch_data, &dbg_s->ch_list, ch_entry)
@@ -1137,8 +1152,8 @@ static int vgpu_gr_suspend_resume_contexts(struct gk20a *g,
1137 } 1152 }
1138 1153
1139fail: 1154fail:
1140 mutex_unlock(&dbg_s->ch_list_lock); 1155 nvgpu_mutex_release(&dbg_s->ch_list_lock);
1141 mutex_unlock(&g->dbg_sessions_lock); 1156 nvgpu_mutex_release(&g->dbg_sessions_lock);
1142 1157
1143 *ctx_resident_ch_fd = channel_fd; 1158 *ctx_resident_ch_fd = channel_fd;
1144 kfree(msg); 1159 kfree(msg);
diff --git a/include/linux/tegra_vgpu.h b/include/linux/tegra_vgpu.h
index 9ecc44a7..3e3bbf58 100644
--- a/include/linux/tegra_vgpu.h
+++ b/include/linux/tegra_vgpu.h
@@ -101,6 +101,7 @@ enum {
101 TEGRA_VGPU_CMD_GET_GPU_LOAD = 65, 101 TEGRA_VGPU_CMD_GET_GPU_LOAD = 65,
102 TEGRA_VGPU_CMD_SUSPEND_CONTEXTS = 66, 102 TEGRA_VGPU_CMD_SUSPEND_CONTEXTS = 66,
103 TEGRA_VGPU_CMD_RESUME_CONTEXTS = 67, 103 TEGRA_VGPU_CMD_RESUME_CONTEXTS = 67,
104 TEGRA_VGPU_CMD_CLEAR_SM_ERROR_STATE = 68,
104}; 105};
105 106
106struct tegra_vgpu_connect_params { 107struct tegra_vgpu_connect_params {
@@ -462,6 +463,11 @@ struct tegra_vgpu_suspend_resume_contexts {
462 u16 chids[]; 463 u16 chids[];
463}; 464};
464 465
466struct tegra_vgpu_clear_sm_error_state {
467 u64 handle;
468 u32 sm_id;
469};
470
465struct tegra_vgpu_cmd_msg { 471struct tegra_vgpu_cmd_msg {
466 u32 cmd; 472 u32 cmd;
467 int ret; 473 int ret;
@@ -510,6 +516,7 @@ struct tegra_vgpu_cmd_msg {
510 struct tegra_vgpu_gpu_load_params gpu_load; 516 struct tegra_vgpu_gpu_load_params gpu_load;
511 struct tegra_vgpu_suspend_resume_contexts suspend_contexts; 517 struct tegra_vgpu_suspend_resume_contexts suspend_contexts;
512 struct tegra_vgpu_suspend_resume_contexts resume_contexts; 518 struct tegra_vgpu_suspend_resume_contexts resume_contexts;
519 struct tegra_vgpu_clear_sm_error_state clear_sm_error_state;
513 char padding[192]; 520 char padding[192];
514 } params; 521 } params;
515}; 522};