From 2caea7576a42c5f6593c58229d51f74517e0c60c Mon Sep 17 00:00:00 2001 From: Thomas Fleury Date: Mon, 30 Jan 2017 17:48:02 -0800 Subject: gpu: nvgpu: vgpu: add clear single SM error state Add support for clearing single SM error state for CUDA debugger. In addition to clearing local copy of SM error state, vgpu_gr_clear_sm_error_state now sends a command to RM server (TEGRA_VGPU_CMD_CLEAR_SM_ERROR_STATE), to clear global ESR and warp ESR. Bug 1791111 Change-Id: I3a1f0644787fd900ec59a0e7974037d46a603487 Signed-off-by: Thomas Fleury Reviewed-on: http://git-master/r/1296311 (cherry picked from commit fd07e03c3d086f396e4d65575c576a4dd68c920a) Reviewed-on: http://git-master/r/1299060 Reviewed-by: Automatic_Commit_Validation_User Reviewed-by: svccoveritychecker GVS: Gerrit_Virtual_Submit Reviewed-by: Cory Perry Tested-by: Cory Perry Reviewed-by: Terje Bergstrom --- drivers/gpu/nvgpu/vgpu/gr_vgpu.c | 23 +++++++++++++++++++---- include/linux/tegra_vgpu.h | 7 +++++++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c index 7ffe96fe..a98c9d38 100644 --- a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c +++ b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c @@ -1077,11 +1077,26 @@ static int vgpu_gr_clear_sm_error_state(struct gk20a *g, struct channel_gk20a *ch, u32 sm_id) { struct gr_gk20a *gr = &g->gr; + struct tegra_vgpu_cmd_msg msg; + struct tegra_vgpu_clear_sm_error_state *p = + &msg.params.clear_sm_error_state; + int err; nvgpu_mutex_acquire(&g->dbg_sessions_lock); + msg.cmd = TEGRA_VGPU_CMD_CLEAR_SM_ERROR_STATE; + msg.handle = vgpu_get_handle(g); + p->handle = ch->virt_ctx; + p->sm_id = sm_id; + + err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg)); + WARN_ON(err || msg.ret); + memset(&gr->sm_error_states[sm_id], 0, sizeof(*gr->sm_error_states)); nvgpu_mutex_release(&g->dbg_sessions_lock); + return err ? err : msg.ret; + + return 0; } @@ -1099,8 +1114,8 @@ static int vgpu_gr_suspend_resume_contexts(struct gk20a *g, int channel_fd = -1; int err = 0; - mutex_lock(&g->dbg_sessions_lock); - mutex_lock(&dbg_s->ch_list_lock); + nvgpu_mutex_acquire(&g->dbg_sessions_lock); + nvgpu_mutex_acquire(&dbg_s->ch_list_lock); n = 0; list_for_each_entry(ch_data, &dbg_s->ch_list, ch_entry) @@ -1137,8 +1152,8 @@ static int vgpu_gr_suspend_resume_contexts(struct gk20a *g, } fail: - mutex_unlock(&dbg_s->ch_list_lock); - mutex_unlock(&g->dbg_sessions_lock); + nvgpu_mutex_release(&dbg_s->ch_list_lock); + nvgpu_mutex_release(&g->dbg_sessions_lock); *ctx_resident_ch_fd = channel_fd; kfree(msg); diff --git a/include/linux/tegra_vgpu.h b/include/linux/tegra_vgpu.h index 9ecc44a7..3e3bbf58 100644 --- a/include/linux/tegra_vgpu.h +++ b/include/linux/tegra_vgpu.h @@ -101,6 +101,7 @@ enum { TEGRA_VGPU_CMD_GET_GPU_LOAD = 65, TEGRA_VGPU_CMD_SUSPEND_CONTEXTS = 66, TEGRA_VGPU_CMD_RESUME_CONTEXTS = 67, + TEGRA_VGPU_CMD_CLEAR_SM_ERROR_STATE = 68, }; struct tegra_vgpu_connect_params { @@ -462,6 +463,11 @@ struct tegra_vgpu_suspend_resume_contexts { u16 chids[]; }; +struct tegra_vgpu_clear_sm_error_state { + u64 handle; + u32 sm_id; +}; + struct tegra_vgpu_cmd_msg { u32 cmd; int ret; @@ -510,6 +516,7 @@ struct tegra_vgpu_cmd_msg { struct tegra_vgpu_gpu_load_params gpu_load; struct tegra_vgpu_suspend_resume_contexts suspend_contexts; struct tegra_vgpu_suspend_resume_contexts resume_contexts; + struct tegra_vgpu_clear_sm_error_state clear_sm_error_state; char padding[192]; } params; }; -- cgit v1.2.2