From 2f6698b863c9cc1db6455637b7c72e812b470b93 Mon Sep 17 00:00:00 2001
From: Terje Bergstrom <tbergstrom@nvidia.com>
Date: Fri, 15 Dec 2017 09:04:15 -0800
Subject: gpu: nvgpu: Make graphics context property of TSG

Move graphics context ownership to TSG instead of channel. Combine
channel_ctx_gk20a and gr_ctx_desc to one structure, because the split
between them was arbitrary. Move context header to be property of
channel.

Bug 1842197

Change-Id: I410e3262f80b318d8528bcbec270b63a2d8d2ff9
Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1639532
Reviewed-by: Seshendra Gadagottu <sgadagottu@nvidia.com>
Tested-by: Seshendra Gadagottu <sgadagottu@nvidia.com>
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/common/linux/vgpu/gr_vgpu.c | 343 +++++++++++++-------------
 1 file changed, 165 insertions(+), 178 deletions(-)

(limited to 'drivers/gpu/nvgpu/common/linux/vgpu/gr_vgpu.c')

diff --git a/drivers/gpu/nvgpu/common/linux/vgpu/gr_vgpu.c b/drivers/gpu/nvgpu/common/linux/vgpu/gr_vgpu.c
index e8790587..8f1c5d78 100644
--- a/drivers/gpu/nvgpu/common/linux/vgpu/gr_vgpu.c
+++ b/drivers/gpu/nvgpu/common/linux/vgpu/gr_vgpu.c
@@ -20,14 +20,18 @@
 
 #include <nvgpu/kmem.h>
 #include <nvgpu/bug.h>
+#include <nvgpu/dma.h>
 #include <nvgpu/error_notifier.h>
 #include <nvgpu/dma.h>
 
 #include "vgpu.h"
 #include "gr_vgpu.h"
 #include "gk20a/dbg_gpu_gk20a.h"
+#include "gk20a/channel_gk20a.h"
+#include "gk20a/tsg_gk20a.h"
 
 #include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_ctxsw_prog_gk20a.h>
 
 void vgpu_gr_detect_sm_arch(struct gk20a *g)
 {
@@ -152,8 +156,9 @@ static int vgpu_gr_map_global_ctx_buffers(struct gk20a *g,
 	struct tegra_vgpu_cmd_msg msg;
 	struct tegra_vgpu_ch_ctx_params *p = &msg.params.ch_ctx;
 	struct vm_gk20a *ch_vm = c->vm;
-	u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
-	u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
+	struct tsg_gk20a *tsg;
+	u64 *g_bfr_va;
+	u64 *g_bfr_size;
 	struct gr_gk20a *gr = &g->gr;
 	u64 gpu_va;
 	u32 i;
@@ -161,7 +166,12 @@ static int vgpu_gr_map_global_ctx_buffers(struct gk20a *g,
 
 	gk20a_dbg_fn("");
 
-	/* FIXME: add VPR support */
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	g_bfr_va = tsg->gr_ctx.global_ctx_buffer_va;
+	g_bfr_size = tsg->gr_ctx.global_ctx_buffer_size;
 
 	/* Circular Buffer */
 	gpu_va = __nvgpu_vm_alloc_va(ch_vm,
@@ -213,7 +223,7 @@ static int vgpu_gr_map_global_ctx_buffers(struct gk20a *g,
 	if (err || msg.ret)
 		goto clean_up;
 
-	c->ch_ctx.global_ctx_buffer_mapped = true;
+	tsg->gr_ctx.global_ctx_buffer_mapped = true;
 	return 0;
 
  clean_up:
@@ -227,40 +237,33 @@ static int vgpu_gr_map_global_ctx_buffers(struct gk20a *g,
 	return -ENOMEM;
 }
 
-static void vgpu_gr_unmap_global_ctx_buffers(struct channel_gk20a *c)
+static void vgpu_gr_unmap_global_ctx_buffers(struct tsg_gk20a *tsg)
 {
-	struct vm_gk20a *ch_vm = c->vm;
-	u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
-	u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
+	struct vm_gk20a *ch_vm = tsg->vm;
+	u64 *g_bfr_va = tsg->gr_ctx.global_ctx_buffer_va;
+	u64 *g_bfr_size = tsg->gr_ctx.global_ctx_buffer_size;
 	u32 i;
 
 	gk20a_dbg_fn("");
 
-	if (c->ch_ctx.global_ctx_buffer_mapped) {
-		struct tegra_vgpu_cmd_msg msg;
-		struct tegra_vgpu_ch_ctx_params *p = &msg.params.ch_ctx;
-		int err;
+	if (tsg->gr_ctx.global_ctx_buffer_mapped) {
+		/* server will unmap on channel close */
 
-		msg.cmd = TEGRA_VGPU_CMD_CHANNEL_UNMAP_GR_GLOBAL_CTX;
-		msg.handle = vgpu_get_handle(c->g);
-		p->handle = c->virt_ctx;
-		err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
-		WARN_ON(err || msg.ret);
-	}
-
-	for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
-		if (g_bfr_va[i]) {
-			__nvgpu_vm_free_va(ch_vm, g_bfr_va[i],
-					   gmmu_page_size_kernel);
-			g_bfr_va[i] = 0;
-			g_bfr_size[i] = 0;
+		for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
+			if (g_bfr_va[i]) {
+				__nvgpu_vm_free_va(ch_vm, g_bfr_va[i],
+						   gmmu_page_size_kernel);
+				g_bfr_va[i] = 0;
+				g_bfr_size[i] = 0;
+			}
 		}
+
+		tsg->gr_ctx.global_ctx_buffer_mapped = false;
 	}
-	c->ch_ctx.global_ctx_buffer_mapped = false;
 }
 
 int vgpu_gr_alloc_gr_ctx(struct gk20a *g,
-			struct gr_ctx_desc **__gr_ctx,
+			struct nvgpu_gr_ctx *gr_ctx,
 			struct vm_gk20a *vm,
 			u32 class,
 			u32 flags)
@@ -268,7 +271,6 @@ int vgpu_gr_alloc_gr_ctx(struct gk20a *g,
 	struct tegra_vgpu_cmd_msg msg = {0};
 	struct tegra_vgpu_gr_ctx_params *p = &msg.params.gr_ctx;
 	struct gr_gk20a *gr = &g->gr;
-	struct gr_ctx_desc *gr_ctx;
 	int err;
 
 	gk20a_dbg_fn("");
@@ -280,19 +282,14 @@ int vgpu_gr_alloc_gr_ctx(struct gk20a *g,
 	gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
 	gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
 
-	gr_ctx = nvgpu_kzalloc(g, sizeof(*gr_ctx));
-	if (!gr_ctx)
-		return -ENOMEM;
-
-	gr_ctx->mem.size = gr->ctx_vars.buffer_total_size;
 	gr_ctx->mem.gpu_va = __nvgpu_vm_alloc_va(vm,
-						gr_ctx->mem.size,
+						gr->ctx_vars.buffer_total_size,
 						gmmu_page_size_kernel);
 
-	if (!gr_ctx->mem.gpu_va) {
-		nvgpu_kfree(g, gr_ctx);
+	if (!gr_ctx->mem.gpu_va)
 		return -ENOMEM;
-	}
+	gr_ctx->mem.size = gr->ctx_vars.buffer_total_size;
+	gr_ctx->mem.aperture = APERTURE_SYSMEM;
 
 	msg.cmd = TEGRA_VGPU_CMD_GR_CTX_ALLOC;
 	msg.handle = vgpu_get_handle(g);
@@ -306,57 +303,19 @@ int vgpu_gr_alloc_gr_ctx(struct gk20a *g,
 		nvgpu_err(g, "fail to alloc gr_ctx");
 		__nvgpu_vm_free_va(vm, gr_ctx->mem.gpu_va,
 				   gmmu_page_size_kernel);
-		nvgpu_kfree(g, gr_ctx);
+		gr_ctx->mem.aperture = APERTURE_INVALID;
 	} else {
 		gr_ctx->virt_ctx = p->gr_ctx_handle;
-		*__gr_ctx = gr_ctx;
 	}
 
 	return err;
 }
 
-void vgpu_gr_free_gr_ctx(struct gk20a *g, struct vm_gk20a *vm,
-			struct gr_ctx_desc *gr_ctx)
-{
-	struct tegra_vgpu_cmd_msg msg;
-	struct tegra_vgpu_gr_ctx_params *p = &msg.params.gr_ctx;
-	int err;
-
-	gk20a_dbg_fn("");
-
-	if (!gr_ctx || !gr_ctx->mem.gpu_va)
-		return;
-
-
-	msg.cmd = TEGRA_VGPU_CMD_GR_CTX_FREE;
-	msg.handle = vgpu_get_handle(g);
-	p->gr_ctx_handle = gr_ctx->virt_ctx;
-	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
-	WARN_ON(err || msg.ret);
-
-	__nvgpu_vm_free_va(vm, gr_ctx->mem.gpu_va,
-			   gmmu_page_size_kernel);
-
-	nvgpu_dma_unmap_free(vm, &gr_ctx->pagepool_ctxsw_buffer);
-	nvgpu_dma_unmap_free(vm, &gr_ctx->betacb_ctxsw_buffer);
-	nvgpu_dma_unmap_free(vm, &gr_ctx->spill_ctxsw_buffer);
-	nvgpu_dma_unmap_free(vm, &gr_ctx->preempt_ctxsw_buffer);
-
-	nvgpu_kfree(g, gr_ctx);
-}
-
-static void vgpu_gr_free_channel_gr_ctx(struct channel_gk20a *c)
-{
-	gk20a_dbg_fn("");
-
-	c->g->ops.gr.free_gr_ctx(c->g, c->vm, c->ch_ctx.gr_ctx);
-	c->ch_ctx.gr_ctx = NULL;
-}
-
 static int vgpu_gr_alloc_channel_patch_ctx(struct gk20a *g,
 					struct channel_gk20a *c)
 {
-	struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
+	struct tsg_gk20a *tsg;
+	struct patch_desc *patch_ctx;
 	struct vm_gk20a *ch_vm = c->vm;
 	struct tegra_vgpu_cmd_msg msg;
 	struct tegra_vgpu_ch_ctx_params *p = &msg.params.ch_ctx;
@@ -364,6 +323,11 @@ static int vgpu_gr_alloc_channel_patch_ctx(struct gk20a *g,
 
 	gk20a_dbg_fn("");
 
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	patch_ctx = &tsg->gr_ctx.patch_ctx;
 	patch_ctx->mem.size = 128 * sizeof(u32);
 	patch_ctx->mem.gpu_va = __nvgpu_vm_alloc_va(ch_vm,
 						patch_ctx->mem.size,
@@ -385,37 +349,25 @@ static int vgpu_gr_alloc_channel_patch_ctx(struct gk20a *g,
 	return err;
 }
 
-static void vgpu_gr_free_channel_patch_ctx(struct channel_gk20a *c)
+static void vgpu_gr_free_channel_patch_ctx(struct tsg_gk20a *tsg)
 {
-	struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
-	struct vm_gk20a *ch_vm = c->vm;
+	struct patch_desc *patch_ctx = &tsg->gr_ctx.patch_ctx;
 
 	gk20a_dbg_fn("");
 
 	if (patch_ctx->mem.gpu_va) {
-		struct tegra_vgpu_cmd_msg msg;
-		struct tegra_vgpu_ch_ctx_params *p = &msg.params.ch_ctx;
-		int err;
+		/* server will free on channel close */
 
-		msg.cmd = TEGRA_VGPU_CMD_CHANNEL_FREE_GR_PATCH_CTX;
-		msg.handle = vgpu_get_handle(c->g);
-		p->handle = c->virt_ctx;
-		err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
-		WARN_ON(err || msg.ret);
-
-		__nvgpu_vm_free_va(ch_vm, patch_ctx->mem.gpu_va,
+		__nvgpu_vm_free_va(tsg->vm, patch_ctx->mem.gpu_va,
 				   gmmu_page_size_kernel);
 		patch_ctx->mem.gpu_va = 0;
 	}
 }
 
-static void vgpu_gr_free_channel_pm_ctx(struct channel_gk20a *c)
+static void vgpu_gr_free_channel_pm_ctx(struct tsg_gk20a *tsg)
 {
-	struct tegra_vgpu_cmd_msg msg;
-	struct tegra_vgpu_channel_free_hwpm_ctx *p = &msg.params.free_hwpm_ctx;
-	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	struct nvgpu_gr_ctx *ch_ctx = &tsg->gr_ctx;
 	struct pm_ctx_desc *pm_ctx = &ch_ctx->pm_ctx;
-	int err;
 
 	gk20a_dbg_fn("");
 
@@ -423,44 +375,63 @@ static void vgpu_gr_free_channel_pm_ctx(struct channel_gk20a *c)
 	if (pm_ctx->mem.gpu_va == 0)
 		return;
 
-	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_FREE_HWPM_CTX;
-	msg.handle = vgpu_get_handle(c->g);
-	p->handle = c->virt_ctx;
-	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
-	WARN_ON(err || msg.ret);
+	/* server will free on channel close */
 
-	__nvgpu_vm_free_va(c->vm, pm_ctx->mem.gpu_va,
+	__nvgpu_vm_free_va(tsg->vm, pm_ctx->mem.gpu_va,
 			   gmmu_page_size_kernel);
 	pm_ctx->mem.gpu_va = 0;
 }
 
-void vgpu_gr_free_channel_ctx(struct channel_gk20a *c, bool is_tsg)
+void vgpu_gr_free_gr_ctx(struct gk20a *g,
+			 struct vm_gk20a *vm, struct nvgpu_gr_ctx *gr_ctx)
 {
+	struct tsg_gk20a *tsg;
+
 	gk20a_dbg_fn("");
 
-	if (c->g->ops.fifo.free_channel_ctx_header)
-		c->g->ops.fifo.free_channel_ctx_header(c);
-	vgpu_gr_unmap_global_ctx_buffers(c);
-	vgpu_gr_free_channel_patch_ctx(c);
-	vgpu_gr_free_channel_pm_ctx(c);
-	if (!is_tsg)
-		vgpu_gr_free_channel_gr_ctx(c);
+	if (gr_ctx->mem.gpu_va) {
+		struct tegra_vgpu_cmd_msg msg;
+		struct tegra_vgpu_gr_ctx_params *p = &msg.params.gr_ctx;
+		int err;
 
-	/* zcull_ctx, pm_ctx */
+		msg.cmd = TEGRA_VGPU_CMD_GR_CTX_FREE;
+		msg.handle = vgpu_get_handle(g);
+		p->gr_ctx_handle = gr_ctx->virt_ctx;
+		err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+		WARN_ON(err || msg.ret);
 
-	memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
+		__nvgpu_vm_free_va(vm, gr_ctx->mem.gpu_va,
+				   gmmu_page_size_kernel);
+
+		tsg = &g->fifo.tsg[gr_ctx->tsgid];
+		vgpu_gr_unmap_global_ctx_buffers(tsg);
+		vgpu_gr_free_channel_patch_ctx(tsg);
+		vgpu_gr_free_channel_pm_ctx(tsg);
+
+		nvgpu_dma_unmap_free(vm, &gr_ctx->pagepool_ctxsw_buffer);
+		nvgpu_dma_unmap_free(vm, &gr_ctx->betacb_ctxsw_buffer);
+		nvgpu_dma_unmap_free(vm, &gr_ctx->spill_ctxsw_buffer);
+		nvgpu_dma_unmap_free(vm, &gr_ctx->preempt_ctxsw_buffer);
 
-	c->first_init = false;
+		memset(gr_ctx, 0, sizeof(*gr_ctx));
+	}
 }
 
 static int vgpu_gr_ch_bind_gr_ctx(struct channel_gk20a *c)
 {
-	struct gr_ctx_desc *gr_ctx = c->ch_ctx.gr_ctx;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx;
 	struct tegra_vgpu_cmd_msg msg = {0};
 	struct tegra_vgpu_channel_bind_gr_ctx_params *p =
 				&msg.params.ch_bind_gr_ctx;
 	int err;
 
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	gr_ctx = &tsg->gr_ctx;
+
 	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_BIND_GR_CTX;
 	msg.handle = vgpu_get_handle(c->g);
 	p->ch_handle = c->virt_ctx;
@@ -474,7 +445,7 @@ static int vgpu_gr_ch_bind_gr_ctx(struct channel_gk20a *c)
 
 static int vgpu_gr_tsg_bind_gr_ctx(struct tsg_gk20a *tsg)
 {
-	struct gr_ctx_desc *gr_ctx = tsg->tsg_gr_ctx;
+	struct nvgpu_gr_ctx *gr_ctx = &tsg->gr_ctx;
 	struct tegra_vgpu_cmd_msg msg = {0};
 	struct tegra_vgpu_tsg_bind_gr_ctx_params *p =
 					&msg.params.tsg_bind_gr_ctx;
@@ -495,7 +466,7 @@ int vgpu_gr_alloc_obj_ctx(struct channel_gk20a  *c, u32 class_num, u32 flags)
 {
 	struct gk20a *g = c->g;
 	struct fifo_gk20a *f = &g->fifo;
-	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	struct nvgpu_gr_ctx *gr_ctx = NULL;
 	struct tsg_gk20a *tsg = NULL;
 	int err = 0;
 
@@ -515,95 +486,87 @@ int vgpu_gr_alloc_obj_ctx(struct channel_gk20a  *c, u32 class_num, u32 flags)
 	}
 	c->obj_class = class_num;
 
-	if (gk20a_is_channel_marked_as_tsg(c))
-		tsg = &f->tsg[c->tsgid];
-
-	if (!tsg) {
-		/* allocate gr ctx buffer */
-		if (!ch_ctx->gr_ctx) {
-			err = g->ops.gr.alloc_gr_ctx(g, &c->ch_ctx.gr_ctx,
-						c->vm,
-						class_num,
-						flags);
-			if (!err)
-				err = vgpu_gr_ch_bind_gr_ctx(c);
-			if (err) {
-				nvgpu_err(g, "fail to allocate gr ctx buffer");
-				goto out;
-			}
-		} else {
-			/*TBD: needs to be more subtle about which is
-			 * being allocated as some are allowed to be
-			 * allocated along same channel */
+	if (!gk20a_is_channel_marked_as_tsg(c))
+		return -EINVAL;
+
+	tsg = &f->tsg[c->tsgid];
+	gr_ctx = &tsg->gr_ctx;
+
+	if (!nvgpu_mem_is_valid(&gr_ctx->mem)) {
+		tsg->vm = c->vm;
+		nvgpu_vm_get(tsg->vm);
+		err = g->ops.gr.alloc_gr_ctx(g, gr_ctx,
+					c->vm,
+					class_num,
+					flags);
+		if (!err)
+			err = vgpu_gr_tsg_bind_gr_ctx(tsg);
+		if (err) {
 			nvgpu_err(g,
-				"too many classes alloc'd on same channel");
-			err = -EINVAL;
+				"fail to allocate TSG gr ctx buffer, err=%d", err);
+			nvgpu_vm_put(tsg->vm);
+			tsg->vm = NULL;
 			goto out;
 		}
-	} else {
-		if (!tsg->tsg_gr_ctx) {
-			tsg->vm = c->vm;
-			nvgpu_vm_get(tsg->vm);
-			err = g->ops.gr.alloc_gr_ctx(g, &tsg->tsg_gr_ctx,
-						c->vm,
-						class_num,
-						flags);
-			if (!err)
-				err = vgpu_gr_tsg_bind_gr_ctx(tsg);
-			if (err) {
-				nvgpu_err(g,
-					"fail to allocate TSG gr ctx buffer, err=%d", err);
-				nvgpu_vm_put(tsg->vm);
-				tsg->vm = NULL;
-				goto out;
-			}
-		}
 
-		ch_ctx->gr_ctx = tsg->tsg_gr_ctx;
 		err = vgpu_gr_ch_bind_gr_ctx(c);
 		if (err) {
 			nvgpu_err(g, "fail to bind gr ctx buffer");
 			goto out;
 		}
-	}
 
-	/* commit gr ctx buffer */
-	err = g->ops.gr.commit_inst(c, ch_ctx->gr_ctx->mem.gpu_va);
-	if (err) {
-		nvgpu_err(g, "fail to commit gr ctx buffer");
-		goto out;
-	}
+		/* commit gr ctx buffer */
+		err = g->ops.gr.commit_inst(c, gr_ctx->mem.gpu_va);
+		if (err) {
+			nvgpu_err(g, "fail to commit gr ctx buffer");
+			goto out;
+		}
 
-	/* allocate patch buffer */
-	if (ch_ctx->patch_ctx.mem.priv.pages == NULL) {
+		/* allocate patch buffer */
 		err = vgpu_gr_alloc_channel_patch_ctx(g, c);
 		if (err) {
 			nvgpu_err(g, "fail to allocate patch buffer");
 			goto out;
 		}
-	}
 
-	/* map global buffer to channel gpu_va and commit */
-	if (!ch_ctx->global_ctx_buffer_mapped) {
+		/* map global buffer to channel gpu_va and commit */
 		err = vgpu_gr_map_global_ctx_buffers(g, c);
 		if (err) {
 			nvgpu_err(g, "fail to map global ctx buffer");
 			goto out;
 		}
-		vgpu_gr_commit_global_ctx_buffers(g, c, true);
-	}
 
-	/* load golden image */
-	if (!c->first_init) {
+		err = vgpu_gr_commit_global_ctx_buffers(g, c, true);
+		if (err) {
+			nvgpu_err(g, "fail to commit global ctx buffers");
+			goto out;
+		}
+
+		/* load golden image */
 		err = gr_gk20a_elpg_protected_call(g,
 				vgpu_gr_load_golden_ctx_image(g, c));
 		if (err) {
 			nvgpu_err(g, "fail to load golden ctx image");
 			goto out;
 		}
-		c->first_init = true;
+	} else {
+		err = vgpu_gr_ch_bind_gr_ctx(c);
+		if (err) {
+			nvgpu_err(g, "fail to bind gr ctx buffer");
+			goto out;
+		}
+
+		/* commit gr ctx buffer */
+		err = g->ops.gr.commit_inst(c, gr_ctx->mem.gpu_va);
+		if (err) {
+			nvgpu_err(g, "fail to commit gr ctx buffer");
+			goto out;
+		}
 	}
 
+	/* PM ctxt switch is off by default */
+	gr_ctx->pm_ctx.pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
+
 	gk20a_dbg_fn("done");
 	return 0;
 out:
@@ -1055,15 +1018,30 @@ int vgpu_gr_update_smpc_ctxsw_mode(struct gk20a *g,
 int vgpu_gr_update_hwpm_ctxsw_mode(struct gk20a *g,
 	struct channel_gk20a *ch, bool enable)
 {
-	struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
-	struct pm_ctx_desc *pm_ctx = &ch_ctx->pm_ctx;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *ch_ctx;
+	struct pm_ctx_desc *pm_ctx;
 	struct tegra_vgpu_cmd_msg msg;
 	struct tegra_vgpu_channel_set_ctxsw_mode *p = &msg.params.set_ctxsw_mode;
 	int err;
 
 	gk20a_dbg_fn("");
 
+	tsg = tsg_gk20a_from_ch(ch);
+	if (!tsg)
+		return -EINVAL;
+
+	ch_ctx = &tsg->gr_ctx;
+	pm_ctx = &ch_ctx->pm_ctx;
+
 	if (enable) {
+		/*
+		 * send command to enable HWPM only once - otherwise server
+		 * will return an error due to using the same GPU VA twice.
+		 */
+		if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f())
+			return 0;
+
 		p->mode = TEGRA_VGPU_CTXSW_MODE_CTXSW;
 
 		/* Allocate buffer if necessary */
@@ -1076,8 +1054,12 @@ int vgpu_gr_update_hwpm_ctxsw_mode(struct gk20a *g,
 				return -ENOMEM;
 			pm_ctx->mem.size = g->gr.ctx_vars.pm_ctxsw_image_size;
 		}
-	} else
+	} else {
+		if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_no_ctxsw_f())
+			return 0;
+
 		p->mode = TEGRA_VGPU_CTXSW_MODE_NO_CTXSW;
+	}
 
 	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_SET_HWPM_CTXSW_MODE;
 	msg.handle = vgpu_get_handle(g);
@@ -1086,8 +1068,13 @@ int vgpu_gr_update_hwpm_ctxsw_mode(struct gk20a *g,
 
 	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
 	WARN_ON(err || msg.ret);
+	err = err ? err : msg.ret;
+	if (!err)
+		pm_ctx->pm_mode = enable ?
+			ctxsw_prog_main_image_pm_mode_ctxsw_f() :
+			ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
 
-	return err ? err : msg.ret;
+	return err;
 }
 
 int vgpu_gr_clear_sm_error_state(struct gk20a *g,
-- 
cgit v1.2.2