From 2f6698b863c9cc1db6455637b7c72e812b470b93 Mon Sep 17 00:00:00 2001
From: Terje Bergstrom <tbergstrom@nvidia.com>
Date: Fri, 15 Dec 2017 09:04:15 -0800
Subject: gpu: nvgpu: Make graphics context property of TSG

Move graphics context ownership to TSG instead of channel. Combine
channel_ctx_gk20a and gr_ctx_desc to one structure, because the split
between them was arbitrary. Move context header to be property of
channel.

Bug 1842197

Change-Id: I410e3262f80b318d8528bcbec270b63a2d8d2ff9
Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1639532
Reviewed-by: Seshendra Gadagottu <sgadagottu@nvidia.com>
Tested-by: Seshendra Gadagottu <sgadagottu@nvidia.com>
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/common/linux/debug_fifo.c        |   4 +-
 drivers/gpu/nvgpu/common/linux/ioctl_channel.c     |   8 +-
 drivers/gpu/nvgpu/common/linux/sched.c             |  13 +-
 .../nvgpu/common/linux/vgpu/gp10b/vgpu_gr_gp10b.c  |  25 +-
 .../nvgpu/common/linux/vgpu/gp10b/vgpu_gr_gp10b.h  |   4 +-
 .../nvgpu/common/linux/vgpu/gp10b/vgpu_hal_gp10b.c |   1 -
 drivers/gpu/nvgpu/common/linux/vgpu/gr_vgpu.c      | 343 ++++++-----
 drivers/gpu/nvgpu/common/linux/vgpu/gr_vgpu.h      |   1 +
 .../nvgpu/common/linux/vgpu/gv11b/vgpu_hal_gv11b.c |   1 -
 .../common/linux/vgpu/gv11b/vgpu_subctx_gv11b.c    |   4 +-
 drivers/gpu/nvgpu/common/linux/vgpu/vgpu.h         |   8 +-
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c            |   6 +-
 drivers/gpu/nvgpu/gk20a/channel_gk20a.h            |  23 +-
 drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c         |  12 +-
 drivers/gpu/nvgpu/gk20a/gk20a.h                    |  20 +-
 drivers/gpu/nvgpu/gk20a/gr_gk20a.c                 | 641 ++++++++++-----------
 drivers/gpu/nvgpu/gk20a/gr_gk20a.h                 |  40 +-
 drivers/gpu/nvgpu/gk20a/tsg_gk20a.c                |   5 +-
 drivers/gpu/nvgpu/gk20a/tsg_gk20a.h                |   6 +-
 drivers/gpu/nvgpu/gm20b/gr_gm20b.c                 |  50 +-
 drivers/gpu/nvgpu/gm20b/gr_gm20b.h                 |  10 +-
 drivers/gpu/nvgpu/gm20b/hal_gm20b.c                |   1 -
 drivers/gpu/nvgpu/gp106/gr_gp106.c                 |   2 +-
 drivers/gpu/nvgpu/gp106/gr_gp106.h                 |   2 +-
 drivers/gpu/nvgpu/gp106/hal_gp106.c                |   1 -
 drivers/gpu/nvgpu/gp10b/gr_gp10b.c                 | 202 ++++---
 drivers/gpu/nvgpu/gp10b/gr_gp10b.h                 |  19 +-
 drivers/gpu/nvgpu/gp10b/hal_gp10b.c                |   1 -
 drivers/gpu/nvgpu/gv100/hal_gv100.c                |   1 -
 drivers/gpu/nvgpu/gv11b/gr_gv11b.c                 |  66 ++-
 drivers/gpu/nvgpu/gv11b/gr_gv11b.h                 |   9 +-
 drivers/gpu/nvgpu/gv11b/hal_gv11b.c                |   1 -
 drivers/gpu/nvgpu/gv11b/subctx_gv11b.c             |  42 +-
 33 files changed, 833 insertions(+), 739 deletions(-)

diff --git a/drivers/gpu/nvgpu/common/linux/debug_fifo.c b/drivers/gpu/nvgpu/common/linux/debug_fifo.c
index ad157ee7..aeab0c92 100644
--- a/drivers/gpu/nvgpu/common/linux/debug_fifo.c
+++ b/drivers/gpu/nvgpu/common/linux/debug_fifo.c
@@ -91,8 +91,8 @@ static int gk20a_fifo_sched_debugfs_seq_show(
 				tsg->timeslice_us,
 				ch->timeout_ms_max,
 				tsg->interleave_level,
-				ch->ch_ctx.gr_ctx ? ch->ch_ctx.gr_ctx->graphics_preempt_mode : U32_MAX,
-				ch->ch_ctx.gr_ctx ? ch->ch_ctx.gr_ctx->compute_preempt_mode : U32_MAX);
+				tsg->gr_ctx.graphics_preempt_mode,
+				tsg->gr_ctx.compute_preempt_mode);
 		gk20a_channel_put(ch);
 	}
 	return 0;
diff --git a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
index 94501a89..e8f4c14b 100644
--- a/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
+++ b/drivers/gpu/nvgpu/common/linux/ioctl_channel.c
@@ -85,10 +85,10 @@ static void gk20a_channel_trace_sched_param(
 		tsg_gk20a_from_ch(ch)->timeslice_us,
 		ch->timeout_ms_max,
 		gk20a_fifo_interleave_level_name(tsg->interleave_level),
-		gr_gk20a_graphics_preempt_mode_name(ch->ch_ctx.gr_ctx ?
-			ch->ch_ctx.gr_ctx->graphics_preempt_mode : 0),
-		gr_gk20a_compute_preempt_mode_name(ch->ch_ctx.gr_ctx ?
-			ch->ch_ctx.gr_ctx->compute_preempt_mode : 0));
+		gr_gk20a_graphics_preempt_mode_name(
+			tsg->gr_ctx.graphics_preempt_mode),
+		gr_gk20a_compute_preempt_mode_name(
+			tsg->gr_ctx.compute_preempt_mode));
 }
 
 /*
diff --git a/drivers/gpu/nvgpu/common/linux/sched.c b/drivers/gpu/nvgpu/common/linux/sched.c
index fc3f6ed8..e6211790 100644
--- a/drivers/gpu/nvgpu/common/linux/sched.c
+++ b/drivers/gpu/nvgpu/common/linux/sched.c
@@ -198,15 +198,10 @@ static int gk20a_sched_dev_ioctl_get_params(struct gk20a_sched_ctrl *sched,
 	arg->runlist_interleave = tsg->interleave_level;
 	arg->timeslice = tsg->timeslice_us;
 
-	if (tsg->tsg_gr_ctx) {
-		arg->graphics_preempt_mode =
-			tsg->tsg_gr_ctx->graphics_preempt_mode;
-		arg->compute_preempt_mode =
-			tsg->tsg_gr_ctx->compute_preempt_mode;
-	} else {
-		arg->graphics_preempt_mode = 0;
-		arg->compute_preempt_mode = 0;
-	}
+	arg->graphics_preempt_mode =
+		tsg->gr_ctx.graphics_preempt_mode;
+	arg->compute_preempt_mode =
+		tsg->gr_ctx.compute_preempt_mode;
 
 	nvgpu_ref_put(&tsg->refcount, nvgpu_ioctl_tsg_release);
 
diff --git a/drivers/gpu/nvgpu/common/linux/vgpu/gp10b/vgpu_gr_gp10b.c b/drivers/gpu/nvgpu/common/linux/vgpu/gp10b/vgpu_gr_gp10b.c
index ed61f16b..9adf20d1 100644
--- a/drivers/gpu/nvgpu/common/linux/vgpu/gp10b/vgpu_gr_gp10b.c
+++ b/drivers/gpu/nvgpu/common/linux/vgpu/gp10b/vgpu_gr_gp10b.c
@@ -27,12 +27,11 @@
 #include <nvgpu/hw/gp10b/hw_gr_gp10b.h>
 
 int vgpu_gr_gp10b_alloc_gr_ctx(struct gk20a *g,
-				struct gr_ctx_desc **__gr_ctx,
+				struct nvgpu_gr_ctx *gr_ctx,
 				struct vm_gk20a *vm,
 				u32 class,
 				u32 flags)
 {
-	struct gr_ctx_desc *gr_ctx;
 	u32 graphics_preempt_mode = 0;
 	u32 compute_preempt_mode = 0;
 	struct vgpu_priv_data *priv = vgpu_get_priv_data(g);
@@ -40,12 +39,10 @@ int vgpu_gr_gp10b_alloc_gr_ctx(struct gk20a *g,
 
 	gk20a_dbg_fn("");
 
-	err = vgpu_gr_alloc_gr_ctx(g, __gr_ctx, vm, class, flags);
+	err = vgpu_gr_alloc_gr_ctx(g, gr_ctx, vm, class, flags);
 	if (err)
 		return err;
 
-	gr_ctx = *__gr_ctx;
-
 	if (flags & NVGPU_OBJ_CTX_FLAGS_SUPPORT_GFXP)
 		graphics_preempt_mode = NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP;
 	if (flags & NVGPU_OBJ_CTX_FLAGS_SUPPORT_CILP)
@@ -84,7 +81,7 @@ fail:
 }
 
 int vgpu_gr_gp10b_set_ctxsw_preemption_mode(struct gk20a *g,
-				struct gr_ctx_desc *gr_ctx,
+				struct nvgpu_gr_ctx *gr_ctx,
 				struct vm_gk20a *vm, u32 class,
 				u32 graphics_preempt_mode,
 				u32 compute_preempt_mode)
@@ -240,7 +237,7 @@ int vgpu_gr_gp10b_set_preemption_mode(struct channel_gk20a *ch,
 					u32 graphics_preempt_mode,
 					u32 compute_preempt_mode)
 {
-	struct gr_ctx_desc *gr_ctx = ch->ch_ctx.gr_ctx;
+	struct nvgpu_gr_ctx *gr_ctx;
 	struct gk20a *g = ch->g;
 	struct tsg_gk20a *tsg;
 	struct vm_gk20a *vm;
@@ -251,6 +248,13 @@ int vgpu_gr_gp10b_set_preemption_mode(struct channel_gk20a *ch,
 	if (!class)
 		return -EINVAL;
 
+	tsg = tsg_gk20a_from_ch(ch);
+	if (!tsg)
+		return -EINVAL;
+
+	vm = tsg->vm;
+	gr_ctx = &tsg->gr_ctx;
+
 	/* skip setting anything if both modes are already set */
 	if (graphics_preempt_mode &&
 	   (graphics_preempt_mode == gr_ctx->graphics_preempt_mode))
@@ -263,13 +267,6 @@ int vgpu_gr_gp10b_set_preemption_mode(struct channel_gk20a *ch,
 	if (graphics_preempt_mode == 0 && compute_preempt_mode == 0)
 		return 0;
 
-	if (gk20a_is_channel_marked_as_tsg(ch)) {
-		tsg = &g->fifo.tsg[ch->tsgid];
-		vm = tsg->vm;
-	} else {
-		vm = ch->vm;
-	}
-
 	if (g->ops.gr.set_ctxsw_preemption_mode) {
 		err = g->ops.gr.set_ctxsw_preemption_mode(g, gr_ctx, vm, class,
 						graphics_preempt_mode,
diff --git a/drivers/gpu/nvgpu/common/linux/vgpu/gp10b/vgpu_gr_gp10b.h b/drivers/gpu/nvgpu/common/linux/vgpu/gp10b/vgpu_gr_gp10b.h
index 31b88d19..559bd227 100644
--- a/drivers/gpu/nvgpu/common/linux/vgpu/gp10b/vgpu_gr_gp10b.h
+++ b/drivers/gpu/nvgpu/common/linux/vgpu/gp10b/vgpu_gr_gp10b.h
@@ -20,12 +20,12 @@
 #include "gk20a/gk20a.h"
 
 int vgpu_gr_gp10b_alloc_gr_ctx(struct gk20a *g,
-				struct gr_ctx_desc **__gr_ctx,
+				struct nvgpu_gr_ctx *gr_ctx,
 				struct vm_gk20a *vm,
 				u32 class,
 				u32 flags);
 int vgpu_gr_gp10b_set_ctxsw_preemption_mode(struct gk20a *g,
-				struct gr_ctx_desc *gr_ctx,
+				struct nvgpu_gr_ctx *gr_ctx,
 				struct vm_gk20a *vm, u32 class,
 				u32 graphics_preempt_mode,
 				u32 compute_preempt_mode);
diff --git a/drivers/gpu/nvgpu/common/linux/vgpu/gp10b/vgpu_hal_gp10b.c b/drivers/gpu/nvgpu/common/linux/vgpu/gp10b/vgpu_hal_gp10b.c
index e8cb96b4..d5fd5102 100644
--- a/drivers/gpu/nvgpu/common/linux/vgpu/gp10b/vgpu_hal_gp10b.c
+++ b/drivers/gpu/nvgpu/common/linux/vgpu/gp10b/vgpu_hal_gp10b.c
@@ -112,7 +112,6 @@ static const struct gpu_ops vgpu_gp10b_ops = {
 		.load_ctxsw_ucode = gr_gk20a_load_ctxsw_ucode,
 		.set_gpc_tpc_mask = gr_gp10b_set_gpc_tpc_mask,
 		.get_gpc_tpc_mask = vgpu_gr_get_gpc_tpc_mask,
-		.free_channel_ctx = vgpu_gr_free_channel_ctx,
 		.alloc_obj_ctx = vgpu_gr_alloc_obj_ctx,
 		.bind_ctxsw_zcull = vgpu_gr_bind_ctxsw_zcull,
 		.get_zcull_info = vgpu_gr_get_zcull_info,
diff --git a/drivers/gpu/nvgpu/common/linux/vgpu/gr_vgpu.c b/drivers/gpu/nvgpu/common/linux/vgpu/gr_vgpu.c
index e8790587..8f1c5d78 100644
--- a/drivers/gpu/nvgpu/common/linux/vgpu/gr_vgpu.c
+++ b/drivers/gpu/nvgpu/common/linux/vgpu/gr_vgpu.c
@@ -20,14 +20,18 @@
 
 #include <nvgpu/kmem.h>
 #include <nvgpu/bug.h>
+#include <nvgpu/dma.h>
 #include <nvgpu/error_notifier.h>
 #include <nvgpu/dma.h>
 
 #include "vgpu.h"
 #include "gr_vgpu.h"
 #include "gk20a/dbg_gpu_gk20a.h"
+#include "gk20a/channel_gk20a.h"
+#include "gk20a/tsg_gk20a.h"
 
 #include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
+#include <nvgpu/hw/gk20a/hw_ctxsw_prog_gk20a.h>
 
 void vgpu_gr_detect_sm_arch(struct gk20a *g)
 {
@@ -152,8 +156,9 @@ static int vgpu_gr_map_global_ctx_buffers(struct gk20a *g,
 	struct tegra_vgpu_cmd_msg msg;
 	struct tegra_vgpu_ch_ctx_params *p = &msg.params.ch_ctx;
 	struct vm_gk20a *ch_vm = c->vm;
-	u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
-	u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
+	struct tsg_gk20a *tsg;
+	u64 *g_bfr_va;
+	u64 *g_bfr_size;
 	struct gr_gk20a *gr = &g->gr;
 	u64 gpu_va;
 	u32 i;
@@ -161,7 +166,12 @@ static int vgpu_gr_map_global_ctx_buffers(struct gk20a *g,
 
 	gk20a_dbg_fn("");
 
-	/* FIXME: add VPR support */
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	g_bfr_va = tsg->gr_ctx.global_ctx_buffer_va;
+	g_bfr_size = tsg->gr_ctx.global_ctx_buffer_size;
 
 	/* Circular Buffer */
 	gpu_va = __nvgpu_vm_alloc_va(ch_vm,
@@ -213,7 +223,7 @@ static int vgpu_gr_map_global_ctx_buffers(struct gk20a *g,
 	if (err || msg.ret)
 		goto clean_up;
 
-	c->ch_ctx.global_ctx_buffer_mapped = true;
+	tsg->gr_ctx.global_ctx_buffer_mapped = true;
 	return 0;
 
  clean_up:
@@ -227,40 +237,33 @@ static int vgpu_gr_map_global_ctx_buffers(struct gk20a *g,
 	return -ENOMEM;
 }
 
-static void vgpu_gr_unmap_global_ctx_buffers(struct channel_gk20a *c)
+static void vgpu_gr_unmap_global_ctx_buffers(struct tsg_gk20a *tsg)
 {
-	struct vm_gk20a *ch_vm = c->vm;
-	u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
-	u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
+	struct vm_gk20a *ch_vm = tsg->vm;
+	u64 *g_bfr_va = tsg->gr_ctx.global_ctx_buffer_va;
+	u64 *g_bfr_size = tsg->gr_ctx.global_ctx_buffer_size;
 	u32 i;
 
 	gk20a_dbg_fn("");
 
-	if (c->ch_ctx.global_ctx_buffer_mapped) {
-		struct tegra_vgpu_cmd_msg msg;
-		struct tegra_vgpu_ch_ctx_params *p = &msg.params.ch_ctx;
-		int err;
+	if (tsg->gr_ctx.global_ctx_buffer_mapped) {
+		/* server will unmap on channel close */
 
-		msg.cmd = TEGRA_VGPU_CMD_CHANNEL_UNMAP_GR_GLOBAL_CTX;
-		msg.handle = vgpu_get_handle(c->g);
-		p->handle = c->virt_ctx;
-		err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
-		WARN_ON(err || msg.ret);
-	}
-
-	for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
-		if (g_bfr_va[i]) {
-			__nvgpu_vm_free_va(ch_vm, g_bfr_va[i],
-					   gmmu_page_size_kernel);
-			g_bfr_va[i] = 0;
-			g_bfr_size[i] = 0;
+		for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
+			if (g_bfr_va[i]) {
+				__nvgpu_vm_free_va(ch_vm, g_bfr_va[i],
+						   gmmu_page_size_kernel);
+				g_bfr_va[i] = 0;
+				g_bfr_size[i] = 0;
+			}
 		}
+
+		tsg->gr_ctx.global_ctx_buffer_mapped = false;
 	}
-	c->ch_ctx.global_ctx_buffer_mapped = false;
 }
 
 int vgpu_gr_alloc_gr_ctx(struct gk20a *g,
-			struct gr_ctx_desc **__gr_ctx,
+			struct nvgpu_gr_ctx *gr_ctx,
 			struct vm_gk20a *vm,
 			u32 class,
 			u32 flags)
@@ -268,7 +271,6 @@ int vgpu_gr_alloc_gr_ctx(struct gk20a *g,
 	struct tegra_vgpu_cmd_msg msg = {0};
 	struct tegra_vgpu_gr_ctx_params *p = &msg.params.gr_ctx;
 	struct gr_gk20a *gr = &g->gr;
-	struct gr_ctx_desc *gr_ctx;
 	int err;
 
 	gk20a_dbg_fn("");
@@ -280,19 +282,14 @@ int vgpu_gr_alloc_gr_ctx(struct gk20a *g,
 	gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
 	gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
 
-	gr_ctx = nvgpu_kzalloc(g, sizeof(*gr_ctx));
-	if (!gr_ctx)
-		return -ENOMEM;
-
-	gr_ctx->mem.size = gr->ctx_vars.buffer_total_size;
 	gr_ctx->mem.gpu_va = __nvgpu_vm_alloc_va(vm,
-						gr_ctx->mem.size,
+						gr->ctx_vars.buffer_total_size,
 						gmmu_page_size_kernel);
 
-	if (!gr_ctx->mem.gpu_va) {
-		nvgpu_kfree(g, gr_ctx);
+	if (!gr_ctx->mem.gpu_va)
 		return -ENOMEM;
-	}
+	gr_ctx->mem.size = gr->ctx_vars.buffer_total_size;
+	gr_ctx->mem.aperture = APERTURE_SYSMEM;
 
 	msg.cmd = TEGRA_VGPU_CMD_GR_CTX_ALLOC;
 	msg.handle = vgpu_get_handle(g);
@@ -306,57 +303,19 @@ int vgpu_gr_alloc_gr_ctx(struct gk20a *g,
 		nvgpu_err(g, "fail to alloc gr_ctx");
 		__nvgpu_vm_free_va(vm, gr_ctx->mem.gpu_va,
 				   gmmu_page_size_kernel);
-		nvgpu_kfree(g, gr_ctx);
+		gr_ctx->mem.aperture = APERTURE_INVALID;
 	} else {
 		gr_ctx->virt_ctx = p->gr_ctx_handle;
-		*__gr_ctx = gr_ctx;
 	}
 
 	return err;
 }
 
-void vgpu_gr_free_gr_ctx(struct gk20a *g, struct vm_gk20a *vm,
-			struct gr_ctx_desc *gr_ctx)
-{
-	struct tegra_vgpu_cmd_msg msg;
-	struct tegra_vgpu_gr_ctx_params *p = &msg.params.gr_ctx;
-	int err;
-
-	gk20a_dbg_fn("");
-
-	if (!gr_ctx || !gr_ctx->mem.gpu_va)
-		return;
-
-
-	msg.cmd = TEGRA_VGPU_CMD_GR_CTX_FREE;
-	msg.handle = vgpu_get_handle(g);
-	p->gr_ctx_handle = gr_ctx->virt_ctx;
-	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
-	WARN_ON(err || msg.ret);
-
-	__nvgpu_vm_free_va(vm, gr_ctx->mem.gpu_va,
-			   gmmu_page_size_kernel);
-
-	nvgpu_dma_unmap_free(vm, &gr_ctx->pagepool_ctxsw_buffer);
-	nvgpu_dma_unmap_free(vm, &gr_ctx->betacb_ctxsw_buffer);
-	nvgpu_dma_unmap_free(vm, &gr_ctx->spill_ctxsw_buffer);
-	nvgpu_dma_unmap_free(vm, &gr_ctx->preempt_ctxsw_buffer);
-
-	nvgpu_kfree(g, gr_ctx);
-}
-
-static void vgpu_gr_free_channel_gr_ctx(struct channel_gk20a *c)
-{
-	gk20a_dbg_fn("");
-
-	c->g->ops.gr.free_gr_ctx(c->g, c->vm, c->ch_ctx.gr_ctx);
-	c->ch_ctx.gr_ctx = NULL;
-}
-
 static int vgpu_gr_alloc_channel_patch_ctx(struct gk20a *g,
 					struct channel_gk20a *c)
 {
-	struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
+	struct tsg_gk20a *tsg;
+	struct patch_desc *patch_ctx;
 	struct vm_gk20a *ch_vm = c->vm;
 	struct tegra_vgpu_cmd_msg msg;
 	struct tegra_vgpu_ch_ctx_params *p = &msg.params.ch_ctx;
@@ -364,6 +323,11 @@ static int vgpu_gr_alloc_channel_patch_ctx(struct gk20a *g,
 
 	gk20a_dbg_fn("");
 
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	patch_ctx = &tsg->gr_ctx.patch_ctx;
 	patch_ctx->mem.size = 128 * sizeof(u32);
 	patch_ctx->mem.gpu_va = __nvgpu_vm_alloc_va(ch_vm,
 						patch_ctx->mem.size,
@@ -385,37 +349,25 @@ static int vgpu_gr_alloc_channel_patch_ctx(struct gk20a *g,
 	return err;
 }
 
-static void vgpu_gr_free_channel_patch_ctx(struct channel_gk20a *c)
+static void vgpu_gr_free_channel_patch_ctx(struct tsg_gk20a *tsg)
 {
-	struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
-	struct vm_gk20a *ch_vm = c->vm;
+	struct patch_desc *patch_ctx = &tsg->gr_ctx.patch_ctx;
 
 	gk20a_dbg_fn("");
 
 	if (patch_ctx->mem.gpu_va) {
-		struct tegra_vgpu_cmd_msg msg;
-		struct tegra_vgpu_ch_ctx_params *p = &msg.params.ch_ctx;
-		int err;
+		/* server will free on channel close */
 
-		msg.cmd = TEGRA_VGPU_CMD_CHANNEL_FREE_GR_PATCH_CTX;
-		msg.handle = vgpu_get_handle(c->g);
-		p->handle = c->virt_ctx;
-		err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
-		WARN_ON(err || msg.ret);
-
-		__nvgpu_vm_free_va(ch_vm, patch_ctx->mem.gpu_va,
+		__nvgpu_vm_free_va(tsg->vm, patch_ctx->mem.gpu_va,
 				   gmmu_page_size_kernel);
 		patch_ctx->mem.gpu_va = 0;
 	}
 }
 
-static void vgpu_gr_free_channel_pm_ctx(struct channel_gk20a *c)
+static void vgpu_gr_free_channel_pm_ctx(struct tsg_gk20a *tsg)
 {
-	struct tegra_vgpu_cmd_msg msg;
-	struct tegra_vgpu_channel_free_hwpm_ctx *p = &msg.params.free_hwpm_ctx;
-	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	struct nvgpu_gr_ctx *ch_ctx = &tsg->gr_ctx;
 	struct pm_ctx_desc *pm_ctx = &ch_ctx->pm_ctx;
-	int err;
 
 	gk20a_dbg_fn("");
 
@@ -423,44 +375,63 @@ static void vgpu_gr_free_channel_pm_ctx(struct channel_gk20a *c)
 	if (pm_ctx->mem.gpu_va == 0)
 		return;
 
-	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_FREE_HWPM_CTX;
-	msg.handle = vgpu_get_handle(c->g);
-	p->handle = c->virt_ctx;
-	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
-	WARN_ON(err || msg.ret);
+	/* server will free on channel close */
 
-	__nvgpu_vm_free_va(c->vm, pm_ctx->mem.gpu_va,
+	__nvgpu_vm_free_va(tsg->vm, pm_ctx->mem.gpu_va,
 			   gmmu_page_size_kernel);
 	pm_ctx->mem.gpu_va = 0;
 }
 
-void vgpu_gr_free_channel_ctx(struct channel_gk20a *c, bool is_tsg)
+void vgpu_gr_free_gr_ctx(struct gk20a *g,
+			 struct vm_gk20a *vm, struct nvgpu_gr_ctx *gr_ctx)
 {
+	struct tsg_gk20a *tsg;
+
 	gk20a_dbg_fn("");
 
-	if (c->g->ops.fifo.free_channel_ctx_header)
-		c->g->ops.fifo.free_channel_ctx_header(c);
-	vgpu_gr_unmap_global_ctx_buffers(c);
-	vgpu_gr_free_channel_patch_ctx(c);
-	vgpu_gr_free_channel_pm_ctx(c);
-	if (!is_tsg)
-		vgpu_gr_free_channel_gr_ctx(c);
+	if (gr_ctx->mem.gpu_va) {
+		struct tegra_vgpu_cmd_msg msg;
+		struct tegra_vgpu_gr_ctx_params *p = &msg.params.gr_ctx;
+		int err;
 
-	/* zcull_ctx, pm_ctx */
+		msg.cmd = TEGRA_VGPU_CMD_GR_CTX_FREE;
+		msg.handle = vgpu_get_handle(g);
+		p->gr_ctx_handle = gr_ctx->virt_ctx;
+		err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
+		WARN_ON(err || msg.ret);
 
-	memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
+		__nvgpu_vm_free_va(vm, gr_ctx->mem.gpu_va,
+				   gmmu_page_size_kernel);
+
+		tsg = &g->fifo.tsg[gr_ctx->tsgid];
+		vgpu_gr_unmap_global_ctx_buffers(tsg);
+		vgpu_gr_free_channel_patch_ctx(tsg);
+		vgpu_gr_free_channel_pm_ctx(tsg);
+
+		nvgpu_dma_unmap_free(vm, &gr_ctx->pagepool_ctxsw_buffer);
+		nvgpu_dma_unmap_free(vm, &gr_ctx->betacb_ctxsw_buffer);
+		nvgpu_dma_unmap_free(vm, &gr_ctx->spill_ctxsw_buffer);
+		nvgpu_dma_unmap_free(vm, &gr_ctx->preempt_ctxsw_buffer);
 
-	c->first_init = false;
+		memset(gr_ctx, 0, sizeof(*gr_ctx));
+	}
 }
 
 static int vgpu_gr_ch_bind_gr_ctx(struct channel_gk20a *c)
 {
-	struct gr_ctx_desc *gr_ctx = c->ch_ctx.gr_ctx;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx;
 	struct tegra_vgpu_cmd_msg msg = {0};
 	struct tegra_vgpu_channel_bind_gr_ctx_params *p =
 				&msg.params.ch_bind_gr_ctx;
 	int err;
 
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	gr_ctx = &tsg->gr_ctx;
+
 	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_BIND_GR_CTX;
 	msg.handle = vgpu_get_handle(c->g);
 	p->ch_handle = c->virt_ctx;
@@ -474,7 +445,7 @@ static int vgpu_gr_ch_bind_gr_ctx(struct channel_gk20a *c)
 
 static int vgpu_gr_tsg_bind_gr_ctx(struct tsg_gk20a *tsg)
 {
-	struct gr_ctx_desc *gr_ctx = tsg->tsg_gr_ctx;
+	struct nvgpu_gr_ctx *gr_ctx = &tsg->gr_ctx;
 	struct tegra_vgpu_cmd_msg msg = {0};
 	struct tegra_vgpu_tsg_bind_gr_ctx_params *p =
 					&msg.params.tsg_bind_gr_ctx;
@@ -495,7 +466,7 @@ int vgpu_gr_alloc_obj_ctx(struct channel_gk20a  *c, u32 class_num, u32 flags)
 {
 	struct gk20a *g = c->g;
 	struct fifo_gk20a *f = &g->fifo;
-	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	struct nvgpu_gr_ctx *gr_ctx = NULL;
 	struct tsg_gk20a *tsg = NULL;
 	int err = 0;
 
@@ -515,95 +486,87 @@ int vgpu_gr_alloc_obj_ctx(struct channel_gk20a  *c, u32 class_num, u32 flags)
 	}
 	c->obj_class = class_num;
 
-	if (gk20a_is_channel_marked_as_tsg(c))
-		tsg = &f->tsg[c->tsgid];
-
-	if (!tsg) {
-		/* allocate gr ctx buffer */
-		if (!ch_ctx->gr_ctx) {
-			err = g->ops.gr.alloc_gr_ctx(g, &c->ch_ctx.gr_ctx,
-						c->vm,
-						class_num,
-						flags);
-			if (!err)
-				err = vgpu_gr_ch_bind_gr_ctx(c);
-			if (err) {
-				nvgpu_err(g, "fail to allocate gr ctx buffer");
-				goto out;
-			}
-		} else {
-			/*TBD: needs to be more subtle about which is
-			 * being allocated as some are allowed to be
-			 * allocated along same channel */
+	if (!gk20a_is_channel_marked_as_tsg(c))
+		return -EINVAL;
+
+	tsg = &f->tsg[c->tsgid];
+	gr_ctx = &tsg->gr_ctx;
+
+	if (!nvgpu_mem_is_valid(&gr_ctx->mem)) {
+		tsg->vm = c->vm;
+		nvgpu_vm_get(tsg->vm);
+		err = g->ops.gr.alloc_gr_ctx(g, gr_ctx,
+					c->vm,
+					class_num,
+					flags);
+		if (!err)
+			err = vgpu_gr_tsg_bind_gr_ctx(tsg);
+		if (err) {
 			nvgpu_err(g,
-				"too many classes alloc'd on same channel");
-			err = -EINVAL;
+				"fail to allocate TSG gr ctx buffer, err=%d", err);
+			nvgpu_vm_put(tsg->vm);
+			tsg->vm = NULL;
 			goto out;
 		}
-	} else {
-		if (!tsg->tsg_gr_ctx) {
-			tsg->vm = c->vm;
-			nvgpu_vm_get(tsg->vm);
-			err = g->ops.gr.alloc_gr_ctx(g, &tsg->tsg_gr_ctx,
-						c->vm,
-						class_num,
-						flags);
-			if (!err)
-				err = vgpu_gr_tsg_bind_gr_ctx(tsg);
-			if (err) {
-				nvgpu_err(g,
-					"fail to allocate TSG gr ctx buffer, err=%d", err);
-				nvgpu_vm_put(tsg->vm);
-				tsg->vm = NULL;
-				goto out;
-			}
-		}
 
-		ch_ctx->gr_ctx = tsg->tsg_gr_ctx;
 		err = vgpu_gr_ch_bind_gr_ctx(c);
 		if (err) {
 			nvgpu_err(g, "fail to bind gr ctx buffer");
 			goto out;
 		}
-	}
 
-	/* commit gr ctx buffer */
-	err = g->ops.gr.commit_inst(c, ch_ctx->gr_ctx->mem.gpu_va);
-	if (err) {
-		nvgpu_err(g, "fail to commit gr ctx buffer");
-		goto out;
-	}
+		/* commit gr ctx buffer */
+		err = g->ops.gr.commit_inst(c, gr_ctx->mem.gpu_va);
+		if (err) {
+			nvgpu_err(g, "fail to commit gr ctx buffer");
+			goto out;
+		}
 
-	/* allocate patch buffer */
-	if (ch_ctx->patch_ctx.mem.priv.pages == NULL) {
+		/* allocate patch buffer */
 		err = vgpu_gr_alloc_channel_patch_ctx(g, c);
 		if (err) {
 			nvgpu_err(g, "fail to allocate patch buffer");
 			goto out;
 		}
-	}
 
-	/* map global buffer to channel gpu_va and commit */
-	if (!ch_ctx->global_ctx_buffer_mapped) {
+		/* map global buffer to channel gpu_va and commit */
 		err = vgpu_gr_map_global_ctx_buffers(g, c);
 		if (err) {
 			nvgpu_err(g, "fail to map global ctx buffer");
 			goto out;
 		}
-		vgpu_gr_commit_global_ctx_buffers(g, c, true);
-	}
 
-	/* load golden image */
-	if (!c->first_init) {
+		err = vgpu_gr_commit_global_ctx_buffers(g, c, true);
+		if (err) {
+			nvgpu_err(g, "fail to commit global ctx buffers");
+			goto out;
+		}
+
+		/* load golden image */
 		err = gr_gk20a_elpg_protected_call(g,
 				vgpu_gr_load_golden_ctx_image(g, c));
 		if (err) {
 			nvgpu_err(g, "fail to load golden ctx image");
 			goto out;
 		}
-		c->first_init = true;
+	} else {
+		err = vgpu_gr_ch_bind_gr_ctx(c);
+		if (err) {
+			nvgpu_err(g, "fail to bind gr ctx buffer");
+			goto out;
+		}
+
+		/* commit gr ctx buffer */
+		err = g->ops.gr.commit_inst(c, gr_ctx->mem.gpu_va);
+		if (err) {
+			nvgpu_err(g, "fail to commit gr ctx buffer");
+			goto out;
+		}
 	}
 
+	/* PM ctxt switch is off by default */
+	gr_ctx->pm_ctx.pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
+
 	gk20a_dbg_fn("done");
 	return 0;
 out:
@@ -1055,15 +1018,30 @@ int vgpu_gr_update_smpc_ctxsw_mode(struct gk20a *g,
 int vgpu_gr_update_hwpm_ctxsw_mode(struct gk20a *g,
 	struct channel_gk20a *ch, bool enable)
 {
-	struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
-	struct pm_ctx_desc *pm_ctx = &ch_ctx->pm_ctx;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *ch_ctx;
+	struct pm_ctx_desc *pm_ctx;
 	struct tegra_vgpu_cmd_msg msg;
 	struct tegra_vgpu_channel_set_ctxsw_mode *p = &msg.params.set_ctxsw_mode;
 	int err;
 
 	gk20a_dbg_fn("");
 
+	tsg = tsg_gk20a_from_ch(ch);
+	if (!tsg)
+		return -EINVAL;
+
+	ch_ctx = &tsg->gr_ctx;
+	pm_ctx = &ch_ctx->pm_ctx;
+
 	if (enable) {
+		/*
+		 * send command to enable HWPM only once - otherwise server
+		 * will return an error due to using the same GPU VA twice.
+		 */
+		if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f())
+			return 0;
+
 		p->mode = TEGRA_VGPU_CTXSW_MODE_CTXSW;
 
 		/* Allocate buffer if necessary */
@@ -1076,8 +1054,12 @@ int vgpu_gr_update_hwpm_ctxsw_mode(struct gk20a *g,
 				return -ENOMEM;
 			pm_ctx->mem.size = g->gr.ctx_vars.pm_ctxsw_image_size;
 		}
-	} else
+	} else {
+		if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_no_ctxsw_f())
+			return 0;
+
 		p->mode = TEGRA_VGPU_CTXSW_MODE_NO_CTXSW;
+	}
 
 	msg.cmd = TEGRA_VGPU_CMD_CHANNEL_SET_HWPM_CTXSW_MODE;
 	msg.handle = vgpu_get_handle(g);
@@ -1086,8 +1068,13 @@ int vgpu_gr_update_hwpm_ctxsw_mode(struct gk20a *g,
 
 	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
 	WARN_ON(err || msg.ret);
+	err = err ? err : msg.ret;
+	if (!err)
+		pm_ctx->pm_mode = enable ?
+			ctxsw_prog_main_image_pm_mode_ctxsw_f() :
+			ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
 
-	return err ? err : msg.ret;
+	return err;
 }
 
 int vgpu_gr_clear_sm_error_state(struct gk20a *g,
diff --git a/drivers/gpu/nvgpu/common/linux/vgpu/gr_vgpu.h b/drivers/gpu/nvgpu/common/linux/vgpu/gr_vgpu.h
index 16aa92a9..4b81da91 100644
--- a/drivers/gpu/nvgpu/common/linux/vgpu/gr_vgpu.h
+++ b/drivers/gpu/nvgpu/common/linux/vgpu/gr_vgpu.h
@@ -29,6 +29,7 @@ struct dbg_session_gk20a;
 
 void vgpu_gr_detect_sm_arch(struct gk20a *g);
 void vgpu_gr_free_channel_ctx(struct channel_gk20a *c, bool is_tsg);
+void vgpu_gr_free_tsg_ctx(struct tsg_gk20a *tsg);
 int vgpu_gr_alloc_obj_ctx(struct channel_gk20a  *c, u32 class_num, u32 flags);
 int vgpu_gr_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
 				struct channel_gk20a *c, u64 zcull_va,
diff --git a/drivers/gpu/nvgpu/common/linux/vgpu/gv11b/vgpu_hal_gv11b.c b/drivers/gpu/nvgpu/common/linux/vgpu/gv11b/vgpu_hal_gv11b.c
index 968eae10..132ce6e5 100644
--- a/drivers/gpu/nvgpu/common/linux/vgpu/gv11b/vgpu_hal_gv11b.c
+++ b/drivers/gpu/nvgpu/common/linux/vgpu/gv11b/vgpu_hal_gv11b.c
@@ -131,7 +131,6 @@ static const struct gpu_ops vgpu_gv11b_ops = {
 		.load_ctxsw_ucode = gr_gk20a_load_ctxsw_ucode,
 		.set_gpc_tpc_mask = gr_gv11b_set_gpc_tpc_mask,
 		.get_gpc_tpc_mask = vgpu_gr_get_gpc_tpc_mask,
-		.free_channel_ctx = vgpu_gr_free_channel_ctx,
 		.alloc_obj_ctx = vgpu_gr_alloc_obj_ctx,
 		.bind_ctxsw_zcull = vgpu_gr_bind_ctxsw_zcull,
 		.get_zcull_info = vgpu_gr_get_zcull_info,
diff --git a/drivers/gpu/nvgpu/common/linux/vgpu/gv11b/vgpu_subctx_gv11b.c b/drivers/gpu/nvgpu/common/linux/vgpu/gv11b/vgpu_subctx_gv11b.c
index d59f0381..a0099f03 100644
--- a/drivers/gpu/nvgpu/common/linux/vgpu/gv11b/vgpu_subctx_gv11b.c
+++ b/drivers/gpu/nvgpu/common/linux/vgpu/gv11b/vgpu_subctx_gv11b.c
@@ -21,7 +21,7 @@
 
 int vgpu_gv11b_alloc_subctx_header(struct channel_gk20a *c)
 {
-	struct ctx_header_desc *ctx = &c->ch_ctx.ctx_header;
+	struct ctx_header_desc *ctx = &c->ctx_header;
 	struct tegra_vgpu_cmd_msg msg = {};
 	struct tegra_vgpu_alloc_ctx_header_params *p =
 				&msg.params.alloc_ctx_header;
@@ -52,7 +52,7 @@ int vgpu_gv11b_alloc_subctx_header(struct channel_gk20a *c)
 
 void vgpu_gv11b_free_subctx_header(struct channel_gk20a *c)
 {
-	struct ctx_header_desc *ctx = &c->ch_ctx.ctx_header;
+	struct ctx_header_desc *ctx = &c->ctx_header;
 	struct tegra_vgpu_cmd_msg msg = {};
 	struct tegra_vgpu_free_ctx_header_params *p =
 				&msg.params.free_ctx_header;
diff --git a/drivers/gpu/nvgpu/common/linux/vgpu/vgpu.h b/drivers/gpu/nvgpu/common/linux/vgpu/vgpu.h
index 8c306ea0..20624240 100644
--- a/drivers/gpu/nvgpu/common/linux/vgpu/vgpu.h
+++ b/drivers/gpu/nvgpu/common/linux/vgpu/vgpu.h
@@ -79,12 +79,12 @@ int vgpu_gr_isr(struct gk20a *g, struct tegra_vgpu_gr_intr_info *info);
 int vgpu_gr_nonstall_isr(struct gk20a *g,
 			struct tegra_vgpu_gr_nonstall_intr_info *info);
 int vgpu_gr_alloc_gr_ctx(struct gk20a *g,
-			struct gr_ctx_desc **__gr_ctx,
+			struct nvgpu_gr_ctx *gr_ctx,
 			struct vm_gk20a *vm,
 			u32 class,
 			u32 flags);
 void vgpu_gr_free_gr_ctx(struct gk20a *g, struct vm_gk20a *vm,
-			struct gr_ctx_desc *gr_ctx);
+			struct nvgpu_gr_ctx *gr_ctx);
 void vgpu_gr_handle_sm_esr_event(struct gk20a *g,
 			struct tegra_vgpu_sm_esr_info *info);
 int vgpu_gr_init_ctx_state(struct gk20a *g);
@@ -141,7 +141,7 @@ static inline int vgpu_gr_isr(struct gk20a *g,
 	return 0;
 }
 static inline int vgpu_gr_alloc_gr_ctx(struct gk20a *g,
-				struct gr_ctx_desc **__gr_ctx,
+				struct nvgpu_gr_ctx *gr_ctx,
 				struct vm_gk20a *vm,
 				u32 class,
 				u32 flags)
@@ -149,7 +149,7 @@ static inline int vgpu_gr_alloc_gr_ctx(struct gk20a *g,
 	return -ENOSYS;
 }
 static inline void vgpu_gr_free_gr_ctx(struct gk20a *g, struct vm_gk20a *vm,
-				struct gr_ctx_desc *gr_ctx)
+				struct nvgpu_gr_ctx *gr_ctx)
 {
 }
 static inline int vgpu_gr_init_ctx_state(struct gk20a *g)
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 16d4711f..64266fe5 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -259,7 +259,7 @@ void gk20a_channel_abort(struct channel_gk20a *ch, bool channel_preempt)
 
 	ch->g->ops.fifo.disable_channel(ch);
 
-	if (channel_preempt && ch->ch_ctx.gr_ctx)
+	if (channel_preempt && gk20a_is_channel_marked_as_tsg(ch))
 		ch->g->ops.fifo.preempt_channel(ch->g, ch->chid);
 
 	gk20a_channel_abort_clean_up(ch);
@@ -421,8 +421,8 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
 		g->ops.fecs_trace.unbind_channel(g, ch);
 #endif
 
-	/* release channel ctx */
-	g->ops.gr.free_channel_ctx(ch, was_tsg);
+	if(g->ops.fifo.free_channel_ctx_header)
+		g->ops.fifo.free_channel_ctx_header(ch);
 
 	gk20a_gr_flush_channel_tlb(gr);
 
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index c13b1c58..29fa302f 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -31,7 +31,6 @@
 #include <nvgpu/atomic.h>
 
 struct gk20a;
-struct gr_gk20a;
 struct dbg_session_gk20a;
 struct gk20a_fence;
 struct fifo_profile_gk20a;
@@ -50,10 +49,6 @@ struct fifo_profile_gk20a;
 #define NVGPU_GPFIFO_FLAGS_SUPPORT_DETERMINISTIC	(1 << 1)
 #define NVGPU_GPFIFO_FLAGS_REPLAYABLE_FAULTS_ENABLE	(1 << 2)
 
-/* Flags to be passed to g->ops.gr.alloc_obj_ctx() */
-#define NVGPU_OBJ_CTX_FLAGS_SUPPORT_GFXP		(1 << 1)
-#define NVGPU_OBJ_CTX_FLAGS_SUPPORT_CILP		(1 << 2)
-
 struct notification {
 	struct {
 		u32 nanoseconds[2];
@@ -63,19 +58,6 @@ struct notification {
 	u16 status;
 };
 
-/* contexts associated with a channel */
-struct channel_ctx_gk20a {
-	struct gr_ctx_desc	*gr_ctx;
-	struct patch_desc	patch_ctx;
-	struct zcull_ctx_desc	zcull_ctx;
-	struct pm_ctx_desc	pm_ctx;
-	u64	global_ctx_buffer_va[NR_GLOBAL_CTX_BUF_VA];
-	u64	global_ctx_buffer_size[NR_GLOBAL_CTX_BUF_VA];
-	int	global_ctx_buffer_index[NR_GLOBAL_CTX_BUF_VA];
-	bool	global_ctx_buffer_mapped;
-	struct ctx_header_desc ctx_header;
-};
-
 struct channel_gk20a_job {
 	struct nvgpu_mapped_buf **mapped_buffers;
 	int num_mapped_buffers;
@@ -190,7 +172,6 @@ struct channel_gk20a {
 	int chid;
 	bool wdt_enabled;
 	nvgpu_atomic_t bound;
-	bool first_init;
 	bool vpr;
 	bool deterministic;
 	/* deterministic, but explicitly idle and submits disallowed */
@@ -210,8 +191,6 @@ struct channel_gk20a {
 
 	struct gpfifo_desc gpfifo;
 
-	struct channel_ctx_gk20a ch_ctx;
-
 	struct nvgpu_mem inst_block;
 
 	u64 userd_iova;
@@ -262,6 +241,8 @@ struct channel_gk20a {
 	struct channel_t19x t19x;
 #endif
 
+	struct ctx_header_desc ctx_header;
+
 	/* Any operating system specific data. */
 	void *os_priv;
 };
diff --git a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
index d283a82e..409661fc 100644
--- a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
@@ -625,9 +625,10 @@ int gk20a_fecs_trace_bind_channel(struct gk20a *g,
 	u32 lo;
 	u32 hi;
 	u64 pa;
-	struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *ch_ctx;
 	struct gk20a_fecs_trace *trace = g->fecs_trace;
-	struct nvgpu_mem *mem = &ch_ctx->gr_ctx->mem;
+	struct nvgpu_mem *mem;
 	u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(g, ch);
 	pid_t pid;
 	u32 aperture;
@@ -637,6 +638,13 @@ int gk20a_fecs_trace_bind_channel(struct gk20a *g,
 			ch->chid, context_ptr,
 			nvgpu_inst_block_addr(g, &ch->inst_block));
 
+	tsg = tsg_gk20a_from_ch(ch);
+	if (!tsg)
+		return -EINVAL;
+
+	ch_ctx = &tsg->gr_ctx;
+	mem = &ch_ctx->mem;
+
 	if (!trace)
 		return -ENOMEM;
 
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 070b26b6..685976b1 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -187,16 +187,16 @@ struct gpu_ops {
 		void (*cb_size_default)(struct gk20a *g);
 		int (*calc_global_ctx_buffer_size)(struct gk20a *g);
 		void (*commit_global_attrib_cb)(struct gk20a *g,
-						struct channel_ctx_gk20a *ch_ctx,
+						struct nvgpu_gr_ctx *ch_ctx,
 						u64 addr, bool patch);
 		void (*commit_global_bundle_cb)(struct gk20a *g,
-						struct channel_ctx_gk20a *ch_ctx,
+						struct nvgpu_gr_ctx *ch_ctx,
 						u64 addr, u64 size, bool patch);
 		int (*commit_global_cb_manager)(struct gk20a *g,
 						struct channel_gk20a *ch,
 						bool patch);
 		void (*commit_global_pagepool)(struct gk20a *g,
-					       struct channel_ctx_gk20a *ch_ctx,
+					       struct nvgpu_gr_ctx *ch_ctx,
 					       u64 addr, u32 size, bool patch);
 		void (*init_gpc_mmu)(struct gk20a *g);
 		int (*handle_sw_method)(struct gk20a *g, u32 addr,
@@ -230,7 +230,6 @@ struct gpu_ops {
 		int (*load_ctxsw_ucode)(struct gk20a *g);
 		u32 (*get_gpc_tpc_mask)(struct gk20a *g, u32 gpc_index);
 		void (*set_gpc_tpc_mask)(struct gk20a *g, u32 gpc_index);
-		void (*free_channel_ctx)(struct channel_gk20a *c, bool is_tsg);
 		int (*alloc_obj_ctx)(struct channel_gk20a  *c,
 				     u32 class_num, u32 flags);
 		int (*bind_ctxsw_zcull)(struct gk20a *g, struct gr_gk20a *gr,
@@ -285,13 +284,12 @@ struct gpu_ops {
 		u32 (*pagepool_default_size)(struct gk20a *g);
 		int (*init_ctx_state)(struct gk20a *g);
 		int (*alloc_gr_ctx)(struct gk20a *g,
-			  struct gr_ctx_desc **__gr_ctx, struct vm_gk20a *vm,
+			  struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm,
 			  u32 class, u32 padding);
 		void (*free_gr_ctx)(struct gk20a *g,
-			  struct vm_gk20a *vm,
-			  struct gr_ctx_desc *gr_ctx);
+				    struct vm_gk20a *vm, struct nvgpu_gr_ctx *gr_ctx);
 		void (*update_ctxsw_preemption_mode)(struct gk20a *g,
-				struct channel_ctx_gk20a *ch_ctx,
+				struct channel_gk20a *c,
 				struct nvgpu_mem *mem);
 		int (*update_smpc_ctxsw_mode)(struct gk20a *g,
 				struct channel_gk20a *c,
@@ -384,14 +382,14 @@ struct gpu_ops {
 		int (*get_preemption_mode_flags)(struct gk20a *g,
 		       struct nvgpu_preemption_modes_rec *preemption_modes_rec);
 		int (*set_ctxsw_preemption_mode)(struct gk20a *g,
-				struct gr_ctx_desc *gr_ctx,
+				struct nvgpu_gr_ctx *gr_ctx,
 				struct vm_gk20a *vm, u32 class,
 				u32 graphics_preempt_mode,
 				u32 compute_preempt_mode);
 		int (*set_boosted_ctx)(struct channel_gk20a *ch, bool boost);
 		void (*update_boosted_ctx)(struct gk20a *g,
 					   struct nvgpu_mem *mem,
-					   struct gr_ctx_desc *gr_ctx);
+					   struct nvgpu_gr_ctx *gr_ctx);
 		int (*init_sm_id_table)(struct gk20a *g);
 		int (*load_smid_config)(struct gk20a *g);
 		void (*program_sm_id_numbering)(struct gk20a *g,
@@ -440,7 +438,7 @@ struct gpu_ops {
 		u32 (*get_gpcs_swdx_dss_zbc_c_format_reg)(struct gk20a *g);
 		u32 (*get_gpcs_swdx_dss_zbc_z_format_reg)(struct gk20a *g);
 		void (*dump_ctxsw_stats)(struct gk20a *g, struct vm_gk20a *vm,
-					 struct gr_ctx_desc *gr_ctx);
+					 struct nvgpu_gr_ctx *gr_ctx);
 	} gr;
 	struct {
 		void (*init_hw)(struct gk20a *g);
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 263ae030..f8af091b 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -85,18 +85,19 @@ static int  gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g);
 static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g);
 static int  gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
 					    struct channel_gk20a *c);
-static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c);
-
-/* channel gr ctx buffer */
-static int  gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
-					struct channel_gk20a *c,
-					u32 class, u32 padding);
-static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c);
+static void gr_gk20a_unmap_global_ctx_buffers(struct gk20a *g,
+					      struct vm_gk20a *vm,
+					      struct nvgpu_gr_ctx *gr_ctx);
+static void gr_gk20a_free_channel_pm_ctx(struct gk20a *g,
+					 struct vm_gk20a *vm,
+					 struct nvgpu_gr_ctx *gr_ctx);
 
 /* channel patch ctx buffer */
 static int  gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
 					struct channel_gk20a *c);
-static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c);
+static void gr_gk20a_free_channel_patch_ctx(struct gk20a *g,
+					    struct vm_gk20a *vm,
+					    struct nvgpu_gr_ctx *gr_ctx);
 
 /* golden ctx image */
 static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
@@ -108,8 +109,16 @@ int gr_gk20a_get_ctx_id(struct gk20a *g,
 		struct channel_gk20a *c,
 		u32 *ctx_id)
 {
-	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
-	struct nvgpu_mem *mem = &ch_ctx->gr_ctx->mem;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx = NULL;
+	struct nvgpu_mem *mem = NULL;
+
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	gr_ctx = &tsg->gr_ctx;
+	mem = &gr_ctx->mem;
 
 	/* Channel gr_ctx buffer is gpu cacheable.
 	   Flush and invalidate before cpu update. */
@@ -671,62 +680,62 @@ int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
  */
 
 int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
-					  struct channel_ctx_gk20a *ch_ctx,
+					  struct nvgpu_gr_ctx *gr_ctx,
 					  bool update_patch_count)
 {
 	int err = 0;
 
-	err = nvgpu_mem_begin(g, &ch_ctx->patch_ctx.mem);
+	err = nvgpu_mem_begin(g, &gr_ctx->patch_ctx.mem);
 	if (err)
 		return err;
 
 	if (update_patch_count) {
 		/* reset patch count if ucode has already processed it */
-		ch_ctx->patch_ctx.data_count = nvgpu_mem_rd(g,
-						&ch_ctx->gr_ctx->mem,
+		gr_ctx->patch_ctx.data_count = nvgpu_mem_rd(g,
+						&gr_ctx->mem,
 					ctxsw_prog_main_image_patch_count_o());
 		nvgpu_log(g, gpu_dbg_info, "patch count reset to %d",
-					ch_ctx->patch_ctx.data_count);
+					gr_ctx->patch_ctx.data_count);
 	}
 	return 0;
 }
 
 void gr_gk20a_ctx_patch_write_end(struct gk20a *g,
-					struct channel_ctx_gk20a *ch_ctx,
+					struct nvgpu_gr_ctx *gr_ctx,
 					bool update_patch_count)
 {
-	nvgpu_mem_end(g, &ch_ctx->patch_ctx.mem);
+	nvgpu_mem_end(g, &gr_ctx->patch_ctx.mem);
 
 	/* Write context count to context image if it is mapped */
 	if (update_patch_count) {
-		nvgpu_mem_wr(g, &ch_ctx->gr_ctx->mem,
+		nvgpu_mem_wr(g, &gr_ctx->mem,
 			     ctxsw_prog_main_image_patch_count_o(),
-			     ch_ctx->patch_ctx.data_count);
+			     gr_ctx->patch_ctx.data_count);
 		nvgpu_log(g, gpu_dbg_info, "write patch count %d",
-			ch_ctx->patch_ctx.data_count);
+			gr_ctx->patch_ctx.data_count);
 	}
 }
 
 void gr_gk20a_ctx_patch_write(struct gk20a *g,
-				    struct channel_ctx_gk20a *ch_ctx,
+				    struct nvgpu_gr_ctx *gr_ctx,
 				    u32 addr, u32 data, bool patch)
 {
 	if (patch) {
-		u32 patch_slot = ch_ctx->patch_ctx.data_count *
+		u32 patch_slot = gr_ctx->patch_ctx.data_count *
 				PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY;
 		if (patch_slot > (PATCH_CTX_ENTRIES_FROM_SIZE(
-					ch_ctx->patch_ctx.mem.size) -
+					gr_ctx->patch_ctx.mem.size) -
 				PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY)) {
 			nvgpu_err(g, "failed to access patch_slot %d",
 				patch_slot);
 			return;
 		}
-		nvgpu_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot, addr);
-		nvgpu_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot + 1, data);
-		ch_ctx->patch_ctx.data_count++;
+		nvgpu_mem_wr32(g, &gr_ctx->patch_ctx.mem, patch_slot, addr);
+		nvgpu_mem_wr32(g, &gr_ctx->patch_ctx.mem, patch_slot + 1, data);
+		gr_ctx->patch_ctx.data_count++;
 		nvgpu_log(g, gpu_dbg_info,
 			"patch addr = 0x%x data = 0x%x data_count %d",
-			addr, data, ch_ctx->patch_ctx.data_count);
+			addr, data, gr_ctx->patch_ctx.data_count);
 	} else {
 		gk20a_writel(g, addr, data);
 	}
@@ -793,14 +802,22 @@ void gr_gk20a_write_pm_ptr(struct gk20a *g,
 
 static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c)
 {
-	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
-	struct nvgpu_mem *mem = &ch_ctx->gr_ctx->mem;
-	struct ctx_header_desc *ctx = &c->ch_ctx.ctx_header;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx = NULL;
+	struct nvgpu_mem *mem = NULL;
+	struct ctx_header_desc *ctx = &c->ctx_header;
 	struct nvgpu_mem *ctxheader = &ctx->mem;
 	int ret = 0;
 
 	gk20a_dbg_fn("");
 
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	gr_ctx = &tsg->gr_ctx;
+	mem = &gr_ctx->mem;
+
 	if (nvgpu_mem_begin(g, mem))
 		return -ENOMEM;
 
@@ -809,8 +826,8 @@ static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c)
 		goto clean_up_mem;
 	}
 
-	if (ch_ctx->zcull_ctx.gpu_va == 0 &&
-	    ch_ctx->zcull_ctx.ctx_sw_mode ==
+	if (gr_ctx->zcull_ctx.gpu_va == 0 &&
+	    gr_ctx->zcull_ctx.ctx_sw_mode ==
 		ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
 		ret = -EINVAL;
 		goto clean_up;
@@ -830,13 +847,13 @@ static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c)
 
 	nvgpu_mem_wr(g, mem,
 			ctxsw_prog_main_image_zcull_o(),
-		 ch_ctx->zcull_ctx.ctx_sw_mode);
+		 gr_ctx->zcull_ctx.ctx_sw_mode);
 
 	if (ctxheader->gpu_va)
 		g->ops.gr.write_zcull_ptr(g, ctxheader,
-					ch_ctx->zcull_ctx.gpu_va);
+					gr_ctx->zcull_ctx.gpu_va);
 	else
-		g->ops.gr.write_zcull_ptr(g, mem, ch_ctx->zcull_ctx.gpu_va);
+		g->ops.gr.write_zcull_ptr(g, mem, gr_ctx->zcull_ctx.gpu_va);
 
 	gk20a_enable_channel_tsg(g, c);
 
@@ -869,22 +886,29 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
 			struct channel_gk20a *c, bool patch)
 {
 	struct gr_gk20a *gr = &g->gr;
-	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx = NULL;
 	u64 addr;
 	u32 size;
 
 	gk20a_dbg_fn("");
+
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	gr_ctx = &tsg->gr_ctx;
 	if (patch) {
 		int err;
-		err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, false);
+		err = gr_gk20a_ctx_patch_write_begin(g, gr_ctx, false);
 		if (err)
 			return err;
 	}
 
 	/* global pagepool buffer */
-	addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
+	addr = (u64_lo32(gr_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
 		gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
-		(u64_hi32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
+		(u64_hi32(gr_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
 		 (32 - gr_scc_pagepool_base_addr_39_8_align_bits_v()));
 
 	size = gr->global_ctx_buffer[PAGEPOOL].mem.size /
@@ -896,12 +920,12 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
 	gk20a_dbg_info("pagepool buffer addr : 0x%016llx, size : %d",
 		addr, size);
 
-	g->ops.gr.commit_global_pagepool(g, ch_ctx, addr, size, patch);
+	g->ops.gr.commit_global_pagepool(g, gr_ctx, addr, size, patch);
 
 	/* global bundle cb */
-	addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
+	addr = (u64_lo32(gr_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
 		gr_scc_bundle_cb_base_addr_39_8_align_bits_v()) |
-		(u64_hi32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
+		(u64_hi32(gr_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
 		 (32 - gr_scc_bundle_cb_base_addr_39_8_align_bits_v()));
 
 	size = gr->bundle_cb_default_size;
@@ -909,20 +933,20 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
 	gk20a_dbg_info("bundle cb addr : 0x%016llx, size : %d",
 		addr, size);
 
-	g->ops.gr.commit_global_bundle_cb(g, ch_ctx, addr, size, patch);
+	g->ops.gr.commit_global_bundle_cb(g, gr_ctx, addr, size, patch);
 
 	/* global attrib cb */
-	addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
+	addr = (u64_lo32(gr_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
 		gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()) |
-		(u64_hi32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
+		(u64_hi32(gr_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
 		 (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
 
 	gk20a_dbg_info("attrib cb addr : 0x%016llx", addr);
-	g->ops.gr.commit_global_attrib_cb(g, ch_ctx, addr, patch);
+	g->ops.gr.commit_global_attrib_cb(g, gr_ctx, addr, patch);
 	g->ops.gr.commit_global_cb_manager(g, c, patch);
 
 	if (patch)
-		gr_gk20a_ctx_patch_write_end(g, ch_ctx, false);
+		gr_gk20a_ctx_patch_write_end(g, gr_ctx, false);
 
 	return 0;
 }
@@ -930,7 +954,7 @@ static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
 int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c)
 {
 	struct gr_gk20a *gr = &g->gr;
-	struct channel_ctx_gk20a *ch_ctx = NULL;
+	struct nvgpu_gr_ctx *gr_ctx = NULL;
 	u32 gpm_pd_cfg;
 	u32 pd_ab_dist_cfg0;
 	u32 ds_debug;
@@ -956,22 +980,22 @@ int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c)
 		ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
 		mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
 
-		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, false);
-		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, false);
-		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, false);
-		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, false);
-		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, false);
-		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, false);
+		gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, false);
+		gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, false);
+		gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, false);
+		gr_gk20a_ctx_patch_write(g, gr_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, false);
+		gr_gk20a_ctx_patch_write(g, gr_ctx, gr_ds_debug_r(), ds_debug, false);
+		gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, false);
 	} else {
 		gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
 		pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
 		ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
 		mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
 
-		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, false);
-		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, false);
-		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, false);
-		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, false);
+		gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, false);
+		gr_gk20a_ctx_patch_write(g, gr_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, false);
+		gr_gk20a_ctx_patch_write(g, gr_ctx, gr_ds_debug_r(), ds_debug, false);
+		gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, false);
 	}
 
 	return 0;
@@ -1360,13 +1384,14 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
 					  struct channel_gk20a *c)
 {
 	struct gr_gk20a *gr = &g->gr;
-	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx = NULL;
 	u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
 	u32 ctx_header_words;
 	u32 i;
 	u32 data;
 	struct nvgpu_mem *gold_mem = &gr->global_ctx_buffer[GOLDEN_CTX].mem;
-	struct nvgpu_mem *gr_mem = &ch_ctx->gr_ctx->mem;
+	struct nvgpu_mem *gr_mem;
 	u32 err = 0;
 	struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
 	struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
@@ -1374,6 +1399,13 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
 
 	gk20a_dbg_fn("");
 
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	gr_ctx = &tsg->gr_ctx;
+	gr_mem = &gr_ctx->mem;
+
 	/* golden ctx is global to all channels. Although only the first
 	   channel initializes golden image, driver needs to prevent multiple
 	   channels from initializing golden ctx at the same time */
@@ -1565,7 +1597,7 @@ restore_fe_go_idle:
 
 	g->ops.gr.write_zcull_ptr(g, gold_mem, 0);
 
-	err = g->ops.gr.commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
+	err = g->ops.gr.commit_inst(c, gr_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
 	if (err)
 		goto clean_up;
 
@@ -1614,20 +1646,25 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
 				    struct channel_gk20a *c,
 				    bool enable_smpc_ctxsw)
 {
-	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
-	struct nvgpu_mem *mem;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx = NULL;
+	struct nvgpu_mem *mem = NULL;
 	u32 data;
 	int ret;
 
 	gk20a_dbg_fn("");
 
-	if (!ch_ctx->gr_ctx) {
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	gr_ctx = &tsg->gr_ctx;
+	mem = &gr_ctx->mem;
+	if (!nvgpu_mem_is_valid(mem)) {
 		nvgpu_err(g, "no graphics context allocated");
 		return -EFAULT;
 	}
 
-	mem = &ch_ctx->gr_ctx->mem;
-
 	ret = gk20a_disable_channel_tsg(g, c);
 	if (ret) {
 		nvgpu_err(g, "failed to disable channel/TSG");
@@ -1670,24 +1707,30 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 				  struct channel_gk20a *c,
 				  bool enable_hwpm_ctxsw)
 {
-	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
-	struct pm_ctx_desc *pm_ctx = &ch_ctx->pm_ctx;
-	struct nvgpu_mem *gr_mem;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_mem *gr_mem = NULL;
+	struct nvgpu_gr_ctx *gr_ctx;
+	struct pm_ctx_desc *pm_ctx;
 	u32 data;
 	u64 virt_addr;
-	struct ctx_header_desc *ctx = &c->ch_ctx.ctx_header;
+	struct ctx_header_desc *ctx = &c->ctx_header;
 	struct nvgpu_mem *ctxheader = &ctx->mem;
 	int ret;
 
 	gk20a_dbg_fn("");
 
-	if (!ch_ctx->gr_ctx) {
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	gr_ctx = &tsg->gr_ctx;
+	pm_ctx = &gr_ctx->pm_ctx;
+	gr_mem = &gr_ctx->mem;
+	if (!nvgpu_mem_is_valid(gr_mem)) {
 		nvgpu_err(g, "no graphics context allocated");
 		return -EFAULT;
 	}
 
-	gr_mem = &ch_ctx->gr_ctx->mem;
-
 	if (enable_hwpm_ctxsw) {
 		if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f())
 			return 0;
@@ -1816,20 +1859,25 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 					struct channel_gk20a *c)
 {
 	struct gr_gk20a *gr = &g->gr;
-	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx;
 	u32 virt_addr_lo;
 	u32 virt_addr_hi;
 	u64 virt_addr = 0;
 	u32 v, data;
 	int ret = 0;
-	struct nvgpu_mem *mem = &ch_ctx->gr_ctx->mem;
-	struct ctx_header_desc *ctx = &c->ch_ctx.ctx_header;
-	struct nvgpu_mem *ctxheader = &ctx->mem;
+	struct nvgpu_mem *mem;
 
 	gk20a_dbg_fn("");
 
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	gr_ctx = &tsg->gr_ctx;
+	mem = &gr_ctx->mem;
 	if (gr->ctx_vars.local_golden_image == NULL)
-		return -1;
+		return -EINVAL;
 
 	/* Channel gr_ctx buffer is gpu cacheable.
 	   Flush and invalidate before cpu update. */
@@ -1838,11 +1886,6 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 	if (nvgpu_mem_begin(g, mem))
 		return -ENOMEM;
 
-	if (nvgpu_mem_begin(g, ctxheader)) {
-		ret = -ENOMEM;
-		goto clean_up_mem;
-	}
-
 	nvgpu_mem_wr_n(g, mem, 0,
 		gr->ctx_vars.local_golden_image,
 		gr->ctx_vars.golden_image_size);
@@ -1855,9 +1898,9 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 
 	/* set priv access map */
 	virt_addr_lo =
-		 u64_lo32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
+		 u64_lo32(gr_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
 	virt_addr_hi =
-		 u64_hi32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
+		 u64_hi32(gr_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
 
 	if (g->allow_all)
 		data = ctxsw_prog_main_image_priv_access_map_config_mode_allow_all_f();
@@ -1867,21 +1910,13 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 	nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_config_o(),
 		 data);
 
-	if (ctxheader->gpu_va) {
-		nvgpu_mem_wr(g, ctxheader,
-			ctxsw_prog_main_image_priv_access_map_addr_lo_o(),
-			virt_addr_lo);
-		nvgpu_mem_wr(g, ctxheader,
-			ctxsw_prog_main_image_priv_access_map_addr_hi_o(),
-			virt_addr_hi);
-	} else {
-		nvgpu_mem_wr(g, mem,
-			ctxsw_prog_main_image_priv_access_map_addr_lo_o(),
-			virt_addr_lo);
-		nvgpu_mem_wr(g, mem,
-			ctxsw_prog_main_image_priv_access_map_addr_hi_o(),
-			virt_addr_hi);
-	}
+	nvgpu_mem_wr(g, mem,
+		ctxsw_prog_main_image_priv_access_map_addr_lo_o(),
+		virt_addr_lo);
+	nvgpu_mem_wr(g, mem,
+		ctxsw_prog_main_image_priv_access_map_addr_hi_o(),
+		virt_addr_hi);
+
 	/* disable verif features */
 	v = nvgpu_mem_rd(g, mem, ctxsw_prog_main_image_misc_options_o());
 	v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
@@ -1889,65 +1924,50 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 	nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_misc_options_o(), v);
 
 	if (g->ops.gr.update_ctxsw_preemption_mode)
-		g->ops.gr.update_ctxsw_preemption_mode(g, ch_ctx, mem);
+		g->ops.gr.update_ctxsw_preemption_mode(g, c, mem);
 
 	if (g->ops.gr.update_boosted_ctx)
-		g->ops.gr.update_boosted_ctx(g, mem, ch_ctx->gr_ctx);
+		g->ops.gr.update_boosted_ctx(g, mem, gr_ctx);
 
-	virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.mem.gpu_va);
-	virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.mem.gpu_va);
+	virt_addr_lo = u64_lo32(gr_ctx->patch_ctx.mem.gpu_va);
+	virt_addr_hi = u64_hi32(gr_ctx->patch_ctx.mem.gpu_va);
 
 	nvgpu_log(g, gpu_dbg_info, "write patch count = %d",
-			ch_ctx->patch_ctx.data_count);
+			gr_ctx->patch_ctx.data_count);
 	nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_patch_count_o(),
-		 ch_ctx->patch_ctx.data_count);
-
-	if (ctxheader->gpu_va) {
-		nvgpu_mem_wr(g, ctxheader,
-			ctxsw_prog_main_image_patch_adr_lo_o(),
-			virt_addr_lo);
-		nvgpu_mem_wr(g, ctxheader,
-			ctxsw_prog_main_image_patch_adr_hi_o(),
-			virt_addr_hi);
-	} else {
-		nvgpu_mem_wr(g, mem,
-			ctxsw_prog_main_image_patch_adr_lo_o(),
-			virt_addr_lo);
-		nvgpu_mem_wr(g, mem,
-			ctxsw_prog_main_image_patch_adr_hi_o(),
-			virt_addr_hi);
-	}
+		 gr_ctx->patch_ctx.data_count);
+
+	nvgpu_mem_wr(g, mem,
+		ctxsw_prog_main_image_patch_adr_lo_o(),
+		virt_addr_lo);
+	nvgpu_mem_wr(g, mem,
+		ctxsw_prog_main_image_patch_adr_hi_o(),
+		virt_addr_hi);
 
 	/* Update main header region of the context buffer with the info needed
 	 * for PM context switching, including mode and possibly a pointer to
 	 * the PM backing store.
 	 */
-	if (ch_ctx->pm_ctx.pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f()) {
-		if (ch_ctx->pm_ctx.mem.gpu_va == 0) {
+	if (gr_ctx->pm_ctx.pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f()) {
+		if (gr_ctx->pm_ctx.mem.gpu_va == 0) {
 			nvgpu_err(g,
 				"context switched pm with no pm buffer!");
 			nvgpu_mem_end(g, mem);
 			return -EFAULT;
 		}
 
-		virt_addr = ch_ctx->pm_ctx.mem.gpu_va;
+		virt_addr = gr_ctx->pm_ctx.mem.gpu_va;
 	} else
 		virt_addr = 0;
 
 	data = nvgpu_mem_rd(g, mem, ctxsw_prog_main_image_pm_o());
 	data = data & ~ctxsw_prog_main_image_pm_mode_m();
-	data |= ch_ctx->pm_ctx.pm_mode;
+	data |= gr_ctx->pm_ctx.pm_mode;
 
 	nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_pm_o(), data);
 
-	if (ctxheader->gpu_va)
-		g->ops.gr.write_pm_ptr(g, ctxheader, virt_addr);
-	else
-		g->ops.gr.write_pm_ptr(g, mem, virt_addr);
-
+	g->ops.gr.write_pm_ptr(g, mem, virt_addr);
 
-	nvgpu_mem_end(g, ctxheader);
-clean_up_mem:
 	nvgpu_mem_end(g, mem);
 
 	return ret;
@@ -2568,13 +2588,13 @@ static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g)
 	return -ENOMEM;
 }
 
-static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c)
+static void gr_gk20a_unmap_global_ctx_buffers(struct gk20a *g,
+					      struct vm_gk20a *vm,
+					      struct nvgpu_gr_ctx *gr_ctx)
 {
-	struct vm_gk20a *ch_vm = c->vm;
-	struct gr_gk20a *gr = &c->g->gr;
-	u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
-	u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
-	int *g_bfr_index = c->ch_ctx.global_ctx_buffer_index;
+	u64 *g_bfr_va = gr_ctx->global_ctx_buffer_va;
+	u64 *g_bfr_size = gr_ctx->global_ctx_buffer_size;
+	int *g_bfr_index = gr_ctx->global_ctx_buffer_index;
 	u32 i;
 
 	gk20a_dbg_fn("");
@@ -2588,32 +2608,41 @@ static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c)
 			 * the correct struct nvgpu_mem to use. Handles the VPR
 			 * vs non-VPR difference in context images.
 			 */
-			mem = &gr->global_ctx_buffer[g_bfr_index[i]].mem;
+			mem = &g->gr.global_ctx_buffer[g_bfr_index[i]].mem;
 
-			nvgpu_gmmu_unmap(ch_vm, mem, g_bfr_va[i]);
+			nvgpu_gmmu_unmap(vm, mem, g_bfr_va[i]);
 		}
 	}
 
-	memset(g_bfr_va, 0, sizeof(c->ch_ctx.global_ctx_buffer_va));
-	memset(g_bfr_size, 0, sizeof(c->ch_ctx.global_ctx_buffer_size));
-	memset(g_bfr_index, 0, sizeof(c->ch_ctx.global_ctx_buffer_index));
+	memset(g_bfr_va, 0, sizeof(gr_ctx->global_ctx_buffer_va));
+	memset(g_bfr_size, 0, sizeof(gr_ctx->global_ctx_buffer_size));
+	memset(g_bfr_index, 0, sizeof(gr_ctx->global_ctx_buffer_index));
 
-	c->ch_ctx.global_ctx_buffer_mapped = false;
+	gr_ctx->global_ctx_buffer_mapped = false;
 }
 
 static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
 					struct channel_gk20a *c)
 {
+	struct tsg_gk20a *tsg;
 	struct vm_gk20a *ch_vm = c->vm;
-	u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
-	u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
-	int *g_bfr_index = c->ch_ctx.global_ctx_buffer_index;
+	u64 *g_bfr_va;
+	u64 *g_bfr_size;
+	int *g_bfr_index;
 	struct gr_gk20a *gr = &g->gr;
 	struct nvgpu_mem *mem;
 	u64 gpu_va;
 
 	gk20a_dbg_fn("");
 
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	g_bfr_va = tsg->gr_ctx.global_ctx_buffer_va;
+	g_bfr_size = tsg->gr_ctx.global_ctx_buffer_size;
+	g_bfr_index = tsg->gr_ctx.global_ctx_buffer_index;
+
 	/* Circular Buffer */
 	if (c->vpr &&
 	    nvgpu_mem_is_valid(&gr->global_ctx_buffer[CIRCULAR_VPR].mem)) {
@@ -2688,21 +2717,20 @@ static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
 	g_bfr_size[PRIV_ACCESS_MAP_VA] = mem->size;
 	g_bfr_index[PRIV_ACCESS_MAP_VA] = PRIV_ACCESS_MAP;
 
-	c->ch_ctx.global_ctx_buffer_mapped = true;
+	tsg->gr_ctx.global_ctx_buffer_mapped = true;
 	return 0;
 
 clean_up:
-	gr_gk20a_unmap_global_ctx_buffers(c);
+	gr_gk20a_unmap_global_ctx_buffers(g, ch_vm, &tsg->gr_ctx);
 
 	return -ENOMEM;
 }
 
 int gr_gk20a_alloc_gr_ctx(struct gk20a *g,
-			  struct gr_ctx_desc **__gr_ctx, struct vm_gk20a *vm,
+			  struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm,
 			  u32 class,
 			  u32 padding)
 {
-	struct gr_ctx_desc *gr_ctx = NULL;
 	struct gr_gk20a *gr = &g->gr;
 	int err = 0;
 
@@ -2715,15 +2743,11 @@ int gr_gk20a_alloc_gr_ctx(struct gk20a *g,
 	gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
 	gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
 
-	gr_ctx = nvgpu_kzalloc(g, sizeof(*gr_ctx));
-	if (!gr_ctx)
-		return -ENOMEM;
-
 	err = nvgpu_dma_alloc_flags(g, NVGPU_DMA_NO_KERNEL_MAPPING,
 					gr->ctx_vars.buffer_total_size,
 					&gr_ctx->mem);
 	if (err)
-		goto err_free_ctx;
+		return err;
 
 	gr_ctx->mem.gpu_va = nvgpu_gmmu_map(vm,
 					&gr_ctx->mem,
@@ -2734,15 +2758,10 @@ int gr_gk20a_alloc_gr_ctx(struct gk20a *g,
 	if (!gr_ctx->mem.gpu_va)
 		goto err_free_mem;
 
-	*__gr_ctx = gr_ctx;
-
 	return 0;
 
  err_free_mem:
 	nvgpu_dma_free(g, &gr_ctx->mem);
- err_free_ctx:
-	nvgpu_kfree(g, gr_ctx);
-	gr_ctx = NULL;
 
 	return err;
 }
@@ -2750,7 +2769,7 @@ int gr_gk20a_alloc_gr_ctx(struct gk20a *g,
 static int gr_gk20a_alloc_tsg_gr_ctx(struct gk20a *g,
 			struct tsg_gk20a *tsg, u32 class, u32 padding)
 {
-	struct gr_ctx_desc **gr_ctx = &tsg->tsg_gr_ctx;
+	struct nvgpu_gr_ctx *gr_ctx = &tsg->gr_ctx;
 	int err;
 
 	if (!tsg->vm) {
@@ -2762,57 +2781,44 @@ static int gr_gk20a_alloc_tsg_gr_ctx(struct gk20a *g,
 	if (err)
 		return err;
 
-	return 0;
-}
-
-static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
-				struct channel_gk20a *c,
-				u32 class,
-				u32 padding)
-{
-	struct gr_ctx_desc **gr_ctx = &c->ch_ctx.gr_ctx;
-	int err = g->ops.gr.alloc_gr_ctx(g, gr_ctx, c->vm, class, padding);
-	if (err)
-		return err;
+	gr_ctx->tsgid = tsg->tsgid;
 
 	return 0;
 }
 
 void gr_gk20a_free_gr_ctx(struct gk20a *g,
-			  struct vm_gk20a *vm, struct gr_ctx_desc *gr_ctx)
+			  struct vm_gk20a *vm, struct nvgpu_gr_ctx *gr_ctx)
 {
 	gk20a_dbg_fn("");
 
-	if (!gr_ctx || !gr_ctx->mem.gpu_va)
-		return;
+	if (gr_ctx->mem.gpu_va) {
+		gr_gk20a_unmap_global_ctx_buffers(g, vm, gr_ctx);
+		gr_gk20a_free_channel_patch_ctx(g, vm, gr_ctx);
+		gr_gk20a_free_channel_pm_ctx(g, vm, gr_ctx);
 
-	if (g->ops.gr.dump_ctxsw_stats &&
-	    g->gr.ctx_vars.dump_ctxsw_stats_on_channel_close)
-		g->ops.gr.dump_ctxsw_stats(g, vm, gr_ctx);
+		if (g->ops.gr.dump_ctxsw_stats &&
+		    g->gr.ctx_vars.dump_ctxsw_stats_on_channel_close)
+			g->ops.gr.dump_ctxsw_stats(g, vm, gr_ctx);
 
-	nvgpu_dma_unmap_free(vm, &gr_ctx->pagepool_ctxsw_buffer);
-	nvgpu_dma_unmap_free(vm, &gr_ctx->betacb_ctxsw_buffer);
-	nvgpu_dma_unmap_free(vm, &gr_ctx->spill_ctxsw_buffer);
-	nvgpu_dma_unmap_free(vm, &gr_ctx->preempt_ctxsw_buffer);
-	nvgpu_gmmu_unmap(vm, &gr_ctx->mem, gr_ctx->mem.gpu_va);
-	nvgpu_dma_free(g, &gr_ctx->mem);
-	nvgpu_kfree(g, gr_ctx);
+		nvgpu_dma_unmap_free(vm, &gr_ctx->pagepool_ctxsw_buffer);
+		nvgpu_dma_unmap_free(vm, &gr_ctx->betacb_ctxsw_buffer);
+		nvgpu_dma_unmap_free(vm, &gr_ctx->spill_ctxsw_buffer);
+		nvgpu_dma_unmap_free(vm, &gr_ctx->preempt_ctxsw_buffer);
+		nvgpu_dma_unmap_free(vm, &gr_ctx->mem);
+
+		memset(gr_ctx, 0, sizeof(*gr_ctx));
+	}
 }
 
 void gr_gk20a_free_tsg_gr_ctx(struct tsg_gk20a *tsg)
 {
+	struct gk20a *g = tsg->g;
+
 	if (!tsg->vm) {
-		nvgpu_err(tsg->g, "No address space bound");
+		nvgpu_err(g, "No address space bound");
 		return;
 	}
-	tsg->g->ops.gr.free_gr_ctx(tsg->g, tsg->vm, tsg->tsg_gr_ctx);
-	tsg->tsg_gr_ctx = NULL;
-}
-
-static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c)
-{
-	c->g->ops.gr.free_gr_ctx(c->g, c->vm, c->ch_ctx.gr_ctx);
-	c->ch_ctx.gr_ctx = NULL;
+	tsg->g->ops.gr.free_gr_ctx(g, tsg->vm, &tsg->gr_ctx);
 }
 
 u32 gr_gk20a_get_patch_slots(struct gk20a *g)
@@ -2823,13 +2829,19 @@ u32 gr_gk20a_get_patch_slots(struct gk20a *g)
 static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
 				struct channel_gk20a *c)
 {
-	struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
+	struct tsg_gk20a *tsg;
+	struct patch_desc *patch_ctx;
 	struct vm_gk20a *ch_vm = c->vm;
 	u32 alloc_size;
 	int err = 0;
 
 	gk20a_dbg_fn("");
 
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	patch_ctx = &tsg->gr_ctx.patch_ctx;
 	alloc_size = g->ops.gr.get_patch_slots(g) *
 		PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY;
 
@@ -2845,57 +2857,42 @@ static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
 	return 0;
 }
 
-static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c)
+static void gr_gk20a_free_channel_patch_ctx(struct gk20a *g,
+					    struct vm_gk20a *vm,
+					    struct nvgpu_gr_ctx *gr_ctx)
 {
-	struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
-	struct gk20a *g = c->g;
+	struct patch_desc *patch_ctx = &gr_ctx->patch_ctx;
 
 	gk20a_dbg_fn("");
 
 	if (patch_ctx->mem.gpu_va)
-		nvgpu_gmmu_unmap(c->vm, &patch_ctx->mem,
+		nvgpu_gmmu_unmap(vm, &patch_ctx->mem,
 				 patch_ctx->mem.gpu_va);
 
 	nvgpu_dma_free(g, &patch_ctx->mem);
 	patch_ctx->data_count = 0;
 }
 
-static void gr_gk20a_free_channel_pm_ctx(struct channel_gk20a *c)
+static void gr_gk20a_free_channel_pm_ctx(struct gk20a *g,
+					 struct vm_gk20a *vm,
+					 struct nvgpu_gr_ctx *gr_ctx)
 {
-	struct pm_ctx_desc *pm_ctx = &c->ch_ctx.pm_ctx;
-	struct gk20a *g = c->g;
+	struct pm_ctx_desc *pm_ctx = &gr_ctx->pm_ctx;
 
 	gk20a_dbg_fn("");
 
 	if (pm_ctx->mem.gpu_va) {
-		nvgpu_gmmu_unmap(c->vm, &pm_ctx->mem, pm_ctx->mem.gpu_va);
+		nvgpu_gmmu_unmap(vm, &pm_ctx->mem, pm_ctx->mem.gpu_va);
 
 		nvgpu_dma_free(g, &pm_ctx->mem);
 	}
 }
 
-void gk20a_free_channel_ctx(struct channel_gk20a *c, bool is_tsg)
-{
-	if(c->g->ops.fifo.free_channel_ctx_header)
-		c->g->ops.fifo.free_channel_ctx_header(c);
-	gr_gk20a_unmap_global_ctx_buffers(c);
-	gr_gk20a_free_channel_patch_ctx(c);
-	gr_gk20a_free_channel_pm_ctx(c);
-	if (!is_tsg)
-		gr_gk20a_free_channel_gr_ctx(c);
-
-	/* zcull_ctx */
-
-	memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
-
-	c->first_init = false;
-}
-
 int gk20a_alloc_obj_ctx(struct channel_gk20a  *c, u32 class_num, u32 flags)
 {
 	struct gk20a *g = c->g;
 	struct fifo_gk20a *f = &g->fifo;
-	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	struct nvgpu_gr_ctx *gr_ctx;
 	struct tsg_gk20a *tsg = NULL;
 	int err = 0;
 
@@ -2917,92 +2914,64 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a  *c, u32 class_num, u32 flags)
 	}
 	c->obj_class = class_num;
 
-	if (gk20a_is_channel_marked_as_tsg(c))
-		tsg = &f->tsg[c->tsgid];
+	if (!gk20a_is_channel_marked_as_tsg(c))
+		return -EINVAL;
 
-	/* allocate gr ctx buffer */
-	if (!tsg) {
-		if (!ch_ctx->gr_ctx) {
-			err = gr_gk20a_alloc_channel_gr_ctx(g, c,
-							    class_num,
-							    flags);
-			if (err) {
-				nvgpu_err(g,
-					"fail to allocate gr ctx buffer");
-				goto out;
-			}
-		} else {
-			/*TBD: needs to be more subtle about which is
-			 * being allocated as some are allowed to be
-			 * allocated along same channel */
+	tsg = &f->tsg[c->tsgid];
+	gr_ctx = &tsg->gr_ctx;
+
+	if (!nvgpu_mem_is_valid(&gr_ctx->mem)) {
+		tsg->vm = c->vm;
+		nvgpu_vm_get(tsg->vm);
+		err = gr_gk20a_alloc_tsg_gr_ctx(g, tsg,
+						class_num,
+						flags);
+		if (err) {
 			nvgpu_err(g,
-				"too many classes alloc'd on same channel");
-			err = -EINVAL;
+				"fail to allocate TSG gr ctx buffer");
+			nvgpu_vm_put(tsg->vm);
+			tsg->vm = NULL;
 			goto out;
 		}
-	} else {
-		if (!tsg->tsg_gr_ctx) {
-			tsg->vm = c->vm;
-			nvgpu_vm_get(tsg->vm);
-			err = gr_gk20a_alloc_tsg_gr_ctx(g, tsg,
-							class_num,
-							flags);
+
+		/* allocate patch buffer */
+		if (!nvgpu_mem_is_valid(&gr_ctx->patch_ctx.mem)) {
+			gr_ctx->patch_ctx.data_count = 0;
+			err = gr_gk20a_alloc_channel_patch_ctx(g, c);
 			if (err) {
 				nvgpu_err(g,
-					"fail to allocate TSG gr ctx buffer");
-				nvgpu_vm_put(tsg->vm);
-				tsg->vm = NULL;
+					"fail to allocate patch buffer");
 				goto out;
 			}
 		}
-		ch_ctx->gr_ctx = tsg->tsg_gr_ctx;
-	}
-
-	/* PM ctxt switch is off by default */
-	ch_ctx->pm_ctx.pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
 
-	/* commit gr ctx buffer */
-	err = g->ops.gr.commit_inst(c, ch_ctx->gr_ctx->mem.gpu_va);
-	if (err) {
-		nvgpu_err(g,
-			"fail to commit gr ctx buffer");
-		goto out;
-	}
-
-	/* allocate patch buffer */
-	if (!nvgpu_mem_is_valid(&ch_ctx->patch_ctx.mem)) {
-		ch_ctx->patch_ctx.data_count = 0;
-		err = gr_gk20a_alloc_channel_patch_ctx(g, c);
+		/* map global buffer to channel gpu_va and commit */
+		err = gr_gk20a_map_global_ctx_buffers(g, c);
 		if (err) {
 			nvgpu_err(g,
-				"fail to allocate patch buffer");
+				"fail to map global ctx buffer");
 			goto out;
 		}
-	}
+		gr_gk20a_commit_global_ctx_buffers(g, c, true);
 
-	/* map global buffer to channel gpu_va and commit */
-	if (!ch_ctx->global_ctx_buffer_mapped) {
-		err = gr_gk20a_map_global_ctx_buffers(g, c);
+		/* commit gr ctx buffer */
+		err = g->ops.gr.commit_inst(c, gr_ctx->mem.gpu_va);
 		if (err) {
 			nvgpu_err(g,
-				"fail to map global ctx buffer");
+				"fail to commit gr ctx buffer");
 			goto out;
 		}
-		gr_gk20a_commit_global_ctx_buffers(g, c, true);
-	}
 
-	/* init golden image, ELPG enabled after this is done */
-	err = gr_gk20a_init_golden_ctx_image(g, c);
-	if (err) {
-		nvgpu_err(g,
-			"fail to init golden ctx image");
-		goto out;
-	}
+		/* init golden image, ELPG enabled after this is done */
+		err = gr_gk20a_init_golden_ctx_image(g, c);
+		if (err) {
+			nvgpu_err(g,
+				"fail to init golden ctx image");
+			goto out;
+		}
 
-	/* load golden image */
-	if (!c->first_init) {
-		err = gr_gk20a_elpg_protected_call(g,
-			gr_gk20a_load_golden_ctx_image(g, c));
+		/* load golden image */
+		gr_gk20a_load_golden_ctx_image(g, c);
 		if (err) {
 			nvgpu_err(g,
 				"fail to load golden ctx image");
@@ -3016,11 +2985,21 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a  *c, u32 class_num, u32 flags)
 					"fail to bind channel for ctxsw trace");
 		}
 #endif
-		c->first_init = true;
-	}
 
-	if (g->ops.gr.set_czf_bypass)
-		g->ops.gr.set_czf_bypass(g, c);
+		if (g->ops.gr.set_czf_bypass)
+			g->ops.gr.set_czf_bypass(g, c);
+
+		/* PM ctxt switch is off by default */
+		gr_ctx->pm_ctx.pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
+	} else {
+		/* commit gr ctx buffer */
+		err = g->ops.gr.commit_inst(c, gr_ctx->mem.gpu_va);
+		if (err) {
+			nvgpu_err(g,
+				"fail to commit gr ctx buffer");
+			goto out;
+		}
+	}
 
 	gk20a_dbg_fn("done");
 	return 0;
@@ -3553,8 +3532,14 @@ u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr)
 int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
 			struct channel_gk20a *c, u64 zcull_va, u32 mode)
 {
-	struct zcull_ctx_desc *zcull_ctx = &c->ch_ctx.zcull_ctx;
+	struct tsg_gk20a *tsg;
+	struct zcull_ctx_desc *zcull_ctx;
 
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	zcull_ctx = &tsg->gr_ctx.zcull_ctx;
 	zcull_ctx->ctx_sw_mode = mode;
 	zcull_ctx->gpu_va = zcull_va;
 
@@ -6516,7 +6501,7 @@ void gk20a_gr_init_ovr_sm_dsm_perf(void)
  * write will actually occur. so later we should put a lazy,
  *  map-and-hold system in the patch write state */
 static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
-			    struct channel_ctx_gk20a *ch_ctx,
+			    struct channel_gk20a *ch,
 			    u32 addr, u32 data,
 			    struct nvgpu_mem *mem)
 {
@@ -6531,9 +6516,16 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 	u32 *ovr_perf_regs = NULL;
 	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
 	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
-	struct ctx_header_desc *ctx = &ch_ctx->ctx_header;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx;
+	struct ctx_header_desc *ctx = &ch->ctx_header;
 	struct nvgpu_mem *ctxheader = &ctx->mem;
 
+	tsg = tsg_gk20a_from_ch(ch);
+	if (!tsg)
+		return -EINVAL;
+
+	gr_ctx = &tsg->gr_ctx;
 	g->ops.gr.init_ovr_sm_dsm_perf();
 	g->ops.gr.init_sm_dsm_reg_info();
 	g->ops.gr.get_ovr_perf_regs(g, &num_ovr_perf_regs, &ovr_perf_regs);
@@ -6556,17 +6548,17 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 				       ctxsw_prog_main_image_patch_count_o());
 
 				if (!tmp)
-					ch_ctx->patch_ctx.data_count = 0;
+					gr_ctx->patch_ctx.data_count = 0;
 
-				gr_gk20a_ctx_patch_write(g, ch_ctx,
+				gr_gk20a_ctx_patch_write(g, gr_ctx,
 							 addr, data, true);
 
-				vaddr_lo = u64_lo32(ch_ctx->patch_ctx.mem.gpu_va);
-				vaddr_hi = u64_hi32(ch_ctx->patch_ctx.mem.gpu_va);
+				vaddr_lo = u64_lo32(gr_ctx->patch_ctx.mem.gpu_va);
+				vaddr_hi = u64_hi32(gr_ctx->patch_ctx.mem.gpu_va);
 
 				nvgpu_mem_wr(g, mem,
 					 ctxsw_prog_main_image_patch_count_o(),
-					 ch_ctx->patch_ctx.data_count);
+					 gr_ctx->patch_ctx.data_count);
 				if (ctxheader->gpu_va) {
 					/*
 					 * Main context can be gr_ctx or pm_ctx.
@@ -6575,7 +6567,7 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 					 * __gr_gk20a_exec_ctx_ops. Need to take
 					 * care of cpu access to ctxheader here.
 					 */
-					if (nvgpu_mem_begin(g, ctxheader))
+				if (nvgpu_mem_begin(g, ctxheader))
 						return -ENOMEM;
 					nvgpu_mem_wr(g, ctxheader,
 						ctxsw_prog_main_image_patch_adr_lo_o(),
@@ -7690,7 +7682,8 @@ int __gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 			    bool ch_is_curr_ctx)
 {
 	struct gk20a *g = ch->g;
-	struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx;
 	bool gr_ctx_ready = false;
 	bool pm_ctx_ready = false;
 	struct nvgpu_mem *current_mem = NULL;
@@ -7707,6 +7700,12 @@ int __gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "wr_ops=%d rd_ops=%d",
 		   num_ctx_wr_ops, num_ctx_rd_ops);
 
+	tsg = tsg_gk20a_from_ch(ch);
+	if (!tsg)
+		return -EINVAL;
+
+	gr_ctx = &tsg->gr_ctx;
+
 	if (ch_is_curr_ctx) {
 		for (pass = 0; pass < 2; pass++) {
 			ctx_op_nr = 0;
@@ -7778,7 +7777,7 @@ int __gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 	}
 	offset_addrs = offsets + max_offsets;
 
-	err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, false);
+	err = gr_gk20a_ctx_patch_write_begin(g, gr_ctx, false);
 	if (err)
 		goto cleanup;
 
@@ -7812,13 +7811,13 @@ int __gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 					 * gr_gk20a_apply_instmem_overrides,
 					 * recoded in-place instead.
 					 */
-					if (nvgpu_mem_begin(g, &ch_ctx->gr_ctx->mem)) {
+					if (nvgpu_mem_begin(g, &gr_ctx->mem)) {
 						err = -ENOMEM;
 						goto cleanup;
 					}
 					gr_ctx_ready = true;
 				}
-				current_mem = &ch_ctx->gr_ctx->mem;
+				current_mem = &gr_ctx->mem;
 			} else {
 				err = gr_gk20a_get_pm_ctx_buffer_offsets(g,
 							ctx_ops[i].offset,
@@ -7835,19 +7834,19 @@ int __gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 				}
 				if (!pm_ctx_ready) {
 					/* Make sure ctx buffer was initialized */
-					if (!nvgpu_mem_is_valid(&ch_ctx->pm_ctx.mem)) {
+					if (!nvgpu_mem_is_valid(&gr_ctx->pm_ctx.mem)) {
 						nvgpu_err(g,
 							"Invalid ctx buffer");
 						err = -EINVAL;
 						goto cleanup;
 					}
-					if (nvgpu_mem_begin(g, &ch_ctx->pm_ctx.mem)) {
+					if (nvgpu_mem_begin(g, &gr_ctx->pm_ctx.mem)) {
 						err = -ENOMEM;
 						goto cleanup;
 					}
 					pm_ctx_ready = true;
 				}
-				current_mem = &ch_ctx->pm_ctx.mem;
+				current_mem = &gr_ctx->pm_ctx.mem;
 			}
 
 			/* if this is a quad access, setup for special access*/
@@ -7860,7 +7859,7 @@ int __gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 				/* sanity check gr ctxt offsets,
 				 * don't write outside, worst case
 				 */
-				if ((current_mem == &ch_ctx->gr_ctx->mem) &&
+				if ((current_mem == &gr_ctx->mem) &&
 					(offsets[j] >= g->gr.ctx_vars.golden_image_size))
 					continue;
 				if (pass == 0) { /* write pass */
@@ -7886,7 +7885,7 @@ int __gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 
 					/* check to see if we need to add a special WAR
 					   for some of the SMPC perf regs */
-					gr_gk20a_ctx_patch_smpc(g, ch_ctx, offset_addrs[j],
+					gr_gk20a_ctx_patch_smpc(g, ch, offset_addrs[j],
 							v, current_mem);
 
 				} else { /* read pass */
@@ -7915,12 +7914,12 @@ int __gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 	if (offsets)
 		nvgpu_kfree(g, offsets);
 
-	if (ch_ctx->patch_ctx.mem.cpu_va)
-		gr_gk20a_ctx_patch_write_end(g, ch_ctx, gr_ctx_ready);
+	if (gr_ctx->patch_ctx.mem.cpu_va)
+		gr_gk20a_ctx_patch_write_end(g, gr_ctx, gr_ctx_ready);
 	if (gr_ctx_ready)
-		nvgpu_mem_end(g, &ch_ctx->gr_ctx->mem);
+		nvgpu_mem_end(g, &gr_ctx->mem);
 	if (pm_ctx_ready)
-		nvgpu_mem_end(g, &ch_ctx->pm_ctx.mem);
+		nvgpu_mem_end(g, &gr_ctx->pm_ctx.mem);
 
 	return err;
 }
@@ -7962,23 +7961,23 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 }
 
 void gr_gk20a_commit_global_pagepool(struct gk20a *g,
-					    struct channel_ctx_gk20a *ch_ctx,
+					    struct nvgpu_gr_ctx *gr_ctx,
 					    u64 addr, u32 size, bool patch)
 {
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_base_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_pagepool_base_r(),
 		gr_scc_pagepool_base_addr_39_8_f(addr), patch);
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_pagepool_r(),
 		gr_scc_pagepool_total_pages_f(size) |
 		gr_scc_pagepool_valid_true_f(), patch);
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_base_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gcc_pagepool_base_r(),
 		gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gcc_pagepool_r(),
 		gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_pagepool_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_pd_pagepool_r(),
 		gr_pd_pagepool_total_pages_f(size) |
 		gr_pd_pagepool_valid_true_f(), patch);
 }
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index 1c22923b..6cc15c94 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -28,7 +28,6 @@
 #include "gr_t19x.h"
 #endif
 
-#include "tsg_gk20a.h"
 #include "gr_ctx_gk20a.h"
 #include "mm_gk20a.h"
 
@@ -48,6 +47,10 @@
 
 #define GK20A_TIMEOUT_FPGA		100000 /* 100 sec */
 
+/* Flags to be passed to g->ops.gr.alloc_obj_ctx() */
+#define NVGPU_OBJ_CTX_FLAGS_SUPPORT_GFXP		(1 << 1)
+#define NVGPU_OBJ_CTX_FLAGS_SUPPORT_CILP		(1 << 2)
+
 /*
  * allocate a minimum of 1 page (4KB) worth of patch space, this is 512 entries
  * of address and data pairs
@@ -64,6 +67,7 @@
 #define NVGPU_PREEMPTION_MODE_COMPUTE_CTA	(1 << 1)
 #define NVGPU_PREEMPTION_MODE_COMPUTE_CILP	(1 << 2)
 
+struct tsg_gk20a;
 struct channel_gk20a;
 struct nvgpu_warpstate;
 
@@ -433,7 +437,12 @@ struct gr_gk20a {
 
 void gk20a_fecs_dump_falcon_stats(struct gk20a *g);
 
-struct gr_ctx_desc {
+struct ctx_header_desc {
+	struct nvgpu_mem mem;
+};
+
+/* contexts associated with a TSG */
+struct nvgpu_gr_ctx {
 	struct nvgpu_mem mem;
 
 	u32 graphics_preempt_mode;
@@ -452,10 +461,16 @@ struct gr_ctx_desc {
 	u64 virt_ctx;
 #endif
 	bool golden_img_loaded;
-};
 
-struct ctx_header_desc {
-	struct nvgpu_mem mem;
+	struct patch_desc	patch_ctx;
+	struct zcull_ctx_desc	zcull_ctx;
+	struct pm_ctx_desc	pm_ctx;
+	u64	global_ctx_buffer_va[NR_GLOBAL_CTX_BUF_VA];
+	u64	global_ctx_buffer_size[NR_GLOBAL_CTX_BUF_VA];
+	int	global_ctx_buffer_index[NR_GLOBAL_CTX_BUF_VA];
+	bool	global_ctx_buffer_mapped;
+
+	u32 tsgid;
 };
 
 struct gk20a_ctxsw_ucode_segment {
@@ -552,7 +567,6 @@ int gk20a_init_gr_channel(struct channel_gk20a *ch_gk20a);
 int gr_gk20a_init_ctx_vars(struct gk20a *g, struct gr_gk20a *gr);
 
 int gk20a_alloc_obj_ctx(struct channel_gk20a  *c, u32 class_num, u32 flags);
-void gk20a_free_channel_ctx(struct channel_gk20a *c, bool is_tsg);
 
 int gk20a_gr_isr(struct gk20a *g);
 int gk20a_gr_nonstall_isr(struct gk20a *g);
@@ -633,17 +647,17 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 				  struct channel_gk20a *c,
 				  bool enable_hwpm_ctxsw);
 
-struct channel_ctx_gk20a;
-void gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_ctx_gk20a *ch_ctx,
+struct nvgpu_gr_ctx;
+void gr_gk20a_ctx_patch_write(struct gk20a *g, struct nvgpu_gr_ctx *ch_ctx,
 				    u32 addr, u32 data, bool patch);
 int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
-					  struct channel_ctx_gk20a *ch_ctx,
+					  struct nvgpu_gr_ctx *ch_ctx,
 					  bool update_patch_count);
 void gr_gk20a_ctx_patch_write_end(struct gk20a *g,
-					struct channel_ctx_gk20a *ch_ctx,
+					struct nvgpu_gr_ctx *ch_ctx,
 					bool update_patch_count);
 void gr_gk20a_commit_global_pagepool(struct gk20a *g,
-				     struct channel_ctx_gk20a *ch_ctx,
+				     struct nvgpu_gr_ctx *ch_ctx,
 				     u64 addr, u32 size, bool patch);
 void gk20a_gr_set_shader_exceptions(struct gk20a *g, u32 data);
 void gr_gk20a_enable_hww_exceptions(struct gk20a *g);
@@ -694,10 +708,10 @@ int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
 int gr_gk20a_submit_fecs_sideband_method_op(struct gk20a *g,
 		struct fecs_method_op_gk20a op);
 int gr_gk20a_alloc_gr_ctx(struct gk20a *g,
-			  struct gr_ctx_desc **__gr_ctx, struct vm_gk20a *vm,
+			  struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm,
 			  u32 class, u32 padding);
 void gr_gk20a_free_gr_ctx(struct gk20a *g,
-			  struct vm_gk20a *vm, struct gr_ctx_desc *gr_ctx);
+		       struct vm_gk20a *vm, struct nvgpu_gr_ctx *gr_ctx);
 int gr_gk20a_halt_pipe(struct gk20a *g);
 
 #if defined(CONFIG_GK20A_CYCLE_STATS)
diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
index d9ddc011..19d0ecce 100644
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
@@ -280,7 +280,6 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g)
 	tsg->num_active_channels = 0;
 	nvgpu_ref_init(&tsg->refcount);
 
-	tsg->tsg_gr_ctx = NULL;
 	tsg->vm = NULL;
 	tsg->interleave_level = NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_LOW;
 	tsg->timeslice_us = 0;
@@ -319,10 +318,8 @@ void gk20a_tsg_release(struct nvgpu_ref *ref)
 	if (g->ops.fifo.tsg_release)
 		g->ops.fifo.tsg_release(tsg);
 
-	if (tsg->tsg_gr_ctx) {
+	if (nvgpu_mem_is_valid(&tsg->gr_ctx.mem))
 		gr_gk20a_free_tsg_gr_ctx(tsg);
-		tsg->tsg_gr_ctx = NULL;
-	}
 
 	if (g->ops.fifo.deinit_eng_method_buffers)
 		g->ops.fifo.deinit_eng_method_buffers(g, tsg);
diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
index 08fe0365..2168cb4f 100644
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
@@ -26,6 +26,8 @@
 #include <nvgpu/kref.h>
 #include <nvgpu/rwsem.h>
 
+#include "gr_gk20a.h"
+
 #ifdef CONFIG_TEGRA_19x_GPU
 #include "tsg_t19x.h"
 #endif
@@ -56,8 +58,6 @@ struct tsg_gk20a {
 	unsigned int timeslice_timeout;
 	unsigned int timeslice_scale;
 
-	struct gr_ctx_desc *tsg_gr_ctx;
-
 	struct vm_gk20a *vm;
 
 	u32 interleave_level;
@@ -71,6 +71,8 @@ struct tsg_gk20a {
 #ifdef CONFIG_TEGRA_19x_GPU
 	struct tsg_t19x t19x;
 #endif
+
+	struct nvgpu_gr_ctx gr_ctx;
 };
 
 int gk20a_enable_tsg(struct tsg_gk20a *tsg);
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
index 36fad8b3..a2434320 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
@@ -124,7 +124,7 @@ int gr_gm20b_calc_global_ctx_buffer_size(struct gk20a *g)
 }
 
 void gr_gm20b_commit_global_attrib_cb(struct gk20a *g,
-				      struct channel_ctx_gk20a *ch_ctx,
+				      struct nvgpu_gr_ctx *ch_ctx,
 				      u64 addr, bool patch)
 {
 	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_attrib_cb_base_r(),
@@ -141,7 +141,7 @@ void gr_gm20b_commit_global_attrib_cb(struct gk20a *g,
 }
 
 void gr_gm20b_commit_global_bundle_cb(struct gk20a *g,
-					    struct channel_ctx_gk20a *ch_ctx,
+					    struct nvgpu_gr_ctx *ch_ctx,
 					    u64 addr, u64 size, bool patch)
 {
 	u32 data;
@@ -180,7 +180,8 @@ int gr_gm20b_commit_global_cb_manager(struct gk20a *g,
 			struct channel_gk20a *c, bool patch)
 {
 	struct gr_gk20a *gr = &g->gr;
-	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *ch_ctx;
 	u32 attrib_offset_in_chunk = 0;
 	u32 alpha_offset_in_chunk = 0;
 	u32 pd_ab_max_output;
@@ -193,6 +194,12 @@ int gr_gm20b_commit_global_cb_manager(struct gk20a *g,
 
 	gk20a_dbg_fn("");
 
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	ch_ctx = &tsg->gr_ctx;
+
 	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_tga_constraintlogic_r(),
 		gr_ds_tga_constraintlogic_beta_cbsize_f(gr->attrib_cb_default_size) |
 		gr_ds_tga_constraintlogic_alpha_cbsize_f(gr->alpha_cb_default_size),
@@ -257,7 +264,7 @@ int gr_gm20b_commit_global_cb_manager(struct gk20a *g,
 }
 
 void gr_gm20b_commit_global_pagepool(struct gk20a *g,
-					    struct channel_ctx_gk20a *ch_ctx,
+					    struct nvgpu_gr_ctx *ch_ctx,
 					    u64 addr, u32 size, bool patch)
 {
 	gr_gk20a_commit_global_pagepool(g, ch_ctx, addr, size, patch);
@@ -845,7 +852,7 @@ u32 gr_gm20b_pagepool_default_size(struct gk20a *g)
 }
 
 int gr_gm20b_alloc_gr_ctx(struct gk20a *g,
-			  struct gr_ctx_desc **gr_ctx, struct vm_gk20a *vm,
+			  struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm,
 			  u32 class,
 			  u32 flags)
 {
@@ -858,7 +865,7 @@ int gr_gm20b_alloc_gr_ctx(struct gk20a *g,
 		return err;
 
 	if (class == MAXWELL_COMPUTE_B)
-		(*gr_ctx)->compute_preempt_mode = NVGPU_PREEMPTION_MODE_COMPUTE_CTA;
+		gr_ctx->compute_preempt_mode = NVGPU_PREEMPTION_MODE_COMPUTE_CTA;
 
 	gk20a_dbg_fn("done");
 
@@ -866,15 +873,21 @@ int gr_gm20b_alloc_gr_ctx(struct gk20a *g,
 }
 
 void gr_gm20b_update_ctxsw_preemption_mode(struct gk20a *g,
-		struct channel_ctx_gk20a *ch_ctx,
+		struct channel_gk20a *c,
 		struct nvgpu_mem *mem)
 {
-	struct gr_ctx_desc *gr_ctx = ch_ctx->gr_ctx;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx;
 	u32 cta_preempt_option =
 		ctxsw_prog_main_image_preemption_options_control_cta_enabled_f();
 
 	gk20a_dbg_fn("");
 
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return;
+
+	gr_ctx = &tsg->gr_ctx;
 	if (gr_ctx->compute_preempt_mode == NVGPU_PREEMPTION_MODE_COMPUTE_CTA) {
 		gk20a_dbg_info("CTA: %x", cta_preempt_option);
 		nvgpu_mem_wr(g, mem,
@@ -1026,16 +1039,22 @@ int gr_gm20b_dump_gr_status_regs(struct gk20a *g,
 int gr_gm20b_update_pc_sampling(struct channel_gk20a *c,
 				       bool enable)
 {
-	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx;
 	struct nvgpu_mem *mem;
 	u32 v;
 
 	gk20a_dbg_fn("");
 
-	if (!ch_ctx || !ch_ctx->gr_ctx || c->vpr)
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	gr_ctx = &tsg->gr_ctx;
+	mem = &gr_ctx->mem;
+	if (!nvgpu_mem_is_valid(mem) || c->vpr)
 		return -EINVAL;
 
-	mem = &ch_ctx->gr_ctx->mem;
 
 	if (nvgpu_mem_begin(c->g, mem))
 		return -ENOMEM;
@@ -1289,12 +1308,19 @@ int gm20b_gr_update_sm_error_state(struct gk20a *g,
 {
 	u32 gpc, tpc, offset;
 	struct gr_gk20a *gr = &g->gr;
-	struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *ch_ctx;
 	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
 	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g,
 					       GPU_LIT_TPC_IN_GPC_STRIDE);
 	int err = 0;
 
+	tsg = tsg_gk20a_from_ch(ch);
+	if (!tsg)
+		return -EINVAL;
+
+	ch_ctx = &tsg->gr_ctx;
+
 	nvgpu_mutex_acquire(&g->dbg_sessions_lock);
 
 	gr->sm_error_states[sm_id].hww_global_esr =
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
index 18e6b032..bddf6412 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
@@ -46,7 +46,7 @@ enum {
 #define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE 0
 
 void gr_gm20b_commit_global_attrib_cb(struct gk20a *g,
-				      struct channel_ctx_gk20a *ch_ctx,
+				      struct nvgpu_gr_ctx *ch_ctx,
 				      u64 addr, bool patch);
 int gr_gm20b_init_fs_state(struct gk20a *g);
 int gm20b_gr_tpc_disable_override(struct gk20a *g, u32 mask);
@@ -57,12 +57,12 @@ void gr_gm20b_bundle_cb_defaults(struct gk20a *g);
 void gr_gm20b_cb_size_default(struct gk20a *g);
 int gr_gm20b_calc_global_ctx_buffer_size(struct gk20a *g);
 void gr_gm20b_commit_global_bundle_cb(struct gk20a *g,
-					    struct channel_ctx_gk20a *ch_ctx,
+					    struct nvgpu_gr_ctx *ch_ctx,
 					    u64 addr, u64 size, bool patch);
 int gr_gm20b_commit_global_cb_manager(struct gk20a *g,
 			struct channel_gk20a *c, bool patch);
 void gr_gm20b_commit_global_pagepool(struct gk20a *g,
-					    struct channel_ctx_gk20a *ch_ctx,
+					    struct nvgpu_gr_ctx *ch_ctx,
 					    u64 addr, u32 size, bool patch);
 int gr_gm20b_handle_sw_method(struct gk20a *g, u32 addr,
 					  u32 class_num, u32 offset, u32 data);
@@ -96,11 +96,11 @@ int gr_gm20b_load_ctxsw_ucode(struct gk20a *g);
 void gr_gm20b_detect_sm_arch(struct gk20a *g);
 u32 gr_gm20b_pagepool_default_size(struct gk20a *g);
 int gr_gm20b_alloc_gr_ctx(struct gk20a *g,
-			  struct gr_ctx_desc **gr_ctx, struct vm_gk20a *vm,
+			  struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm,
 			  u32 class,
 			  u32 flags);
 void gr_gm20b_update_ctxsw_preemption_mode(struct gk20a *g,
-		struct channel_ctx_gk20a *ch_ctx,
+		struct channel_gk20a *c,
 		struct nvgpu_mem *mem);
 int gr_gm20b_dump_gr_status_regs(struct gk20a *g,
 			   struct gk20a_debug_output *o);
diff --git a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
index c29f7267..3ee22ed1 100644
--- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
@@ -226,7 +226,6 @@ static const struct gpu_ops gm20b_ops = {
 		.load_ctxsw_ucode = gr_gk20a_load_ctxsw_ucode,
 		.set_gpc_tpc_mask = gr_gm20b_set_gpc_tpc_mask,
 		.get_gpc_tpc_mask = gr_gm20b_get_gpc_tpc_mask,
-		.free_channel_ctx = gk20a_free_channel_ctx,
 		.alloc_obj_ctx = gk20a_alloc_obj_ctx,
 		.bind_ctxsw_zcull = gr_gk20a_bind_ctxsw_zcull,
 		.get_zcull_info = gr_gk20a_get_zcull_info,
diff --git a/drivers/gpu/nvgpu/gp106/gr_gp106.c b/drivers/gpu/nvgpu/gp106/gr_gp106.c
index bedc0b78..02cecf53 100644
--- a/drivers/gpu/nvgpu/gp106/gr_gp106.c
+++ b/drivers/gpu/nvgpu/gp106/gr_gp106.c
@@ -135,7 +135,7 @@ void gr_gp106_cb_size_default(struct gk20a *g)
 }
 
 int gr_gp106_set_ctxsw_preemption_mode(struct gk20a *g,
-				struct gr_ctx_desc *gr_ctx,
+				struct nvgpu_gr_ctx *gr_ctx,
 				struct vm_gk20a *vm, u32 class,
 				u32 graphics_preempt_mode,
 				u32 compute_preempt_mode)
diff --git a/drivers/gpu/nvgpu/gp106/gr_gp106.h b/drivers/gpu/nvgpu/gp106/gr_gp106.h
index 9f76e4ac..491ced4e 100644
--- a/drivers/gpu/nvgpu/gp106/gr_gp106.h
+++ b/drivers/gpu/nvgpu/gp106/gr_gp106.h
@@ -38,7 +38,7 @@ int gr_gp106_handle_sw_method(struct gk20a *g, u32 addr,
 				     u32 class_num, u32 offset, u32 data);
 void gr_gp106_cb_size_default(struct gk20a *g);
 int gr_gp106_set_ctxsw_preemption_mode(struct gk20a *g,
-				struct gr_ctx_desc *gr_ctx,
+				struct nvgpu_gr_ctx *gr_ctx,
 				struct vm_gk20a *vm, u32 class,
 				u32 graphics_preempt_mode,
 				u32 compute_preempt_mode);
diff --git a/drivers/gpu/nvgpu/gp106/hal_gp106.c b/drivers/gpu/nvgpu/gp106/hal_gp106.c
index 1498d1c0..3073668e 100644
--- a/drivers/gpu/nvgpu/gp106/hal_gp106.c
+++ b/drivers/gpu/nvgpu/gp106/hal_gp106.c
@@ -272,7 +272,6 @@ static const struct gpu_ops gp106_ops = {
 		.falcon_load_ucode = gr_gm20b_load_ctxsw_ucode_segments,
 		.set_gpc_tpc_mask = gr_gp10b_set_gpc_tpc_mask,
 		.get_gpc_tpc_mask = gr_gm20b_get_gpc_tpc_mask,
-		.free_channel_ctx = gk20a_free_channel_ctx,
 		.alloc_obj_ctx = gk20a_alloc_obj_ctx,
 		.bind_ctxsw_zcull = gr_gk20a_bind_ctxsw_zcull,
 		.get_zcull_info = gr_gk20a_get_zcull_info,
diff --git a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
index 56acc732..549a4da4 100644
--- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
@@ -389,9 +389,9 @@ int gr_gp10b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc,
 int gr_gp10b_commit_global_cb_manager(struct gk20a *g,
 			struct channel_gk20a *c, bool patch)
 {
+	struct tsg_gk20a *tsg;
 	struct gr_gk20a *gr = &g->gr;
-	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
-	struct gr_ctx_desc *gr_ctx = ch_ctx->gr_ctx;
+	struct nvgpu_gr_ctx *gr_ctx;
 	u32 attrib_offset_in_chunk = 0;
 	u32 alpha_offset_in_chunk = 0;
 	u32 pd_ab_max_output;
@@ -405,6 +405,12 @@ int gr_gp10b_commit_global_cb_manager(struct gk20a *g,
 
 	gk20a_dbg_fn("");
 
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	gr_ctx = &tsg->gr_ctx;
+
 	if (gr_ctx->graphics_preempt_mode == NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP) {
 		attrib_size_in_chunk = gr->attrib_cb_gfxp_size;
 		cb_attrib_cache_size_init = gr->attrib_cb_gfxp_default_size;
@@ -413,9 +419,9 @@ int gr_gp10b_commit_global_cb_manager(struct gk20a *g,
 		cb_attrib_cache_size_init = gr->attrib_cb_default_size;
 	}
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_tga_constraintlogic_beta_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_ds_tga_constraintlogic_beta_r(),
 		gr->attrib_cb_default_size, patch);
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_tga_constraintlogic_alpha_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_ds_tga_constraintlogic_alpha_r(),
 		gr->alpha_cb_default_size, patch);
 
 	pd_ab_max_output = (gr->alpha_cb_default_size *
@@ -423,11 +429,11 @@ int gr_gp10b_commit_global_cb_manager(struct gk20a *g,
 		gr_pd_ab_dist_cfg1_max_output_granularity_v();
 
 	if (g->gr.pd_max_batches) {
-		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg1_r(),
+		gr_gk20a_ctx_patch_write(g, gr_ctx, gr_pd_ab_dist_cfg1_r(),
 			gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) |
 			gr_pd_ab_dist_cfg1_max_batches_f(g->gr.pd_max_batches), patch);
 	} else {
-		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg1_r(),
+		gr_gk20a_ctx_patch_write(g, gr_ctx, gr_pd_ab_dist_cfg1_r(),
 			gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) |
 			gr_pd_ab_dist_cfg1_max_batches_init_f(), patch);
 	}
@@ -447,17 +453,17 @@ int gr_gp10b_commit_global_cb_manager(struct gk20a *g,
 			cbm_cfg_size_steadystate = gr->attrib_cb_default_size *
 				gr->pes_tpc_count[ppc_index][gpc_index];
 
-			gr_gk20a_ctx_patch_write(g, ch_ctx,
+			gr_gk20a_ctx_patch_write(g, gr_ctx,
 				gr_gpc0_ppc0_cbm_beta_cb_size_r() + temp +
 				ppc_in_gpc_stride * ppc_index,
 				cbm_cfg_size_beta, patch);
 
-			gr_gk20a_ctx_patch_write(g, ch_ctx,
+			gr_gk20a_ctx_patch_write(g, gr_ctx,
 				gr_gpc0_ppc0_cbm_beta_cb_offset_r() + temp +
 				ppc_in_gpc_stride * ppc_index,
 				attrib_offset_in_chunk, patch);
 
-			gr_gk20a_ctx_patch_write(g, ch_ctx,
+			gr_gk20a_ctx_patch_write(g, gr_ctx,
 				gr_gpc0_ppc0_cbm_beta_steady_state_cb_size_r() + temp +
 				ppc_in_gpc_stride * ppc_index,
 				cbm_cfg_size_steadystate,
@@ -466,12 +472,12 @@ int gr_gp10b_commit_global_cb_manager(struct gk20a *g,
 			attrib_offset_in_chunk += attrib_size_in_chunk *
 				gr->pes_tpc_count[ppc_index][gpc_index];
 
-			gr_gk20a_ctx_patch_write(g, ch_ctx,
+			gr_gk20a_ctx_patch_write(g, gr_ctx,
 				gr_gpc0_ppc0_cbm_alpha_cb_size_r() + temp +
 				ppc_in_gpc_stride * ppc_index,
 				cbm_cfg_size_alpha, patch);
 
-			gr_gk20a_ctx_patch_write(g, ch_ctx,
+			gr_gk20a_ctx_patch_write(g, gr_ctx,
 				gr_gpc0_ppc0_cbm_alpha_cb_offset_r() + temp +
 				ppc_in_gpc_stride * ppc_index,
 				alpha_offset_in_chunk, patch);
@@ -479,7 +485,7 @@ int gr_gp10b_commit_global_cb_manager(struct gk20a *g,
 			alpha_offset_in_chunk += gr->alpha_cb_size *
 				gr->pes_tpc_count[ppc_index][gpc_index];
 
-			gr_gk20a_ctx_patch_write(g, ch_ctx,
+			gr_gk20a_ctx_patch_write(g, gr_ctx,
 				gr_gpcs_swdx_tc_beta_cb_size_r(ppc_index + temp2),
 				gr_gpcs_swdx_tc_beta_cb_size_v_f(cbm_cfg_size_steadystate),
 				patch);
@@ -490,20 +496,20 @@ int gr_gp10b_commit_global_cb_manager(struct gk20a *g,
 }
 
 void gr_gp10b_commit_global_pagepool(struct gk20a *g,
-					    struct channel_ctx_gk20a *ch_ctx,
+					    struct nvgpu_gr_ctx *gr_ctx,
 					    u64 addr, u32 size, bool patch)
 {
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_base_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_pagepool_base_r(),
 		gr_scc_pagepool_base_addr_39_8_f(addr), patch);
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_pagepool_r(),
 		gr_scc_pagepool_total_pages_f(size) |
 		gr_scc_pagepool_valid_true_f(), patch);
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_base_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gcc_pagepool_base_r(),
 		gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gcc_pagepool_r(),
 		gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
 }
 
@@ -947,7 +953,7 @@ fail_free:
 }
 
 int gr_gp10b_set_ctxsw_preemption_mode(struct gk20a *g,
-				struct gr_ctx_desc *gr_ctx,
+				struct nvgpu_gr_ctx *gr_ctx,
 				struct vm_gk20a *vm, u32 class,
 				u32 graphics_preempt_mode,
 				u32 compute_preempt_mode)
@@ -1071,7 +1077,7 @@ fail:
 }
 
 int gr_gp10b_alloc_gr_ctx(struct gk20a *g,
-			  struct gr_ctx_desc **gr_ctx, struct vm_gk20a *vm,
+			  struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm,
 			  u32 class,
 			  u32 flags)
 {
@@ -1085,7 +1091,7 @@ int gr_gp10b_alloc_gr_ctx(struct gk20a *g,
 	if (err)
 		return err;
 
-	(*gr_ctx)->ctx_id_valid = false;
+	gr_ctx->ctx_id_valid = false;
 
 	if (flags & NVGPU_OBJ_CTX_FLAGS_SUPPORT_GFXP)
 		graphics_preempt_mode = NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP;
@@ -1094,7 +1100,7 @@ int gr_gp10b_alloc_gr_ctx(struct gk20a *g,
 
 	if (graphics_preempt_mode || compute_preempt_mode) {
 		if (g->ops.gr.set_ctxsw_preemption_mode) {
-			err = g->ops.gr.set_ctxsw_preemption_mode(g, *gr_ctx, vm,
+			err = g->ops.gr.set_ctxsw_preemption_mode(g, gr_ctx, vm,
 			    class, graphics_preempt_mode, compute_preempt_mode);
 			if (err) {
 				nvgpu_err(g, "set_ctxsw_preemption_mode failed");
@@ -1109,14 +1115,13 @@ int gr_gp10b_alloc_gr_ctx(struct gk20a *g,
 	return 0;
 
 fail_free_gk20a_ctx:
-	gr_gk20a_free_gr_ctx(g, vm, *gr_ctx);
-	*gr_ctx = NULL;
+	gr_gk20a_free_gr_ctx(g, vm, gr_ctx);
 
 	return err;
 }
 
 void gr_gp10b_dump_ctxsw_stats(struct gk20a *g, struct vm_gk20a *vm,
-			       struct gr_ctx_desc *gr_ctx)
+			       struct nvgpu_gr_ctx *gr_ctx)
 {
 	struct nvgpu_mem *mem = &gr_ctx->mem;
 
@@ -1168,13 +1173,13 @@ void gr_gp10b_dump_ctxsw_stats(struct gk20a *g, struct vm_gk20a *vm,
 }
 
 void gr_gp10b_update_ctxsw_preemption_mode(struct gk20a *g,
-		struct channel_ctx_gk20a *ch_ctx,
+		struct channel_gk20a *c,
 		struct nvgpu_mem *mem)
 {
-	struct gr_ctx_desc *gr_ctx = ch_ctx->gr_ctx;
-	struct ctx_header_desc *ctx = &ch_ctx->ctx_header;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx;
+	struct ctx_header_desc *ctx = &c->ctx_header;
 	struct nvgpu_mem *ctxheader = &ctx->mem;
-
 	u32 gfxp_preempt_option =
 		ctxsw_prog_main_image_graphics_preemption_options_control_gfxp_f();
 	u32 cilp_preempt_option =
@@ -1185,6 +1190,12 @@ void gr_gp10b_update_ctxsw_preemption_mode(struct gk20a *g,
 
 	gk20a_dbg_fn("");
 
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return;
+
+	gr_ctx = &tsg->gr_ctx;
+
 	if (gr_ctx->graphics_preempt_mode == NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP) {
 		gk20a_dbg_info("GfxP: %x", gfxp_preempt_option);
 		nvgpu_mem_wr(g, mem,
@@ -1220,7 +1231,7 @@ void gr_gp10b_update_ctxsw_preemption_mode(struct gk20a *g,
 				gr_ctx->preempt_ctxsw_buffer.gpu_va);
 		}
 
-		err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, true);
+		err = gr_gk20a_ctx_patch_write_begin(g, gr_ctx, true);
 		if (err) {
 			nvgpu_err(g, "can't map patch context");
 			goto out;
@@ -1232,7 +1243,7 @@ void gr_gp10b_update_ctxsw_preemption_mode(struct gk20a *g,
 			 (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
 
 		gk20a_dbg_info("attrib cb addr : 0x%016x", addr);
-		g->ops.gr.commit_global_attrib_cb(g, ch_ctx, addr, true);
+		g->ops.gr.commit_global_attrib_cb(g, gr_ctx, addr, true);
 
 		addr = (u64_lo32(gr_ctx->pagepool_ctxsw_buffer.gpu_va) >>
 			gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
@@ -1243,7 +1254,7 @@ void gr_gp10b_update_ctxsw_preemption_mode(struct gk20a *g,
 		if (size == g->ops.gr.pagepool_default_size(g))
 			size = gr_scc_pagepool_total_pages_hwmax_v();
 
-		g->ops.gr.commit_global_pagepool(g, ch_ctx, addr, size, true);
+		g->ops.gr.commit_global_pagepool(g, gr_ctx, addr, size, true);
 
 		addr = (u64_lo32(gr_ctx->spill_ctxsw_buffer.gpu_va) >>
 			gr_gpc0_swdx_rm_spill_buffer_addr_39_8_align_bits_v()) |
@@ -1252,28 +1263,28 @@ void gr_gp10b_update_ctxsw_preemption_mode(struct gk20a *g,
 		size = gr_ctx->spill_ctxsw_buffer.size /
 			gr_gpc0_swdx_rm_spill_buffer_size_256b_byte_granularity_v();
 
-		gr_gk20a_ctx_patch_write(g, ch_ctx,
+		gr_gk20a_ctx_patch_write(g, gr_ctx,
 				gr_gpc0_swdx_rm_spill_buffer_addr_r(),
 				gr_gpc0_swdx_rm_spill_buffer_addr_39_8_f(addr),
 				true);
-		gr_gk20a_ctx_patch_write(g, ch_ctx,
+		gr_gk20a_ctx_patch_write(g, gr_ctx,
 				gr_gpc0_swdx_rm_spill_buffer_size_r(),
 				gr_gpc0_swdx_rm_spill_buffer_size_256b_f(size),
 				true);
 
 		cbes_reserve = gr_gpcs_swdx_beta_cb_ctrl_cbes_reserve_gfxp_v();
-		gr_gk20a_ctx_patch_write(g, ch_ctx,
+		gr_gk20a_ctx_patch_write(g, gr_ctx,
 				gr_gpcs_swdx_beta_cb_ctrl_r(),
 				gr_gpcs_swdx_beta_cb_ctrl_cbes_reserve_f(
 					cbes_reserve),
 				true);
-		gr_gk20a_ctx_patch_write(g, ch_ctx,
+		gr_gk20a_ctx_patch_write(g, gr_ctx,
 				gr_gpcs_ppcs_cbm_beta_cb_ctrl_r(),
 				gr_gpcs_ppcs_cbm_beta_cb_ctrl_cbes_reserve_f(
 					cbes_reserve),
 				true);
 
-		gr_gk20a_ctx_patch_write_end(g, ch_ctx, true);
+		gr_gk20a_ctx_patch_write_end(g, gr_ctx, true);
 	}
 
 out:
@@ -1478,10 +1489,9 @@ int gr_gp10b_wait_empty(struct gk20a *g, unsigned long duration_ms,
 }
 
 void gr_gp10b_commit_global_attrib_cb(struct gk20a *g,
-					     struct channel_ctx_gk20a *ch_ctx,
+					     struct nvgpu_gr_ctx *gr_ctx,
 					     u64 addr, bool patch)
 {
-	struct gr_ctx_desc *gr_ctx = ch_ctx->gr_ctx;
 	int attrBufferSize;
 
 	if (gr_ctx->preempt_ctxsw_buffer.gpu_va)
@@ -1491,37 +1501,37 @@ void gr_gp10b_commit_global_attrib_cb(struct gk20a *g,
 
 	attrBufferSize /= gr_gpcs_tpcs_tex_rm_cb_1_size_div_128b_granularity_f();
 
-	gr_gm20b_commit_global_attrib_cb(g, ch_ctx, addr, patch);
+	gr_gm20b_commit_global_attrib_cb(g, gr_ctx, addr, patch);
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_cb_global_base_addr_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_mpc_vtg_cb_global_base_addr_r(),
 		gr_gpcs_tpcs_mpc_vtg_cb_global_base_addr_v_f(addr) |
 		gr_gpcs_tpcs_mpc_vtg_cb_global_base_addr_valid_true_f(), patch);
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_tex_rm_cb_0_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_tex_rm_cb_0_r(),
 		gr_gpcs_tpcs_tex_rm_cb_0_base_addr_43_12_f(addr), patch);
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_tex_rm_cb_1_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_tex_rm_cb_1_r(),
 		gr_gpcs_tpcs_tex_rm_cb_1_size_div_128b_f(attrBufferSize) |
 		gr_gpcs_tpcs_tex_rm_cb_1_valid_true_f(), patch);
 }
 
 void gr_gp10b_commit_global_bundle_cb(struct gk20a *g,
-					    struct channel_ctx_gk20a *ch_ctx,
+					    struct nvgpu_gr_ctx *gr_ctx,
 					    u64 addr, u64 size, bool patch)
 {
 	u32 data;
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_base_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_bundle_cb_base_r(),
 		gr_scc_bundle_cb_base_addr_39_8_f(addr), patch);
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_size_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_bundle_cb_size_r(),
 		gr_scc_bundle_cb_size_div_256b_f(size) |
 		gr_scc_bundle_cb_size_valid_true_f(), patch);
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_swdx_bundle_cb_base_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_swdx_bundle_cb_base_r(),
 		gr_gpcs_swdx_bundle_cb_base_addr_39_8_f(addr), patch);
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_swdx_bundle_cb_size_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_swdx_bundle_cb_size_r(),
 		gr_gpcs_swdx_bundle_cb_size_div_256b_f(size) |
 		gr_gpcs_swdx_bundle_cb_size_valid_true_f(), patch);
 
@@ -1535,7 +1545,7 @@ void gr_gp10b_commit_global_bundle_cb(struct gk20a *g,
 	gk20a_dbg_info("bundle cb token limit : %d, state limit : %d",
 		   g->gr.bundle_cb_token_limit, data);
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg2_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_pd_ab_dist_cfg2_r(),
 		gr_pd_ab_dist_cfg2_token_limit_f(g->gr.bundle_cb_token_limit) |
 		gr_pd_ab_dist_cfg2_state_limit_f(data), patch);
 }
@@ -1706,14 +1716,17 @@ int gr_gp10b_set_cilp_preempt_pending(struct gk20a *g,
 			struct channel_gk20a *fault_ch)
 {
 	int ret;
-	struct gr_ctx_desc *gr_ctx = fault_ch->ch_ctx.gr_ctx;
 	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx;
 
 	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg | gpu_dbg_intr, "");
 
-	if (!gr_ctx)
+	tsg = tsg_gk20a_from_ch(fault_ch);
+	if (!tsg)
 		return -EINVAL;
 
+	gr_ctx = &tsg->gr_ctx;
+
 	if (gr_ctx->cilp_preempt_pending) {
 		gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg | gpu_dbg_intr,
 				"CILP is already pending for chid %d",
@@ -1783,13 +1796,17 @@ int gr_gp10b_set_cilp_preempt_pending(struct gk20a *g,
 static int gr_gp10b_clear_cilp_preempt_pending(struct gk20a *g,
 					       struct channel_gk20a *fault_ch)
 {
-	struct gr_ctx_desc *gr_ctx = fault_ch->ch_ctx.gr_ctx;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx;
 
 	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg | gpu_dbg_intr, "");
 
-	if (!gr_ctx)
+	tsg = tsg_gk20a_from_ch(fault_ch);
+	if (!tsg)
 		return -EINVAL;
 
+	gr_ctx = &tsg->gr_ctx;
+
 	/* The ucode is self-clearing, so all we need to do here is
 	   to clear cilp_preempt_pending. */
 	if (!gr_ctx->cilp_preempt_pending) {
@@ -1820,13 +1837,19 @@ int gr_gp10b_pre_process_sm_exception(struct gk20a *g,
 	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
 	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
 	u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
+	struct tsg_gk20a *tsg;
 
 	*early_exit = false;
 	*ignore_debugger = false;
 
-	if (fault_ch)
-		cilp_enabled = (fault_ch->ch_ctx.gr_ctx->compute_preempt_mode ==
+	if (fault_ch) {
+		tsg = tsg_gk20a_from_ch(fault_ch);
+		if (!tsg)
+			return -EINVAL;
+
+		cilp_enabled = (tsg->gr_ctx.compute_preempt_mode ==
 			NVGPU_PREEMPTION_MODE_COMPUTE_CILP);
+	}
 
 	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "SM Exception received on gpc %d tpc %d = %u\n",
 			gpc, tpc, global_esr);
@@ -1911,8 +1934,9 @@ int gr_gp10b_pre_process_sm_exception(struct gk20a *g,
 
 static int gr_gp10b_get_cilp_preempt_pending_chid(struct gk20a *g, int *__chid)
 {
-	struct gr_ctx_desc *gr_ctx;
+	struct nvgpu_gr_ctx *gr_ctx;
 	struct channel_gk20a *ch;
+	struct tsg_gk20a *tsg;
 	int chid;
 	int ret = -EINVAL;
 
@@ -1922,7 +1946,11 @@ static int gr_gp10b_get_cilp_preempt_pending_chid(struct gk20a *g, int *__chid)
 	if (!ch)
 		return ret;
 
-	gr_ctx = ch->ch_ctx.gr_ctx;
+	tsg = tsg_gk20a_from_ch(ch);
+	if (!tsg)
+		return -EINVAL;
+
+	gr_ctx = &tsg->gr_ctx;
 
 	if (gr_ctx->cilp_preempt_pending) {
 		*__chid = chid;
@@ -2022,11 +2050,17 @@ static bool gr_gp10b_suspend_context(struct channel_gk20a *ch,
 				bool *cilp_preempt_pending)
 {
 	struct gk20a *g = ch->g;
-	struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
-	struct gr_ctx_desc *gr_ctx = ch_ctx->gr_ctx;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx;
 	bool ctx_resident = false;
 	int err = 0;
 
+	tsg = tsg_gk20a_from_ch(ch);
+	if (!tsg)
+		return -EINVAL;
+
+	gr_ctx = &tsg->gr_ctx;
+
 	*cilp_preempt_pending = false;
 
 	if (gk20a_is_channel_ctx_resident(ch)) {
@@ -2097,15 +2131,22 @@ int gr_gp10b_suspend_contexts(struct gk20a *g,
 	nvgpu_mutex_release(&g->dbg_sessions_lock);
 
 	if (cilp_preempt_pending_ch) {
-		struct channel_ctx_gk20a *ch_ctx =
-				&cilp_preempt_pending_ch->ch_ctx;
-		struct gr_ctx_desc *gr_ctx = ch_ctx->gr_ctx;
+		struct tsg_gk20a *tsg;
+		struct nvgpu_gr_ctx *gr_ctx;
 		struct nvgpu_timeout timeout;
 
 		gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg | gpu_dbg_intr,
 			"CILP preempt pending, waiting %lu msecs for preemption",
 			gk20a_get_gr_idle_timeout(g));
 
+		tsg = tsg_gk20a_from_ch(cilp_preempt_pending_ch);
+		if (!tsg) {
+			err = -EINVAL;
+			goto clean_up;
+		}
+
+		gr_ctx = &tsg->gr_ctx;
+
 		nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g),
 				   NVGPU_TIMER_CPU_TIMER);
 		do {
@@ -2130,12 +2171,19 @@ clean_up:
 int gr_gp10b_set_boosted_ctx(struct channel_gk20a *ch,
 				    bool boost)
 {
-	struct gr_ctx_desc *gr_ctx = ch->ch_ctx.gr_ctx;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx;
 	struct gk20a *g = ch->g;
-	struct nvgpu_mem *mem = &gr_ctx->mem;
+	struct nvgpu_mem *mem;
 	int err = 0;
 
+	tsg = tsg_gk20a_from_ch(ch);
+	if (!tsg)
+		return -EINVAL;
+
+	gr_ctx = &tsg->gr_ctx;
 	gr_ctx->boosted_ctx = boost;
+	mem = &gr_ctx->mem;
 
 	if (nvgpu_mem_begin(g, mem))
 		return -ENOMEM;
@@ -2162,7 +2210,7 @@ unmap_ctx:
 }
 
 void gr_gp10b_update_boosted_ctx(struct gk20a *g, struct nvgpu_mem *mem,
-				       struct gr_ctx_desc *gr_ctx) {
+				       struct nvgpu_gr_ctx *gr_ctx) {
 	u32 v;
 
 	v = ctxsw_prog_main_image_pmu_options_boost_clock_frequencies_f(
@@ -2174,13 +2222,12 @@ int gr_gp10b_set_preemption_mode(struct channel_gk20a *ch,
 					u32 graphics_preempt_mode,
 					u32 compute_preempt_mode)
 {
-	struct gr_ctx_desc *gr_ctx = ch->ch_ctx.gr_ctx;
-	struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
+	struct nvgpu_gr_ctx *gr_ctx;
 	struct gk20a *g = ch->g;
 	struct tsg_gk20a *tsg;
 	struct vm_gk20a *vm;
-	struct nvgpu_mem *mem = &gr_ctx->mem;
-	struct ctx_header_desc *ctx = &ch->ch_ctx.ctx_header;
+	struct nvgpu_mem *mem;
+	struct ctx_header_desc *ctx = &ch->ctx_header;
 	struct nvgpu_mem *ctxheader = &ctx->mem;
 	u32 class;
 	int err = 0;
@@ -2189,12 +2236,13 @@ int gr_gp10b_set_preemption_mode(struct channel_gk20a *ch,
 	if (!class)
 		return -EINVAL;
 
-	if (gk20a_is_channel_marked_as_tsg(ch)) {
-		tsg = &g->fifo.tsg[ch->tsgid];
-		vm = tsg->vm;
-	} else {
-		vm = ch->vm;
-	}
+	tsg = tsg_gk20a_from_ch(ch);
+	if (!tsg)
+		return -EINVAL;
+
+	vm = tsg->vm;
+	gr_ctx = &tsg->gr_ctx;
+	mem = &gr_ctx->mem;
 
 	/* skip setting anything if both modes are already set */
 	if (graphics_preempt_mode &&
@@ -2241,15 +2289,15 @@ int gr_gp10b_set_preemption_mode(struct channel_gk20a *ch,
 
 	if (g->ops.gr.update_ctxsw_preemption_mode) {
 		g->ops.gr.update_ctxsw_preemption_mode(ch->g,
-						ch_ctx, mem);
+						ch, mem);
 
-		err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, true);
+		err = gr_gk20a_ctx_patch_write_begin(g, gr_ctx, true);
 		if (err) {
 			nvgpu_err(g, "can't map patch context");
 			goto enable_ch;
 		}
 		g->ops.gr.commit_global_cb_manager(g, ch, true);
-		gr_gk20a_ctx_patch_write_end(g, ch_ctx, true);
+		gr_gk20a_ctx_patch_write_end(g, gr_ctx, true);
 	}
 
 enable_ch:
diff --git a/drivers/gpu/nvgpu/gp10b/gr_gp10b.h b/drivers/gpu/nvgpu/gp10b/gr_gp10b.h
index e3ef6304..8d553d37 100644
--- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.h
+++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.h
@@ -29,9 +29,8 @@
 
 struct gk20a;
 struct gr_gk20a_isr_data;
-struct channel_ctx_gk20a;
+struct nvgpu_gr_ctx;
 struct zbc_entry;
-struct gr_ctx_desc;
 struct nvgpu_preemption_modes_rec;
 struct gk20a_debug_output;
 
@@ -75,7 +74,7 @@ int gr_gp10b_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc,
 int gr_gp10b_commit_global_cb_manager(struct gk20a *g,
 			struct channel_gk20a *c, bool patch);
 void gr_gp10b_commit_global_pagepool(struct gk20a *g,
-					    struct channel_ctx_gk20a *ch_ctx,
+					    struct nvgpu_gr_ctx *ch_ctx,
 					    u64 addr, u32 size, bool patch);
 u32 gr_gp10b_get_gpcs_swdx_dss_zbc_c_format_reg(struct gk20a *g);
 u32 gr_gp10b_get_gpcs_swdx_dss_zbc_z_format_reg(struct gk20a *g);
@@ -93,28 +92,28 @@ void gr_gp10b_set_alpha_circular_buffer_size(struct gk20a *g, u32 data);
 void gr_gp10b_set_circular_buffer_size(struct gk20a *g, u32 data);
 int gr_gp10b_init_ctx_state(struct gk20a *g);
 int gr_gp10b_set_ctxsw_preemption_mode(struct gk20a *g,
-				struct gr_ctx_desc *gr_ctx,
+				struct nvgpu_gr_ctx *gr_ctx,
 				struct vm_gk20a *vm, u32 class,
 				u32 graphics_preempt_mode,
 				u32 compute_preempt_mode);
 int gr_gp10b_alloc_gr_ctx(struct gk20a *g,
-			  struct gr_ctx_desc **gr_ctx, struct vm_gk20a *vm,
+			  struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm,
 			  u32 class,
 			  u32 flags);
 void gr_gp10b_update_ctxsw_preemption_mode(struct gk20a *g,
-		struct channel_ctx_gk20a *ch_ctx,
+		struct channel_gk20a *c,
 		struct nvgpu_mem *mem);
 int gr_gp10b_dump_gr_status_regs(struct gk20a *g,
 			   struct gk20a_debug_output *o);
 void gr_gp10b_dump_ctxsw_stats(struct gk20a *g, struct vm_gk20a *vm,
-			       struct gr_ctx_desc *gr_ctx);
+			       struct nvgpu_gr_ctx *gr_ctx);
 int gr_gp10b_wait_empty(struct gk20a *g, unsigned long duration_ms,
 			       u32 expect_delay);
 void gr_gp10b_commit_global_attrib_cb(struct gk20a *g,
-					     struct channel_ctx_gk20a *ch_ctx,
+					     struct nvgpu_gr_ctx *ch_ctx,
 					     u64 addr, bool patch);
 void gr_gp10b_commit_global_bundle_cb(struct gk20a *g,
-					    struct channel_ctx_gk20a *ch_ctx,
+					    struct nvgpu_gr_ctx *ch_ctx,
 					    u64 addr, u64 size, bool patch);
 int gr_gp10b_load_smid_config(struct gk20a *g);
 void gr_gp10b_set_gpc_tpc_mask(struct gk20a *g, u32 gpc_index);
@@ -133,7 +132,7 @@ int gr_gp10b_suspend_contexts(struct gk20a *g,
 int gr_gp10b_set_boosted_ctx(struct channel_gk20a *ch,
 				    bool boost);
 void gr_gp10b_update_boosted_ctx(struct gk20a *g, struct nvgpu_mem *mem,
-				       struct gr_ctx_desc *gr_ctx);
+				       struct nvgpu_gr_ctx *gr_ctx);
 int gr_gp10b_set_preemption_mode(struct channel_gk20a *ch,
 					u32 graphics_preempt_mode,
 					u32 compute_preempt_mode);
diff --git a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
index aaee595d..7041c5bd 100644
--- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
@@ -236,7 +236,6 @@ static const struct gpu_ops gp10b_ops = {
 		.load_ctxsw_ucode = gr_gk20a_load_ctxsw_ucode,
 		.set_gpc_tpc_mask = gr_gp10b_set_gpc_tpc_mask,
 		.get_gpc_tpc_mask = gr_gm20b_get_gpc_tpc_mask,
-		.free_channel_ctx = gk20a_free_channel_ctx,
 		.alloc_obj_ctx = gk20a_alloc_obj_ctx,
 		.bind_ctxsw_zcull = gr_gk20a_bind_ctxsw_zcull,
 		.get_zcull_info = gr_gk20a_get_zcull_info,
diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c
index b29a73d4..95d1f076 100644
--- a/drivers/gpu/nvgpu/gv100/hal_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c
@@ -305,7 +305,6 @@ static const struct gpu_ops gv100_ops = {
 		.load_ctxsw_ucode = gr_gm20b_load_ctxsw_ucode,
 		.set_gpc_tpc_mask = gr_gv100_set_gpc_tpc_mask,
 		.get_gpc_tpc_mask = gr_gm20b_get_gpc_tpc_mask,
-		.free_channel_ctx = gk20a_free_channel_ctx,
 		.alloc_obj_ctx = gk20a_alloc_obj_ctx,
 		.bind_ctxsw_zcull = gr_gk20a_bind_ctxsw_zcull,
 		.get_zcull_info = gr_gk20a_get_zcull_info,
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index d5924169..3030def8 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -1373,7 +1373,7 @@ fail_free:
 }
 
 int gr_gv11b_set_ctxsw_preemption_mode(struct gk20a *g,
-				struct gr_ctx_desc *gr_ctx,
+				struct nvgpu_gr_ctx *gr_ctx,
 				struct vm_gk20a *vm, u32 class,
 				u32 graphics_preempt_mode,
 				u32 compute_preempt_mode)
@@ -1497,13 +1497,13 @@ fail:
 }
 
 void gr_gv11b_update_ctxsw_preemption_mode(struct gk20a *g,
-		struct channel_ctx_gk20a *ch_ctx,
+		struct channel_gk20a *c,
 		struct nvgpu_mem *mem)
 {
-	struct gr_ctx_desc *gr_ctx = ch_ctx->gr_ctx;
-	struct ctx_header_desc *ctx = &ch_ctx->ctx_header;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx;
+	struct ctx_header_desc *ctx = &c->ctx_header;
 	struct nvgpu_mem *ctxheader = &ctx->mem;
-
 	u32 gfxp_preempt_option =
 		ctxsw_prog_main_image_graphics_preemption_options_control_gfxp_f();
 	u32 cilp_preempt_option =
@@ -1514,6 +1514,12 @@ void gr_gv11b_update_ctxsw_preemption_mode(struct gk20a *g,
 
 	gk20a_dbg_fn("");
 
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return;
+
+	gr_ctx = &tsg->gr_ctx;
+
 	if (gr_ctx->graphics_preempt_mode ==
 					NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP) {
 		gk20a_dbg_info("GfxP: %x", gfxp_preempt_option);
@@ -1552,7 +1558,7 @@ void gr_gv11b_update_ctxsw_preemption_mode(struct gk20a *g,
 				gr_ctx->preempt_ctxsw_buffer.gpu_va);
 		}
 
-		err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, true);
+		err = gr_gk20a_ctx_patch_write_begin(g, gr_ctx, true);
 		if (err) {
 			nvgpu_err(g, "can't map patch context");
 			goto out;
@@ -1564,7 +1570,7 @@ void gr_gv11b_update_ctxsw_preemption_mode(struct gk20a *g,
 			 (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
 
 		gk20a_dbg_info("attrib cb addr : 0x%016x", addr);
-		g->ops.gr.commit_global_attrib_cb(g, ch_ctx, addr, true);
+		g->ops.gr.commit_global_attrib_cb(g, gr_ctx, addr, true);
 
 		addr = (u64_lo32(gr_ctx->pagepool_ctxsw_buffer.gpu_va) >>
 			gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
@@ -1575,7 +1581,7 @@ void gr_gv11b_update_ctxsw_preemption_mode(struct gk20a *g,
 		if (size == g->ops.gr.pagepool_default_size(g))
 			size = gr_scc_pagepool_total_pages_hwmax_v();
 
-		g->ops.gr.commit_global_pagepool(g, ch_ctx, addr, size, true);
+		g->ops.gr.commit_global_pagepool(g, gr_ctx, addr, size, true);
 
 		addr = (u64_lo32(gr_ctx->spill_ctxsw_buffer.gpu_va) >>
 			gr_gpc0_swdx_rm_spill_buffer_addr_39_8_align_bits_v()) |
@@ -1584,28 +1590,28 @@ void gr_gv11b_update_ctxsw_preemption_mode(struct gk20a *g,
 		size = gr_ctx->spill_ctxsw_buffer.size /
 			gr_gpc0_swdx_rm_spill_buffer_size_256b_byte_granularity_v();
 
-		gr_gk20a_ctx_patch_write(g, ch_ctx,
+		gr_gk20a_ctx_patch_write(g, gr_ctx,
 				gr_gpc0_swdx_rm_spill_buffer_addr_r(),
 				gr_gpc0_swdx_rm_spill_buffer_addr_39_8_f(addr),
 				true);
-		gr_gk20a_ctx_patch_write(g, ch_ctx,
+		gr_gk20a_ctx_patch_write(g, gr_ctx,
 				gr_gpc0_swdx_rm_spill_buffer_size_r(),
 				gr_gpc0_swdx_rm_spill_buffer_size_256b_f(size),
 				true);
 
 		cbes_reserve = gr_gpcs_swdx_beta_cb_ctrl_cbes_reserve_gfxp_v();
-		gr_gk20a_ctx_patch_write(g, ch_ctx,
+		gr_gk20a_ctx_patch_write(g, gr_ctx,
 				gr_gpcs_swdx_beta_cb_ctrl_r(),
 				gr_gpcs_swdx_beta_cb_ctrl_cbes_reserve_f(
 					cbes_reserve),
 				true);
-		gr_gk20a_ctx_patch_write(g, ch_ctx,
+		gr_gk20a_ctx_patch_write(g, gr_ctx,
 				gr_gpcs_ppcs_cbm_beta_cb_ctrl_r(),
 				gr_gpcs_ppcs_cbm_beta_cb_ctrl_cbes_reserve_f(
 					cbes_reserve),
 				true);
 
-		gr_gk20a_ctx_patch_write_end(g, ch_ctx, true);
+		gr_gk20a_ctx_patch_write_end(g, gr_ctx, true);
 	}
 
 out:
@@ -1902,10 +1908,9 @@ int gr_gv11b_wait_empty(struct gk20a *g, unsigned long duration_ms,
 }
 
 void gr_gv11b_commit_global_attrib_cb(struct gk20a *g,
-					     struct channel_ctx_gk20a *ch_ctx,
+					     struct nvgpu_gr_ctx *gr_ctx,
 					     u64 addr, bool patch)
 {
-	struct gr_ctx_desc *gr_ctx = ch_ctx->gr_ctx;
 	int attrBufferSize;
 
 	if (gr_ctx->preempt_ctxsw_buffer.gpu_va)
@@ -1915,16 +1920,16 @@ void gr_gv11b_commit_global_attrib_cb(struct gk20a *g,
 
 	attrBufferSize /= gr_gpcs_tpcs_tex_rm_cb_1_size_div_128b_granularity_f();
 
-	gr_gm20b_commit_global_attrib_cb(g, ch_ctx, addr, patch);
+	gr_gm20b_commit_global_attrib_cb(g, gr_ctx, addr, patch);
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_cb_global_base_addr_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_mpc_vtg_cb_global_base_addr_r(),
 		gr_gpcs_tpcs_mpc_vtg_cb_global_base_addr_v_f(addr) |
 		gr_gpcs_tpcs_mpc_vtg_cb_global_base_addr_valid_true_f(), patch);
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_tex_rm_cb_0_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_tex_rm_cb_0_r(),
 		gr_gpcs_tpcs_tex_rm_cb_0_base_addr_43_12_f(addr), patch);
 
-	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_tex_rm_cb_1_r(),
+	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_tex_rm_cb_1_r(),
 		gr_gpcs_tpcs_tex_rm_cb_1_size_div_128b_f(attrBufferSize) |
 		gr_gpcs_tpcs_tex_rm_cb_1_valid_true_f(), patch);
 }
@@ -2042,6 +2047,7 @@ int gr_gv11b_pre_process_sm_exception(struct gk20a *g,
 	u32 offset = gk20a_gr_gpc_offset(g, gpc) +
 			gk20a_gr_tpc_offset(g, tpc) +
 			gv11b_gr_sm_offset(g, sm);
+	struct tsg_gk20a *tsg;
 
 	*early_exit = false;
 	*ignore_debugger = false;
@@ -2054,9 +2060,14 @@ int gr_gv11b_pre_process_sm_exception(struct gk20a *g,
 		return gr_gv11b_handle_warp_esr_error_mmu_nack(g, gpc, tpc, sm,
 				warp_esr, fault_ch);
 
-	if (fault_ch)
-		cilp_enabled = (fault_ch->ch_ctx.gr_ctx->compute_preempt_mode ==
+	if (fault_ch) {
+		tsg = tsg_gk20a_from_ch(fault_ch);
+		if (!tsg)
+			return -EINVAL;
+
+		cilp_enabled = (tsg->gr_ctx.compute_preempt_mode ==
 			NVGPU_PREEMPTION_MODE_COMPUTE_CILP);
+	}
 
 	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg,
 			"SM Exception received on gpc %d tpc %d sm %d = 0x%08x",
@@ -2509,7 +2520,7 @@ int gr_gv11b_commit_inst(struct channel_gk20a *c, u64 gpu_va)
 	if (err)
 		return err;
 
-	ctx = &c->ch_ctx.ctx_header;
+	ctx = &c->ctx_header;
 	addr_lo = u64_lo32(ctx->mem.gpu_va) >> ram_in_base_shift_v();
 	addr_hi = u64_hi32(ctx->mem.gpu_va);
 
@@ -2529,7 +2540,7 @@ int gr_gv11b_commit_inst(struct channel_gk20a *c, u64 gpu_va)
 
 int gr_gv11b_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c)
 {
-	struct channel_ctx_gk20a *ch_ctx = NULL;
+	struct nvgpu_gr_ctx *ch_ctx = NULL;
 	u32 pd_ab_dist_cfg0;
 	u32 ds_debug;
 	u32 mpc_vtg_debug;
@@ -2836,11 +2847,18 @@ int gv11b_gr_update_sm_error_state(struct gk20a *g,
 		struct channel_gk20a *ch, u32 sm_id,
 		struct nvgpu_gr_sm_error_state *sm_error_state)
 {
+	struct tsg_gk20a *tsg;
 	u32 gpc, tpc, sm, offset;
 	struct gr_gk20a *gr = &g->gr;
-	struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
+	struct nvgpu_gr_ctx *ch_ctx;
 	int err = 0;
 
+	tsg = tsg_gk20a_from_ch(ch);
+	if (!tsg)
+		return -EINVAL;
+
+	ch_ctx = &tsg->gr_ctx;
+
 	nvgpu_mutex_acquire(&g->dbg_sessions_lock);
 
 	gr->sm_error_states[sm_id].hww_global_esr =
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
index b69e69bd..022a7698 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
@@ -41,9 +41,10 @@ struct zbc_s_table {
 };
 
 struct gk20a;
+struct gr_gk20a;
 struct zbc_entry;
 struct zbc_query_params;
-struct channel_ctx_gk20a;
+struct nvgpu_gr_ctx;
 struct nvgpu_warpstate;
 struct nvgpu_gr_sm_error_state;
 struct gr_ctx_desc;
@@ -128,7 +129,7 @@ int gr_gv11b_dump_gr_status_regs(struct gk20a *g,
 int gr_gv11b_wait_empty(struct gk20a *g, unsigned long duration_ms,
 		       u32 expect_delay);
 void gr_gv11b_commit_global_attrib_cb(struct gk20a *g,
-					     struct channel_ctx_gk20a *ch_ctx,
+					     struct nvgpu_gr_ctx *ch_ctx,
 					     u64 addr, bool patch);
 void gr_gv11b_set_gpc_tpc_mask(struct gk20a *g, u32 gpc_index);
 void gr_gv11b_get_access_map(struct gk20a *g,
@@ -222,13 +223,13 @@ unsigned long gr_gv11b_get_max_gfxp_wfi_timeout_count(struct gk20a *g);
 void gr_gv11b_ecc_init_scrub_reg(struct gk20a *g);
 
 int gr_gv11b_set_ctxsw_preemption_mode(struct gk20a *g,
-                                struct gr_ctx_desc *gr_ctx,
+                                struct nvgpu_gr_ctx *gr_ctx,
                                 struct vm_gk20a *vm, u32 class,
                                 u32 graphics_preempt_mode,
                                 u32 compute_preempt_mode);
 
 void gr_gv11b_update_ctxsw_preemption_mode(struct gk20a *g,
-                struct channel_ctx_gk20a *ch_ctx,
+                struct channel_gk20a *ch_ctx,
                 struct nvgpu_mem *mem);
 
 #endif
diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
index aa3d52af..0a552f5b 100644
--- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
@@ -272,7 +272,6 @@ static const struct gpu_ops gv11b_ops = {
 		.load_ctxsw_ucode = gr_gk20a_load_ctxsw_ucode,
 		.set_gpc_tpc_mask = gr_gv11b_set_gpc_tpc_mask,
 		.get_gpc_tpc_mask = gr_gm20b_get_gpc_tpc_mask,
-		.free_channel_ctx = gk20a_free_channel_ctx,
 		.alloc_obj_ctx = gk20a_alloc_obj_ctx,
 		.bind_ctxsw_zcull = gr_gk20a_bind_ctxsw_zcull,
 		.get_zcull_info = gr_gk20a_get_zcull_info,
diff --git a/drivers/gpu/nvgpu/gv11b/subctx_gv11b.c b/drivers/gpu/nvgpu/gv11b/subctx_gv11b.c
index fe1aa8a5..607fff91 100644
--- a/drivers/gpu/nvgpu/gv11b/subctx_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/subctx_gv11b.c
@@ -43,7 +43,7 @@ static void gv11b_subctx_commit_pdb(struct channel_gk20a *c,
 
 void gv11b_free_subctx_header(struct channel_gk20a *c)
 {
-	struct ctx_header_desc *ctx = &c->ch_ctx.ctx_header;
+	struct ctx_header_desc *ctx = &c->ctx_header;
 	struct gk20a *g = c->g;
 
 	nvgpu_log(g, gpu_dbg_fn, "gv11b_free_subctx_header");
@@ -57,13 +57,13 @@ void gv11b_free_subctx_header(struct channel_gk20a *c)
 
 int gv11b_alloc_subctx_header(struct channel_gk20a *c)
 {
-	struct ctx_header_desc *ctx = &c->ch_ctx.ctx_header;
+	struct ctx_header_desc *ctx = &c->ctx_header;
 	struct gk20a *g = c->g;
 	int ret = 0;
 
 	nvgpu_log(g, gpu_dbg_fn, "gv11b_alloc_subctx_header");
 
-	if (ctx->mem.gpu_va == 0) {
+	if (!nvgpu_mem_is_valid(&ctx->mem)) {
 		ret = nvgpu_dma_alloc_flags_sys(g,
 				0, /* No Special flags */
 				ctxsw_prog_fecs_header_v(),
@@ -111,20 +111,50 @@ static void gv11b_init_subcontext_pdb(struct channel_gk20a *c,
 
 int gv11b_update_subctx_header(struct channel_gk20a *c, u64 gpu_va)
 {
-	struct ctx_header_desc *ctx = &c->ch_ctx.ctx_header;
+	struct ctx_header_desc *ctx = &c->ctx_header;
 	struct nvgpu_mem *gr_mem;
 	struct gk20a *g = c->g;
 	int ret = 0;
 	u32 addr_lo, addr_hi;
+	struct tsg_gk20a *tsg;
+	struct nvgpu_gr_ctx *gr_ctx;
 
-	addr_lo = u64_lo32(gpu_va);
-	addr_hi = u64_hi32(gpu_va);
+	tsg = tsg_gk20a_from_ch(c);
+	if (!tsg)
+		return -EINVAL;
+
+	gr_ctx = &tsg->gr_ctx;
 
 	gr_mem = &ctx->mem;
 	g->ops.mm.l2_flush(g, true);
 	if (nvgpu_mem_begin(g, gr_mem))
 		return -ENOMEM;
 
+	/* set priv access map */
+	addr_lo = u64_lo32(gr_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
+	addr_hi = u64_hi32(gr_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
+	nvgpu_mem_wr(g, gr_mem,
+		ctxsw_prog_main_image_priv_access_map_addr_lo_o(),
+		addr_lo);
+	nvgpu_mem_wr(g, gr_mem,
+		ctxsw_prog_main_image_priv_access_map_addr_hi_o(),
+		addr_hi);
+
+	addr_lo = u64_lo32(gr_ctx->patch_ctx.mem.gpu_va);
+	addr_hi = u64_hi32(gr_ctx->patch_ctx.mem.gpu_va);
+	nvgpu_mem_wr(g, gr_mem,
+		ctxsw_prog_main_image_patch_adr_lo_o(),
+		addr_lo);
+	nvgpu_mem_wr(g, gr_mem,
+		ctxsw_prog_main_image_patch_adr_hi_o(),
+		addr_hi);
+
+	g->ops.gr.write_pm_ptr(g, gr_mem, gr_ctx->pm_ctx.mem.gpu_va);
+	g->ops.gr.write_zcull_ptr(g, gr_mem, gr_ctx->zcull_ctx.gpu_va);
+
+	addr_lo = u64_lo32(gpu_va);
+	addr_hi = u64_hi32(gpu_va);
+
 	nvgpu_mem_wr(g, gr_mem,
 		ctxsw_prog_main_image_context_buffer_ptr_hi_o(), addr_hi);
 	nvgpu_mem_wr(g, gr_mem,
-- 
cgit v1.2.2