From 6eebc87d99f9f04b2b68e0bc0142c161ab3e669d Mon Sep 17 00:00:00 2001
From: Konsta Holtta <kholtta@nvidia.com>
Date: Thu, 12 May 2016 09:32:05 +0300
Subject: gpu: nvgpu: refactor gk20a_mem_{wr,rd} for vidmem

To support vidmem, pass g and mem_desc to the buffer memory accessor
functions. This allows the functions to select the memory access method
based on the buffer aperture instead of using the cpu pointer directly
(like until now). The selection and aperture support will be in another
patch; this patch only refactors these accessors, but keeps the
underlying functionality as-is.

gk20a_mem_{rd,wr}32() work as previously; add also gk20a_mem_{rd,wr}()
for byte-indexed accesses, gk20a_mem_{rd,wr}_n() for memcpy()-like
functionality, and gk20a_memset() for filling buffers with a constant.
The 8 and 16 bit accessor functions are removed.

vmap()/vunmap() pairs are abstracted to gk20a_mem_{begin,end}() to
support other types of mappings or conditions where mapping the buffer
is unnecessary or different.

Several function arguments that would access these buffers are also
changed to take a mem_desc instead of a plain cpu pointer. Some relevant
occasions are changed to use the accessor functions instead of cpu
pointers without them (e.g., memcpying to and from), but the majority of
direct accesses will be adjusted later, when the buffers are moved to
support vidmem.

JIRA DNVGPU-23

Change-Id: I3dd22e14290c4ab742d42e2dd327ebeb5cd3f25a
Signed-off-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-on: http://git-master/r/1121143
Reviewed-by: Ken Adams <kadams@nvidia.com>
Tested-by: Ken Adams <kadams@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c    |  73 +++---
 drivers/gpu/nvgpu/gk20a/channel_gk20a.h    |   1 -
 drivers/gpu/nvgpu/gk20a/debug_gk20a.c      |  59 +++--
 drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c |  27 +--
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.c       |   2 -
 drivers/gpu/nvgpu/gk20a/gk20a.h            |  57 +----
 drivers/gpu/nvgpu/gk20a/gr_gk20a.c         | 359 ++++++++++++-----------------
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c         | 144 ++++++++++--
 drivers/gpu/nvgpu/gk20a/mm_gk20a.h         |  31 ++-
 drivers/gpu/nvgpu/gk20a/pmu_gk20a.c        |  10 +-
 drivers/gpu/nvgpu/gm20b/acr_gm20b.c        | 206 +++++++----------
 drivers/gpu/nvgpu/gm20b/gr_gm20b.c         |  26 +--
 drivers/gpu/nvgpu/gm20b/mm_gm20b.c         |   6 +-
 drivers/gpu/nvgpu/vgpu/fifo_vgpu.c         |   2 -
 14 files changed, 493 insertions(+), 510 deletions(-)

(limited to 'drivers/gpu/nvgpu')

diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 990972e4..065e8ab1 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -129,28 +129,25 @@ static int channel_gk20a_commit_userd(struct channel_gk20a *c)
 {
 	u32 addr_lo;
 	u32 addr_hi;
-	void *inst_ptr;
 	struct gk20a *g = c->g;
 
 	gk20a_dbg_fn("");
 
-	inst_ptr = c->inst_block.cpu_va;
-	if (!inst_ptr)
-		return -ENOMEM;
-
 	addr_lo = u64_lo32(c->userd_iova >> ram_userd_base_shift_v());
 	addr_hi = u64_hi32(c->userd_iova);
 
 	gk20a_dbg_info("channel %d : set ramfc userd 0x%16llx",
 		c->hw_chid, (u64)c->userd_iova);
 
-	gk20a_mem_wr32(inst_ptr, ram_in_ramfc_w() + ram_fc_userd_w(),
+	gk20a_mem_wr32(g, &c->inst_block,
+		       ram_in_ramfc_w() + ram_fc_userd_w(),
 		       (g->mm.vidmem_is_vidmem ?
 			pbdma_userd_target_sys_mem_ncoh_f() :
 			pbdma_userd_target_vid_mem_f()) |
 		       pbdma_userd_addr_f(addr_lo));
 
-	gk20a_mem_wr32(inst_ptr, ram_in_ramfc_w() + ram_fc_userd_hi_w(),
+	gk20a_mem_wr32(g, &c->inst_block,
+		       ram_in_ramfc_w() + ram_fc_userd_hi_w(),
 		       pbdma_userd_hi_addr_f(addr_hi));
 
 	return 0;
@@ -186,13 +183,8 @@ int gk20a_channel_get_timescale_from_timeslice(struct gk20a *g,
 
 static int channel_gk20a_set_schedule_params(struct channel_gk20a *c)
 {
-	void *inst_ptr;
 	int shift = 0, value = 0;
 
-	inst_ptr = c->inst_block.cpu_va;
-	if (!inst_ptr)
-		return -ENOMEM;
-
 	gk20a_channel_get_timescale_from_timeslice(c->g,
 		c->timeslice_us, &value, &shift);
 
@@ -203,7 +195,7 @@ static int channel_gk20a_set_schedule_params(struct channel_gk20a *c)
 	WARN_ON(c->g->ops.fifo.preempt_channel(c->g, c->hw_chid));
 
 	/* set new timeslice */
-	gk20a_mem_wr32(inst_ptr, ram_fc_runlist_timeslice_w(),
+	gk20a_mem_wr32(c->g, &c->inst_block, ram_fc_runlist_timeslice_w(),
 		value | (shift << 12) |
 		fifo_runlist_timeslice_enable_true_f());
 
@@ -255,33 +247,30 @@ u32 channel_gk20a_pbdma_acquire_val(struct channel_gk20a *c)
 int channel_gk20a_setup_ramfc(struct channel_gk20a *c,
 			u64 gpfifo_base, u32 gpfifo_entries, u32 flags)
 {
-	void *inst_ptr;
+	struct gk20a *g = c->g;
+	struct mem_desc *mem = &c->inst_block;
 
 	gk20a_dbg_fn("");
 
-	inst_ptr = c->inst_block.cpu_va;
-	if (!inst_ptr)
-		return -ENOMEM;
-
-	memset(inst_ptr, 0, ram_fc_size_val_v());
+	gk20a_memset(g, mem, 0, 0, ram_fc_size_val_v());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_gp_base_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_gp_base_w(),
 		pbdma_gp_base_offset_f(
 		u64_lo32(gpfifo_base >> pbdma_gp_base_rsvd_s())));
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_gp_base_hi_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_gp_base_hi_w(),
 		pbdma_gp_base_hi_offset_f(u64_hi32(gpfifo_base)) |
 		pbdma_gp_base_hi_limit2_f(ilog2(gpfifo_entries)));
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_signature_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_signature_w(),
 		 c->g->ops.fifo.get_pbdma_signature(c->g));
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_formats_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_formats_w(),
 		pbdma_formats_gp_fermi0_f() |
 		pbdma_formats_pb_fermi1_f() |
 		pbdma_formats_mp_fermi0_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_pb_header_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_pb_header_w(),
 		pbdma_pb_header_priv_user_f() |
 		pbdma_pb_header_method_zero_f() |
 		pbdma_pb_header_subchannel_zero_f() |
@@ -289,47 +278,49 @@ int channel_gk20a_setup_ramfc(struct channel_gk20a *c,
 		pbdma_pb_header_first_true_f() |
 		pbdma_pb_header_type_inc_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_subdevice_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_subdevice_w(),
 		pbdma_subdevice_id_f(1) |
 		pbdma_subdevice_status_active_f() |
 		pbdma_subdevice_channel_dma_enable_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_target_w(), pbdma_target_engine_sw_f());
+	gk20a_mem_wr32(g, mem, ram_fc_target_w(), pbdma_target_engine_sw_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_acquire_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_acquire_w(),
 		channel_gk20a_pbdma_acquire_val(c));
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_runlist_timeslice_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_runlist_timeslice_w(),
 		fifo_runlist_timeslice_timeout_128_f() |
 		fifo_runlist_timeslice_timescale_3_f() |
 		fifo_runlist_timeslice_enable_true_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_pb_timeslice_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_pb_timeslice_w(),
 		fifo_pb_timeslice_timeout_16_f() |
 		fifo_pb_timeslice_timescale_0_f() |
 		fifo_pb_timeslice_enable_true_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_chid_w(), ram_fc_chid_id_f(c->hw_chid));
+	gk20a_mem_wr32(g, mem, ram_fc_chid_w(), ram_fc_chid_id_f(c->hw_chid));
 
 	return channel_gk20a_commit_userd(c);
 }
 
 static int channel_gk20a_setup_userd(struct channel_gk20a *c)
 {
-	BUG_ON(!c->userd_cpu_va);
+	struct gk20a *g = c->g;
+	struct mem_desc *mem = &g->fifo.userd;
+	u32 offset = c->hw_chid * g->fifo.userd_entry_size / sizeof(u32);
 
 	gk20a_dbg_fn("");
 
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_put_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_get_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_ref_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_put_hi_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_ref_threshold_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_top_level_get_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_top_level_get_hi_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_get_hi_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_get_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_put_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_put_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_get_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_ref_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_put_hi_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_ref_threshold_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_gp_top_level_get_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_gp_top_level_get_hi_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_get_hi_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_gp_get_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_gp_put_w(), 0);
 
 	return 0;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 8840a3ae..b1355f92 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -130,7 +130,6 @@ struct channel_gk20a {
 	struct mem_desc inst_block;
 	struct mem_desc_sub ramfc;
 
-	void *userd_cpu_va;
 	u64 userd_iova;
 	u64 userd_gpu_va;
 
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
index c2285c8a..a3fa2ea5 100644
--- a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
@@ -36,7 +36,7 @@ unsigned int gk20a_debug_trace_cmdbuf;
 struct ch_state {
 	int pid;
 	int refs;
-	u8 inst_block[0];
+	u32 inst_block[0];
 };
 
 static const char * const ccsr_chan_status_str[] = {
@@ -108,15 +108,15 @@ static void gk20a_debug_show_channel(struct gk20a *g,
 	u32 channel = gk20a_readl(g, ccsr_channel_r(hw_chid));
 	u32 status = ccsr_channel_status_v(channel);
 	u32 syncpointa, syncpointb;
-	void *inst_ptr;
+	u32 *inst_mem;
 
 	if (!ch_state)
 		return;
 
-	inst_ptr = &ch_state->inst_block[0];
+	inst_mem = &ch_state->inst_block[0];
 
-	syncpointa = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointa_w());
-	syncpointb = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointb_w());
+	syncpointa = inst_mem[ram_fc_syncpointa_w()];
+	syncpointb = inst_mem[ram_fc_syncpointb_w()];
 
 	gk20a_debug_output(o, "%d-%s, pid %d, refs: %d: ", hw_chid,
 			dev_name(g->dev),
@@ -129,23 +129,22 @@ static void gk20a_debug_show_channel(struct gk20a *g,
 	gk20a_debug_output(o, "TOP: %016llx PUT: %016llx GET: %016llx "
 			"FETCH: %016llx\nHEADER: %08x COUNT: %08x\n"
 			"SYNCPOINT %08x %08x SEMAPHORE %08x %08x %08x %08x\n",
-		(u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_top_level_get_w()) +
-		((u64)gk20a_mem_rd32(inst_ptr,
-			ram_fc_pb_top_level_get_hi_w()) << 32ULL),
-		(u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_put_w()) +
-		((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_put_hi_w()) << 32ULL),
-		(u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_get_w()) +
-		((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_get_hi_w()) << 32ULL),
-		(u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_fetch_w()) +
-		((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_fetch_hi_w()) << 32ULL),
-		gk20a_mem_rd32(inst_ptr, ram_fc_pb_header_w()),
-		gk20a_mem_rd32(inst_ptr, ram_fc_pb_count_w()),
+		(u64)inst_mem[ram_fc_pb_top_level_get_w()] +
+		((u64)inst_mem[ram_fc_pb_top_level_get_hi_w()] << 32ULL),
+		(u64)inst_mem[ram_fc_pb_put_w()] +
+		((u64)inst_mem[ram_fc_pb_put_hi_w()] << 32ULL),
+		(u64)inst_mem[ram_fc_pb_get_w()] +
+		((u64)inst_mem[ram_fc_pb_get_hi_w()] << 32ULL),
+		(u64)inst_mem[ram_fc_pb_fetch_w()] +
+		((u64)inst_mem[ram_fc_pb_fetch_hi_w()] << 32ULL),
+		inst_mem[ram_fc_pb_header_w()],
+		inst_mem[ram_fc_pb_count_w()],
 		syncpointa,
 		syncpointb,
-		gk20a_mem_rd32(inst_ptr, ram_fc_semaphorea_w()),
-		gk20a_mem_rd32(inst_ptr, ram_fc_semaphoreb_w()),
-		gk20a_mem_rd32(inst_ptr, ram_fc_semaphorec_w()),
-		gk20a_mem_rd32(inst_ptr, ram_fc_semaphored_w()));
+		inst_mem[ram_fc_semaphorea_w()],
+		inst_mem[ram_fc_semaphoreb_w()],
+		inst_mem[ram_fc_semaphorec_w()],
+		inst_mem[ram_fc_semaphored_w()]);
 
 #ifdef CONFIG_TEGRA_GK20A
 	if ((pbdma_syncpointb_op_v(syncpointb) == pbdma_syncpointb_op_wait_v())
@@ -246,17 +245,15 @@ void gk20a_debug_show_dump(struct gk20a *g, struct gk20a_debug_output *o)
 
 	for (chid = 0; chid < f->num_channels; chid++) {
 		struct channel_gk20a *ch = &f->channel[chid];
-		if (ch_state[chid]) {
-			if (ch->inst_block.cpu_va) {
-				ch_state[chid]->pid = ch->pid;
-				ch_state[chid]->refs =
-					atomic_read(&ch->ref_count);
-				memcpy(&ch_state[chid]->inst_block[0],
-						ch->inst_block.cpu_va,
-						ram_in_alloc_size_v());
-			}
-			gk20a_channel_put(ch);
-		}
+		if (!ch_state[chid])
+			continue;
+
+		ch_state[chid]->pid = ch->pid;
+		ch_state[chid]->refs = atomic_read(&ch->ref_count);
+		gk20a_mem_rd_n(g, &ch->inst_block, 0,
+				&ch_state[chid]->inst_block[0],
+				ram_in_alloc_size_v());
+		gk20a_channel_put(ch);
 	}
 	for (chid = 0; chid < f->num_channels; chid++) {
 		if (ch_state[chid]) {
diff --git a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
index f9cddc41..edddcdc1 100644
--- a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
@@ -619,7 +619,7 @@ static int gk20a_fecs_trace_bind_channel(struct gk20a *g,
 	phys_addr_t pa;
 	struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
 	struct gk20a_fecs_trace *trace = g->fecs_trace;
-	void *ctx_ptr;
+	struct mem_desc *mem = &ch_ctx->gr_ctx->mem;
 	u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(ch);
 
 	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
@@ -634,10 +634,7 @@ static int gk20a_fecs_trace_bind_channel(struct gk20a *g,
 	if (!pa)
 		return -ENOMEM;
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-		PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT, 0,
-		pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr)
+	if (gk20a_mem_begin(g, mem))
 		return -ENOMEM;
 
 	lo = u64_lo32(pa);
@@ -646,18 +643,18 @@ static int gk20a_fecs_trace_bind_channel(struct gk20a *g,
 	gk20a_dbg(gpu_dbg_ctxsw, "addr_hi=%x addr_lo=%x count=%d", hi,
 		lo, GK20A_FECS_TRACE_NUM_RECORDS);
 
-	gk20a_mem_wr32(ctx_ptr
-		+ ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
-		0, lo);
-	gk20a_mem_wr32(ctx_ptr
-		+ ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
-		0, ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi));
-	gk20a_mem_wr32(ctx_ptr
-		+ ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
-		0, ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
+	gk20a_mem_wr(g, mem,
+		ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
+		lo);
+	gk20a_mem_wr(g, mem,
+		ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
+		ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi));
+	gk20a_mem_wr(g, mem,
+		ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
+		ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
 			GK20A_FECS_TRACE_NUM_RECORDS));
 
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, mem);
 	gk20a_fecs_trace_hash_add(g, context_ptr, ch->pid);
 
 	return 0;
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index dc3debf2..71400331 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -520,8 +520,6 @@ static int gk20a_init_fifo_setup_sw(struct gk20a *g)
 	mutex_init(&f->free_chs_mutex);
 
 	for (chid = 0; chid < f->num_channels; chid++) {
-		f->channel[chid].userd_cpu_va =
-			f->userd.cpu_va + chid * f->userd_entry_size;
 		f->channel[chid].userd_iova =
 			g->ops.mm.get_iova_addr(g, f->userd.sgt->sgl, 0)
 				+ chid * f->userd_entry_size;
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index f228cce4..2f85bf96 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -201,7 +201,7 @@ struct gpu_ops {
 			  struct gr_ctx_desc *gr_ctx);
 		void (*update_ctxsw_preemption_mode)(struct gk20a *g,
 				struct channel_ctx_gk20a *ch_ctx,
-				void *ctx_ptr);
+				struct mem_desc *mem);
 		int (*update_smpc_ctxsw_mode)(struct gk20a *g,
 				struct channel_gk20a *c,
 				bool enable);
@@ -221,7 +221,8 @@ struct gpu_ops {
 		int (*wait_empty)(struct gk20a *g, unsigned long end_jiffies,
 		       u32 expect_delay);
 		void (*init_cyclestats)(struct gk20a *g);
-		void (*enable_cde_in_fecs)(void *ctx_ptr);
+		void (*enable_cde_in_fecs)(struct gk20a *g,
+				struct mem_desc *mem);
 		int (*set_sm_debug_mode)(struct gk20a *g, struct channel_gk20a *ch,
 					u64 sms, bool enable);
 		void (*bpt_reg_info)(struct gk20a *g,
@@ -484,7 +485,7 @@ struct gpu_ops {
 		void (*cbc_clean)(struct gk20a *g);
 		void (*tlb_invalidate)(struct vm_gk20a *vm);
 		void (*set_big_page_size)(struct gk20a *g,
-					  void *inst_ptr, int size);
+					  struct mem_desc *mem, int size);
 		u32 (*get_big_page_sizes)(void);
 		u32 (*get_physical_addr_bits)(struct gk20a *g);
 		int (*init_mm_setup_hw)(struct gk20a *g);
@@ -493,7 +494,8 @@ struct gpu_ops {
 		void (*remove_bar2_vm)(struct gk20a *g);
 		const struct gk20a_mmu_level *
 			(*get_mmu_levels)(struct gk20a *g, u32 big_page_size);
-		void (*init_pdb)(struct gk20a *g, void *inst_ptr, u64 pdb_addr);
+		void (*init_pdb)(struct gk20a *g, struct mem_desc *mem,
+				u64 pdb_addr);
 		u64 (*get_iova_addr)(struct gk20a *g, struct scatterlist *sgl,
 					 u32 flags);
 		int (*bar1_bind)(struct gk20a *g, u64 bar1_iova);
@@ -859,53 +861,6 @@ do {									\
 #define gk20a_dbg_info(fmt, arg...) \
 	gk20a_dbg(gpu_dbg_info, fmt, ##arg)
 
-/* mem access with dbg_mem logging */
-static inline u8 gk20a_mem_rd08(void *ptr, int b)
-{
-	u8 _b = ((const u8 *)ptr)[b];
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u8)*b, _b);
-#endif
-	return _b;
-}
-static inline u16 gk20a_mem_rd16(void *ptr, int s)
-{
-	u16 _s = ((const u16 *)ptr)[s];
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u16)*s, _s);
-#endif
-	return _s;
-}
-static inline u32 gk20a_mem_rd32(void *ptr, int w)
-{
-	u32 _w = ((const u32 *)ptr)[w];
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + sizeof(u32)*w, _w);
-#endif
-	return _w;
-}
-static inline void gk20a_mem_wr08(void *ptr, int b, u8 data)
-{
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u8)*b, data);
-#endif
-	((u8 *)ptr)[b] = data;
-}
-static inline void gk20a_mem_wr16(void *ptr, int s, u16 data)
-{
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u16)*s, data);
-#endif
-	((u16 *)ptr)[s] = data;
-}
-static inline void gk20a_mem_wr32(void *ptr, int w, u32 data)
-{
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u32)*w, data);
-#endif
-	((u32 *)ptr)[w] = data;
-}
-
 void gk20a_init_clk_ops(struct gpu_ops *gops);
 
 /* register accessors */
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 4e7c36ee..e7e6662a 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -97,22 +97,18 @@ int gr_gk20a_get_ctx_id(struct gk20a *g,
 		u32 *ctx_id)
 {
 	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
-	void *ctx_ptr = NULL;
 
 	/* Channel gr_ctx buffer is gpu cacheable.
 	   Flush and invalidate before cpu update. */
 	g->ops.mm.l2_flush(g, true);
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr)
+	if (gk20a_mem_begin(g, &ch_ctx->gr_ctx->mem))
 		return -ENOMEM;
 
-	*ctx_id = gk20a_mem_rd32(ctx_ptr +
-				 ctxsw_prog_main_image_context_id_o(), 0);
+	*ctx_id = gk20a_mem_rd(g, &ch_ctx->gr_ctx->mem,
+			ctxsw_prog_main_image_context_id_o());
 
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, &ch_ctx->gr_ctx->mem);
 
 	return 0;
 }
@@ -619,22 +615,17 @@ static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
 {
 	u32 addr_lo;
 	u32 addr_hi;
-	void *inst_ptr = NULL;
 
 	gk20a_dbg_fn("");
 
-	inst_ptr = c->inst_block.cpu_va;
-	if (!inst_ptr)
-		return -ENOMEM;
-
 	addr_lo = u64_lo32(gpu_va) >> 12;
 	addr_hi = u64_hi32(gpu_va);
 
-	gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_target_w(),
+	gk20a_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_target_w(),
 		 ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
 		 ram_in_gr_wfi_ptr_lo_f(addr_lo));
 
-	gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_ptr_hi_w(),
+	gk20a_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_ptr_hi_w(),
 		 ram_in_gr_wfi_ptr_hi_f(addr_hi));
 
 	return 0;
@@ -658,11 +649,7 @@ int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
 		return -EBUSY;
 	}
 
-	ch_ctx->patch_ctx.mem.cpu_va = vmap(ch_ctx->patch_ctx.mem.pages,
-			PAGE_ALIGN(ch_ctx->patch_ctx.mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-
-	if (!ch_ctx->patch_ctx.mem.cpu_va)
+	if (gk20a_mem_begin(g, &ch_ctx->patch_ctx.mem))
 		return -ENOMEM;
 
 	return 0;
@@ -677,8 +664,7 @@ int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
 		return -EINVAL;
 	}
 
-	vunmap(ch_ctx->patch_ctx.mem.cpu_va);
-	ch_ctx->patch_ctx.mem.cpu_va = NULL;
+	gk20a_mem_end(g, &ch_ctx->patch_ctx.mem);
 	return 0;
 }
 
@@ -687,7 +673,6 @@ int gr_gk20a_ctx_patch_write(struct gk20a *g,
 				    u32 addr, u32 data, bool patch)
 {
 	u32 patch_slot = 0;
-	void *patch_ptr = NULL;
 	bool mapped_here = false;
 
 	BUG_ON(patch != 0 && ch_ctx == NULL);
@@ -708,11 +693,10 @@ int gr_gk20a_ctx_patch_write(struct gk20a *g,
 		} else
 			mapped_here = false;
 
-		patch_ptr = ch_ctx->patch_ctx.mem.cpu_va;
 		patch_slot = ch_ctx->patch_ctx.data_count * 2;
 
-		gk20a_mem_wr32(patch_ptr, patch_slot++, addr);
-		gk20a_mem_wr32(patch_ptr, patch_slot++, data);
+		gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot++, addr);
+		gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot++, data);
 
 		ch_ctx->patch_ctx.data_count++;
 
@@ -760,16 +744,13 @@ static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
 static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c)
 {
 	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	struct mem_desc *mem = &ch_ctx->gr_ctx->mem;
 	u32 va_lo, va_hi, va;
 	int ret = 0;
-	void *ctx_ptr = NULL;
 
 	gk20a_dbg_fn("");
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr)
+	if (gk20a_mem_begin(g, mem))
 		return -ENOMEM;
 
 	if (ch_ctx->zcull_ctx.gpu_va == 0 &&
@@ -792,15 +773,17 @@ static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c)
 		goto clean_up;
 	}
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_o(), 0,
+	gk20a_mem_wr(g, mem,
+			ctxsw_prog_main_image_zcull_o(),
 		 ch_ctx->zcull_ctx.ctx_sw_mode);
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, va);
+	gk20a_mem_wr(g, mem,
+			ctxsw_prog_main_image_zcull_ptr_o(), va);
 
 	c->g->ops.fifo.enable_channel(c);
 
 clean_up:
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, mem);
 
 	return ret;
 }
@@ -1500,8 +1483,8 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
 	u32 ctx_header_words;
 	u32 i;
 	u32 data;
-	void *ctx_ptr = NULL;
-	void *gold_ptr = NULL;
+	struct mem_desc *gold_mem = &gr->global_ctx_buffer[GOLDEN_CTX].mem;
+	struct mem_desc *gr_mem = &ch_ctx->gr_ctx->mem;
 	u32 err = 0;
 
 	gk20a_dbg_fn("");
@@ -1527,16 +1510,10 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
 	if (err)
 		goto clean_up;
 
-	gold_ptr = vmap(gr->global_ctx_buffer[GOLDEN_CTX].mem.pages,
-			PAGE_ALIGN(gr->global_ctx_buffer[GOLDEN_CTX].mem.size) >>
-			PAGE_SHIFT, 0, pgprot_writecombine(PAGE_KERNEL));
-	if (!gold_ptr)
+	if (gk20a_mem_begin(g, gold_mem))
 		goto clean_up;
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr)
+	if (gk20a_mem_begin(g, gr_mem))
 		goto clean_up;
 
 	ctx_header_words =  roundup(ctx_header_bytes, sizeof(u32));
@@ -1545,14 +1522,14 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
 	g->ops.mm.l2_flush(g, true);
 
 	for (i = 0; i < ctx_header_words; i++) {
-		data = gk20a_mem_rd32(ctx_ptr, i);
-		gk20a_mem_wr32(gold_ptr, i, data);
+		data = gk20a_mem_rd32(g, gr_mem, i);
+		gk20a_mem_wr32(g, gold_mem, i, data);
 	}
 
-	gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_o(), 0,
+	gk20a_mem_wr(g, gold_mem, ctxsw_prog_main_image_zcull_o(),
 		 ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
 
-	gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, 0);
+	gk20a_mem_wr(g, gold_mem, ctxsw_prog_main_image_zcull_ptr_o(), 0);
 
 	gr_gk20a_commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
 
@@ -1568,12 +1545,12 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
 			goto clean_up;
 		}
 
-		for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
-			gr->ctx_vars.local_golden_image[i] =
-				gk20a_mem_rd32(gold_ptr, i);
+		gk20a_mem_rd_n(g, gold_mem, 0,
+				gr->ctx_vars.local_golden_image,
+				gr->ctx_vars.golden_image_size);
 	}
 
-	gr_gk20a_commit_inst(c, ch_ctx->gr_ctx->mem.gpu_va);
+	gr_gk20a_commit_inst(c, gr_mem->gpu_va);
 
 	gr->ctx_vars.golden_image_initialized = true;
 
@@ -1586,10 +1563,8 @@ clean_up:
 	else
 		gk20a_dbg_fn("done");
 
-	if (gold_ptr)
-		vunmap(gold_ptr);
-	if (ctx_ptr)
-		vunmap(ctx_ptr);
+	gk20a_mem_end(g, gold_mem);
+	gk20a_mem_end(g, gr_mem);
 
 	mutex_unlock(&gr->ctx_mutex);
 	return err;
@@ -1600,7 +1575,7 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
 				    bool enable_smpc_ctxsw)
 {
 	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
-	void *ctx_ptr = NULL;
+	struct mem_desc *mem;
 	u32 data;
 	int ret;
 
@@ -1611,46 +1586,39 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
 		return -EFAULT;
 	}
 
+	mem = &ch_ctx->gr_ctx->mem;
+
 	c->g->ops.fifo.disable_channel(c);
 	ret = c->g->ops.fifo.preempt_channel(c->g, c->hw_chid);
 	if (ret) {
-		c->g->ops.fifo.enable_channel(c);
-		gk20a_err(dev_from_gk20a(g),
-			"failed to preempt channel\n");
-		return ret;
+		gk20a_err(dev_from_gk20a(g), "failed to preempt channel");
+		goto out;
 	}
 
 	/* Channel gr_ctx buffer is gpu cacheable.
 	   Flush and invalidate before cpu update. */
 	g->ops.mm.l2_flush(g, true);
 
-	if (!ch_ctx->gr_ctx) {
-		gk20a_err(dev_from_gk20a(g), "no graphics context allocated");
-		return -EFAULT;
-	}
-
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr) {
-		c->g->ops.fifo.enable_channel(c);
-		return -ENOMEM;
+	if (gk20a_mem_begin(g, mem)) {
+		ret = -ENOMEM;
+		goto out;
 	}
 
-	data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+	data = gk20a_mem_rd(g, mem,
+			ctxsw_prog_main_image_pm_o());
 	data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
 	data |= enable_smpc_ctxsw ?
 		ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
 		ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
-		 data);
+	gk20a_mem_wr(g, mem,
+			ctxsw_prog_main_image_pm_o(),
+			data);
 
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, mem);
 
-	/* enable channel */
+out:
 	c->g->ops.fifo.enable_channel(c);
-
-	return 0;
+	return ret;
 }
 
 int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
@@ -1659,8 +1627,7 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 {
 	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
 	struct pm_ctx_desc *pm_ctx = &ch_ctx->pm_ctx;
-	void *ctx_ptr = NULL;
-	void *pm_ctx_ptr;
+	struct mem_desc *gr_mem;
 	u32 data, virt_addr;
 	int ret;
 
@@ -1671,6 +1638,8 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 		return -EFAULT;
 	}
 
+	gr_mem = &ch_ctx->gr_ctx->mem;
+
 	if (enable_hwpm_ctxsw) {
 		if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f())
 			return 0;
@@ -1721,29 +1690,22 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 		}
 
 		/* Now clear the buffer */
-		pm_ctx_ptr = vmap(pm_ctx->mem.pages,
-				PAGE_ALIGN(pm_ctx->mem.size) >> PAGE_SHIFT,
-				0, pgprot_writecombine(PAGE_KERNEL));
-
-		if (!pm_ctx_ptr) {
+		if (gk20a_mem_begin(g, &pm_ctx->mem)) {
 			ret = -ENOMEM;
 			goto cleanup_pm_buf;
 		}
 
-		memset(pm_ctx_ptr, 0, pm_ctx->mem.size);
+		gk20a_memset(g, &pm_ctx->mem, 0, 0, pm_ctx->mem.size);
 
-		vunmap(pm_ctx_ptr);
+		gk20a_mem_end(g, &pm_ctx->mem);
 	}
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr) {
+	if (gk20a_mem_begin(g, gr_mem)) {
 		ret = -ENOMEM;
 		goto cleanup_pm_buf;
 	}
 
-	data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+	data = gk20a_mem_rd(g, gr_mem, ctxsw_prog_main_image_pm_o());
 	data = data & ~ctxsw_prog_main_image_pm_mode_m();
 
 	if (enable_hwpm_ctxsw) {
@@ -1760,10 +1722,10 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 
 	data |= pm_ctx->pm_mode;
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0, data);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, virt_addr);
+	gk20a_mem_wr(g, gr_mem, ctxsw_prog_main_image_pm_o(), data);
+	gk20a_mem_wr(g, gr_mem, ctxsw_prog_main_image_pm_ptr_o(), virt_addr);
 
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, gr_mem);
 
 	/* enable channel */
 	c->g->ops.fifo.enable_channel(c);
@@ -1788,9 +1750,9 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 	u32 virt_addr_lo;
 	u32 virt_addr_hi;
 	u32 virt_addr = 0;
-	u32 i, v, data;
+	u32 v, data;
 	int ret = 0;
-	void *ctx_ptr = NULL;
+	struct mem_desc *mem = &ch_ctx->gr_ctx->mem;
 
 	gk20a_dbg_fn("");
 
@@ -1801,20 +1763,18 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 	   Flush and invalidate before cpu update. */
 	g->ops.mm.l2_flush(g, true);
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr)
+	if (gk20a_mem_begin(g, mem))
 		return -ENOMEM;
 
-	for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
-		gk20a_mem_wr32(ctx_ptr, i, gr->ctx_vars.local_golden_image[i]);
+	gk20a_mem_wr_n(g, mem, 0,
+			gr->ctx_vars.local_golden_image,
+			gr->ctx_vars.golden_image_size);
 
 	if (g->ops.gr.enable_cde_in_fecs && c->cde)
-		g->ops.gr.enable_cde_in_fecs(ctx_ptr);
+		g->ops.gr.enable_cde_in_fecs(g, mem);
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_o(), 0, 0);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_o(), 0, 0);
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_num_save_ops_o(), 0);
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_num_restore_ops_o(), 0);
 
 	/* set priv access map */
 	virt_addr_lo =
@@ -1827,29 +1787,29 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 	else
 		data = ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f();
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_config_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_config_o(),
 		 data);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_lo_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_addr_lo_o(),
 		 virt_addr_lo);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_hi_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_addr_hi_o(),
 		 virt_addr_hi);
 	/* disable verif features */
-	v = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0);
+	v = gk20a_mem_rd(g, mem, ctxsw_prog_main_image_misc_options_o());
 	v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
 	v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0, v);
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_misc_options_o(), v);
 
 	if (g->ops.gr.update_ctxsw_preemption_mode)
-		g->ops.gr.update_ctxsw_preemption_mode(g, ch_ctx, ctx_ptr);
+		g->ops.gr.update_ctxsw_preemption_mode(g, ch_ctx, mem);
 
 	virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.mem.gpu_va);
 	virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.mem.gpu_va);
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_count_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_patch_count_o(),
 		 ch_ctx->patch_ctx.data_count);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_lo_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_patch_adr_lo_o(),
 		 virt_addr_lo);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_patch_adr_hi_o(),
 		 virt_addr_hi);
 
 	/* Update main header region of the context buffer with the info needed
@@ -1860,7 +1820,7 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 		if (ch_ctx->pm_ctx.mem.gpu_va == 0) {
 			gk20a_err(dev_from_gk20a(g),
 				"context switched pm with no pm buffer!");
-			vunmap(ctx_ptr);
+			gk20a_mem_end(g, mem);
 			return -EFAULT;
 		}
 
@@ -1871,14 +1831,14 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 	} else
 		virt_addr = 0;
 
-	data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+	data = gk20a_mem_rd(g, mem, ctxsw_prog_main_image_pm_o());
 	data = data & ~ctxsw_prog_main_image_pm_mode_m();
 	data |= ch_ctx->pm_ctx.pm_mode;
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0, data);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, virt_addr);
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_pm_o(), data);
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_pm_ptr_o(), virt_addr);
 
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, mem);
 
 	if (tegra_platform_is_linsim()) {
 		u32 inst_base_ptr =
@@ -1978,16 +1938,20 @@ static void gr_gk20a_init_ctxsw_ucode_segments(
 }
 
 static int gr_gk20a_copy_ctxsw_ucode_segments(
-	u8 *buf,
+	struct gk20a *g,
+	struct mem_desc *dst,
 	struct gk20a_ctxsw_ucode_segments *segments,
 	u32 *bootimage,
 	u32 *code, u32 *data)
 {
 	int i;
 
-	memcpy(buf + segments->boot.offset, bootimage, segments->boot.size);
-	memcpy(buf + segments->code.offset, code,      segments->code.size);
-	memcpy(buf + segments->data.offset, data,      segments->data.size);
+	gk20a_mem_wr_n(g, dst, segments->boot.offset, bootimage,
+			segments->boot.size);
+	gk20a_mem_wr_n(g, dst, segments->code.offset, code,
+			segments->code.size);
+	gk20a_mem_wr_n(g, dst, segments->data.offset, data,
+			segments->data.size);
 
 	/* compute a "checksum" for the boot binary to detect its version */
 	segments->boot_signature = 0;
@@ -2009,7 +1973,6 @@ int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
 	u32 *fecs_boot_image;
 	u32 *gpccs_boot_image;
 	struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
-	u8 *buf;
 	u32 ucode_size;
 	int err = 0;
 
@@ -2049,14 +2012,8 @@ int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
 	if (err)
 		goto clean_up;
 
-	buf = (u8 *)ucode_info->surface_desc.cpu_va;
-	if (!buf) {
-		gk20a_err(d, "failed to map surface desc buffer");
-		err = -ENOMEM;
-		goto clean_up;
-	}
-
-	gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->fecs,
+	gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc,
+		&ucode_info->fecs,
 		fecs_boot_image,
 		g->gr.ctx_vars.ucode.fecs.inst.l,
 		g->gr.ctx_vars.ucode.fecs.data.l);
@@ -2064,7 +2021,8 @@ int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
 	release_firmware(fecs_fw);
 	fecs_fw = NULL;
 
-	gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->gpccs,
+	gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc,
+		&ucode_info->gpccs,
 		gpccs_boot_image,
 		g->gr.ctx_vars.ucode.gpccs.inst.l,
 		g->gr.ctx_vars.ucode.gpccs.data.l);
@@ -4690,41 +4648,38 @@ out:
 static int gr_gk20a_init_access_map(struct gk20a *g)
 {
 	struct gr_gk20a *gr = &g->gr;
-	void *data;
-	int err = 0;
+	struct mem_desc *mem = &gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem;
 	u32 w, nr_pages =
 		DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size,
 			     PAGE_SIZE);
 	u32 *whitelist = NULL;
 	int num_entries = 0;
 
-	data = vmap(gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem.pages,
-		    PAGE_ALIGN(gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem.size) >>
-		    PAGE_SHIFT, 0, pgprot_writecombine(PAGE_KERNEL));
-	if (!data) {
+	if (gk20a_mem_begin(g, mem)) {
 		gk20a_err(dev_from_gk20a(g),
 			  "failed to map priv access map memory");
-		err = -ENOMEM;
-		goto clean_up;
+		return -ENOMEM;
 	}
 
-	memset(data, 0x0, PAGE_SIZE * nr_pages);
+	gk20a_memset(g, mem, 0, 0, PAGE_SIZE * nr_pages);
 
 	g->ops.gr.get_access_map(g, &whitelist, &num_entries);
 
 	for (w = 0; w < num_entries; w++) {
-		u32 map_bit, map_byte, map_shift;
+		u32 map_bit, map_byte, map_shift, x;
 		map_bit = whitelist[w] >> 2;
 		map_byte = map_bit >> 3;
 		map_shift = map_bit & 0x7; /* i.e. 0-7 */
 		gk20a_dbg_info("access map addr:0x%x byte:0x%x bit:%d",
 			       whitelist[w], map_byte, map_shift);
-		((u8 *)data)[map_byte] |= 1 << map_shift;
+		x = gk20a_mem_rd32(g, mem, map_byte / sizeof(u32));
+		x |= 1 << (
+			   (map_byte % sizeof(u32) * BITS_PER_BYTE)
+			  + map_shift);
+		gk20a_mem_wr32(g, mem, map_byte / sizeof(u32), x);
 	}
 
-clean_up:
-	if (data)
-		vunmap(data);
+	gk20a_mem_end(g, mem);
 	return 0;
 }
 
@@ -6659,7 +6614,7 @@ static void gr_gk20a_init_sm_dsm_reg_info(void)
 static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 			    struct channel_ctx_gk20a *ch_ctx,
 			    u32 addr, u32 data,
-			    u8 *context)
+			    struct mem_desc *mem)
 {
 	u32 num_gpc = g->gr.gpc_count;
 	u32 num_tpc;
@@ -6688,8 +6643,8 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 				/* reset the patch count from previous
 				   runs,if ucode has already processed
 				   it */
-				tmp = gk20a_mem_rd32(context +
-				       ctxsw_prog_main_image_patch_count_o(), 0);
+				tmp = gk20a_mem_rd(g, mem,
+				       ctxsw_prog_main_image_patch_count_o());
 
 				if (!tmp)
 					ch_ctx->patch_ctx.data_count = 0;
@@ -6700,15 +6655,15 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 				vaddr_lo = u64_lo32(ch_ctx->patch_ctx.mem.gpu_va);
 				vaddr_hi = u64_hi32(ch_ctx->patch_ctx.mem.gpu_va);
 
-				gk20a_mem_wr32(context +
+				gk20a_mem_wr(g, mem,
 					 ctxsw_prog_main_image_patch_count_o(),
-					 0, ch_ctx->patch_ctx.data_count);
-				gk20a_mem_wr32(context +
+					 ch_ctx->patch_ctx.data_count);
+				gk20a_mem_wr(g, mem,
 					 ctxsw_prog_main_image_patch_adr_lo_o(),
-					 0, vaddr_lo);
-				gk20a_mem_wr32(context +
+					 vaddr_lo);
+				gk20a_mem_wr(g, mem,
 					 ctxsw_prog_main_image_patch_adr_hi_o(),
-					 0, vaddr_hi);
+					 vaddr_hi);
 
 				/* we're not caching these on cpu side,
 				   but later watch for it */
@@ -6760,17 +6715,15 @@ static void gr_gk20a_access_smpc_reg(struct gk20a *g, u32 quad, u32 offset)
 
 #define ILLEGAL_ID (~0)
 
-static inline bool check_main_image_header_magic(void *context)
+static inline bool check_main_image_header_magic(u8 *context)
 {
-	u32 magic = gk20a_mem_rd32(context +
-			     ctxsw_prog_main_image_magic_value_o(), 0);
+	u32 magic = *(u32 *)(context + ctxsw_prog_main_image_magic_value_o());
 	gk20a_dbg(gpu_dbg_gpu_dbg, "main image magic=0x%x", magic);
 	return magic == ctxsw_prog_main_image_magic_value_v_value_v();
 }
-static inline bool check_local_header_magic(void *context)
+static inline bool check_local_header_magic(u8 *context)
 {
-	u32 magic = gk20a_mem_rd32(context +
-			     ctxsw_prog_local_magic_value_o(), 0);
+	u32 magic = *(u32 *)(context + ctxsw_prog_local_magic_value_o());
 	gk20a_dbg(gpu_dbg_gpu_dbg, "local magic=0x%x",  magic);
 	return magic == ctxsw_prog_local_magic_value_v_value_v();
 
@@ -6814,7 +6767,7 @@ static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
 	u32 num_gpcs, num_tpcs;
 	u32 chk_addr;
 	u32 ext_priv_offset, ext_priv_size;
-	void *context;
+	u8 *context;
 	u32 offset_to_segment, offset_to_segment_end;
 	u32 sm_dsm_perf_reg_id = ILLEGAL_ID;
 	u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID;
@@ -6856,14 +6809,14 @@ static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
 	/* note below is in words/num_registers */
 	marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2;
 
-	context = context_buffer;
+	context = (u8 *)context_buffer;
 	/* sanity check main header */
 	if (!check_main_image_header_magic(context)) {
 		gk20a_err(dev_from_gk20a(g),
 			   "Invalid main header: magic value");
 		return -EINVAL;
 	}
-	num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
+	num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
 	if (gpc_num >= num_gpcs) {
 		gk20a_err(dev_from_gk20a(g),
 		   "GPC 0x%08x is greater than total count 0x%08x!\n",
@@ -6871,7 +6824,7 @@ static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
 		return -EINVAL;
 	}
 
-	data32 = gk20a_mem_rd32(context + ctxsw_prog_main_extended_buffer_ctl_o(), 0);
+	data32 = *(u32 *)(context + ctxsw_prog_main_extended_buffer_ctl_o());
 	ext_priv_size   = ctxsw_prog_main_extended_buffer_ctl_size_v(data32);
 	if (0 == ext_priv_size) {
 		gk20a_dbg_info(" No extended memory in context buffer");
@@ -7149,7 +7102,7 @@ gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g,
 }
 
 static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
-					       void *context,
+					       u8 *context,
 					       u32 *num_ppcs, u32 *ppc_mask,
 					       u32 *reg_ppc_count)
 {
@@ -7165,7 +7118,7 @@ static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
 	     (num_pes_per_gpc > 1)))
 		return -EINVAL;
 
-	data32 = gk20a_mem_rd32(context + ctxsw_prog_local_image_ppc_info_o(), 0);
+	data32 = *(u32 *)(context + ctxsw_prog_local_image_ppc_info_o());
 
 	*num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32);
 	*ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32);
@@ -7177,7 +7130,7 @@ static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
 
 /*
  *  This function will return the 32 bit offset for a priv register if it is
- *  present in the context buffer.
+ *  present in the context buffer. The context buffer is in CPU memory.
  */
 static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 					       u32 addr,
@@ -7196,7 +7149,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 	u32 offset;
 	u32 sys_priv_offset, gpc_priv_offset;
 	u32 ppc_mask, reg_list_ppc_count;
-	void *context;
+	u8 *context;
 	u32 offset_to_segment;
 
 	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
@@ -7207,13 +7160,13 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 	if (err)
 		return err;
 
-	context = context_buffer;
+	context = (u8 *)context_buffer;
 	if (!check_main_image_header_magic(context)) {
 		gk20a_err(dev_from_gk20a(g),
 			   "Invalid main header: magic value");
 		return -EINVAL;
 	}
-	num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
+	num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
 
 	/* Parse the FECS local header. */
 	context += ctxsw_prog_ucode_header_size_in_bytes();
@@ -7222,7 +7175,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 			   "Invalid FECS local header: magic value\n");
 		return -EINVAL;
 	}
-	data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
+	data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
 	sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
 
 	/* If found in Ext buffer, ok.
@@ -7268,7 +7221,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 			return -EINVAL;
 
 		}
-		data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
+		data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
 		gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
 
 		err = gr_gk20a_determine_ppc_configuration(g, context,
@@ -7277,7 +7230,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 		if (err)
 			return err;
 
-		num_tpcs = gk20a_mem_rd32(context + ctxsw_prog_local_image_num_tpcs_o(), 0);
+		num_tpcs = *(u32 *)(context + ctxsw_prog_local_image_num_tpcs_o());
 
 		if ((i == gpc_num) && ((tpc_num + 1) > num_tpcs)) {
 			gk20a_err(dev_from_gk20a(g),
@@ -7689,9 +7642,9 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 {
 	struct gk20a *g = ch->g;
 	struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
-	void *ctx_ptr = NULL;
-	void *pm_ctx_ptr = NULL;
-	void *base_ptr = NULL;
+	bool gr_ctx_ready = false;
+	bool pm_ctx_ready = false;
+	struct mem_desc *current_mem = NULL;
 	bool ch_is_curr_ctx, restart_gr_ctxsw = false;
 	u32 i, j, offset, v;
 	struct gr_gk20a *gr = &g->gr;
@@ -7821,20 +7774,18 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 						ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD),
 						ctx_ops[i].quad);
 			if (!err) {
-				if (!ctx_ptr) {
+				if (!gr_ctx_ready) {
 					/* would have been a variant of
 					 * gr_gk20a_apply_instmem_overrides,
 					 * recoded in-place instead.
 					 */
-					ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-						PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-						0, pgprot_writecombine(PAGE_KERNEL));
-					if (!ctx_ptr) {
+					if (gk20a_mem_begin(g, &ch_ctx->gr_ctx->mem)) {
 						err = -ENOMEM;
 						goto cleanup;
 					}
+					gr_ctx_ready = true;
 				}
-				base_ptr = ctx_ptr;
+				current_mem = &ch_ctx->gr_ctx->mem;
 			} else {
 				err = gr_gk20a_get_pm_ctx_buffer_offsets(g,
 							ctx_ops[i].offset,
@@ -7849,7 +7800,7 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 						NVGPU_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET;
 					continue;
 				}
-				if (!pm_ctx_ptr) {
+				if (!pm_ctx_ready) {
 					/* Make sure ctx buffer was initialized */
 					if (!ch_ctx->pm_ctx.mem.pages) {
 						gk20a_err(dev_from_gk20a(g),
@@ -7857,15 +7808,13 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 						err = -EINVAL;
 						goto cleanup;
 					}
-					pm_ctx_ptr = vmap(ch_ctx->pm_ctx.mem.pages,
-						PAGE_ALIGN(ch_ctx->pm_ctx.mem.size) >> PAGE_SHIFT,
-						0, pgprot_writecombine(PAGE_KERNEL));
-					if (!pm_ctx_ptr) {
+					if (gk20a_mem_begin(g, &ch_ctx->pm_ctx.mem)) {
 						err = -ENOMEM;
 						goto cleanup;
 					}
+					pm_ctx_ready = true;
 				}
-				base_ptr = pm_ctx_ptr;
+				current_mem = &ch_ctx->pm_ctx.mem;
 			}
 
 			/* if this is a quad access, setup for special access*/
@@ -7878,24 +7827,24 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 				/* sanity check gr ctxt offsets,
 				 * don't write outside, worst case
 				 */
-				if ((base_ptr == ctx_ptr) &&
+				if ((current_mem == &ch_ctx->gr_ctx->mem) &&
 					(offsets[j] >= g->gr.ctx_vars.golden_image_size))
 					continue;
 				if (pass == 0) { /* write pass */
-					v = gk20a_mem_rd32(base_ptr + offsets[j], 0);
+					v = gk20a_mem_rd(g, current_mem, offsets[j]);
 					v &= ~ctx_ops[i].and_n_mask_lo;
 					v |= ctx_ops[i].value_lo;
-					gk20a_mem_wr32(base_ptr + offsets[j], 0, v);
+					gk20a_mem_wr(g, current_mem, offsets[j], v);
 
 					gk20a_dbg(gpu_dbg_gpu_dbg,
 						   "context wr: offset=0x%x v=0x%x",
 						   offsets[j], v);
 
 					if (ctx_ops[i].op == REGOP(WRITE_64)) {
-						v = gk20a_mem_rd32(base_ptr + offsets[j] + 4, 0);
+						v = gk20a_mem_rd(g, current_mem, offsets[j] + 4);
 						v &= ~ctx_ops[i].and_n_mask_hi;
 						v |= ctx_ops[i].value_hi;
-						gk20a_mem_wr32(base_ptr + offsets[j] + 4, 0, v);
+						gk20a_mem_wr(g, current_mem, offsets[j] + 4, v);
 
 						gk20a_dbg(gpu_dbg_gpu_dbg,
 							   "context wr: offset=0x%x v=0x%x",
@@ -7905,18 +7854,18 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 					/* check to see if we need to add a special WAR
 					   for some of the SMPC perf regs */
 					gr_gk20a_ctx_patch_smpc(g, ch_ctx, offset_addrs[j],
-							v, base_ptr);
+							v, current_mem);
 
 				} else { /* read pass */
 					ctx_ops[i].value_lo =
-						gk20a_mem_rd32(base_ptr + offsets[0], 0);
+						gk20a_mem_rd(g, current_mem, offsets[0]);
 
 					gk20a_dbg(gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
 						   offsets[0], ctx_ops[i].value_lo);
 
 					if (ctx_ops[i].op == REGOP(READ_64)) {
 						ctx_ops[i].value_hi =
-							gk20a_mem_rd32(base_ptr + offsets[0] + 4, 0);
+							gk20a_mem_rd(g, current_mem, offsets[0] + 4);
 
 						gk20a_dbg(gpu_dbg_gpu_dbg,
 							   "context rd: offset=0x%x v=0x%x",
@@ -7943,12 +7892,10 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 
 	if (ch_ctx->patch_ctx.mem.cpu_va)
 		gr_gk20a_ctx_patch_write_end(g, ch_ctx);
-
-	if (ctx_ptr)
-		vunmap(ctx_ptr);
-
-	if (pm_ctx_ptr)
-		vunmap(pm_ctx_ptr);
+	if (gr_ctx_ready)
+		gk20a_mem_end(g, &ch_ctx->gr_ctx->mem);
+	if (pm_ctx_ready)
+		gk20a_mem_end(g, &ch_ctx->pm_ctx.mem);
 
 	if (restart_gr_ctxsw) {
 		int tmp_err = gr_gk20a_enable_ctxsw(g);
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 6f6734b4..13382416 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -44,6 +44,112 @@
 #include "kind_gk20a.h"
 #include "semaphore_gk20a.h"
 
+int gk20a_mem_begin(struct gk20a *g, struct mem_desc *mem)
+{
+	void *cpu_va;
+
+	if (WARN_ON(mem->cpu_va)) {
+		gk20a_warn(dev_from_gk20a(g), "nested %s", __func__);
+		return -EBUSY;
+	}
+
+	cpu_va = vmap(mem->pages,
+			PAGE_ALIGN(mem->size) >> PAGE_SHIFT,
+			0, pgprot_writecombine(PAGE_KERNEL));
+
+	if (WARN_ON(!cpu_va))
+		return -ENOMEM;
+
+	mem->cpu_va = cpu_va;
+	return 0;
+}
+
+void gk20a_mem_end(struct gk20a *g, struct mem_desc *mem)
+{
+	vunmap(mem->cpu_va);
+	mem->cpu_va = NULL;
+}
+
+u32 gk20a_mem_rd32(struct gk20a *g, struct mem_desc *mem, u32 w)
+{
+	u32 *ptr = mem->cpu_va;
+	u32 data;
+
+	WARN_ON(!ptr);
+	data = ptr[w];
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);
+#endif
+	return data;
+}
+
+u32 gk20a_mem_rd(struct gk20a *g, struct mem_desc *mem, u32 offset)
+{
+	WARN_ON(offset & 3);
+	return gk20a_mem_rd32(g, mem, offset / sizeof(u32));
+}
+
+void gk20a_mem_rd_n(struct gk20a *g, struct mem_desc *mem,
+		u32 offset, void *dest, u32 size)
+{
+	u32 i;
+	u32 *dest_u32 = dest;
+
+	WARN_ON(offset & 3);
+	WARN_ON(size & 3);
+	offset /= sizeof(u32);
+	size /= sizeof(u32);
+
+	for (i = 0; i < size; i++)
+		dest_u32[i] = gk20a_mem_rd32(g, mem, offset + i);
+}
+
+void gk20a_mem_wr32(struct gk20a *g, struct mem_desc *mem, u32 w, u32 data)
+{
+	u32 *ptr = mem->cpu_va;
+
+	WARN_ON(!ptr);
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);
+#endif
+	ptr[w] = data;
+}
+
+void gk20a_mem_wr(struct gk20a *g, struct mem_desc *mem, u32 offset, u32 data)
+{
+	WARN_ON(offset & 3);
+	gk20a_mem_wr32(g, mem, offset / sizeof(u32), data);
+}
+
+void gk20a_mem_wr_n(struct gk20a *g, struct mem_desc *mem, u32 offset,
+		void *src, u32 size)
+{
+	u32 i;
+	u32 *src_u32 = src;
+
+	WARN_ON(offset & 3);
+	WARN_ON(size & 3);
+	offset /= sizeof(u32);
+	size /= sizeof(u32);
+
+	for (i = 0; i < size; i++)
+		gk20a_mem_wr32(g, mem, offset + i, src_u32[i]);
+}
+
+void gk20a_memset(struct gk20a *g, struct mem_desc *mem, u32 offset,
+		u32 value, u32 size)
+{
+	u32 i;
+
+	WARN_ON(offset & 3);
+	WARN_ON(size & 3);
+	offset /= sizeof(u32);
+	size /= sizeof(u32);
+
+	for (i = 0; i < size; i++)
+		gk20a_mem_wr32(g, mem, offset + i, value);
+}
+
 /*
  * GPU mapping life cycle
  * ======================
@@ -780,9 +886,14 @@ void pde_range_from_vaddr_range(struct vm_gk20a *vm,
 		   *pde_lo, *pde_hi);
 }
 
-u32 *pde_from_index(struct vm_gk20a *vm, u32 i)
+static u32 pde_from_index(u32 i)
+{
+	return i * gmmu_pde__size_v() / sizeof(u32);
+}
+
+static u32 pte_from_index(u32 i)
 {
-	return (u32 *) (((u8 *)vm->pdb.mem.cpu_va) + i*gmmu_pde__size_v());
+	return i * gmmu_pte__size_v() / sizeof(u32);
 }
 
 u32 pte_index_from_vaddr(struct vm_gk20a *vm,
@@ -2323,7 +2434,7 @@ static int update_gmmu_pde_locked(struct vm_gk20a *vm,
 	u64 pte_addr_small = 0, pte_addr_big = 0;
 	struct gk20a_mm_entry *entry = vm->pdb.entries + i;
 	u32 pde_v[2] = {0, 0};
-	u32 *pde;
+	u32 pde;
 
 	gk20a_dbg_fn("");
 
@@ -2348,10 +2459,10 @@ static int update_gmmu_pde_locked(struct vm_gk20a *vm,
 		    (big_valid ? (gmmu_pde_vol_big_true_f()) :
 		     gmmu_pde_vol_big_false_f());
 
-	pde = pde_from_index(vm, i);
+	pde = pde_from_index(i);
 
-	gk20a_mem_wr32(pde, 0, pde_v[0]);
-	gk20a_mem_wr32(pde, 1, pde_v[1]);
+	gk20a_mem_wr32(g, &vm->pdb.mem, pde + 0, pde_v[0]);
+	gk20a_mem_wr32(g, &vm->pdb.mem, pde + 1, pde_v[1]);
 
 	gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x",
 		  i, gmmu_pgsz_idx, pde_v[1], pde_v[0]);
@@ -2432,8 +2543,8 @@ static int update_gmmu_pte_locked(struct vm_gk20a *vm,
 		gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
 	}
 
-	gk20a_mem_wr32(pte->mem.cpu_va + i*8, 0, pte_w[0]);
-	gk20a_mem_wr32(pte->mem.cpu_va + i*8, 1, pte_w[1]);
+	gk20a_mem_wr32(g, &pte->mem, pte_from_index(i) + 0, pte_w[0]);
+	gk20a_mem_wr32(g, &pte->mem, pte_from_index(i) + 1, pte_w[1]);
 
 	if (*iova) {
 		*iova += page_size;
@@ -3489,19 +3600,19 @@ static int gk20a_init_cde_vm(struct mm_gk20a *mm)
 			false, false, "cde");
 }
 
-void gk20a_mm_init_pdb(struct gk20a *g, void *inst_ptr, u64 pdb_addr)
+void gk20a_mm_init_pdb(struct gk20a *g, struct mem_desc *mem, u64 pdb_addr)
 {
 	u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
 	u32 pdb_addr_hi = u64_hi32(pdb_addr);
 
-	gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
+	gk20a_mem_wr32(g, mem, ram_in_page_dir_base_lo_w(),
 		(g->mm.vidmem_is_vidmem ?
 		  ram_in_page_dir_base_target_sys_mem_ncoh_f() :
 		  ram_in_page_dir_base_target_vid_mem_f()) |
 		ram_in_page_dir_base_vol_true_f() |
 		ram_in_page_dir_base_lo_f(pdb_addr_lo));
 
-	gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
+	gk20a_mem_wr32(g, mem, ram_in_page_dir_base_hi_w(),
 		ram_in_page_dir_base_hi_f(pdb_addr_hi));
 }
 
@@ -3510,23 +3621,22 @@ void gk20a_init_inst_block(struct mem_desc *inst_block, struct vm_gk20a *vm,
 {
 	struct gk20a *g = gk20a_from_vm(vm);
 	u64 pde_addr = g->ops.mm.get_iova_addr(g, vm->pdb.mem.sgt->sgl, 0);
-	void *inst_ptr = inst_block->cpu_va;
 
 	gk20a_dbg_info("inst block phys = 0x%llx, kv = 0x%p",
-		gk20a_mm_inst_block_addr(g, inst_block), inst_ptr);
+		gk20a_mm_inst_block_addr(g, inst_block), inst_block->cpu_va);
 
 	gk20a_dbg_info("pde pa=0x%llx", (u64)pde_addr);
 
-	g->ops.mm.init_pdb(g, inst_ptr, pde_addr);
+	g->ops.mm.init_pdb(g, inst_block, pde_addr);
 
-	gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
+	gk20a_mem_wr32(g, inst_block, ram_in_adr_limit_lo_w(),
 		u64_lo32(vm->va_limit - 1) & ~0xfff);
 
-	gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
+	gk20a_mem_wr32(g, inst_block, ram_in_adr_limit_hi_w(),
 		ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit - 1)));
 
 	if (big_page_size && g->ops.mm.set_big_page_size)
-		g->ops.mm.set_big_page_size(g, inst_ptr, big_page_size);
+		g->ops.mm.set_big_page_size(g, inst_block, big_page_size);
 }
 
 int gk20a_mm_fb_flush(struct gk20a *g)
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 7fa0b7fb..e9ac8f18 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -419,6 +419,34 @@ static inline enum gmmu_pgsz_gk20a __get_pte_size(struct vm_gk20a *vm,
 		return gmmu_page_size_small;
 }
 
+/*
+ * Buffer accessors - wrap between begin() and end() if there is no permanent
+ * kernel mapping for this buffer.
+ */
+
+int gk20a_mem_begin(struct gk20a *g, struct mem_desc *mem);
+/* nop for null mem, like with free() or vunmap() */
+void gk20a_mem_end(struct gk20a *g, struct mem_desc *mem);
+
+/* word-indexed offset */
+u32 gk20a_mem_rd32(struct gk20a *g, struct mem_desc *mem, u32 w);
+/* byte offset (32b-aligned) */
+u32 gk20a_mem_rd(struct gk20a *g, struct mem_desc *mem, u32 offset);
+/* memcpy to cpu, offset and size in bytes (32b-aligned) */
+void gk20a_mem_rd_n(struct gk20a *g, struct mem_desc *mem, u32 offset,
+		void *dest, u32 size);
+
+/* word-indexed offset */
+void gk20a_mem_wr32(struct gk20a *g, struct mem_desc *mem, u32 w, u32 data);
+/* byte offset (32b-aligned) */
+void gk20a_mem_wr(struct gk20a *g, struct mem_desc *mem, u32 offset, u32 data);
+/* memcpy from cpu, offset and size in bytes (32b-aligned) */
+void gk20a_mem_wr_n(struct gk20a *g, struct mem_desc *mem, u32 offset,
+		void *src, u32 size);
+/* size and offset in bytes (32b-aligned), filled with u32s */
+void gk20a_memset(struct gk20a *g, struct mem_desc *mem, u32 offset,
+		u32 value, u32 size);
+
 #if 0 /*related to addr bits above, concern below TBD on which is accurate */
 #define bar1_instance_block_shift_gk20a() (max_physaddr_bits_gk20a() -\
 					   bus_bar1_block_ptr_s())
@@ -673,7 +701,6 @@ void pde_range_from_vaddr_range(struct vm_gk20a *vm,
 					      u64 addr_lo, u64 addr_hi,
 					      u32 *pde_lo, u32 *pde_hi);
 int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm);
-u32 *pde_from_index(struct vm_gk20a *vm, u32 i);
 u32 pte_index_from_vaddr(struct vm_gk20a *vm,
 			       u64 addr, enum gmmu_pgsz_gk20a pgsz_idx);
 void free_gmmu_pages(struct vm_gk20a *vm,
@@ -685,7 +712,7 @@ struct gpu_ops;
 void gk20a_init_mm(struct gpu_ops *gops);
 const struct gk20a_mmu_level *gk20a_mm_get_mmu_levels(struct gk20a *g,
 						      u32 big_page_size);
-void gk20a_mm_init_pdb(struct gk20a *g, void *inst_ptr, u64 pdb_addr);
+void gk20a_mm_init_pdb(struct gk20a *g, struct mem_desc *mem, u64 pdb_addr);
 
 void gk20a_remove_vm(struct vm_gk20a *vm, struct mem_desc *inst_block);
 
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
index 56ad0c2a..54b2eef4 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -2421,11 +2421,10 @@ static int gk20a_init_pmu_reset_enable_hw(struct gk20a *g)
 static int gk20a_prepare_ucode(struct gk20a *g)
 {
 	struct pmu_gk20a *pmu = &g->pmu;
-	int i, err = 0;
+	int err = 0;
 	struct device *d = dev_from_gk20a(g);
 	struct mm_gk20a *mm = &g->mm;
 	struct vm_gk20a *vm = &mm->pmu.vm;
-	void *ucode_ptr;
 
 	if (g->pmu_fw) {
 		gk20a_init_pmu(pmu);
@@ -2449,11 +2448,8 @@ static int gk20a_prepare_ucode(struct gk20a *g)
 	if (err)
 		goto err_release_fw;
 
-	ucode_ptr = pmu->ucode.cpu_va;
-
-	for (i = 0; i < (pmu->desc->app_start_offset +
-			pmu->desc->app_size) >> 2; i++)
-		gk20a_mem_wr32(ucode_ptr, i, pmu->ucode_image[i]);
+	gk20a_mem_wr_n(g, &pmu->ucode, 0, pmu->ucode_image,
+			pmu->desc->app_start_offset + pmu->desc->app_size);
 
 	gk20a_init_pmu(pmu);
 
diff --git a/drivers/gpu/nvgpu/gm20b/acr_gm20b.c b/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
index 0e6e715d..3ac2cec8 100644
--- a/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
@@ -43,8 +43,8 @@ static int lsfm_add_ucode_img(struct gk20a *g, struct ls_flcn_mgr *plsfm,
 static void lsfm_free_ucode_img_res(struct flcn_ucode_img *p_img);
 static void lsfm_free_nonpmu_ucode_img_res(struct flcn_ucode_img *p_img);
 static int lsf_gen_wpr_requirements(struct gk20a *g, struct ls_flcn_mgr *plsfm);
-static int lsfm_init_wpr_contents(struct gk20a *g, struct ls_flcn_mgr *plsfm,
-	void *nonwpr_addr);
+static void lsfm_init_wpr_contents(struct gk20a *g, struct ls_flcn_mgr *plsfm,
+	struct mem_desc *nonwpr);
 static int acr_ucode_patch_sig(struct gk20a *g,
 		unsigned int *p_img,
 		unsigned int *p_prod_sig,
@@ -355,7 +355,7 @@ int prepare_ucode_blob(struct gk20a *g)
 
 		gm20b_dbg_pmu("managed LS falcon %d, WPR size %d bytes.\n",
 			plsfm->managed_flcn_cnt, plsfm->wpr_size);
-		lsfm_init_wpr_contents(g, plsfm, g->acr.ucode_blob.cpu_va);
+		lsfm_init_wpr_contents(g, plsfm, &g->acr.ucode_blob);
 	} else {
 		gm20b_dbg_pmu("LSFM is managing no falcons.\n");
 	}
@@ -613,120 +613,91 @@ static int lsfm_fill_flcn_bl_gen_desc(struct gk20a *g,
 }
 
 /* Initialize WPR contents */
-static int lsfm_init_wpr_contents(struct gk20a *g, struct ls_flcn_mgr *plsfm,
-	void *nonwpr_addr)
+static void lsfm_init_wpr_contents(struct gk20a *g, struct ls_flcn_mgr *plsfm,
+	struct mem_desc *ucode)
 {
+	struct lsfm_managed_ucode_img *pnode = plsfm->ucode_img_list;
+	u32 i;
 
-	int status = 0;
-	union flcn_bl_generic_desc *nonwpr_bl_gen_desc;
-	if (nonwpr_addr == NULL) {
-		status = -ENOMEM;
-	} else {
-		struct lsfm_managed_ucode_img *pnode = plsfm->ucode_img_list;
-		struct lsf_wpr_header *wpr_hdr;
-		struct lsf_lsb_header *lsb_hdr;
-		void *ucode_off;
-		u32 i;
-
-		/* The WPR array is at the base of the WPR */
-		wpr_hdr = (struct lsf_wpr_header *)nonwpr_addr;
-		pnode = plsfm->ucode_img_list;
-		i = 0;
+	/* The WPR array is at the base of the WPR */
+	pnode = plsfm->ucode_img_list;
+	i = 0;
 
-		/*
-		 * Walk the managed falcons, flush WPR and LSB headers to FB.
-		 * flush any bl args to the storage area relative to the
-		 * ucode image (appended on the end as a DMEM area).
-		 */
-		while (pnode) {
-			/* Flush WPR header to memory*/
-			memcpy(&wpr_hdr[i], &pnode->wpr_header,
-					sizeof(struct lsf_wpr_header));
-			gm20b_dbg_pmu("wpr header as in memory and pnode\n");
-			gm20b_dbg_pmu("falconid :%d %d\n",
-				pnode->wpr_header.falcon_id,
-				wpr_hdr[i].falcon_id);
-			gm20b_dbg_pmu("lsb_offset :%x %x\n",
-				pnode->wpr_header.lsb_offset,
-				wpr_hdr[i].lsb_offset);
-			gm20b_dbg_pmu("bootstrap_owner :%d %d\n",
-				pnode->wpr_header.bootstrap_owner,
-				wpr_hdr[i].bootstrap_owner);
-			gm20b_dbg_pmu("lazy_bootstrap :%d %d\n",
-				pnode->wpr_header.lazy_bootstrap,
-				wpr_hdr[i].lazy_bootstrap);
-			gm20b_dbg_pmu("status :%d %d\n",
-				pnode->wpr_header.status, wpr_hdr[i].status);
-
-			/*Flush LSB header to memory*/
-			lsb_hdr = (struct lsf_lsb_header *)((u8 *)nonwpr_addr +
-					pnode->wpr_header.lsb_offset);
-			memcpy(lsb_hdr, &pnode->lsb_header,
-					sizeof(struct lsf_lsb_header));
-			gm20b_dbg_pmu("lsb header as in memory and pnode\n");
-			gm20b_dbg_pmu("ucode_off :%x %x\n",
-				pnode->lsb_header.ucode_off,
-				lsb_hdr->ucode_off);
-			gm20b_dbg_pmu("ucode_size :%x %x\n",
-				pnode->lsb_header.ucode_size,
-				lsb_hdr->ucode_size);
-			gm20b_dbg_pmu("data_size :%x %x\n",
-				pnode->lsb_header.data_size,
-				lsb_hdr->data_size);
-			gm20b_dbg_pmu("bl_code_size :%x %x\n",
-				pnode->lsb_header.bl_code_size,
-				lsb_hdr->bl_code_size);
-			gm20b_dbg_pmu("bl_imem_off :%x %x\n",
-				pnode->lsb_header.bl_imem_off,
-				lsb_hdr->bl_imem_off);
-			gm20b_dbg_pmu("bl_data_off :%x %x\n",
-				pnode->lsb_header.bl_data_off,
-				lsb_hdr->bl_data_off);
-			gm20b_dbg_pmu("bl_data_size :%x %x\n",
-				pnode->lsb_header.bl_data_size,
-				lsb_hdr->bl_data_size);
-			gm20b_dbg_pmu("app_code_off :%x %x\n",
-				pnode->lsb_header.app_code_off,
-				lsb_hdr->app_code_off);
-			gm20b_dbg_pmu("app_code_size :%x %x\n",
-				pnode->lsb_header.app_code_size,
-				lsb_hdr->app_code_size);
-			gm20b_dbg_pmu("app_data_off :%x %x\n",
-				pnode->lsb_header.app_data_off,
-				lsb_hdr->app_data_off);
-			gm20b_dbg_pmu("app_data_size :%x %x\n",
-				pnode->lsb_header.app_data_size,
-				lsb_hdr->app_data_size);
-			gm20b_dbg_pmu("flags :%x %x\n",
-				pnode->lsb_header.flags, lsb_hdr->flags);
-
-			/*If this falcon has a boot loader and related args,
-			 * flush them.*/
-			if (!pnode->ucode_img.header) {
-				nonwpr_bl_gen_desc =
-					(union flcn_bl_generic_desc *)
-					((u8 *)nonwpr_addr +
-					pnode->lsb_header.bl_data_off);
-
-				/*Populate gen bl and flush to memory*/
-				lsfm_fill_flcn_bl_gen_desc(g, pnode);
-				memcpy(nonwpr_bl_gen_desc, &pnode->bl_gen_desc,
+	/*
+	 * Walk the managed falcons, flush WPR and LSB headers to FB.
+	 * flush any bl args to the storage area relative to the
+	 * ucode image (appended on the end as a DMEM area).
+	 */
+	while (pnode) {
+		/* Flush WPR header to memory*/
+		gk20a_mem_wr_n(g, ucode, i * sizeof(pnode->wpr_header),
+				&pnode->wpr_header, sizeof(pnode->wpr_header));
+
+		gm20b_dbg_pmu("wpr header");
+		gm20b_dbg_pmu("falconid :%d",
+				pnode->wpr_header.falcon_id);
+		gm20b_dbg_pmu("lsb_offset :%x",
+				pnode->wpr_header.lsb_offset);
+		gm20b_dbg_pmu("bootstrap_owner :%d",
+			pnode->wpr_header.bootstrap_owner);
+		gm20b_dbg_pmu("lazy_bootstrap :%d",
+				pnode->wpr_header.lazy_bootstrap);
+		gm20b_dbg_pmu("status :%d",
+				pnode->wpr_header.status);
+
+		/*Flush LSB header to memory*/
+		gk20a_mem_wr_n(g, ucode, pnode->wpr_header.lsb_offset,
+				&pnode->lsb_header, sizeof(pnode->lsb_header));
+
+		gm20b_dbg_pmu("lsb header");
+		gm20b_dbg_pmu("ucode_off :%x",
+				pnode->lsb_header.ucode_off);
+		gm20b_dbg_pmu("ucode_size :%x",
+				pnode->lsb_header.ucode_size);
+		gm20b_dbg_pmu("data_size :%x",
+				pnode->lsb_header.data_size);
+		gm20b_dbg_pmu("bl_code_size :%x",
+				pnode->lsb_header.bl_code_size);
+		gm20b_dbg_pmu("bl_imem_off :%x",
+				pnode->lsb_header.bl_imem_off);
+		gm20b_dbg_pmu("bl_data_off :%x",
+				pnode->lsb_header.bl_data_off);
+		gm20b_dbg_pmu("bl_data_size :%x",
+				pnode->lsb_header.bl_data_size);
+		gm20b_dbg_pmu("app_code_off :%x",
+				pnode->lsb_header.app_code_off);
+		gm20b_dbg_pmu("app_code_size :%x",
+				pnode->lsb_header.app_code_size);
+		gm20b_dbg_pmu("app_data_off :%x",
+				pnode->lsb_header.app_data_off);
+		gm20b_dbg_pmu("app_data_size :%x",
+				pnode->lsb_header.app_data_size);
+		gm20b_dbg_pmu("flags :%x",
+				pnode->lsb_header.flags);
+
+		/*If this falcon has a boot loader and related args,
+		 * flush them.*/
+		if (!pnode->ucode_img.header) {
+			/*Populate gen bl and flush to memory*/
+			lsfm_fill_flcn_bl_gen_desc(g, pnode);
+			gk20a_mem_wr_n(g, ucode,
+					pnode->lsb_header.bl_data_off,
+					&pnode->bl_gen_desc,
 					pnode->bl_gen_desc_size);
-			}
-			ucode_off = (void *)(pnode->lsb_header.ucode_off +
-				(u8 *)nonwpr_addr);
-			/*Copying of ucode*/
-			memcpy(ucode_off, pnode->ucode_img.data,
-				pnode->ucode_img.data_size);
-			pnode = pnode->next;
-			i++;
 		}
-
-		/* Tag the terminator WPR header with an invalid falcon ID. */
-		gk20a_mem_wr32(&wpr_hdr[plsfm->managed_flcn_cnt].falcon_id,
-			0, LSF_FALCON_ID_INVALID);
+		/*Copying of ucode*/
+		gk20a_mem_wr_n(g, ucode, pnode->lsb_header.ucode_off,
+				pnode->ucode_img.data,
+				pnode->ucode_img.data_size);
+		pnode = pnode->next;
+		i++;
 	}
-	return status;
+
+	/* Tag the terminator WPR header with an invalid falcon ID. */
+	gk20a_mem_wr32(g, ucode,
+			plsfm->managed_flcn_cnt * sizeof(struct lsf_wpr_header) +
+			offsetof(struct lsf_wpr_header, falcon_id),
+			LSF_FALCON_ID_INVALID);
 }
 
 /*!
@@ -1000,7 +971,7 @@ int gm20b_bootstrap_hs_flcn(struct gk20a *g)
 {
 	struct mm_gk20a *mm = &g->mm;
 	struct vm_gk20a *vm = &mm->pmu.vm;
-	int i, err = 0;
+	int err = 0;
 	u64 *acr_dmem;
 	u32 img_size_in_bytes = 0;
 	u32 status, size;
@@ -1066,10 +1037,8 @@ int gm20b_bootstrap_hs_flcn(struct gk20a *g)
 		((struct flcn_acr_desc *)acr_dmem)->regions.no_regions = 2;
 		((struct flcn_acr_desc *)acr_dmem)->wpr_offset = 0;
 
-		for (i = 0; i < (img_size_in_bytes/4); i++) {
-			gk20a_mem_wr32(acr->acr_ucode.cpu_va, i,
-					acr_ucode_data_t210_load[i]);
-		}
+		gk20a_mem_wr_n(g, &acr->acr_ucode, 0,
+				acr_ucode_data_t210_load, img_size_in_bytes);
 		/*
 		 * In order to execute this binary, we will be using
 		 * a bootloader which will load this image into PMU IMEM/DMEM.
@@ -1323,7 +1292,7 @@ int pmu_exec_gen_bl(struct gk20a *g, void *desc, u8 b_wait_for_halt)
 	struct mm_gk20a *mm = &g->mm;
 	struct vm_gk20a *vm = &mm->pmu.vm;
 	struct device *d = dev_from_gk20a(g);
-	int i, err = 0;
+	int err = 0;
 	u32 bl_sz;
 	struct acr_gm20b *acr = &g->acr;
 	const struct firmware *hsbl_fw = acr->hsbl_fw;
@@ -1369,8 +1338,7 @@ int pmu_exec_gen_bl(struct gk20a *g, void *desc, u8 b_wait_for_halt)
 			goto err_free_ucode;
 		}
 
-		for (i = 0; i < (bl_sz) >> 2; i++)
-			gk20a_mem_wr32(acr->hsbl_ucode.cpu_va, i, pmu_bl_gm10x[i]);
+		gk20a_mem_wr_n(g, &acr->hsbl_ucode, 0, pmu_bl_gm10x, bl_sz);
 		gm20b_dbg_pmu("Copied bl ucode to bl_cpuva\n");
 	}
 	/*
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
index b9a1e685..2197bae5 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
@@ -849,7 +849,7 @@ static int gr_gm20b_alloc_gr_ctx(struct gk20a *g,
 
 static void gr_gm20b_update_ctxsw_preemption_mode(struct gk20a *g,
 		struct channel_ctx_gk20a *ch_ctx,
-		void *ctx_ptr)
+		struct mem_desc *mem)
 {
 	struct gr_ctx_desc *gr_ctx = ch_ctx->gr_ctx;
 	u32 cta_preempt_option =
@@ -859,7 +859,8 @@ static void gr_gm20b_update_ctxsw_preemption_mode(struct gk20a *g,
 
 	if (gr_ctx->compute_preempt_mode == NVGPU_COMPUTE_PREEMPTION_MODE_CTA) {
 		gk20a_dbg_info("CTA: %x", cta_preempt_option);
-		gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_preemption_options_o(), 0,
+		gk20a_mem_wr(g, mem,
+				ctxsw_prog_main_image_preemption_options_o(),
 				cta_preempt_option);
 	}
 
@@ -1005,7 +1006,7 @@ static int gr_gm20b_update_pc_sampling(struct channel_gk20a *c,
 				       bool enable)
 {
 	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
-	void *ctx_ptr = NULL;
+	struct mem_desc *mem;
 	u32 v;
 
 	gk20a_dbg_fn("");
@@ -1013,18 +1014,17 @@ static int gr_gm20b_update_pc_sampling(struct channel_gk20a *c,
 	if (!ch_ctx || !ch_ctx->gr_ctx || c->vpr)
 		return -EINVAL;
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr)
+	mem = &ch_ctx->gr_ctx->mem;
+
+	if (gk20a_mem_begin(c->g, mem))
 		return -ENOMEM;
 
-	v = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+	v = gk20a_mem_rd(c->g, mem, ctxsw_prog_main_image_pm_o());
 	v &= ~ctxsw_prog_main_image_pm_pc_sampling_m();
 	v |= ctxsw_prog_main_image_pm_pc_sampling_f(enable);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0, v);
+	gk20a_mem_wr(c->g, mem, ctxsw_prog_main_image_pm_o(), v);
 
-	vunmap(ctx_ptr);
+	gk20a_mem_end(c->g, mem);
 
 	gk20a_dbg_fn("done");
 
@@ -1089,13 +1089,13 @@ static void gr_gm20b_init_cyclestats(struct gk20a *g)
 #endif
 }
 
-static void gr_gm20b_enable_cde_in_fecs(void *ctx_ptr)
+static void gr_gm20b_enable_cde_in_fecs(struct gk20a *g, struct mem_desc *mem)
 {
 	u32 cde_v;
 
-	cde_v = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_ctl_o(), 0);
+	cde_v = gk20a_mem_rd(g, mem, ctxsw_prog_main_image_ctl_o());
 	cde_v |=  ctxsw_prog_main_image_ctl_cde_enabled_f();
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_ctl_o(), 0, cde_v);
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_ctl_o(), cde_v);
 }
 
 static void gr_gm20b_bpt_reg_info(struct gk20a *g, struct warpstate *w_state)
diff --git a/drivers/gpu/nvgpu/gm20b/mm_gm20b.c b/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
index ac73b5c8..726d73ed 100644
--- a/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
@@ -106,14 +106,14 @@ static void gm20b_mm_mmu_set_debug_mode(struct gk20a *g, bool enable)
 }
 
 static void gm20b_mm_set_big_page_size(struct gk20a *g,
-				void *inst_ptr, int size)
+				struct mem_desc *mem, int size)
 {
 	u32 val;
 
 	gk20a_dbg_fn("");
 
 	gk20a_dbg_info("big page size %d\n", size);
-	val = gk20a_mem_rd32(inst_ptr, ram_in_big_page_size_w());
+	val = gk20a_mem_rd32(g, mem, ram_in_big_page_size_w());
 	val &= ~ram_in_big_page_size_m();
 
 	if (size == SZ_64K)
@@ -121,7 +121,7 @@ static void gm20b_mm_set_big_page_size(struct gk20a *g,
 	else
 		val |= ram_in_big_page_size_128kb_f();
 
-	gk20a_mem_wr32(inst_ptr, ram_in_big_page_size_w(), val);
+	gk20a_mem_wr32(g, mem, ram_in_big_page_size_w(), val);
 	gk20a_dbg_fn("done");
 }
 
diff --git a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
index 66b5e410..d1cba979 100644
--- a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
@@ -285,8 +285,6 @@ static int vgpu_init_fifo_setup_sw(struct gk20a *g)
 	mutex_init(&f->free_chs_mutex);
 
 	for (chid = 0; chid < f->num_channels; chid++) {
-		f->channel[chid].userd_cpu_va =
-			f->userd.cpu_va + chid * f->userd_entry_size;
 		f->channel[chid].userd_iova =
 			g->ops.mm.get_iova_addr(g, f->userd.sgt->sgl, 0)
 				+ chid * f->userd_entry_size;
-- 
cgit v1.2.2