From 6eebc87d99f9f04b2b68e0bc0142c161ab3e669d Mon Sep 17 00:00:00 2001
From: Konsta Holtta <kholtta@nvidia.com>
Date: Thu, 12 May 2016 09:32:05 +0300
Subject: gpu: nvgpu: refactor gk20a_mem_{wr,rd} for vidmem

To support vidmem, pass g and mem_desc to the buffer memory accessor
functions. This allows the functions to select the memory access method
based on the buffer aperture instead of using the cpu pointer directly
(like until now). The selection and aperture support will be in another
patch; this patch only refactors these accessors, but keeps the
underlying functionality as-is.

gk20a_mem_{rd,wr}32() work as previously; add also gk20a_mem_{rd,wr}()
for byte-indexed accesses, gk20a_mem_{rd,wr}_n() for memcpy()-like
functionality, and gk20a_memset() for filling buffers with a constant.
The 8 and 16 bit accessor functions are removed.

vmap()/vunmap() pairs are abstracted to gk20a_mem_{begin,end}() to
support other types of mappings or conditions where mapping the buffer
is unnecessary or different.

Several function arguments that would access these buffers are also
changed to take a mem_desc instead of a plain cpu pointer. Some relevant
occasions are changed to use the accessor functions instead of cpu
pointers without them (e.g., memcpying to and from), but the majority of
direct accesses will be adjusted later, when the buffers are moved to
support vidmem.

JIRA DNVGPU-23

Change-Id: I3dd22e14290c4ab742d42e2dd327ebeb5cd3f25a
Signed-off-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-on: http://git-master/r/1121143
Reviewed-by: Ken Adams <kadams@nvidia.com>
Tested-by: Ken Adams <kadams@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/channel_gk20a.c    |  73 +++---
 drivers/gpu/nvgpu/gk20a/channel_gk20a.h    |   1 -
 drivers/gpu/nvgpu/gk20a/debug_gk20a.c      |  59 +++--
 drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c |  27 +--
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.c       |   2 -
 drivers/gpu/nvgpu/gk20a/gk20a.h            |  57 +----
 drivers/gpu/nvgpu/gk20a/gr_gk20a.c         | 359 ++++++++++++-----------------
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c         | 144 ++++++++++--
 drivers/gpu/nvgpu/gk20a/mm_gk20a.h         |  31 ++-
 drivers/gpu/nvgpu/gk20a/pmu_gk20a.c        |  10 +-
 10 files changed, 390 insertions(+), 373 deletions(-)

(limited to 'drivers/gpu/nvgpu/gk20a')

diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 990972e4..065e8ab1 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -129,28 +129,25 @@ static int channel_gk20a_commit_userd(struct channel_gk20a *c)
 {
 	u32 addr_lo;
 	u32 addr_hi;
-	void *inst_ptr;
 	struct gk20a *g = c->g;
 
 	gk20a_dbg_fn("");
 
-	inst_ptr = c->inst_block.cpu_va;
-	if (!inst_ptr)
-		return -ENOMEM;
-
 	addr_lo = u64_lo32(c->userd_iova >> ram_userd_base_shift_v());
 	addr_hi = u64_hi32(c->userd_iova);
 
 	gk20a_dbg_info("channel %d : set ramfc userd 0x%16llx",
 		c->hw_chid, (u64)c->userd_iova);
 
-	gk20a_mem_wr32(inst_ptr, ram_in_ramfc_w() + ram_fc_userd_w(),
+	gk20a_mem_wr32(g, &c->inst_block,
+		       ram_in_ramfc_w() + ram_fc_userd_w(),
 		       (g->mm.vidmem_is_vidmem ?
 			pbdma_userd_target_sys_mem_ncoh_f() :
 			pbdma_userd_target_vid_mem_f()) |
 		       pbdma_userd_addr_f(addr_lo));
 
-	gk20a_mem_wr32(inst_ptr, ram_in_ramfc_w() + ram_fc_userd_hi_w(),
+	gk20a_mem_wr32(g, &c->inst_block,
+		       ram_in_ramfc_w() + ram_fc_userd_hi_w(),
 		       pbdma_userd_hi_addr_f(addr_hi));
 
 	return 0;
@@ -186,13 +183,8 @@ int gk20a_channel_get_timescale_from_timeslice(struct gk20a *g,
 
 static int channel_gk20a_set_schedule_params(struct channel_gk20a *c)
 {
-	void *inst_ptr;
 	int shift = 0, value = 0;
 
-	inst_ptr = c->inst_block.cpu_va;
-	if (!inst_ptr)
-		return -ENOMEM;
-
 	gk20a_channel_get_timescale_from_timeslice(c->g,
 		c->timeslice_us, &value, &shift);
 
@@ -203,7 +195,7 @@ static int channel_gk20a_set_schedule_params(struct channel_gk20a *c)
 	WARN_ON(c->g->ops.fifo.preempt_channel(c->g, c->hw_chid));
 
 	/* set new timeslice */
-	gk20a_mem_wr32(inst_ptr, ram_fc_runlist_timeslice_w(),
+	gk20a_mem_wr32(c->g, &c->inst_block, ram_fc_runlist_timeslice_w(),
 		value | (shift << 12) |
 		fifo_runlist_timeslice_enable_true_f());
 
@@ -255,33 +247,30 @@ u32 channel_gk20a_pbdma_acquire_val(struct channel_gk20a *c)
 int channel_gk20a_setup_ramfc(struct channel_gk20a *c,
 			u64 gpfifo_base, u32 gpfifo_entries, u32 flags)
 {
-	void *inst_ptr;
+	struct gk20a *g = c->g;
+	struct mem_desc *mem = &c->inst_block;
 
 	gk20a_dbg_fn("");
 
-	inst_ptr = c->inst_block.cpu_va;
-	if (!inst_ptr)
-		return -ENOMEM;
-
-	memset(inst_ptr, 0, ram_fc_size_val_v());
+	gk20a_memset(g, mem, 0, 0, ram_fc_size_val_v());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_gp_base_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_gp_base_w(),
 		pbdma_gp_base_offset_f(
 		u64_lo32(gpfifo_base >> pbdma_gp_base_rsvd_s())));
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_gp_base_hi_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_gp_base_hi_w(),
 		pbdma_gp_base_hi_offset_f(u64_hi32(gpfifo_base)) |
 		pbdma_gp_base_hi_limit2_f(ilog2(gpfifo_entries)));
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_signature_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_signature_w(),
 		 c->g->ops.fifo.get_pbdma_signature(c->g));
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_formats_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_formats_w(),
 		pbdma_formats_gp_fermi0_f() |
 		pbdma_formats_pb_fermi1_f() |
 		pbdma_formats_mp_fermi0_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_pb_header_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_pb_header_w(),
 		pbdma_pb_header_priv_user_f() |
 		pbdma_pb_header_method_zero_f() |
 		pbdma_pb_header_subchannel_zero_f() |
@@ -289,47 +278,49 @@ int channel_gk20a_setup_ramfc(struct channel_gk20a *c,
 		pbdma_pb_header_first_true_f() |
 		pbdma_pb_header_type_inc_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_subdevice_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_subdevice_w(),
 		pbdma_subdevice_id_f(1) |
 		pbdma_subdevice_status_active_f() |
 		pbdma_subdevice_channel_dma_enable_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_target_w(), pbdma_target_engine_sw_f());
+	gk20a_mem_wr32(g, mem, ram_fc_target_w(), pbdma_target_engine_sw_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_acquire_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_acquire_w(),
 		channel_gk20a_pbdma_acquire_val(c));
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_runlist_timeslice_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_runlist_timeslice_w(),
 		fifo_runlist_timeslice_timeout_128_f() |
 		fifo_runlist_timeslice_timescale_3_f() |
 		fifo_runlist_timeslice_enable_true_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_pb_timeslice_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_pb_timeslice_w(),
 		fifo_pb_timeslice_timeout_16_f() |
 		fifo_pb_timeslice_timescale_0_f() |
 		fifo_pb_timeslice_enable_true_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_chid_w(), ram_fc_chid_id_f(c->hw_chid));
+	gk20a_mem_wr32(g, mem, ram_fc_chid_w(), ram_fc_chid_id_f(c->hw_chid));
 
 	return channel_gk20a_commit_userd(c);
 }
 
 static int channel_gk20a_setup_userd(struct channel_gk20a *c)
 {
-	BUG_ON(!c->userd_cpu_va);
+	struct gk20a *g = c->g;
+	struct mem_desc *mem = &g->fifo.userd;
+	u32 offset = c->hw_chid * g->fifo.userd_entry_size / sizeof(u32);
 
 	gk20a_dbg_fn("");
 
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_put_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_get_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_ref_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_put_hi_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_ref_threshold_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_top_level_get_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_top_level_get_hi_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_get_hi_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_get_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_put_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_put_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_get_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_ref_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_put_hi_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_ref_threshold_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_gp_top_level_get_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_gp_top_level_get_hi_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_get_hi_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_gp_get_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_gp_put_w(), 0);
 
 	return 0;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 8840a3ae..b1355f92 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -130,7 +130,6 @@ struct channel_gk20a {
 	struct mem_desc inst_block;
 	struct mem_desc_sub ramfc;
 
-	void *userd_cpu_va;
 	u64 userd_iova;
 	u64 userd_gpu_va;
 
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
index c2285c8a..a3fa2ea5 100644
--- a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
@@ -36,7 +36,7 @@ unsigned int gk20a_debug_trace_cmdbuf;
 struct ch_state {
 	int pid;
 	int refs;
-	u8 inst_block[0];
+	u32 inst_block[0];
 };
 
 static const char * const ccsr_chan_status_str[] = {
@@ -108,15 +108,15 @@ static void gk20a_debug_show_channel(struct gk20a *g,
 	u32 channel = gk20a_readl(g, ccsr_channel_r(hw_chid));
 	u32 status = ccsr_channel_status_v(channel);
 	u32 syncpointa, syncpointb;
-	void *inst_ptr;
+	u32 *inst_mem;
 
 	if (!ch_state)
 		return;
 
-	inst_ptr = &ch_state->inst_block[0];
+	inst_mem = &ch_state->inst_block[0];
 
-	syncpointa = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointa_w());
-	syncpointb = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointb_w());
+	syncpointa = inst_mem[ram_fc_syncpointa_w()];
+	syncpointb = inst_mem[ram_fc_syncpointb_w()];
 
 	gk20a_debug_output(o, "%d-%s, pid %d, refs: %d: ", hw_chid,
 			dev_name(g->dev),
@@ -129,23 +129,22 @@ static void gk20a_debug_show_channel(struct gk20a *g,
 	gk20a_debug_output(o, "TOP: %016llx PUT: %016llx GET: %016llx "
 			"FETCH: %016llx\nHEADER: %08x COUNT: %08x\n"
 			"SYNCPOINT %08x %08x SEMAPHORE %08x %08x %08x %08x\n",
-		(u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_top_level_get_w()) +
-		((u64)gk20a_mem_rd32(inst_ptr,
-			ram_fc_pb_top_level_get_hi_w()) << 32ULL),
-		(u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_put_w()) +
-		((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_put_hi_w()) << 32ULL),
-		(u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_get_w()) +
-		((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_get_hi_w()) << 32ULL),
-		(u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_fetch_w()) +
-		((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_fetch_hi_w()) << 32ULL),
-		gk20a_mem_rd32(inst_ptr, ram_fc_pb_header_w()),
-		gk20a_mem_rd32(inst_ptr, ram_fc_pb_count_w()),
+		(u64)inst_mem[ram_fc_pb_top_level_get_w()] +
+		((u64)inst_mem[ram_fc_pb_top_level_get_hi_w()] << 32ULL),
+		(u64)inst_mem[ram_fc_pb_put_w()] +
+		((u64)inst_mem[ram_fc_pb_put_hi_w()] << 32ULL),
+		(u64)inst_mem[ram_fc_pb_get_w()] +
+		((u64)inst_mem[ram_fc_pb_get_hi_w()] << 32ULL),
+		(u64)inst_mem[ram_fc_pb_fetch_w()] +
+		((u64)inst_mem[ram_fc_pb_fetch_hi_w()] << 32ULL),
+		inst_mem[ram_fc_pb_header_w()],
+		inst_mem[ram_fc_pb_count_w()],
 		syncpointa,
 		syncpointb,
-		gk20a_mem_rd32(inst_ptr, ram_fc_semaphorea_w()),
-		gk20a_mem_rd32(inst_ptr, ram_fc_semaphoreb_w()),
-		gk20a_mem_rd32(inst_ptr, ram_fc_semaphorec_w()),
-		gk20a_mem_rd32(inst_ptr, ram_fc_semaphored_w()));
+		inst_mem[ram_fc_semaphorea_w()],
+		inst_mem[ram_fc_semaphoreb_w()],
+		inst_mem[ram_fc_semaphorec_w()],
+		inst_mem[ram_fc_semaphored_w()]);
 
 #ifdef CONFIG_TEGRA_GK20A
 	if ((pbdma_syncpointb_op_v(syncpointb) == pbdma_syncpointb_op_wait_v())
@@ -246,17 +245,15 @@ void gk20a_debug_show_dump(struct gk20a *g, struct gk20a_debug_output *o)
 
 	for (chid = 0; chid < f->num_channels; chid++) {
 		struct channel_gk20a *ch = &f->channel[chid];
-		if (ch_state[chid]) {
-			if (ch->inst_block.cpu_va) {
-				ch_state[chid]->pid = ch->pid;
-				ch_state[chid]->refs =
-					atomic_read(&ch->ref_count);
-				memcpy(&ch_state[chid]->inst_block[0],
-						ch->inst_block.cpu_va,
-						ram_in_alloc_size_v());
-			}
-			gk20a_channel_put(ch);
-		}
+		if (!ch_state[chid])
+			continue;
+
+		ch_state[chid]->pid = ch->pid;
+		ch_state[chid]->refs = atomic_read(&ch->ref_count);
+		gk20a_mem_rd_n(g, &ch->inst_block, 0,
+				&ch_state[chid]->inst_block[0],
+				ram_in_alloc_size_v());
+		gk20a_channel_put(ch);
 	}
 	for (chid = 0; chid < f->num_channels; chid++) {
 		if (ch_state[chid]) {
diff --git a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
index f9cddc41..edddcdc1 100644
--- a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
@@ -619,7 +619,7 @@ static int gk20a_fecs_trace_bind_channel(struct gk20a *g,
 	phys_addr_t pa;
 	struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
 	struct gk20a_fecs_trace *trace = g->fecs_trace;
-	void *ctx_ptr;
+	struct mem_desc *mem = &ch_ctx->gr_ctx->mem;
 	u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(ch);
 
 	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
@@ -634,10 +634,7 @@ static int gk20a_fecs_trace_bind_channel(struct gk20a *g,
 	if (!pa)
 		return -ENOMEM;
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-		PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT, 0,
-		pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr)
+	if (gk20a_mem_begin(g, mem))
 		return -ENOMEM;
 
 	lo = u64_lo32(pa);
@@ -646,18 +643,18 @@ static int gk20a_fecs_trace_bind_channel(struct gk20a *g,
 	gk20a_dbg(gpu_dbg_ctxsw, "addr_hi=%x addr_lo=%x count=%d", hi,
 		lo, GK20A_FECS_TRACE_NUM_RECORDS);
 
-	gk20a_mem_wr32(ctx_ptr
-		+ ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
-		0, lo);
-	gk20a_mem_wr32(ctx_ptr
-		+ ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
-		0, ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi));
-	gk20a_mem_wr32(ctx_ptr
-		+ ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
-		0, ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
+	gk20a_mem_wr(g, mem,
+		ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
+		lo);
+	gk20a_mem_wr(g, mem,
+		ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
+		ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi));
+	gk20a_mem_wr(g, mem,
+		ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
+		ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
 			GK20A_FECS_TRACE_NUM_RECORDS));
 
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, mem);
 	gk20a_fecs_trace_hash_add(g, context_ptr, ch->pid);
 
 	return 0;
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index dc3debf2..71400331 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -520,8 +520,6 @@ static int gk20a_init_fifo_setup_sw(struct gk20a *g)
 	mutex_init(&f->free_chs_mutex);
 
 	for (chid = 0; chid < f->num_channels; chid++) {
-		f->channel[chid].userd_cpu_va =
-			f->userd.cpu_va + chid * f->userd_entry_size;
 		f->channel[chid].userd_iova =
 			g->ops.mm.get_iova_addr(g, f->userd.sgt->sgl, 0)
 				+ chid * f->userd_entry_size;
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index f228cce4..2f85bf96 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -201,7 +201,7 @@ struct gpu_ops {
 			  struct gr_ctx_desc *gr_ctx);
 		void (*update_ctxsw_preemption_mode)(struct gk20a *g,
 				struct channel_ctx_gk20a *ch_ctx,
-				void *ctx_ptr);
+				struct mem_desc *mem);
 		int (*update_smpc_ctxsw_mode)(struct gk20a *g,
 				struct channel_gk20a *c,
 				bool enable);
@@ -221,7 +221,8 @@ struct gpu_ops {
 		int (*wait_empty)(struct gk20a *g, unsigned long end_jiffies,
 		       u32 expect_delay);
 		void (*init_cyclestats)(struct gk20a *g);
-		void (*enable_cde_in_fecs)(void *ctx_ptr);
+		void (*enable_cde_in_fecs)(struct gk20a *g,
+				struct mem_desc *mem);
 		int (*set_sm_debug_mode)(struct gk20a *g, struct channel_gk20a *ch,
 					u64 sms, bool enable);
 		void (*bpt_reg_info)(struct gk20a *g,
@@ -484,7 +485,7 @@ struct gpu_ops {
 		void (*cbc_clean)(struct gk20a *g);
 		void (*tlb_invalidate)(struct vm_gk20a *vm);
 		void (*set_big_page_size)(struct gk20a *g,
-					  void *inst_ptr, int size);
+					  struct mem_desc *mem, int size);
 		u32 (*get_big_page_sizes)(void);
 		u32 (*get_physical_addr_bits)(struct gk20a *g);
 		int (*init_mm_setup_hw)(struct gk20a *g);
@@ -493,7 +494,8 @@ struct gpu_ops {
 		void (*remove_bar2_vm)(struct gk20a *g);
 		const struct gk20a_mmu_level *
 			(*get_mmu_levels)(struct gk20a *g, u32 big_page_size);
-		void (*init_pdb)(struct gk20a *g, void *inst_ptr, u64 pdb_addr);
+		void (*init_pdb)(struct gk20a *g, struct mem_desc *mem,
+				u64 pdb_addr);
 		u64 (*get_iova_addr)(struct gk20a *g, struct scatterlist *sgl,
 					 u32 flags);
 		int (*bar1_bind)(struct gk20a *g, u64 bar1_iova);
@@ -859,53 +861,6 @@ do {									\
 #define gk20a_dbg_info(fmt, arg...) \
 	gk20a_dbg(gpu_dbg_info, fmt, ##arg)
 
-/* mem access with dbg_mem logging */
-static inline u8 gk20a_mem_rd08(void *ptr, int b)
-{
-	u8 _b = ((const u8 *)ptr)[b];
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u8)*b, _b);
-#endif
-	return _b;
-}
-static inline u16 gk20a_mem_rd16(void *ptr, int s)
-{
-	u16 _s = ((const u16 *)ptr)[s];
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u16)*s, _s);
-#endif
-	return _s;
-}
-static inline u32 gk20a_mem_rd32(void *ptr, int w)
-{
-	u32 _w = ((const u32 *)ptr)[w];
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + sizeof(u32)*w, _w);
-#endif
-	return _w;
-}
-static inline void gk20a_mem_wr08(void *ptr, int b, u8 data)
-{
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u8)*b, data);
-#endif
-	((u8 *)ptr)[b] = data;
-}
-static inline void gk20a_mem_wr16(void *ptr, int s, u16 data)
-{
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u16)*s, data);
-#endif
-	((u16 *)ptr)[s] = data;
-}
-static inline void gk20a_mem_wr32(void *ptr, int w, u32 data)
-{
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u32)*w, data);
-#endif
-	((u32 *)ptr)[w] = data;
-}
-
 void gk20a_init_clk_ops(struct gpu_ops *gops);
 
 /* register accessors */
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 4e7c36ee..e7e6662a 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -97,22 +97,18 @@ int gr_gk20a_get_ctx_id(struct gk20a *g,
 		u32 *ctx_id)
 {
 	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
-	void *ctx_ptr = NULL;
 
 	/* Channel gr_ctx buffer is gpu cacheable.
 	   Flush and invalidate before cpu update. */
 	g->ops.mm.l2_flush(g, true);
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr)
+	if (gk20a_mem_begin(g, &ch_ctx->gr_ctx->mem))
 		return -ENOMEM;
 
-	*ctx_id = gk20a_mem_rd32(ctx_ptr +
-				 ctxsw_prog_main_image_context_id_o(), 0);
+	*ctx_id = gk20a_mem_rd(g, &ch_ctx->gr_ctx->mem,
+			ctxsw_prog_main_image_context_id_o());
 
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, &ch_ctx->gr_ctx->mem);
 
 	return 0;
 }
@@ -619,22 +615,17 @@ static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
 {
 	u32 addr_lo;
 	u32 addr_hi;
-	void *inst_ptr = NULL;
 
 	gk20a_dbg_fn("");
 
-	inst_ptr = c->inst_block.cpu_va;
-	if (!inst_ptr)
-		return -ENOMEM;
-
 	addr_lo = u64_lo32(gpu_va) >> 12;
 	addr_hi = u64_hi32(gpu_va);
 
-	gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_target_w(),
+	gk20a_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_target_w(),
 		 ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
 		 ram_in_gr_wfi_ptr_lo_f(addr_lo));
 
-	gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_ptr_hi_w(),
+	gk20a_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_ptr_hi_w(),
 		 ram_in_gr_wfi_ptr_hi_f(addr_hi));
 
 	return 0;
@@ -658,11 +649,7 @@ int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
 		return -EBUSY;
 	}
 
-	ch_ctx->patch_ctx.mem.cpu_va = vmap(ch_ctx->patch_ctx.mem.pages,
-			PAGE_ALIGN(ch_ctx->patch_ctx.mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-
-	if (!ch_ctx->patch_ctx.mem.cpu_va)
+	if (gk20a_mem_begin(g, &ch_ctx->patch_ctx.mem))
 		return -ENOMEM;
 
 	return 0;
@@ -677,8 +664,7 @@ int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
 		return -EINVAL;
 	}
 
-	vunmap(ch_ctx->patch_ctx.mem.cpu_va);
-	ch_ctx->patch_ctx.mem.cpu_va = NULL;
+	gk20a_mem_end(g, &ch_ctx->patch_ctx.mem);
 	return 0;
 }
 
@@ -687,7 +673,6 @@ int gr_gk20a_ctx_patch_write(struct gk20a *g,
 				    u32 addr, u32 data, bool patch)
 {
 	u32 patch_slot = 0;
-	void *patch_ptr = NULL;
 	bool mapped_here = false;
 
 	BUG_ON(patch != 0 && ch_ctx == NULL);
@@ -708,11 +693,10 @@ int gr_gk20a_ctx_patch_write(struct gk20a *g,
 		} else
 			mapped_here = false;
 
-		patch_ptr = ch_ctx->patch_ctx.mem.cpu_va;
 		patch_slot = ch_ctx->patch_ctx.data_count * 2;
 
-		gk20a_mem_wr32(patch_ptr, patch_slot++, addr);
-		gk20a_mem_wr32(patch_ptr, patch_slot++, data);
+		gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot++, addr);
+		gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot++, data);
 
 		ch_ctx->patch_ctx.data_count++;
 
@@ -760,16 +744,13 @@ static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
 static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c)
 {
 	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	struct mem_desc *mem = &ch_ctx->gr_ctx->mem;
 	u32 va_lo, va_hi, va;
 	int ret = 0;
-	void *ctx_ptr = NULL;
 
 	gk20a_dbg_fn("");
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr)
+	if (gk20a_mem_begin(g, mem))
 		return -ENOMEM;
 
 	if (ch_ctx->zcull_ctx.gpu_va == 0 &&
@@ -792,15 +773,17 @@ static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c)
 		goto clean_up;
 	}
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_o(), 0,
+	gk20a_mem_wr(g, mem,
+			ctxsw_prog_main_image_zcull_o(),
 		 ch_ctx->zcull_ctx.ctx_sw_mode);
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, va);
+	gk20a_mem_wr(g, mem,
+			ctxsw_prog_main_image_zcull_ptr_o(), va);
 
 	c->g->ops.fifo.enable_channel(c);
 
 clean_up:
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, mem);
 
 	return ret;
 }
@@ -1500,8 +1483,8 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
 	u32 ctx_header_words;
 	u32 i;
 	u32 data;
-	void *ctx_ptr = NULL;
-	void *gold_ptr = NULL;
+	struct mem_desc *gold_mem = &gr->global_ctx_buffer[GOLDEN_CTX].mem;
+	struct mem_desc *gr_mem = &ch_ctx->gr_ctx->mem;
 	u32 err = 0;
 
 	gk20a_dbg_fn("");
@@ -1527,16 +1510,10 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
 	if (err)
 		goto clean_up;
 
-	gold_ptr = vmap(gr->global_ctx_buffer[GOLDEN_CTX].mem.pages,
-			PAGE_ALIGN(gr->global_ctx_buffer[GOLDEN_CTX].mem.size) >>
-			PAGE_SHIFT, 0, pgprot_writecombine(PAGE_KERNEL));
-	if (!gold_ptr)
+	if (gk20a_mem_begin(g, gold_mem))
 		goto clean_up;
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr)
+	if (gk20a_mem_begin(g, gr_mem))
 		goto clean_up;
 
 	ctx_header_words =  roundup(ctx_header_bytes, sizeof(u32));
@@ -1545,14 +1522,14 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
 	g->ops.mm.l2_flush(g, true);
 
 	for (i = 0; i < ctx_header_words; i++) {
-		data = gk20a_mem_rd32(ctx_ptr, i);
-		gk20a_mem_wr32(gold_ptr, i, data);
+		data = gk20a_mem_rd32(g, gr_mem, i);
+		gk20a_mem_wr32(g, gold_mem, i, data);
 	}
 
-	gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_o(), 0,
+	gk20a_mem_wr(g, gold_mem, ctxsw_prog_main_image_zcull_o(),
 		 ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
 
-	gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, 0);
+	gk20a_mem_wr(g, gold_mem, ctxsw_prog_main_image_zcull_ptr_o(), 0);
 
 	gr_gk20a_commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
 
@@ -1568,12 +1545,12 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
 			goto clean_up;
 		}
 
-		for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
-			gr->ctx_vars.local_golden_image[i] =
-				gk20a_mem_rd32(gold_ptr, i);
+		gk20a_mem_rd_n(g, gold_mem, 0,
+				gr->ctx_vars.local_golden_image,
+				gr->ctx_vars.golden_image_size);
 	}
 
-	gr_gk20a_commit_inst(c, ch_ctx->gr_ctx->mem.gpu_va);
+	gr_gk20a_commit_inst(c, gr_mem->gpu_va);
 
 	gr->ctx_vars.golden_image_initialized = true;
 
@@ -1586,10 +1563,8 @@ clean_up:
 	else
 		gk20a_dbg_fn("done");
 
-	if (gold_ptr)
-		vunmap(gold_ptr);
-	if (ctx_ptr)
-		vunmap(ctx_ptr);
+	gk20a_mem_end(g, gold_mem);
+	gk20a_mem_end(g, gr_mem);
 
 	mutex_unlock(&gr->ctx_mutex);
 	return err;
@@ -1600,7 +1575,7 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
 				    bool enable_smpc_ctxsw)
 {
 	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
-	void *ctx_ptr = NULL;
+	struct mem_desc *mem;
 	u32 data;
 	int ret;
 
@@ -1611,46 +1586,39 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
 		return -EFAULT;
 	}
 
+	mem = &ch_ctx->gr_ctx->mem;
+
 	c->g->ops.fifo.disable_channel(c);
 	ret = c->g->ops.fifo.preempt_channel(c->g, c->hw_chid);
 	if (ret) {
-		c->g->ops.fifo.enable_channel(c);
-		gk20a_err(dev_from_gk20a(g),
-			"failed to preempt channel\n");
-		return ret;
+		gk20a_err(dev_from_gk20a(g), "failed to preempt channel");
+		goto out;
 	}
 
 	/* Channel gr_ctx buffer is gpu cacheable.
 	   Flush and invalidate before cpu update. */
 	g->ops.mm.l2_flush(g, true);
 
-	if (!ch_ctx->gr_ctx) {
-		gk20a_err(dev_from_gk20a(g), "no graphics context allocated");
-		return -EFAULT;
-	}
-
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr) {
-		c->g->ops.fifo.enable_channel(c);
-		return -ENOMEM;
+	if (gk20a_mem_begin(g, mem)) {
+		ret = -ENOMEM;
+		goto out;
 	}
 
-	data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+	data = gk20a_mem_rd(g, mem,
+			ctxsw_prog_main_image_pm_o());
 	data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
 	data |= enable_smpc_ctxsw ?
 		ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
 		ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
-		 data);
+	gk20a_mem_wr(g, mem,
+			ctxsw_prog_main_image_pm_o(),
+			data);
 
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, mem);
 
-	/* enable channel */
+out:
 	c->g->ops.fifo.enable_channel(c);
-
-	return 0;
+	return ret;
 }
 
 int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
@@ -1659,8 +1627,7 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 {
 	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
 	struct pm_ctx_desc *pm_ctx = &ch_ctx->pm_ctx;
-	void *ctx_ptr = NULL;
-	void *pm_ctx_ptr;
+	struct mem_desc *gr_mem;
 	u32 data, virt_addr;
 	int ret;
 
@@ -1671,6 +1638,8 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 		return -EFAULT;
 	}
 
+	gr_mem = &ch_ctx->gr_ctx->mem;
+
 	if (enable_hwpm_ctxsw) {
 		if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f())
 			return 0;
@@ -1721,29 +1690,22 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 		}
 
 		/* Now clear the buffer */
-		pm_ctx_ptr = vmap(pm_ctx->mem.pages,
-				PAGE_ALIGN(pm_ctx->mem.size) >> PAGE_SHIFT,
-				0, pgprot_writecombine(PAGE_KERNEL));
-
-		if (!pm_ctx_ptr) {
+		if (gk20a_mem_begin(g, &pm_ctx->mem)) {
 			ret = -ENOMEM;
 			goto cleanup_pm_buf;
 		}
 
-		memset(pm_ctx_ptr, 0, pm_ctx->mem.size);
+		gk20a_memset(g, &pm_ctx->mem, 0, 0, pm_ctx->mem.size);
 
-		vunmap(pm_ctx_ptr);
+		gk20a_mem_end(g, &pm_ctx->mem);
 	}
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr) {
+	if (gk20a_mem_begin(g, gr_mem)) {
 		ret = -ENOMEM;
 		goto cleanup_pm_buf;
 	}
 
-	data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+	data = gk20a_mem_rd(g, gr_mem, ctxsw_prog_main_image_pm_o());
 	data = data & ~ctxsw_prog_main_image_pm_mode_m();
 
 	if (enable_hwpm_ctxsw) {
@@ -1760,10 +1722,10 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 
 	data |= pm_ctx->pm_mode;
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0, data);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, virt_addr);
+	gk20a_mem_wr(g, gr_mem, ctxsw_prog_main_image_pm_o(), data);
+	gk20a_mem_wr(g, gr_mem, ctxsw_prog_main_image_pm_ptr_o(), virt_addr);
 
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, gr_mem);
 
 	/* enable channel */
 	c->g->ops.fifo.enable_channel(c);
@@ -1788,9 +1750,9 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 	u32 virt_addr_lo;
 	u32 virt_addr_hi;
 	u32 virt_addr = 0;
-	u32 i, v, data;
+	u32 v, data;
 	int ret = 0;
-	void *ctx_ptr = NULL;
+	struct mem_desc *mem = &ch_ctx->gr_ctx->mem;
 
 	gk20a_dbg_fn("");
 
@@ -1801,20 +1763,18 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 	   Flush and invalidate before cpu update. */
 	g->ops.mm.l2_flush(g, true);
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr)
+	if (gk20a_mem_begin(g, mem))
 		return -ENOMEM;
 
-	for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
-		gk20a_mem_wr32(ctx_ptr, i, gr->ctx_vars.local_golden_image[i]);
+	gk20a_mem_wr_n(g, mem, 0,
+			gr->ctx_vars.local_golden_image,
+			gr->ctx_vars.golden_image_size);
 
 	if (g->ops.gr.enable_cde_in_fecs && c->cde)
-		g->ops.gr.enable_cde_in_fecs(ctx_ptr);
+		g->ops.gr.enable_cde_in_fecs(g, mem);
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_o(), 0, 0);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_o(), 0, 0);
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_num_save_ops_o(), 0);
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_num_restore_ops_o(), 0);
 
 	/* set priv access map */
 	virt_addr_lo =
@@ -1827,29 +1787,29 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 	else
 		data = ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f();
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_config_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_config_o(),
 		 data);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_lo_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_addr_lo_o(),
 		 virt_addr_lo);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_hi_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_addr_hi_o(),
 		 virt_addr_hi);
 	/* disable verif features */
-	v = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0);
+	v = gk20a_mem_rd(g, mem, ctxsw_prog_main_image_misc_options_o());
 	v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
 	v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0, v);
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_misc_options_o(), v);
 
 	if (g->ops.gr.update_ctxsw_preemption_mode)
-		g->ops.gr.update_ctxsw_preemption_mode(g, ch_ctx, ctx_ptr);
+		g->ops.gr.update_ctxsw_preemption_mode(g, ch_ctx, mem);
 
 	virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.mem.gpu_va);
 	virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.mem.gpu_va);
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_count_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_patch_count_o(),
 		 ch_ctx->patch_ctx.data_count);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_lo_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_patch_adr_lo_o(),
 		 virt_addr_lo);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_patch_adr_hi_o(),
 		 virt_addr_hi);
 
 	/* Update main header region of the context buffer with the info needed
@@ -1860,7 +1820,7 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 		if (ch_ctx->pm_ctx.mem.gpu_va == 0) {
 			gk20a_err(dev_from_gk20a(g),
 				"context switched pm with no pm buffer!");
-			vunmap(ctx_ptr);
+			gk20a_mem_end(g, mem);
 			return -EFAULT;
 		}
 
@@ -1871,14 +1831,14 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 	} else
 		virt_addr = 0;
 
-	data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+	data = gk20a_mem_rd(g, mem, ctxsw_prog_main_image_pm_o());
 	data = data & ~ctxsw_prog_main_image_pm_mode_m();
 	data |= ch_ctx->pm_ctx.pm_mode;
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0, data);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, virt_addr);
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_pm_o(), data);
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_pm_ptr_o(), virt_addr);
 
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, mem);
 
 	if (tegra_platform_is_linsim()) {
 		u32 inst_base_ptr =
@@ -1978,16 +1938,20 @@ static void gr_gk20a_init_ctxsw_ucode_segments(
 }
 
 static int gr_gk20a_copy_ctxsw_ucode_segments(
-	u8 *buf,
+	struct gk20a *g,
+	struct mem_desc *dst,
 	struct gk20a_ctxsw_ucode_segments *segments,
 	u32 *bootimage,
 	u32 *code, u32 *data)
 {
 	int i;
 
-	memcpy(buf + segments->boot.offset, bootimage, segments->boot.size);
-	memcpy(buf + segments->code.offset, code,      segments->code.size);
-	memcpy(buf + segments->data.offset, data,      segments->data.size);
+	gk20a_mem_wr_n(g, dst, segments->boot.offset, bootimage,
+			segments->boot.size);
+	gk20a_mem_wr_n(g, dst, segments->code.offset, code,
+			segments->code.size);
+	gk20a_mem_wr_n(g, dst, segments->data.offset, data,
+			segments->data.size);
 
 	/* compute a "checksum" for the boot binary to detect its version */
 	segments->boot_signature = 0;
@@ -2009,7 +1973,6 @@ int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
 	u32 *fecs_boot_image;
 	u32 *gpccs_boot_image;
 	struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
-	u8 *buf;
 	u32 ucode_size;
 	int err = 0;
 
@@ -2049,14 +2012,8 @@ int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
 	if (err)
 		goto clean_up;
 
-	buf = (u8 *)ucode_info->surface_desc.cpu_va;
-	if (!buf) {
-		gk20a_err(d, "failed to map surface desc buffer");
-		err = -ENOMEM;
-		goto clean_up;
-	}
-
-	gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->fecs,
+	gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc,
+		&ucode_info->fecs,
 		fecs_boot_image,
 		g->gr.ctx_vars.ucode.fecs.inst.l,
 		g->gr.ctx_vars.ucode.fecs.data.l);
@@ -2064,7 +2021,8 @@ int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
 	release_firmware(fecs_fw);
 	fecs_fw = NULL;
 
-	gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->gpccs,
+	gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc,
+		&ucode_info->gpccs,
 		gpccs_boot_image,
 		g->gr.ctx_vars.ucode.gpccs.inst.l,
 		g->gr.ctx_vars.ucode.gpccs.data.l);
@@ -4690,41 +4648,38 @@ out:
 static int gr_gk20a_init_access_map(struct gk20a *g)
 {
 	struct gr_gk20a *gr = &g->gr;
-	void *data;
-	int err = 0;
+	struct mem_desc *mem = &gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem;
 	u32 w, nr_pages =
 		DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size,
 			     PAGE_SIZE);
 	u32 *whitelist = NULL;
 	int num_entries = 0;
 
-	data = vmap(gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem.pages,
-		    PAGE_ALIGN(gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem.size) >>
-		    PAGE_SHIFT, 0, pgprot_writecombine(PAGE_KERNEL));
-	if (!data) {
+	if (gk20a_mem_begin(g, mem)) {
 		gk20a_err(dev_from_gk20a(g),
 			  "failed to map priv access map memory");
-		err = -ENOMEM;
-		goto clean_up;
+		return -ENOMEM;
 	}
 
-	memset(data, 0x0, PAGE_SIZE * nr_pages);
+	gk20a_memset(g, mem, 0, 0, PAGE_SIZE * nr_pages);
 
 	g->ops.gr.get_access_map(g, &whitelist, &num_entries);
 
 	for (w = 0; w < num_entries; w++) {
-		u32 map_bit, map_byte, map_shift;
+		u32 map_bit, map_byte, map_shift, x;
 		map_bit = whitelist[w] >> 2;
 		map_byte = map_bit >> 3;
 		map_shift = map_bit & 0x7; /* i.e. 0-7 */
 		gk20a_dbg_info("access map addr:0x%x byte:0x%x bit:%d",
 			       whitelist[w], map_byte, map_shift);
-		((u8 *)data)[map_byte] |= 1 << map_shift;
+		x = gk20a_mem_rd32(g, mem, map_byte / sizeof(u32));
+		x |= 1 << (
+			   (map_byte % sizeof(u32) * BITS_PER_BYTE)
+			  + map_shift);
+		gk20a_mem_wr32(g, mem, map_byte / sizeof(u32), x);
 	}
 
-clean_up:
-	if (data)
-		vunmap(data);
+	gk20a_mem_end(g, mem);
 	return 0;
 }
 
@@ -6659,7 +6614,7 @@ static void gr_gk20a_init_sm_dsm_reg_info(void)
 static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 			    struct channel_ctx_gk20a *ch_ctx,
 			    u32 addr, u32 data,
-			    u8 *context)
+			    struct mem_desc *mem)
 {
 	u32 num_gpc = g->gr.gpc_count;
 	u32 num_tpc;
@@ -6688,8 +6643,8 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 				/* reset the patch count from previous
 				   runs,if ucode has already processed
 				   it */
-				tmp = gk20a_mem_rd32(context +
-				       ctxsw_prog_main_image_patch_count_o(), 0);
+				tmp = gk20a_mem_rd(g, mem,
+				       ctxsw_prog_main_image_patch_count_o());
 
 				if (!tmp)
 					ch_ctx->patch_ctx.data_count = 0;
@@ -6700,15 +6655,15 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 				vaddr_lo = u64_lo32(ch_ctx->patch_ctx.mem.gpu_va);
 				vaddr_hi = u64_hi32(ch_ctx->patch_ctx.mem.gpu_va);
 
-				gk20a_mem_wr32(context +
+				gk20a_mem_wr(g, mem,
 					 ctxsw_prog_main_image_patch_count_o(),
-					 0, ch_ctx->patch_ctx.data_count);
-				gk20a_mem_wr32(context +
+					 ch_ctx->patch_ctx.data_count);
+				gk20a_mem_wr(g, mem,
 					 ctxsw_prog_main_image_patch_adr_lo_o(),
-					 0, vaddr_lo);
-				gk20a_mem_wr32(context +
+					 vaddr_lo);
+				gk20a_mem_wr(g, mem,
 					 ctxsw_prog_main_image_patch_adr_hi_o(),
-					 0, vaddr_hi);
+					 vaddr_hi);
 
 				/* we're not caching these on cpu side,
 				   but later watch for it */
@@ -6760,17 +6715,15 @@ static void gr_gk20a_access_smpc_reg(struct gk20a *g, u32 quad, u32 offset)
 
 #define ILLEGAL_ID (~0)
 
-static inline bool check_main_image_header_magic(void *context)
+static inline bool check_main_image_header_magic(u8 *context)
 {
-	u32 magic = gk20a_mem_rd32(context +
-			     ctxsw_prog_main_image_magic_value_o(), 0);
+	u32 magic = *(u32 *)(context + ctxsw_prog_main_image_magic_value_o());
 	gk20a_dbg(gpu_dbg_gpu_dbg, "main image magic=0x%x", magic);
 	return magic == ctxsw_prog_main_image_magic_value_v_value_v();
 }
-static inline bool check_local_header_magic(void *context)
+static inline bool check_local_header_magic(u8 *context)
 {
-	u32 magic = gk20a_mem_rd32(context +
-			     ctxsw_prog_local_magic_value_o(), 0);
+	u32 magic = *(u32 *)(context + ctxsw_prog_local_magic_value_o());
 	gk20a_dbg(gpu_dbg_gpu_dbg, "local magic=0x%x",  magic);
 	return magic == ctxsw_prog_local_magic_value_v_value_v();
 
@@ -6814,7 +6767,7 @@ static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
 	u32 num_gpcs, num_tpcs;
 	u32 chk_addr;
 	u32 ext_priv_offset, ext_priv_size;
-	void *context;
+	u8 *context;
 	u32 offset_to_segment, offset_to_segment_end;
 	u32 sm_dsm_perf_reg_id = ILLEGAL_ID;
 	u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID;
@@ -6856,14 +6809,14 @@ static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
 	/* note below is in words/num_registers */
 	marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2;
 
-	context = context_buffer;
+	context = (u8 *)context_buffer;
 	/* sanity check main header */
 	if (!check_main_image_header_magic(context)) {
 		gk20a_err(dev_from_gk20a(g),
 			   "Invalid main header: magic value");
 		return -EINVAL;
 	}
-	num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
+	num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
 	if (gpc_num >= num_gpcs) {
 		gk20a_err(dev_from_gk20a(g),
 		   "GPC 0x%08x is greater than total count 0x%08x!\n",
@@ -6871,7 +6824,7 @@ static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
 		return -EINVAL;
 	}
 
-	data32 = gk20a_mem_rd32(context + ctxsw_prog_main_extended_buffer_ctl_o(), 0);
+	data32 = *(u32 *)(context + ctxsw_prog_main_extended_buffer_ctl_o());
 	ext_priv_size   = ctxsw_prog_main_extended_buffer_ctl_size_v(data32);
 	if (0 == ext_priv_size) {
 		gk20a_dbg_info(" No extended memory in context buffer");
@@ -7149,7 +7102,7 @@ gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g,
 }
 
 static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
-					       void *context,
+					       u8 *context,
 					       u32 *num_ppcs, u32 *ppc_mask,
 					       u32 *reg_ppc_count)
 {
@@ -7165,7 +7118,7 @@ static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
 	     (num_pes_per_gpc > 1)))
 		return -EINVAL;
 
-	data32 = gk20a_mem_rd32(context + ctxsw_prog_local_image_ppc_info_o(), 0);
+	data32 = *(u32 *)(context + ctxsw_prog_local_image_ppc_info_o());
 
 	*num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32);
 	*ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32);
@@ -7177,7 +7130,7 @@ static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
 
 /*
  *  This function will return the 32 bit offset for a priv register if it is
- *  present in the context buffer.
+ *  present in the context buffer. The context buffer is in CPU memory.
  */
 static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 					       u32 addr,
@@ -7196,7 +7149,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 	u32 offset;
 	u32 sys_priv_offset, gpc_priv_offset;
 	u32 ppc_mask, reg_list_ppc_count;
-	void *context;
+	u8 *context;
 	u32 offset_to_segment;
 
 	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
@@ -7207,13 +7160,13 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 	if (err)
 		return err;
 
-	context = context_buffer;
+	context = (u8 *)context_buffer;
 	if (!check_main_image_header_magic(context)) {
 		gk20a_err(dev_from_gk20a(g),
 			   "Invalid main header: magic value");
 		return -EINVAL;
 	}
-	num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
+	num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
 
 	/* Parse the FECS local header. */
 	context += ctxsw_prog_ucode_header_size_in_bytes();
@@ -7222,7 +7175,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 			   "Invalid FECS local header: magic value\n");
 		return -EINVAL;
 	}
-	data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
+	data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
 	sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
 
 	/* If found in Ext buffer, ok.
@@ -7268,7 +7221,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 			return -EINVAL;
 
 		}
-		data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
+		data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
 		gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
 
 		err = gr_gk20a_determine_ppc_configuration(g, context,
@@ -7277,7 +7230,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 		if (err)
 			return err;
 
-		num_tpcs = gk20a_mem_rd32(context + ctxsw_prog_local_image_num_tpcs_o(), 0);
+		num_tpcs = *(u32 *)(context + ctxsw_prog_local_image_num_tpcs_o());
 
 		if ((i == gpc_num) && ((tpc_num + 1) > num_tpcs)) {
 			gk20a_err(dev_from_gk20a(g),
@@ -7689,9 +7642,9 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 {
 	struct gk20a *g = ch->g;
 	struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
-	void *ctx_ptr = NULL;
-	void *pm_ctx_ptr = NULL;
-	void *base_ptr = NULL;
+	bool gr_ctx_ready = false;
+	bool pm_ctx_ready = false;
+	struct mem_desc *current_mem = NULL;
 	bool ch_is_curr_ctx, restart_gr_ctxsw = false;
 	u32 i, j, offset, v;
 	struct gr_gk20a *gr = &g->gr;
@@ -7821,20 +7774,18 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 						ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD),
 						ctx_ops[i].quad);
 			if (!err) {
-				if (!ctx_ptr) {
+				if (!gr_ctx_ready) {
 					/* would have been a variant of
 					 * gr_gk20a_apply_instmem_overrides,
 					 * recoded in-place instead.
 					 */
-					ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-						PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-						0, pgprot_writecombine(PAGE_KERNEL));
-					if (!ctx_ptr) {
+					if (gk20a_mem_begin(g, &ch_ctx->gr_ctx->mem)) {
 						err = -ENOMEM;
 						goto cleanup;
 					}
+					gr_ctx_ready = true;
 				}
-				base_ptr = ctx_ptr;
+				current_mem = &ch_ctx->gr_ctx->mem;
 			} else {
 				err = gr_gk20a_get_pm_ctx_buffer_offsets(g,
 							ctx_ops[i].offset,
@@ -7849,7 +7800,7 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 						NVGPU_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET;
 					continue;
 				}
-				if (!pm_ctx_ptr) {
+				if (!pm_ctx_ready) {
 					/* Make sure ctx buffer was initialized */
 					if (!ch_ctx->pm_ctx.mem.pages) {
 						gk20a_err(dev_from_gk20a(g),
@@ -7857,15 +7808,13 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 						err = -EINVAL;
 						goto cleanup;
 					}
-					pm_ctx_ptr = vmap(ch_ctx->pm_ctx.mem.pages,
-						PAGE_ALIGN(ch_ctx->pm_ctx.mem.size) >> PAGE_SHIFT,
-						0, pgprot_writecombine(PAGE_KERNEL));
-					if (!pm_ctx_ptr) {
+					if (gk20a_mem_begin(g, &ch_ctx->pm_ctx.mem)) {
 						err = -ENOMEM;
 						goto cleanup;
 					}
+					pm_ctx_ready = true;
 				}
-				base_ptr = pm_ctx_ptr;
+				current_mem = &ch_ctx->pm_ctx.mem;
 			}
 
 			/* if this is a quad access, setup for special access*/
@@ -7878,24 +7827,24 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 				/* sanity check gr ctxt offsets,
 				 * don't write outside, worst case
 				 */
-				if ((base_ptr == ctx_ptr) &&
+				if ((current_mem == &ch_ctx->gr_ctx->mem) &&
 					(offsets[j] >= g->gr.ctx_vars.golden_image_size))
 					continue;
 				if (pass == 0) { /* write pass */
-					v = gk20a_mem_rd32(base_ptr + offsets[j], 0);
+					v = gk20a_mem_rd(g, current_mem, offsets[j]);
 					v &= ~ctx_ops[i].and_n_mask_lo;
 					v |= ctx_ops[i].value_lo;
-					gk20a_mem_wr32(base_ptr + offsets[j], 0, v);
+					gk20a_mem_wr(g, current_mem, offsets[j], v);
 
 					gk20a_dbg(gpu_dbg_gpu_dbg,
 						   "context wr: offset=0x%x v=0x%x",
 						   offsets[j], v);
 
 					if (ctx_ops[i].op == REGOP(WRITE_64)) {
-						v = gk20a_mem_rd32(base_ptr + offsets[j] + 4, 0);
+						v = gk20a_mem_rd(g, current_mem, offsets[j] + 4);
 						v &= ~ctx_ops[i].and_n_mask_hi;
 						v |= ctx_ops[i].value_hi;
-						gk20a_mem_wr32(base_ptr + offsets[j] + 4, 0, v);
+						gk20a_mem_wr(g, current_mem, offsets[j] + 4, v);
 
 						gk20a_dbg(gpu_dbg_gpu_dbg,
 							   "context wr: offset=0x%x v=0x%x",
@@ -7905,18 +7854,18 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 					/* check to see if we need to add a special WAR
 					   for some of the SMPC perf regs */
 					gr_gk20a_ctx_patch_smpc(g, ch_ctx, offset_addrs[j],
-							v, base_ptr);
+							v, current_mem);
 
 				} else { /* read pass */
 					ctx_ops[i].value_lo =
-						gk20a_mem_rd32(base_ptr + offsets[0], 0);
+						gk20a_mem_rd(g, current_mem, offsets[0]);
 
 					gk20a_dbg(gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
 						   offsets[0], ctx_ops[i].value_lo);
 
 					if (ctx_ops[i].op == REGOP(READ_64)) {
 						ctx_ops[i].value_hi =
-							gk20a_mem_rd32(base_ptr + offsets[0] + 4, 0);
+							gk20a_mem_rd(g, current_mem, offsets[0] + 4);
 
 						gk20a_dbg(gpu_dbg_gpu_dbg,
 							   "context rd: offset=0x%x v=0x%x",
@@ -7943,12 +7892,10 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 
 	if (ch_ctx->patch_ctx.mem.cpu_va)
 		gr_gk20a_ctx_patch_write_end(g, ch_ctx);
-
-	if (ctx_ptr)
-		vunmap(ctx_ptr);
-
-	if (pm_ctx_ptr)
-		vunmap(pm_ctx_ptr);
+	if (gr_ctx_ready)
+		gk20a_mem_end(g, &ch_ctx->gr_ctx->mem);
+	if (pm_ctx_ready)
+		gk20a_mem_end(g, &ch_ctx->pm_ctx.mem);
 
 	if (restart_gr_ctxsw) {
 		int tmp_err = gr_gk20a_enable_ctxsw(g);
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 6f6734b4..13382416 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -44,6 +44,112 @@
 #include "kind_gk20a.h"
 #include "semaphore_gk20a.h"
 
+int gk20a_mem_begin(struct gk20a *g, struct mem_desc *mem)
+{
+	void *cpu_va;
+
+	if (WARN_ON(mem->cpu_va)) {
+		gk20a_warn(dev_from_gk20a(g), "nested %s", __func__);
+		return -EBUSY;
+	}
+
+	cpu_va = vmap(mem->pages,
+			PAGE_ALIGN(mem->size) >> PAGE_SHIFT,
+			0, pgprot_writecombine(PAGE_KERNEL));
+
+	if (WARN_ON(!cpu_va))
+		return -ENOMEM;
+
+	mem->cpu_va = cpu_va;
+	return 0;
+}
+
+void gk20a_mem_end(struct gk20a *g, struct mem_desc *mem)
+{
+	vunmap(mem->cpu_va);
+	mem->cpu_va = NULL;
+}
+
+u32 gk20a_mem_rd32(struct gk20a *g, struct mem_desc *mem, u32 w)
+{
+	u32 *ptr = mem->cpu_va;
+	u32 data;
+
+	WARN_ON(!ptr);
+	data = ptr[w];
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);
+#endif
+	return data;
+}
+
+u32 gk20a_mem_rd(struct gk20a *g, struct mem_desc *mem, u32 offset)
+{
+	WARN_ON(offset & 3);
+	return gk20a_mem_rd32(g, mem, offset / sizeof(u32));
+}
+
+void gk20a_mem_rd_n(struct gk20a *g, struct mem_desc *mem,
+		u32 offset, void *dest, u32 size)
+{
+	u32 i;
+	u32 *dest_u32 = dest;
+
+	WARN_ON(offset & 3);
+	WARN_ON(size & 3);
+	offset /= sizeof(u32);
+	size /= sizeof(u32);
+
+	for (i = 0; i < size; i++)
+		dest_u32[i] = gk20a_mem_rd32(g, mem, offset + i);
+}
+
+void gk20a_mem_wr32(struct gk20a *g, struct mem_desc *mem, u32 w, u32 data)
+{
+	u32 *ptr = mem->cpu_va;
+
+	WARN_ON(!ptr);
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);
+#endif
+	ptr[w] = data;
+}
+
+void gk20a_mem_wr(struct gk20a *g, struct mem_desc *mem, u32 offset, u32 data)
+{
+	WARN_ON(offset & 3);
+	gk20a_mem_wr32(g, mem, offset / sizeof(u32), data);
+}
+
+void gk20a_mem_wr_n(struct gk20a *g, struct mem_desc *mem, u32 offset,
+		void *src, u32 size)
+{
+	u32 i;
+	u32 *src_u32 = src;
+
+	WARN_ON(offset & 3);
+	WARN_ON(size & 3);
+	offset /= sizeof(u32);
+	size /= sizeof(u32);
+
+	for (i = 0; i < size; i++)
+		gk20a_mem_wr32(g, mem, offset + i, src_u32[i]);
+}
+
+void gk20a_memset(struct gk20a *g, struct mem_desc *mem, u32 offset,
+		u32 value, u32 size)
+{
+	u32 i;
+
+	WARN_ON(offset & 3);
+	WARN_ON(size & 3);
+	offset /= sizeof(u32);
+	size /= sizeof(u32);
+
+	for (i = 0; i < size; i++)
+		gk20a_mem_wr32(g, mem, offset + i, value);
+}
+
 /*
  * GPU mapping life cycle
  * ======================
@@ -780,9 +886,14 @@ void pde_range_from_vaddr_range(struct vm_gk20a *vm,
 		   *pde_lo, *pde_hi);
 }
 
-u32 *pde_from_index(struct vm_gk20a *vm, u32 i)
+static u32 pde_from_index(u32 i)
+{
+	return i * gmmu_pde__size_v() / sizeof(u32);
+}
+
+static u32 pte_from_index(u32 i)
 {
-	return (u32 *) (((u8 *)vm->pdb.mem.cpu_va) + i*gmmu_pde__size_v());
+	return i * gmmu_pte__size_v() / sizeof(u32);
 }
 
 u32 pte_index_from_vaddr(struct vm_gk20a *vm,
@@ -2323,7 +2434,7 @@ static int update_gmmu_pde_locked(struct vm_gk20a *vm,
 	u64 pte_addr_small = 0, pte_addr_big = 0;
 	struct gk20a_mm_entry *entry = vm->pdb.entries + i;
 	u32 pde_v[2] = {0, 0};
-	u32 *pde;
+	u32 pde;
 
 	gk20a_dbg_fn("");
 
@@ -2348,10 +2459,10 @@ static int update_gmmu_pde_locked(struct vm_gk20a *vm,
 		    (big_valid ? (gmmu_pde_vol_big_true_f()) :
 		     gmmu_pde_vol_big_false_f());
 
-	pde = pde_from_index(vm, i);
+	pde = pde_from_index(i);
 
-	gk20a_mem_wr32(pde, 0, pde_v[0]);
-	gk20a_mem_wr32(pde, 1, pde_v[1]);
+	gk20a_mem_wr32(g, &vm->pdb.mem, pde + 0, pde_v[0]);
+	gk20a_mem_wr32(g, &vm->pdb.mem, pde + 1, pde_v[1]);
 
 	gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x",
 		  i, gmmu_pgsz_idx, pde_v[1], pde_v[0]);
@@ -2432,8 +2543,8 @@ static int update_gmmu_pte_locked(struct vm_gk20a *vm,
 		gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
 	}
 
-	gk20a_mem_wr32(pte->mem.cpu_va + i*8, 0, pte_w[0]);
-	gk20a_mem_wr32(pte->mem.cpu_va + i*8, 1, pte_w[1]);
+	gk20a_mem_wr32(g, &pte->mem, pte_from_index(i) + 0, pte_w[0]);
+	gk20a_mem_wr32(g, &pte->mem, pte_from_index(i) + 1, pte_w[1]);
 
 	if (*iova) {
 		*iova += page_size;
@@ -3489,19 +3600,19 @@ static int gk20a_init_cde_vm(struct mm_gk20a *mm)
 			false, false, "cde");
 }
 
-void gk20a_mm_init_pdb(struct gk20a *g, void *inst_ptr, u64 pdb_addr)
+void gk20a_mm_init_pdb(struct gk20a *g, struct mem_desc *mem, u64 pdb_addr)
 {
 	u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
 	u32 pdb_addr_hi = u64_hi32(pdb_addr);
 
-	gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
+	gk20a_mem_wr32(g, mem, ram_in_page_dir_base_lo_w(),
 		(g->mm.vidmem_is_vidmem ?
 		  ram_in_page_dir_base_target_sys_mem_ncoh_f() :
 		  ram_in_page_dir_base_target_vid_mem_f()) |
 		ram_in_page_dir_base_vol_true_f() |
 		ram_in_page_dir_base_lo_f(pdb_addr_lo));
 
-	gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
+	gk20a_mem_wr32(g, mem, ram_in_page_dir_base_hi_w(),
 		ram_in_page_dir_base_hi_f(pdb_addr_hi));
 }
 
@@ -3510,23 +3621,22 @@ void gk20a_init_inst_block(struct mem_desc *inst_block, struct vm_gk20a *vm,
 {
 	struct gk20a *g = gk20a_from_vm(vm);
 	u64 pde_addr = g->ops.mm.get_iova_addr(g, vm->pdb.mem.sgt->sgl, 0);
-	void *inst_ptr = inst_block->cpu_va;
 
 	gk20a_dbg_info("inst block phys = 0x%llx, kv = 0x%p",
-		gk20a_mm_inst_block_addr(g, inst_block), inst_ptr);
+		gk20a_mm_inst_block_addr(g, inst_block), inst_block->cpu_va);
 
 	gk20a_dbg_info("pde pa=0x%llx", (u64)pde_addr);
 
-	g->ops.mm.init_pdb(g, inst_ptr, pde_addr);
+	g->ops.mm.init_pdb(g, inst_block, pde_addr);
 
-	gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
+	gk20a_mem_wr32(g, inst_block, ram_in_adr_limit_lo_w(),
 		u64_lo32(vm->va_limit - 1) & ~0xfff);
 
-	gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
+	gk20a_mem_wr32(g, inst_block, ram_in_adr_limit_hi_w(),
 		ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit - 1)));
 
 	if (big_page_size && g->ops.mm.set_big_page_size)
-		g->ops.mm.set_big_page_size(g, inst_ptr, big_page_size);
+		g->ops.mm.set_big_page_size(g, inst_block, big_page_size);
 }
 
 int gk20a_mm_fb_flush(struct gk20a *g)
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 7fa0b7fb..e9ac8f18 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -419,6 +419,34 @@ static inline enum gmmu_pgsz_gk20a __get_pte_size(struct vm_gk20a *vm,
 		return gmmu_page_size_small;
 }
 
+/*
+ * Buffer accessors - wrap between begin() and end() if there is no permanent
+ * kernel mapping for this buffer.
+ */
+
+int gk20a_mem_begin(struct gk20a *g, struct mem_desc *mem);
+/* nop for null mem, like with free() or vunmap() */
+void gk20a_mem_end(struct gk20a *g, struct mem_desc *mem);
+
+/* word-indexed offset */
+u32 gk20a_mem_rd32(struct gk20a *g, struct mem_desc *mem, u32 w);
+/* byte offset (32b-aligned) */
+u32 gk20a_mem_rd(struct gk20a *g, struct mem_desc *mem, u32 offset);
+/* memcpy to cpu, offset and size in bytes (32b-aligned) */
+void gk20a_mem_rd_n(struct gk20a *g, struct mem_desc *mem, u32 offset,
+		void *dest, u32 size);
+
+/* word-indexed offset */
+void gk20a_mem_wr32(struct gk20a *g, struct mem_desc *mem, u32 w, u32 data);
+/* byte offset (32b-aligned) */
+void gk20a_mem_wr(struct gk20a *g, struct mem_desc *mem, u32 offset, u32 data);
+/* memcpy from cpu, offset and size in bytes (32b-aligned) */
+void gk20a_mem_wr_n(struct gk20a *g, struct mem_desc *mem, u32 offset,
+		void *src, u32 size);
+/* size and offset in bytes (32b-aligned), filled with u32s */
+void gk20a_memset(struct gk20a *g, struct mem_desc *mem, u32 offset,
+		u32 value, u32 size);
+
 #if 0 /*related to addr bits above, concern below TBD on which is accurate */
 #define bar1_instance_block_shift_gk20a() (max_physaddr_bits_gk20a() -\
 					   bus_bar1_block_ptr_s())
@@ -673,7 +701,6 @@ void pde_range_from_vaddr_range(struct vm_gk20a *vm,
 					      u64 addr_lo, u64 addr_hi,
 					      u32 *pde_lo, u32 *pde_hi);
 int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm);
-u32 *pde_from_index(struct vm_gk20a *vm, u32 i);
 u32 pte_index_from_vaddr(struct vm_gk20a *vm,
 			       u64 addr, enum gmmu_pgsz_gk20a pgsz_idx);
 void free_gmmu_pages(struct vm_gk20a *vm,
@@ -685,7 +712,7 @@ struct gpu_ops;
 void gk20a_init_mm(struct gpu_ops *gops);
 const struct gk20a_mmu_level *gk20a_mm_get_mmu_levels(struct gk20a *g,
 						      u32 big_page_size);
-void gk20a_mm_init_pdb(struct gk20a *g, void *inst_ptr, u64 pdb_addr);
+void gk20a_mm_init_pdb(struct gk20a *g, struct mem_desc *mem, u64 pdb_addr);
 
 void gk20a_remove_vm(struct vm_gk20a *vm, struct mem_desc *inst_block);
 
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
index 56ad0c2a..54b2eef4 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -2421,11 +2421,10 @@ static int gk20a_init_pmu_reset_enable_hw(struct gk20a *g)
 static int gk20a_prepare_ucode(struct gk20a *g)
 {
 	struct pmu_gk20a *pmu = &g->pmu;
-	int i, err = 0;
+	int err = 0;
 	struct device *d = dev_from_gk20a(g);
 	struct mm_gk20a *mm = &g->mm;
 	struct vm_gk20a *vm = &mm->pmu.vm;
-	void *ucode_ptr;
 
 	if (g->pmu_fw) {
 		gk20a_init_pmu(pmu);
@@ -2449,11 +2448,8 @@ static int gk20a_prepare_ucode(struct gk20a *g)
 	if (err)
 		goto err_release_fw;
 
-	ucode_ptr = pmu->ucode.cpu_va;
-
-	for (i = 0; i < (pmu->desc->app_start_offset +
-			pmu->desc->app_size) >> 2; i++)
-		gk20a_mem_wr32(ucode_ptr, i, pmu->ucode_image[i]);
+	gk20a_mem_wr_n(g, &pmu->ucode, 0, pmu->ucode_image,
+			pmu->desc->app_start_offset + pmu->desc->app_size);
 
 	gk20a_init_pmu(pmu);
 
-- 
cgit v1.2.2