14 files changed, 493 insertions, 510 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 990972e4..065e8ab1 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -129,28 +129,25 @@ static int channel_gk20a_commit_userd(struct channel_gk20a *c)
 {
        u32 addr_lo;
        u32 addr_hi;
-        void *inst_ptr;
        struct gk20a *g = c->g;
        gk20a_dbg_fn("");
-        inst_ptr = c->inst_block.cpu_va;
-        if (!inst_ptr)
-                return -ENOMEM;
        addr_lo = u64_lo32(c->userd_iova >> ram_userd_base_shift_v());
        addr_hi = u64_hi32(c->userd_iova);
        gk20a_dbg_info("channel %d : set ramfc userd 0x%16llx",
                c->hw_chid, (u64)c->userd_iova);
-        gk20a_mem_wr32(inst_ptr, ram_in_ramfc_w() + ram_fc_userd_w(),
+        gk20a_mem_wr32(g, &c->inst_block,
+                       ram_in_ramfc_w() + ram_fc_userd_w(),
                       (g->mm.vidmem_is_vidmem ?
                        pbdma_userd_target_sys_mem_ncoh_f() :
                        pbdma_userd_target_vid_mem_f()) |
                       pbdma_userd_addr_f(addr_lo));
-        gk20a_mem_wr32(inst_ptr, ram_in_ramfc_w() + ram_fc_userd_hi_w(),
+        gk20a_mem_wr32(g, &c->inst_block,
+                       ram_in_ramfc_w() + ram_fc_userd_hi_w(),
                       pbdma_userd_hi_addr_f(addr_hi));
        return 0;
@@ -186,13 +183,8 @@ int gk20a_channel_get_timescale_from_timeslice(struct gk20a *g,
 static int channel_gk20a_set_schedule_params(struct channel_gk20a *c)
 {
-        void *inst_ptr;
        int shift = 0, value = 0;
-        inst_ptr = c->inst_block.cpu_va;
-        if (!inst_ptr)
-                return -ENOMEM;
        gk20a_channel_get_timescale_from_timeslice(c->g,
                c->timeslice_us, &value, &shift);
@@ -203,7 +195,7 @@ static int channel_gk20a_set_schedule_params(struct channel_gk20a *c)
        WARN_ON(c->g->ops.fifo.preempt_channel(c->g, c->hw_chid));
        /* set new timeslice */
-        gk20a_mem_wr32(inst_ptr, ram_fc_runlist_timeslice_w(),
+        gk20a_mem_wr32(c->g, &c->inst_block, ram_fc_runlist_timeslice_w(),
                value | (shift << 12) |
                fifo_runlist_timeslice_enable_true_f());
@@ -255,33 +247,30 @@ u32 channel_gk20a_pbdma_acquire_val(struct channel_gk20a *c)
 int channel_gk20a_setup_ramfc(struct channel_gk20a *c,
                        u64 gpfifo_base, u32 gpfifo_entries, u32 flags)
 {
-        void *inst_ptr;
+        struct gk20a *g = c->g;
+        struct mem_desc *mem = &c->inst_block;
        gk20a_dbg_fn("");
-        inst_ptr = c->inst_block.cpu_va;
+        gk20a_memset(g, mem, 0, 0, ram_fc_size_val_v());
-        if (!inst_ptr)
-                return -ENOMEM;
-        memset(inst_ptr, 0, ram_fc_size_val_v());
-        gk20a_mem_wr32(inst_ptr, ram_fc_gp_base_w(),
+        gk20a_mem_wr32(g, mem, ram_fc_gp_base_w(),
                pbdma_gp_base_offset_f(
                u64_lo32(gpfifo_base >> pbdma_gp_base_rsvd_s())));
-        gk20a_mem_wr32(inst_ptr, ram_fc_gp_base_hi_w(),
+        gk20a_mem_wr32(g, mem, ram_fc_gp_base_hi_w(),
                pbdma_gp_base_hi_offset_f(u64_hi32(gpfifo_base)) |
                pbdma_gp_base_hi_limit2_f(ilog2(gpfifo_entries)));
-        gk20a_mem_wr32(inst_ptr, ram_fc_signature_w(),
+        gk20a_mem_wr32(g, mem, ram_fc_signature_w(),
                 c->g->ops.fifo.get_pbdma_signature(c->g));
-        gk20a_mem_wr32(inst_ptr, ram_fc_formats_w(),
+        gk20a_mem_wr32(g, mem, ram_fc_formats_w(),
                pbdma_formats_gp_fermi0_f() |
                pbdma_formats_pb_fermi1_f() |
                pbdma_formats_mp_fermi0_f());
-        gk20a_mem_wr32(inst_ptr, ram_fc_pb_header_w(),
+        gk20a_mem_wr32(g, mem, ram_fc_pb_header_w(),
                pbdma_pb_header_priv_user_f() |
                pbdma_pb_header_method_zero_f() |
                pbdma_pb_header_subchannel_zero_f() |
@@ -289,47 +278,49 @@ int channel_gk20a_setup_ramfc(struct channel_gk20a *c,
                pbdma_pb_header_first_true_f() |
                pbdma_pb_header_type_inc_f());
-        gk20a_mem_wr32(inst_ptr, ram_fc_subdevice_w(),
+        gk20a_mem_wr32(g, mem, ram_fc_subdevice_w(),
                pbdma_subdevice_id_f(1) |
                pbdma_subdevice_status_active_f() |
                pbdma_subdevice_channel_dma_enable_f());
-        gk20a_mem_wr32(inst_ptr, ram_fc_target_w(), pbdma_target_engine_sw_f());
+        gk20a_mem_wr32(g, mem, ram_fc_target_w(), pbdma_target_engine_sw_f());
-        gk20a_mem_wr32(inst_ptr, ram_fc_acquire_w(),
+        gk20a_mem_wr32(g, mem, ram_fc_acquire_w(),
                channel_gk20a_pbdma_acquire_val(c));
-        gk20a_mem_wr32(inst_ptr, ram_fc_runlist_timeslice_w(),
+        gk20a_mem_wr32(g, mem, ram_fc_runlist_timeslice_w(),
                fifo_runlist_timeslice_timeout_128_f() |
                fifo_runlist_timeslice_timescale_3_f() |
                fifo_runlist_timeslice_enable_true_f());
-        gk20a_mem_wr32(inst_ptr, ram_fc_pb_timeslice_w(),
+        gk20a_mem_wr32(g, mem, ram_fc_pb_timeslice_w(),
                fifo_pb_timeslice_timeout_16_f() |
                fifo_pb_timeslice_timescale_0_f() |
                fifo_pb_timeslice_enable_true_f());
-        gk20a_mem_wr32(inst_ptr, ram_fc_chid_w(), ram_fc_chid_id_f(c->hw_chid));
+        gk20a_mem_wr32(g, mem, ram_fc_chid_w(), ram_fc_chid_id_f(c->hw_chid));
        return channel_gk20a_commit_userd(c);
 }
 static int channel_gk20a_setup_userd(struct channel_gk20a *c)
 {
-        BUG_ON(!c->userd_cpu_va);
+        struct gk20a *g = c->g;
+        struct mem_desc *mem = &g->fifo.userd;
+        u32 offset = c->hw_chid * g->fifo.userd_entry_size / sizeof(u32);
        gk20a_dbg_fn("");
-        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_put_w(), 0);
+        gk20a_mem_wr32(g, mem, offset + ram_userd_put_w(), 0);
-        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_get_w(), 0);
+        gk20a_mem_wr32(g, mem, offset + ram_userd_get_w(), 0);
-        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_ref_w(), 0);
+        gk20a_mem_wr32(g, mem, offset + ram_userd_ref_w(), 0);
-        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_put_hi_w(), 0);
+        gk20a_mem_wr32(g, mem, offset + ram_userd_put_hi_w(), 0);
-        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_ref_threshold_w(), 0);
+        gk20a_mem_wr32(g, mem, offset + ram_userd_ref_threshold_w(), 0);
-        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_top_level_get_w(), 0);
+        gk20a_mem_wr32(g, mem, offset + ram_userd_gp_top_level_get_w(), 0);
-        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_top_level_get_hi_w(), 0);
+        gk20a_mem_wr32(g, mem, offset + ram_userd_gp_top_level_get_hi_w(), 0);
-        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_get_hi_w(), 0);
+        gk20a_mem_wr32(g, mem, offset + ram_userd_get_hi_w(), 0);
-        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_get_w(), 0);
+        gk20a_mem_wr32(g, mem, offset + ram_userd_gp_get_w(), 0);
-        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_put_w(), 0);
+        gk20a_mem_wr32(g, mem, offset + ram_userd_gp_put_w(), 0);
        return 0;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 8840a3ae..b1355f92 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -130,7 +130,6 @@ struct channel_gk20a {
        struct mem_desc inst_block;
        struct mem_desc_sub ramfc;
-        void *userd_cpu_va;
        u64 userd_iova;
        u64 userd_gpu_va;
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
index c2285c8a..a3fa2ea5 100644
--- a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
@@ -36,7 +36,7 @@ unsigned int gk20a_debug_trace_cmdbuf;
 struct ch_state {
        int pid;
        int refs;
-        u8 inst_block[0];
+        u32 inst_block[0];
 };
 static const char * const ccsr_chan_status_str[] = {
@@ -108,15 +108,15 @@ static void gk20a_debug_show_channel(struct gk20a *g,
        u32 channel = gk20a_readl(g, ccsr_channel_r(hw_chid));
        u32 status = ccsr_channel_status_v(channel);
        u32 syncpointa, syncpointb;
-        void *inst_ptr;
+        u32 *inst_mem;
        if (!ch_state)
                return;
-        inst_ptr = &ch_state->inst_block[0];
+        inst_mem = &ch_state->inst_block[0];
-        syncpointa = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointa_w());
+        syncpointa = inst_mem[ram_fc_syncpointa_w()];
-        syncpointb = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointb_w());
+        syncpointb = inst_mem[ram_fc_syncpointb_w()];
        gk20a_debug_output(o, "%d-%s, pid %d, refs: %d: ", hw_chid,
                        dev_name(g->dev),
@@ -129,23 +129,22 @@ static void gk20a_debug_show_channel(struct gk20a *g,
        gk20a_debug_output(o, "TOP: %016llx PUT: %016llx GET: %016llx "
                        "FETCH: %016llx\nHEADER: %08x COUNT: %08x\n"
                        "SYNCPOINT %08x %08x SEMAPHORE %08x %08x %08x %08x\n",
-                (u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_top_level_get_w()) +
+                (u64)inst_mem[ram_fc_pb_top_level_get_w()] +
-                ((u64)gk20a_mem_rd32(inst_ptr,
+                ((u64)inst_mem[ram_fc_pb_top_level_get_hi_w()] << 32ULL),
-                        ram_fc_pb_top_level_get_hi_w()) << 32ULL),
+                (u64)inst_mem[ram_fc_pb_put_w()] +
-                (u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_put_w()) +
+                ((u64)inst_mem[ram_fc_pb_put_hi_w()] << 32ULL),
-                ((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_put_hi_w()) << 32ULL),
+                (u64)inst_mem[ram_fc_pb_get_w()] +
-                (u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_get_w()) +
+                ((u64)inst_mem[ram_fc_pb_get_hi_w()] << 32ULL),
-                ((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_get_hi_w()) << 32ULL),
+                (u64)inst_mem[ram_fc_pb_fetch_w()] +
-                (u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_fetch_w()) +
+                ((u64)inst_mem[ram_fc_pb_fetch_hi_w()] << 32ULL),
-                ((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_fetch_hi_w()) << 32ULL),
+                inst_mem[ram_fc_pb_header_w()],
-                gk20a_mem_rd32(inst_ptr, ram_fc_pb_header_w()),
+                inst_mem[ram_fc_pb_count_w()],
-                gk20a_mem_rd32(inst_ptr, ram_fc_pb_count_w()),
                syncpointa,
                syncpointb,
-                gk20a_mem_rd32(inst_ptr, ram_fc_semaphorea_w()),
+                inst_mem[ram_fc_semaphorea_w()],
-                gk20a_mem_rd32(inst_ptr, ram_fc_semaphoreb_w()),
+                inst_mem[ram_fc_semaphoreb_w()],
-                gk20a_mem_rd32(inst_ptr, ram_fc_semaphorec_w()),
+                inst_mem[ram_fc_semaphorec_w()],
-                gk20a_mem_rd32(inst_ptr, ram_fc_semaphored_w()));
+                inst_mem[ram_fc_semaphored_w()]);
 #ifdef CONFIG_TEGRA_GK20A
        if ((pbdma_syncpointb_op_v(syncpointb) == pbdma_syncpointb_op_wait_v())
@@ -246,17 +245,15 @@ void gk20a_debug_show_dump(struct gk20a *g, struct gk20a_debug_output *o)
        for (chid = 0; chid < f->num_channels; chid++) {
                struct channel_gk20a *ch = &f->channel[chid];
-                if (ch_state[chid]) {
+                if (!ch_state[chid])
-                        if (ch->inst_block.cpu_va) {
+                        continue;
-                                ch_state[chid]->pid = ch->pid;
-                                ch_state[chid]->refs =
+                ch_state[chid]->pid = ch->pid;
-                                        atomic_read(&ch->ref_count);
+                ch_state[chid]->refs = atomic_read(&ch->ref_count);
-                                memcpy(&ch_state[chid]->inst_block[0],
+                gk20a_mem_rd_n(g, &ch->inst_block, 0,
-                                                ch->inst_block.cpu_va,
+                                &ch_state[chid]->inst_block[0],
-                                                ram_in_alloc_size_v());
+                                ram_in_alloc_size_v());
-                        }
+                gk20a_channel_put(ch);
-                        gk20a_channel_put(ch);
-                }
        }
        for (chid = 0; chid < f->num_channels; chid++) {
                if (ch_state[chid]) {
diff --git a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
index f9cddc41..edddcdc1 100644
--- a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
@@ -619,7 +619,7 @@ static int gk20a_fecs_trace_bind_channel(struct gk20a *g,
        phys_addr_t pa;
        struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
        struct gk20a_fecs_trace *trace = g->fecs_trace;
-        void *ctx_ptr;
+        struct mem_desc *mem = &ch_ctx->gr_ctx->mem;
        u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(ch);
        gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
@@ -634,10 +634,7 @@ static int gk20a_fecs_trace_bind_channel(struct gk20a *g,
        if (!pa)
                return -ENOMEM;
-        ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
+        if (gk20a_mem_begin(g, mem))
-                PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT, 0,
-                pgprot_writecombine(PAGE_KERNEL));
-        if (!ctx_ptr)
                return -ENOMEM;
        lo = u64_lo32(pa);
@@ -646,18 +643,18 @@ static int gk20a_fecs_trace_bind_channel(struct gk20a *g,
        gk20a_dbg(gpu_dbg_ctxsw, "addr_hi=%x addr_lo=%x count=%d", hi,
                lo, GK20A_FECS_TRACE_NUM_RECORDS);
-        gk20a_mem_wr32(ctx_ptr
+        gk20a_mem_wr(g, mem,
-                + ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
+                ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
-                0, lo);
+                lo);
-        gk20a_mem_wr32(ctx_ptr
+        gk20a_mem_wr(g, mem,
-                + ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
+                ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
-                0, ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi));
+                ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi));
-        gk20a_mem_wr32(ctx_ptr
+        gk20a_mem_wr(g, mem,
-                + ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
+                ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
-                0, ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
+                ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
                        GK20A_FECS_TRACE_NUM_RECORDS));
-        vunmap(ctx_ptr);
+        gk20a_mem_end(g, mem);
        gk20a_fecs_trace_hash_add(g, context_ptr, ch->pid);
        return 0;
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index dc3debf2..71400331 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -520,8 +520,6 @@ static int gk20a_init_fifo_setup_sw(struct gk20a *g)
        mutex_init(&f->free_chs_mutex);
        for (chid = 0; chid < f->num_channels; chid++) {
-                f->channel[chid].userd_cpu_va =
-                        f->userd.cpu_va + chid * f->userd_entry_size;
                f->channel[chid].userd_iova =
                        g->ops.mm.get_iova_addr(g, f->userd.sgt->sgl, 0)
                                + chid * f->userd_entry_size;
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index f228cce4..2f85bf96 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -201,7 +201,7 @@ struct gpu_ops {
                          struct gr_ctx_desc *gr_ctx);
                void (*update_ctxsw_preemption_mode)(struct gk20a *g,
                                struct channel_ctx_gk20a *ch_ctx,
-                                void *ctx_ptr);
+                                struct mem_desc *mem);
                int (*update_smpc_ctxsw_mode)(struct gk20a *g,
                                struct channel_gk20a *c,
                                bool enable);
@@ -221,7 +221,8 @@ struct gpu_ops {
                int (*wait_empty)(struct gk20a *g, unsigned long end_jiffies,
                       u32 expect_delay);
                void (*init_cyclestats)(struct gk20a *g);
-                void (*enable_cde_in_fecs)(void *ctx_ptr);
+                void (*enable_cde_in_fecs)(struct gk20a *g,
+                                struct mem_desc *mem);
                int (*set_sm_debug_mode)(struct gk20a *g, struct channel_gk20a *ch,
                                        u64 sms, bool enable);
                void (*bpt_reg_info)(struct gk20a *g,
@@ -484,7 +485,7 @@ struct gpu_ops {
                void (*cbc_clean)(struct gk20a *g);
                void (*tlb_invalidate)(struct vm_gk20a *vm);
                void (*set_big_page_size)(struct gk20a *g,
-                                          void *inst_ptr, int size);
+                                          struct mem_desc *mem, int size);
                u32 (*get_big_page_sizes)(void);
                u32 (*get_physical_addr_bits)(struct gk20a *g);
                int (*init_mm_setup_hw)(struct gk20a *g);
@@ -493,7 +494,8 @@ struct gpu_ops {
                void (*remove_bar2_vm)(struct gk20a *g);
                const struct gk20a_mmu_level *
                        (*get_mmu_levels)(struct gk20a *g, u32 big_page_size);
-                void (*init_pdb)(struct gk20a *g, void *inst_ptr, u64 pdb_addr);
+                void (*init_pdb)(struct gk20a *g, struct mem_desc *mem,
+                                u64 pdb_addr);
                u64 (*get_iova_addr)(struct gk20a *g, struct scatterlist *sgl,
                                         u32 flags);
                int (*bar1_bind)(struct gk20a *g, u64 bar1_iova);
@@ -859,53 +861,6 @@ do {									\
 #define gk20a_dbg_info(fmt, arg...) \
        gk20a_dbg(gpu_dbg_info, fmt, ##arg)
-/* mem access with dbg_mem logging */
-static inline u8 gk20a_mem_rd08(void *ptr, int b)
-{
-        u8 _b = ((const u8 *)ptr)[b];
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-        gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u8)*b, _b);
-#endif
-        return _b;
-}
-static inline u16 gk20a_mem_rd16(void *ptr, int s)
-{
-        u16 _s = ((const u16 *)ptr)[s];
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-        gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u16)*s, _s);
-#endif
-        return _s;
-}
-static inline u32 gk20a_mem_rd32(void *ptr, int w)
-{
-        u32 _w = ((const u32 *)ptr)[w];
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-        gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + sizeof(u32)*w, _w);
-#endif
-        return _w;
-}
-static inline void gk20a_mem_wr08(void *ptr, int b, u8 data)
-{
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-        gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u8)*b, data);
-#endif
-        ((u8 *)ptr)[b] = data;
-}
-static inline void gk20a_mem_wr16(void *ptr, int s, u16 data)
-{
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-        gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u16)*s, data);
-#endif
-        ((u16 *)ptr)[s] = data;
-}
-static inline void gk20a_mem_wr32(void *ptr, int w, u32 data)
-{
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-        gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u32)*w, data);
-#endif
-        ((u32 *)ptr)[w] = data;
-}
 void gk20a_init_clk_ops(struct gpu_ops *gops);
 /* register accessors */
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 4e7c36ee..e7e6662a 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -97,22 +97,18 @@ int gr_gk20a_get_ctx_id(struct gk20a *g,
                u32 *ctx_id)
 {
        struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
-        void *ctx_ptr = NULL;
        /* Channel gr_ctx buffer is gpu cacheable.
           Flush and invalidate before cpu update. */
        g->ops.mm.l2_flush(g, true);
-        ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
+        if (gk20a_mem_begin(g, &ch_ctx->gr_ctx->mem))
-                        PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-                        0, pgprot_writecombine(PAGE_KERNEL));
-        if (!ctx_ptr)
                return -ENOMEM;
-        *ctx_id = gk20a_mem_rd32(ctx_ptr +
+        *ctx_id = gk20a_mem_rd(g, &ch_ctx->gr_ctx->mem,
-                                 ctxsw_prog_main_image_context_id_o(), 0);
+                        ctxsw_prog_main_image_context_id_o());
-        vunmap(ctx_ptr);
+        gk20a_mem_end(g, &ch_ctx->gr_ctx->mem);
        return 0;
 }
@@ -619,22 +615,17 @@ static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
 {
        u32 addr_lo;
        u32 addr_hi;
-        void *inst_ptr = NULL;
        gk20a_dbg_fn("");
-        inst_ptr = c->inst_block.cpu_va;
-        if (!inst_ptr)
-                return -ENOMEM;
        addr_lo = u64_lo32(gpu_va) >> 12;
        addr_hi = u64_hi32(gpu_va);
-        gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_target_w(),
+        gk20a_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_target_w(),
                 ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
                 ram_in_gr_wfi_ptr_lo_f(addr_lo));
-        gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_ptr_hi_w(),
+        gk20a_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_ptr_hi_w(),
                 ram_in_gr_wfi_ptr_hi_f(addr_hi));
        return 0;
@@ -658,11 +649,7 @@ int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
                return -EBUSY;
        }
-        ch_ctx->patch_ctx.mem.cpu_va = vmap(ch_ctx->patch_ctx.mem.pages,
+        if (gk20a_mem_begin(g, &ch_ctx->patch_ctx.mem))
-                        PAGE_ALIGN(ch_ctx->patch_ctx.mem.size) >> PAGE_SHIFT,
-                        0, pgprot_writecombine(PAGE_KERNEL));
-        if (!ch_ctx->patch_ctx.mem.cpu_va)
                return -ENOMEM;
        return 0;
@@ -677,8 +664,7 @@ int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
                return -EINVAL;
        }
-        vunmap(ch_ctx->patch_ctx.mem.cpu_va);
+        gk20a_mem_end(g, &ch_ctx->patch_ctx.mem);
-        ch_ctx->patch_ctx.mem.cpu_va = NULL;
        return 0;
 }
@@ -687,7 +673,6 @@ int gr_gk20a_ctx_patch_write(struct gk20a *g,
                                    u32 addr, u32 data, bool patch)
 {
        u32 patch_slot = 0;
-        void *patch_ptr = NULL;
        bool mapped_here = false;
        BUG_ON(patch != 0 && ch_ctx == NULL);
@@ -708,11 +693,10 @@ int gr_gk20a_ctx_patch_write(struct gk20a *g,
                } else
                        mapped_here = false;
-                patch_ptr = ch_ctx->patch_ctx.mem.cpu_va;
                patch_slot = ch_ctx->patch_ctx.data_count * 2;
-                gk20a_mem_wr32(patch_ptr, patch_slot++, addr);
+                gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot++, addr);
-                gk20a_mem_wr32(patch_ptr, patch_slot++, data);
+                gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot++, data);
                ch_ctx->patch_ctx.data_count++;
@@ -760,16 +744,13 @@ static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
 static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c)
 {
        struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+        struct mem_desc *mem = &ch_ctx->gr_ctx->mem;
        u32 va_lo, va_hi, va;
        int ret = 0;
-        void *ctx_ptr = NULL;
        gk20a_dbg_fn("");
-        ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
+        if (gk20a_mem_begin(g, mem))
-                        PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-                        0, pgprot_writecombine(PAGE_KERNEL));
-        if (!ctx_ptr)
                return -ENOMEM;
        if (ch_ctx->zcull_ctx.gpu_va == 0 &&
@@ -792,15 +773,17 @@ static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c)
                goto clean_up;
        }
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_o(), 0,
+        gk20a_mem_wr(g, mem,
+                        ctxsw_prog_main_image_zcull_o(),
                 ch_ctx->zcull_ctx.ctx_sw_mode);
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, va);
+        gk20a_mem_wr(g, mem,
+                        ctxsw_prog_main_image_zcull_ptr_o(), va);
        c->g->ops.fifo.enable_channel(c);
 clean_up:
-        vunmap(ctx_ptr);
+        gk20a_mem_end(g, mem);
        return ret;
 }
@@ -1500,8 +1483,8 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
        u32 ctx_header_words;
        u32 i;
        u32 data;
-        void *ctx_ptr = NULL;
+        struct mem_desc *gold_mem = &gr->global_ctx_buffer[GOLDEN_CTX].mem;
-        void *gold_ptr = NULL;
+        struct mem_desc *gr_mem = &ch_ctx->gr_ctx->mem;
        u32 err = 0;
        gk20a_dbg_fn("");
@@ -1527,16 +1510,10 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
        if (err)
                goto clean_up;
-        gold_ptr = vmap(gr->global_ctx_buffer[GOLDEN_CTX].mem.pages,
+        if (gk20a_mem_begin(g, gold_mem))
-                        PAGE_ALIGN(gr->global_ctx_buffer[GOLDEN_CTX].mem.size) >>
-                        PAGE_SHIFT, 0, pgprot_writecombine(PAGE_KERNEL));
-        if (!gold_ptr)
                goto clean_up;
-        ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
+        if (gk20a_mem_begin(g, gr_mem))
-                        PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-                        0, pgprot_writecombine(PAGE_KERNEL));
-        if (!ctx_ptr)
                goto clean_up;
        ctx_header_words =  roundup(ctx_header_bytes, sizeof(u32));
@@ -1545,14 +1522,14 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
        g->ops.mm.l2_flush(g, true);
        for (i = 0; i < ctx_header_words; i++) {
-                data = gk20a_mem_rd32(ctx_ptr, i);
+                data = gk20a_mem_rd32(g, gr_mem, i);
-                gk20a_mem_wr32(gold_ptr, i, data);
+                gk20a_mem_wr32(g, gold_mem, i, data);
        }
-        gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_o(), 0,
+        gk20a_mem_wr(g, gold_mem, ctxsw_prog_main_image_zcull_o(),
                 ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
-        gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, 0);
+        gk20a_mem_wr(g, gold_mem, ctxsw_prog_main_image_zcull_ptr_o(), 0);
        gr_gk20a_commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
@@ -1568,12 +1545,12 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
                        goto clean_up;
                }
-                for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
+                gk20a_mem_rd_n(g, gold_mem, 0,
-                        gr->ctx_vars.local_golden_image[i] =
+                                gr->ctx_vars.local_golden_image,
-                                gk20a_mem_rd32(gold_ptr, i);
+                                gr->ctx_vars.golden_image_size);
        }
-        gr_gk20a_commit_inst(c, ch_ctx->gr_ctx->mem.gpu_va);
+        gr_gk20a_commit_inst(c, gr_mem->gpu_va);
        gr->ctx_vars.golden_image_initialized = true;
@@ -1586,10 +1563,8 @@ clean_up:
        else
                gk20a_dbg_fn("done");
-        if (gold_ptr)
+        gk20a_mem_end(g, gold_mem);
-                vunmap(gold_ptr);
+        gk20a_mem_end(g, gr_mem);
-        if (ctx_ptr)
-                vunmap(ctx_ptr);
        mutex_unlock(&gr->ctx_mutex);
        return err;
@@ -1600,7 +1575,7 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
                                    bool enable_smpc_ctxsw)
 {
        struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
-        void *ctx_ptr = NULL;
+        struct mem_desc *mem;
        u32 data;
        int ret;
@@ -1611,46 +1586,39 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
                return -EFAULT;
        }
+        mem = &ch_ctx->gr_ctx->mem;
        c->g->ops.fifo.disable_channel(c);
        ret = c->g->ops.fifo.preempt_channel(c->g, c->hw_chid);
        if (ret) {
-                c->g->ops.fifo.enable_channel(c);
+                gk20a_err(dev_from_gk20a(g), "failed to preempt channel");
-                gk20a_err(dev_from_gk20a(g),
+                goto out;
-                        "failed to preempt channel\n");
-                return ret;
        }
        /* Channel gr_ctx buffer is gpu cacheable.
           Flush and invalidate before cpu update. */
        g->ops.mm.l2_flush(g, true);
-        if (!ch_ctx->gr_ctx) {
+        if (gk20a_mem_begin(g, mem)) {
-                gk20a_err(dev_from_gk20a(g), "no graphics context allocated");
+                ret = -ENOMEM;
-                return -EFAULT;
+                goto out;
-        }
-        ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-                        PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-                        0, pgprot_writecombine(PAGE_KERNEL));
-        if (!ctx_ptr) {
-                c->g->ops.fifo.enable_channel(c);
-                return -ENOMEM;
        }
-        data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+        data = gk20a_mem_rd(g, mem,
+                        ctxsw_prog_main_image_pm_o());
        data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
        data |= enable_smpc_ctxsw ?
                ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
                ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
+        gk20a_mem_wr(g, mem,
-                 data);
+                        ctxsw_prog_main_image_pm_o(),
+                        data);
-        vunmap(ctx_ptr);
+        gk20a_mem_end(g, mem);
-        /* enable channel */
+out:
        c->g->ops.fifo.enable_channel(c);
+        return ret;
-        return 0;
 }
 int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
@@ -1659,8 +1627,7 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 {
        struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
        struct pm_ctx_desc *pm_ctx = &ch_ctx->pm_ctx;
-        void *ctx_ptr = NULL;
+        struct mem_desc *gr_mem;
-        void *pm_ctx_ptr;
        u32 data, virt_addr;
        int ret;
@@ -1671,6 +1638,8 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
                return -EFAULT;
        }
+        gr_mem = &ch_ctx->gr_ctx->mem;
        if (enable_hwpm_ctxsw) {
                if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f())
                        return 0;
@@ -1721,29 +1690,22 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
                }
                /* Now clear the buffer */
-                pm_ctx_ptr = vmap(pm_ctx->mem.pages,
+                if (gk20a_mem_begin(g, &pm_ctx->mem)) {
-                                PAGE_ALIGN(pm_ctx->mem.size) >> PAGE_SHIFT,
-                                0, pgprot_writecombine(PAGE_KERNEL));
-                if (!pm_ctx_ptr) {
                        ret = -ENOMEM;
                        goto cleanup_pm_buf;
                }
-                memset(pm_ctx_ptr, 0, pm_ctx->mem.size);
+                gk20a_memset(g, &pm_ctx->mem, 0, 0, pm_ctx->mem.size);
-                vunmap(pm_ctx_ptr);
+                gk20a_mem_end(g, &pm_ctx->mem);
        }
-        ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
+        if (gk20a_mem_begin(g, gr_mem)) {
-                        PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-                        0, pgprot_writecombine(PAGE_KERNEL));
-        if (!ctx_ptr) {
                ret = -ENOMEM;
                goto cleanup_pm_buf;
        }
-        data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+        data = gk20a_mem_rd(g, gr_mem, ctxsw_prog_main_image_pm_o());
        data = data & ~ctxsw_prog_main_image_pm_mode_m();
        if (enable_hwpm_ctxsw) {
@@ -1760,10 +1722,10 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
        data |= pm_ctx->pm_mode;
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0, data);
+        gk20a_mem_wr(g, gr_mem, ctxsw_prog_main_image_pm_o(), data);
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, virt_addr);
+        gk20a_mem_wr(g, gr_mem, ctxsw_prog_main_image_pm_ptr_o(), virt_addr);
-        vunmap(ctx_ptr);
+        gk20a_mem_end(g, gr_mem);
        /* enable channel */
        c->g->ops.fifo.enable_channel(c);
@@ -1788,9 +1750,9 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
        u32 virt_addr_lo;
        u32 virt_addr_hi;
        u32 virt_addr = 0;
-        u32 i, v, data;
+        u32 v, data;
        int ret = 0;
-        void *ctx_ptr = NULL;
+        struct mem_desc *mem = &ch_ctx->gr_ctx->mem;
        gk20a_dbg_fn("");
@@ -1801,20 +1763,18 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
           Flush and invalidate before cpu update. */
        g->ops.mm.l2_flush(g, true);
-        ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
+        if (gk20a_mem_begin(g, mem))
-                        PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-                        0, pgprot_writecombine(PAGE_KERNEL));
-        if (!ctx_ptr)
                return -ENOMEM;
-        for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
+        gk20a_mem_wr_n(g, mem, 0,
-                gk20a_mem_wr32(ctx_ptr, i, gr->ctx_vars.local_golden_image[i]);
+                        gr->ctx_vars.local_golden_image,
+                        gr->ctx_vars.golden_image_size);
        if (g->ops.gr.enable_cde_in_fecs && c->cde)
-                g->ops.gr.enable_cde_in_fecs(ctx_ptr);
+                g->ops.gr.enable_cde_in_fecs(g, mem);
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_o(), 0, 0);
+        gk20a_mem_wr(g, mem, ctxsw_prog_main_image_num_save_ops_o(), 0);
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_o(), 0, 0);
+        gk20a_mem_wr(g, mem, ctxsw_prog_main_image_num_restore_ops_o(), 0);
        /* set priv access map */
        virt_addr_lo =
@@ -1827,29 +1787,29 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
        else
                data = ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f();
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_config_o(), 0,
+        gk20a_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_config_o(),
                 data);
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_lo_o(), 0,
+        gk20a_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_addr_lo_o(),
                 virt_addr_lo);
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_hi_o(), 0,
+        gk20a_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_addr_hi_o(),
                 virt_addr_hi);
        /* disable verif features */
-        v = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0);
+        v = gk20a_mem_rd(g, mem, ctxsw_prog_main_image_misc_options_o());
        v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
        v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0, v);
+        gk20a_mem_wr(g, mem, ctxsw_prog_main_image_misc_options_o(), v);
        if (g->ops.gr.update_ctxsw_preemption_mode)
-                g->ops.gr.update_ctxsw_preemption_mode(g, ch_ctx, ctx_ptr);
+                g->ops.gr.update_ctxsw_preemption_mode(g, ch_ctx, mem);
        virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.mem.gpu_va);
        virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.mem.gpu_va);
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_count_o(), 0,
+        gk20a_mem_wr(g, mem, ctxsw_prog_main_image_patch_count_o(),
                 ch_ctx->patch_ctx.data_count);
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_lo_o(), 0,
+        gk20a_mem_wr(g, mem, ctxsw_prog_main_image_patch_adr_lo_o(),
                 virt_addr_lo);
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_o(), 0,
+        gk20a_mem_wr(g, mem, ctxsw_prog_main_image_patch_adr_hi_o(),
                 virt_addr_hi);
        /* Update main header region of the context buffer with the info needed
@@ -1860,7 +1820,7 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
                if (ch_ctx->pm_ctx.mem.gpu_va == 0) {
                        gk20a_err(dev_from_gk20a(g),
                                "context switched pm with no pm buffer!");
-                        vunmap(ctx_ptr);
+                        gk20a_mem_end(g, mem);
                        return -EFAULT;
                }
@@ -1871,14 +1831,14 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
        } else
                virt_addr = 0;
-        data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+        data = gk20a_mem_rd(g, mem, ctxsw_prog_main_image_pm_o());
        data = data & ~ctxsw_prog_main_image_pm_mode_m();
        data |= ch_ctx->pm_ctx.pm_mode;
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0, data);
+        gk20a_mem_wr(g, mem, ctxsw_prog_main_image_pm_o(), data);
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, virt_addr);
+        gk20a_mem_wr(g, mem, ctxsw_prog_main_image_pm_ptr_o(), virt_addr);
-        vunmap(ctx_ptr);
+        gk20a_mem_end(g, mem);
        if (tegra_platform_is_linsim()) {
                u32 inst_base_ptr =
@@ -1978,16 +1938,20 @@ static void gr_gk20a_init_ctxsw_ucode_segments(
 }
 static int gr_gk20a_copy_ctxsw_ucode_segments(
-        u8 *buf,
+        struct gk20a *g,
+        struct mem_desc *dst,
        struct gk20a_ctxsw_ucode_segments *segments,
        u32 *bootimage,
        u32 *code, u32 *data)
 {
        int i;
-        memcpy(buf + segments->boot.offset, bootimage, segments->boot.size);
+        gk20a_mem_wr_n(g, dst, segments->boot.offset, bootimage,
-        memcpy(buf + segments->code.offset, code,      segments->code.size);
+                        segments->boot.size);
-        memcpy(buf + segments->data.offset, data,      segments->data.size);
+        gk20a_mem_wr_n(g, dst, segments->code.offset, code,
+                        segments->code.size);
+        gk20a_mem_wr_n(g, dst, segments->data.offset, data,
+                        segments->data.size);
        /* compute a "checksum" for the boot binary to detect its version */
        segments->boot_signature = 0;
@@ -2009,7 +1973,6 @@ int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
        u32 *fecs_boot_image;
        u32 *gpccs_boot_image;
        struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
-        u8 *buf;
        u32 ucode_size;
        int err = 0;
@@ -2049,14 +2012,8 @@ int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
        if (err)
                goto clean_up;
-        buf = (u8 *)ucode_info->surface_desc.cpu_va;
+        gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc,
-        if (!buf) {
+                &ucode_info->fecs,
-                gk20a_err(d, "failed to map surface desc buffer");
-                err = -ENOMEM;
-                goto clean_up;
-        }
-        gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->fecs,
                fecs_boot_image,
                g->gr.ctx_vars.ucode.fecs.inst.l,
                g->gr.ctx_vars.ucode.fecs.data.l);
@@ -2064,7 +2021,8 @@ int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
        release_firmware(fecs_fw);
        fecs_fw = NULL;
-        gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->gpccs,
+        gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc,
+                &ucode_info->gpccs,
                gpccs_boot_image,
                g->gr.ctx_vars.ucode.gpccs.inst.l,
                g->gr.ctx_vars.ucode.gpccs.data.l);
@@ -4690,41 +4648,38 @@ out:
 static int gr_gk20a_init_access_map(struct gk20a *g)
 {
        struct gr_gk20a *gr = &g->gr;
-        void *data;
+        struct mem_desc *mem = &gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem;
-        int err = 0;
        u32 w, nr_pages =
                DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size,
                             PAGE_SIZE);
        u32 *whitelist = NULL;
        int num_entries = 0;
-        data = vmap(gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem.pages,
+        if (gk20a_mem_begin(g, mem)) {
-                    PAGE_ALIGN(gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem.size) >>
-                    PAGE_SHIFT, 0, pgprot_writecombine(PAGE_KERNEL));
-        if (!data) {
                gk20a_err(dev_from_gk20a(g),
                          "failed to map priv access map memory");
-                err = -ENOMEM;
+                return -ENOMEM;
-                goto clean_up;
        }
-        memset(data, 0x0, PAGE_SIZE * nr_pages);
+        gk20a_memset(g, mem, 0, 0, PAGE_SIZE * nr_pages);
        g->ops.gr.get_access_map(g, &whitelist, &num_entries);
        for (w = 0; w < num_entries; w++) {
-                u32 map_bit, map_byte, map_shift;
+                u32 map_bit, map_byte, map_shift, x;
                map_bit = whitelist[w] >> 2;
                map_byte = map_bit >> 3;
                map_shift = map_bit & 0x7; /* i.e. 0-7 */
                gk20a_dbg_info("access map addr:0x%x byte:0x%x bit:%d",
                               whitelist[w], map_byte, map_shift);
-                ((u8 *)data)[map_byte] |= 1 << map_shift;
+                x = gk20a_mem_rd32(g, mem, map_byte / sizeof(u32));
+                x |= 1 << (
+                           (map_byte % sizeof(u32) * BITS_PER_BYTE)
+                          + map_shift);
+                gk20a_mem_wr32(g, mem, map_byte / sizeof(u32), x);
        }
-clean_up:
+        gk20a_mem_end(g, mem);
-        if (data)
-                vunmap(data);
        return 0;
 }
@@ -6659,7 +6614,7 @@ static void gr_gk20a_init_sm_dsm_reg_info(void)
 static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
                            struct channel_ctx_gk20a *ch_ctx,
                            u32 addr, u32 data,
-                            u8 *context)
+                            struct mem_desc *mem)
 {
        u32 num_gpc = g->gr.gpc_count;
        u32 num_tpc;
@@ -6688,8 +6643,8 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
                                /* reset the patch count from previous
                                   runs,if ucode has already processed
                                   it */
-                                tmp = gk20a_mem_rd32(context +
+                                tmp = gk20a_mem_rd(g, mem,
-                                       ctxsw_prog_main_image_patch_count_o(), 0);
+                                       ctxsw_prog_main_image_patch_count_o());
                                if (!tmp)
                                        ch_ctx->patch_ctx.data_count = 0;
@@ -6700,15 +6655,15 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
                                vaddr_lo = u64_lo32(ch_ctx->patch_ctx.mem.gpu_va);
                                vaddr_hi = u64_hi32(ch_ctx->patch_ctx.mem.gpu_va);
-                                gk20a_mem_wr32(context +
+                                gk20a_mem_wr(g, mem,
                                         ctxsw_prog_main_image_patch_count_o(),
-                                         0, ch_ctx->patch_ctx.data_count);
+                                         ch_ctx->patch_ctx.data_count);
-                                gk20a_mem_wr32(context +
+                                gk20a_mem_wr(g, mem,
                                         ctxsw_prog_main_image_patch_adr_lo_o(),
-                                         0, vaddr_lo);
+                                         vaddr_lo);
-                                gk20a_mem_wr32(context +
+                                gk20a_mem_wr(g, mem,
                                         ctxsw_prog_main_image_patch_adr_hi_o(),
-                                         0, vaddr_hi);
+                                         vaddr_hi);
                                /* we're not caching these on cpu side,
                                   but later watch for it */
@@ -6760,17 +6715,15 @@ static void gr_gk20a_access_smpc_reg(struct gk20a *g, u32 quad, u32 offset)
 #define ILLEGAL_ID (~0)
-static inline bool check_main_image_header_magic(void *context)
+static inline bool check_main_image_header_magic(u8 *context)
 {
-        u32 magic = gk20a_mem_rd32(context +
+        u32 magic = *(u32 *)(context + ctxsw_prog_main_image_magic_value_o());
-                             ctxsw_prog_main_image_magic_value_o(), 0);
        gk20a_dbg(gpu_dbg_gpu_dbg, "main image magic=0x%x", magic);
        return magic == ctxsw_prog_main_image_magic_value_v_value_v();
 }
-static inline bool check_local_header_magic(void *context)
+static inline bool check_local_header_magic(u8 *context)
 {
-        u32 magic = gk20a_mem_rd32(context +
+        u32 magic = *(u32 *)(context + ctxsw_prog_local_magic_value_o());
-                             ctxsw_prog_local_magic_value_o(), 0);
        gk20a_dbg(gpu_dbg_gpu_dbg, "local magic=0x%x",  magic);
        return magic == ctxsw_prog_local_magic_value_v_value_v();
@@ -6814,7 +6767,7 @@ static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
        u32 num_gpcs, num_tpcs;
        u32 chk_addr;
        u32 ext_priv_offset, ext_priv_size;
-        void *context;
+        u8 *context;
        u32 offset_to_segment, offset_to_segment_end;
        u32 sm_dsm_perf_reg_id = ILLEGAL_ID;
        u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID;
@@ -6856,14 +6809,14 @@ static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
        /* note below is in words/num_registers */
        marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2;
-        context = context_buffer;
+        context = (u8 *)context_buffer;
        /* sanity check main header */
        if (!check_main_image_header_magic(context)) {
                gk20a_err(dev_from_gk20a(g),
                           "Invalid main header: magic value");
                return -EINVAL;
        }
-        num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
+        num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
        if (gpc_num >= num_gpcs) {
                gk20a_err(dev_from_gk20a(g),
                   "GPC 0x%08x is greater than total count 0x%08x!\n",
@@ -6871,7 +6824,7 @@ static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
                return -EINVAL;
        }
-        data32 = gk20a_mem_rd32(context + ctxsw_prog_main_extended_buffer_ctl_o(), 0);
+        data32 = *(u32 *)(context + ctxsw_prog_main_extended_buffer_ctl_o());
        ext_priv_size   = ctxsw_prog_main_extended_buffer_ctl_size_v(data32);
        if (0 == ext_priv_size) {
                gk20a_dbg_info(" No extended memory in context buffer");
@@ -7149,7 +7102,7 @@ gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g,
 }
 static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
-                                               void *context,
+                                               u8 *context,
                                               u32 *num_ppcs, u32 *ppc_mask,
                                               u32 *reg_ppc_count)
 {
@@ -7165,7 +7118,7 @@ static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
             (num_pes_per_gpc > 1)))
                return -EINVAL;
-        data32 = gk20a_mem_rd32(context + ctxsw_prog_local_image_ppc_info_o(), 0);
+        data32 = *(u32 *)(context + ctxsw_prog_local_image_ppc_info_o());
        *num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32);
        *ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32);
@@ -7177,7 +7130,7 @@ static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
 /*
 *  This function will return the 32 bit offset for a priv register if it is
- *  present in the context buffer.
+ *  present in the context buffer. The context buffer is in CPU memory.
 */
 static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
                                               u32 addr,
@@ -7196,7 +7149,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
        u32 offset;
        u32 sys_priv_offset, gpc_priv_offset;
        u32 ppc_mask, reg_list_ppc_count;
-        void *context;
+        u8 *context;
        u32 offset_to_segment;
        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
@@ -7207,13 +7160,13 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
        if (err)
                return err;
-        context = context_buffer;
+        context = (u8 *)context_buffer;
        if (!check_main_image_header_magic(context)) {
                gk20a_err(dev_from_gk20a(g),
                           "Invalid main header: magic value");
                return -EINVAL;
        }
-        num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
+        num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
        /* Parse the FECS local header. */
        context += ctxsw_prog_ucode_header_size_in_bytes();
@@ -7222,7 +7175,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
                           "Invalid FECS local header: magic value\n");
                return -EINVAL;
        }
-        data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
+        data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
        sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
        /* If found in Ext buffer, ok.
@@ -7268,7 +7221,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
                        return -EINVAL;
                }
-                data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
+                data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
                gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
                err = gr_gk20a_determine_ppc_configuration(g, context,
@@ -7277,7 +7230,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
                if (err)
                        return err;
-                num_tpcs = gk20a_mem_rd32(context + ctxsw_prog_local_image_num_tpcs_o(), 0);
+                num_tpcs = *(u32 *)(context + ctxsw_prog_local_image_num_tpcs_o());
                if ((i == gpc_num) && ((tpc_num + 1) > num_tpcs)) {
                        gk20a_err(dev_from_gk20a(g),
@@ -7689,9 +7642,9 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 {
        struct gk20a *g = ch->g;
        struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
-        void *ctx_ptr = NULL;
+        bool gr_ctx_ready = false;
-        void *pm_ctx_ptr = NULL;
+        bool pm_ctx_ready = false;
-        void *base_ptr = NULL;
+        struct mem_desc *current_mem = NULL;
        bool ch_is_curr_ctx, restart_gr_ctxsw = false;
        u32 i, j, offset, v;
        struct gr_gk20a *gr = &g->gr;
@@ -7821,20 +7774,18 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
                                                ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD),
                                                ctx_ops[i].quad);
                        if (!err) {
-                                if (!ctx_ptr) {
+                                if (!gr_ctx_ready) {
                                        /* would have been a variant of
                                         * gr_gk20a_apply_instmem_overrides,
                                         * recoded in-place instead.
                                         */
-                                        ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
+                                        if (gk20a_mem_begin(g, &ch_ctx->gr_ctx->mem)) {
-                                                PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-                                                0, pgprot_writecombine(PAGE_KERNEL));
-                                        if (!ctx_ptr) {
                                                err = -ENOMEM;
                                                goto cleanup;
                                        }
+                                        gr_ctx_ready = true;
                                }
-                                base_ptr = ctx_ptr;
+                                current_mem = &ch_ctx->gr_ctx->mem;
                        } else {
                                err = gr_gk20a_get_pm_ctx_buffer_offsets(g,
                                                        ctx_ops[i].offset,
@@ -7849,7 +7800,7 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
                                                NVGPU_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET;
                                        continue;
                                }
-                                if (!pm_ctx_ptr) {
+                                if (!pm_ctx_ready) {
                                        /* Make sure ctx buffer was initialized */
                                        if (!ch_ctx->pm_ctx.mem.pages) {
                                                gk20a_err(dev_from_gk20a(g),
@@ -7857,15 +7808,13 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
                                                err = -EINVAL;
                                                goto cleanup;
                                        }
-                                        pm_ctx_ptr = vmap(ch_ctx->pm_ctx.mem.pages,
+                                        if (gk20a_mem_begin(g, &ch_ctx->pm_ctx.mem)) {
-                                                PAGE_ALIGN(ch_ctx->pm_ctx.mem.size) >> PAGE_SHIFT,
-                                                0, pgprot_writecombine(PAGE_KERNEL));
-                                        if (!pm_ctx_ptr) {
                                                err = -ENOMEM;
                                                goto cleanup;
                                        }
+                                        pm_ctx_ready = true;
                                }
-                                base_ptr = pm_ctx_ptr;
+                                current_mem = &ch_ctx->pm_ctx.mem;
                        }
                        /* if this is a quad access, setup for special access*/
@@ -7878,24 +7827,24 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
                                /* sanity check gr ctxt offsets,
                                 * don't write outside, worst case
                                 */
-                                if ((base_ptr == ctx_ptr) &&
+                                if ((current_mem == &ch_ctx->gr_ctx->mem) &&
                                        (offsets[j] >= g->gr.ctx_vars.golden_image_size))
                                        continue;
                                if (pass == 0) { /* write pass */
-                                        v = gk20a_mem_rd32(base_ptr + offsets[j], 0);
+                                        v = gk20a_mem_rd(g, current_mem, offsets[j]);
                                        v &= ~ctx_ops[i].and_n_mask_lo;
                                        v |= ctx_ops[i].value_lo;
-                                        gk20a_mem_wr32(base_ptr + offsets[j], 0, v);
+                                        gk20a_mem_wr(g, current_mem, offsets[j], v);
                                        gk20a_dbg(gpu_dbg_gpu_dbg,
                                                   "context wr: offset=0x%x v=0x%x",
                                                   offsets[j], v);
                                        if (ctx_ops[i].op == REGOP(WRITE_64)) {
-                                                v = gk20a_mem_rd32(base_ptr + offsets[j] + 4, 0);
+                                                v = gk20a_mem_rd(g, current_mem, offsets[j] + 4);
                                                v &= ~ctx_ops[i].and_n_mask_hi;
                                                v |= ctx_ops[i].value_hi;
-                                                gk20a_mem_wr32(base_ptr + offsets[j] + 4, 0, v);
+                                                gk20a_mem_wr(g, current_mem, offsets[j] + 4, v);
                                                gk20a_dbg(gpu_dbg_gpu_dbg,
                                                           "context wr: offset=0x%x v=0x%x",
@@ -7905,18 +7854,18 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
                                        /* check to see if we need to add a special WAR
                                           for some of the SMPC perf regs */
                                        gr_gk20a_ctx_patch_smpc(g, ch_ctx, offset_addrs[j],
-                                                        v, base_ptr);
+                                                        v, current_mem);
                                } else { /* read pass */
                                        ctx_ops[i].value_lo =
-                                                gk20a_mem_rd32(base_ptr + offsets[0], 0);
+                                                gk20a_mem_rd(g, current_mem, offsets[0]);
                                        gk20a_dbg(gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
                                                   offsets[0], ctx_ops[i].value_lo);
                                        if (ctx_ops[i].op == REGOP(READ_64)) {
                                                ctx_ops[i].value_hi =
-                                                        gk20a_mem_rd32(base_ptr + offsets[0] + 4, 0);
+                                                        gk20a_mem_rd(g, current_mem, offsets[0] + 4);
                                                gk20a_dbg(gpu_dbg_gpu_dbg,
                                                           "context rd: offset=0x%x v=0x%x",
@@ -7943,12 +7892,10 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
        if (ch_ctx->patch_ctx.mem.cpu_va)
                gr_gk20a_ctx_patch_write_end(g, ch_ctx);
+        if (gr_ctx_ready)
-        if (ctx_ptr)
+                gk20a_mem_end(g, &ch_ctx->gr_ctx->mem);
-                vunmap(ctx_ptr);
+        if (pm_ctx_ready)
+                gk20a_mem_end(g, &ch_ctx->pm_ctx.mem);
-        if (pm_ctx_ptr)
-                vunmap(pm_ctx_ptr);
        if (restart_gr_ctxsw) {
                int tmp_err = gr_gk20a_enable_ctxsw(g);
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 6f6734b4..13382416 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -44,6 +44,112 @@
 #include "kind_gk20a.h"
 #include "semaphore_gk20a.h"
+int gk20a_mem_begin(struct gk20a *g, struct mem_desc *mem)
+{
+        void *cpu_va;
+        if (WARN_ON(mem->cpu_va)) {
+                gk20a_warn(dev_from_gk20a(g), "nested %s", __func__);
+                return -EBUSY;
+        }
+        cpu_va = vmap(mem->pages,
+                        PAGE_ALIGN(mem->size) >> PAGE_SHIFT,
+                        0, pgprot_writecombine(PAGE_KERNEL));
+        if (WARN_ON(!cpu_va))
+                return -ENOMEM;
+        mem->cpu_va = cpu_va;
+        return 0;
+}
+void gk20a_mem_end(struct gk20a *g, struct mem_desc *mem)
+{
+        vunmap(mem->cpu_va);
+        mem->cpu_va = NULL;
+}
+u32 gk20a_mem_rd32(struct gk20a *g, struct mem_desc *mem, u32 w)
+{
+        u32 *ptr = mem->cpu_va;
+        u32 data;
+        WARN_ON(!ptr);
+        data = ptr[w];
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+        gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);
+#endif
+        return data;
+}
+u32 gk20a_mem_rd(struct gk20a *g, struct mem_desc *mem, u32 offset)
+{
+        WARN_ON(offset & 3);
+        return gk20a_mem_rd32(g, mem, offset / sizeof(u32));
+}
+void gk20a_mem_rd_n(struct gk20a *g, struct mem_desc *mem,
+                u32 offset, void *dest, u32 size)
+{
+        u32 i;
+        u32 *dest_u32 = dest;
+        WARN_ON(offset & 3);
+        WARN_ON(size & 3);
+        offset /= sizeof(u32);
+        size /= sizeof(u32);
+        for (i = 0; i < size; i++)
+                dest_u32[i] = gk20a_mem_rd32(g, mem, offset + i);
+}
+void gk20a_mem_wr32(struct gk20a *g, struct mem_desc *mem, u32 w, u32 data)
+{
+        u32 *ptr = mem->cpu_va;
+        WARN_ON(!ptr);
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+        gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);
+#endif
+        ptr[w] = data;
+}
+void gk20a_mem_wr(struct gk20a *g, struct mem_desc *mem, u32 offset, u32 data)
+{
+        WARN_ON(offset & 3);
+        gk20a_mem_wr32(g, mem, offset / sizeof(u32), data);
+}
+void gk20a_mem_wr_n(struct gk20a *g, struct mem_desc *mem, u32 offset,
+                void *src, u32 size)
+{
+        u32 i;
+        u32 *src_u32 = src;
+        WARN_ON(offset & 3);
+        WARN_ON(size & 3);
+        offset /= sizeof(u32);
+        size /= sizeof(u32);
+        for (i = 0; i < size; i++)
+                gk20a_mem_wr32(g, mem, offset + i, src_u32[i]);
+}
+void gk20a_memset(struct gk20a *g, struct mem_desc *mem, u32 offset,
+                u32 value, u32 size)
+{
+        u32 i;
+        WARN_ON(offset & 3);
+        WARN_ON(size & 3);
+        offset /= sizeof(u32);
+        size /= sizeof(u32);
+        for (i = 0; i < size; i++)
+                gk20a_mem_wr32(g, mem, offset + i, value);
+}
 /*
 * GPU mapping life cycle
 * ======================
@@ -780,9 +886,14 @@ void pde_range_from_vaddr_range(struct vm_gk20a *vm,
                   *pde_lo, *pde_hi);
 }
-u32 *pde_from_index(struct vm_gk20a *vm, u32 i)
+static u32 pde_from_index(u32 i)
+{
+        return i * gmmu_pde__size_v() / sizeof(u32);
+}
+static u32 pte_from_index(u32 i)
 {
-        return (u32 *) (((u8 *)vm->pdb.mem.cpu_va) + i*gmmu_pde__size_v());
+        return i * gmmu_pte__size_v() / sizeof(u32);
 }
 u32 pte_index_from_vaddr(struct vm_gk20a *vm,
@@ -2323,7 +2434,7 @@ static int update_gmmu_pde_locked(struct vm_gk20a *vm,
        u64 pte_addr_small = 0, pte_addr_big = 0;
        struct gk20a_mm_entry *entry = vm->pdb.entries + i;
        u32 pde_v[2] = {0, 0};
-        u32 *pde;
+        u32 pde;
        gk20a_dbg_fn("");
@@ -2348,10 +2459,10 @@ static int update_gmmu_pde_locked(struct vm_gk20a *vm,
                    (big_valid ? (gmmu_pde_vol_big_true_f()) :
                     gmmu_pde_vol_big_false_f());
-        pde = pde_from_index(vm, i);
+        pde = pde_from_index(i);
-        gk20a_mem_wr32(pde, 0, pde_v[0]);
+        gk20a_mem_wr32(g, &vm->pdb.mem, pde + 0, pde_v[0]);
-        gk20a_mem_wr32(pde, 1, pde_v[1]);
+        gk20a_mem_wr32(g, &vm->pdb.mem, pde + 1, pde_v[1]);
        gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x",
                  i, gmmu_pgsz_idx, pde_v[1], pde_v[0]);
@@ -2432,8 +2543,8 @@ static int update_gmmu_pte_locked(struct vm_gk20a *vm,
                gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
        }
-        gk20a_mem_wr32(pte->mem.cpu_va + i*8, 0, pte_w[0]);
+        gk20a_mem_wr32(g, &pte->mem, pte_from_index(i) + 0, pte_w[0]);
-        gk20a_mem_wr32(pte->mem.cpu_va + i*8, 1, pte_w[1]);
+        gk20a_mem_wr32(g, &pte->mem, pte_from_index(i) + 1, pte_w[1]);
        if (*iova) {
                *iova += page_size;
@@ -3489,19 +3600,19 @@ static int gk20a_init_cde_vm(struct mm_gk20a *mm)
                        false, false, "cde");
 }
-void gk20a_mm_init_pdb(struct gk20a *g, void *inst_ptr, u64 pdb_addr)
+void gk20a_mm_init_pdb(struct gk20a *g, struct mem_desc *mem, u64 pdb_addr)
 {
        u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
        u32 pdb_addr_hi = u64_hi32(pdb_addr);
-        gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
+        gk20a_mem_wr32(g, mem, ram_in_page_dir_base_lo_w(),
                (g->mm.vidmem_is_vidmem ?
                  ram_in_page_dir_base_target_sys_mem_ncoh_f() :
                  ram_in_page_dir_base_target_vid_mem_f()) |
                ram_in_page_dir_base_vol_true_f() |
                ram_in_page_dir_base_lo_f(pdb_addr_lo));
-        gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
+        gk20a_mem_wr32(g, mem, ram_in_page_dir_base_hi_w(),
                ram_in_page_dir_base_hi_f(pdb_addr_hi));
 }
@@ -3510,23 +3621,22 @@ void gk20a_init_inst_block(struct mem_desc *inst_block, struct vm_gk20a *vm,
 {
        struct gk20a *g = gk20a_from_vm(vm);
        u64 pde_addr = g->ops.mm.get_iova_addr(g, vm->pdb.mem.sgt->sgl, 0);
-        void *inst_ptr = inst_block->cpu_va;
        gk20a_dbg_info("inst block phys = 0x%llx, kv = 0x%p",
-                gk20a_mm_inst_block_addr(g, inst_block), inst_ptr);
+                gk20a_mm_inst_block_addr(g, inst_block), inst_block->cpu_va);
        gk20a_dbg_info("pde pa=0x%llx", (u64)pde_addr);
-        g->ops.mm.init_pdb(g, inst_ptr, pde_addr);
+        g->ops.mm.init_pdb(g, inst_block, pde_addr);
-        gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
+        gk20a_mem_wr32(g, inst_block, ram_in_adr_limit_lo_w(),
                u64_lo32(vm->va_limit - 1) & ~0xfff);
-        gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
+        gk20a_mem_wr32(g, inst_block, ram_in_adr_limit_hi_w(),
                ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit - 1)));
        if (big_page_size && g->ops.mm.set_big_page_size)
-                g->ops.mm.set_big_page_size(g, inst_ptr, big_page_size);
+                g->ops.mm.set_big_page_size(g, inst_block, big_page_size);
 }
 int gk20a_mm_fb_flush(struct gk20a *g)
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 7fa0b7fb..e9ac8f18 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -419,6 +419,34 @@ static inline enum gmmu_pgsz_gk20a __get_pte_size(struct vm_gk20a *vm,
                return gmmu_page_size_small;
 }
+/*
+ * Buffer accessors - wrap between begin() and end() if there is no permanent
+ * kernel mapping for this buffer.
+ */
+int gk20a_mem_begin(struct gk20a *g, struct mem_desc *mem);
+/* nop for null mem, like with free() or vunmap() */
+void gk20a_mem_end(struct gk20a *g, struct mem_desc *mem);
+/* word-indexed offset */
+u32 gk20a_mem_rd32(struct gk20a *g, struct mem_desc *mem, u32 w);
+/* byte offset (32b-aligned) */
+u32 gk20a_mem_rd(struct gk20a *g, struct mem_desc *mem, u32 offset);
+/* memcpy to cpu, offset and size in bytes (32b-aligned) */
+void gk20a_mem_rd_n(struct gk20a *g, struct mem_desc *mem, u32 offset,
+                void *dest, u32 size);
+/* word-indexed offset */
+void gk20a_mem_wr32(struct gk20a *g, struct mem_desc *mem, u32 w, u32 data);
+/* byte offset (32b-aligned) */
+void gk20a_mem_wr(struct gk20a *g, struct mem_desc *mem, u32 offset, u32 data);
+/* memcpy from cpu, offset and size in bytes (32b-aligned) */
+void gk20a_mem_wr_n(struct gk20a *g, struct mem_desc *mem, u32 offset,
+                void *src, u32 size);
+/* size and offset in bytes (32b-aligned), filled with u32s */
+void gk20a_memset(struct gk20a *g, struct mem_desc *mem, u32 offset,
+                u32 value, u32 size);
 #if 0 /*related to addr bits above, concern below TBD on which is accurate */
 #define bar1_instance_block_shift_gk20a() (max_physaddr_bits_gk20a() -\
                                           bus_bar1_block_ptr_s())
@@ -673,7 +701,6 @@ void pde_range_from_vaddr_range(struct vm_gk20a *vm,
                                              u64 addr_lo, u64 addr_hi,
                                              u32 *pde_lo, u32 *pde_hi);
 int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm);
-u32 *pde_from_index(struct vm_gk20a *vm, u32 i);
 u32 pte_index_from_vaddr(struct vm_gk20a *vm,
                               u64 addr, enum gmmu_pgsz_gk20a pgsz_idx);
 void free_gmmu_pages(struct vm_gk20a *vm,
@@ -685,7 +712,7 @@ struct gpu_ops;
 void gk20a_init_mm(struct gpu_ops *gops);
 const struct gk20a_mmu_level *gk20a_mm_get_mmu_levels(struct gk20a *g,
                                                      u32 big_page_size);
-void gk20a_mm_init_pdb(struct gk20a *g, void *inst_ptr, u64 pdb_addr);
+void gk20a_mm_init_pdb(struct gk20a *g, struct mem_desc *mem, u64 pdb_addr);
 void gk20a_remove_vm(struct vm_gk20a *vm, struct mem_desc *inst_block);
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
index 56ad0c2a..54b2eef4 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -2421,11 +2421,10 @@ static int gk20a_init_pmu_reset_enable_hw(struct gk20a *g)
 static int gk20a_prepare_ucode(struct gk20a *g)
 {
        struct pmu_gk20a *pmu = &g->pmu;
-        int i, err = 0;
+        int err = 0;
        struct device *d = dev_from_gk20a(g);
        struct mm_gk20a *mm = &g->mm;
        struct vm_gk20a *vm = &mm->pmu.vm;
-        void *ucode_ptr;
        if (g->pmu_fw) {
                gk20a_init_pmu(pmu);
@@ -2449,11 +2448,8 @@ static int gk20a_prepare_ucode(struct gk20a *g)
        if (err)
                goto err_release_fw;
-        ucode_ptr = pmu->ucode.cpu_va;
+        gk20a_mem_wr_n(g, &pmu->ucode, 0, pmu->ucode_image,
+                        pmu->desc->app_start_offset + pmu->desc->app_size);
-        for (i = 0; i < (pmu->desc->app_start_offset +
-                        pmu->desc->app_size) >> 2; i++)
-                gk20a_mem_wr32(ucode_ptr, i, pmu->ucode_image[i]);
        gk20a_init_pmu(pmu);
diff --git a/drivers/gpu/nvgpu/gm20b/acr_gm20b.c b/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
index 0e6e715d..3ac2cec8 100644
--- a/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
@@ -43,8 +43,8 @@ static int lsfm_add_ucode_img(struct gk20a *g, struct ls_flcn_mgr *plsfm,
 static void lsfm_free_ucode_img_res(struct flcn_ucode_img *p_img);
 static void lsfm_free_nonpmu_ucode_img_res(struct flcn_ucode_img *p_img);
 static int lsf_gen_wpr_requirements(struct gk20a *g, struct ls_flcn_mgr *plsfm);
-static int lsfm_init_wpr_contents(struct gk20a *g, struct ls_flcn_mgr *plsfm,
+static void lsfm_init_wpr_contents(struct gk20a *g, struct ls_flcn_mgr *plsfm,
-        void *nonwpr_addr);
+        struct mem_desc *nonwpr);
 static int acr_ucode_patch_sig(struct gk20a *g,
                unsigned int *p_img,
                unsigned int *p_prod_sig,
@@ -355,7 +355,7 @@ int prepare_ucode_blob(struct gk20a *g)
                gm20b_dbg_pmu("managed LS falcon %d, WPR size %d bytes.\n",
                        plsfm->managed_flcn_cnt, plsfm->wpr_size);
-                lsfm_init_wpr_contents(g, plsfm, g->acr.ucode_blob.cpu_va);
+                lsfm_init_wpr_contents(g, plsfm, &g->acr.ucode_blob);
        } else {
                gm20b_dbg_pmu("LSFM is managing no falcons.\n");
        }
@@ -613,120 +613,91 @@ static int lsfm_fill_flcn_bl_gen_desc(struct gk20a *g,
 }
 /* Initialize WPR contents */
-static int lsfm_init_wpr_contents(struct gk20a *g, struct ls_flcn_mgr *plsfm,
+static void lsfm_init_wpr_contents(struct gk20a *g, struct ls_flcn_mgr *plsfm,
-        void *nonwpr_addr)
+        struct mem_desc *ucode)
 {
+        struct lsfm_managed_ucode_img *pnode = plsfm->ucode_img_list;
+        u32 i;
-        int status = 0;
+        /* The WPR array is at the base of the WPR */
-        union flcn_bl_generic_desc *nonwpr_bl_gen_desc;
+        pnode = plsfm->ucode_img_list;
-        if (nonwpr_addr == NULL) {
+        i = 0;
-                status = -ENOMEM;
-        } else {
-                struct lsfm_managed_ucode_img *pnode = plsfm->ucode_img_list;
-                struct lsf_wpr_header *wpr_hdr;
-                struct lsf_lsb_header *lsb_hdr;
-                void *ucode_off;
-                u32 i;
-                /* The WPR array is at the base of the WPR */
-                wpr_hdr = (struct lsf_wpr_header *)nonwpr_addr;
-                pnode = plsfm->ucode_img_list;
-                i = 0;
-                /*
+        /*
-                 * Walk the managed falcons, flush WPR and LSB headers to FB.
+         * Walk the managed falcons, flush WPR and LSB headers to FB.
-                 * flush any bl args to the storage area relative to the
+         * flush any bl args to the storage area relative to the
-                 * ucode image (appended on the end as a DMEM area).
+         * ucode image (appended on the end as a DMEM area).
-                 */
+         */
-                while (pnode) {
+        while (pnode) {
-                        /* Flush WPR header to memory*/
+                /* Flush WPR header to memory*/
-                        memcpy(&wpr_hdr[i], &pnode->wpr_header,
+                gk20a_mem_wr_n(g, ucode, i * sizeof(pnode->wpr_header),
-                                        sizeof(struct lsf_wpr_header));
+                                &pnode->wpr_header, sizeof(pnode->wpr_header));
-                        gm20b_dbg_pmu("wpr header as in memory and pnode\n");
-                        gm20b_dbg_pmu("falconid :%d %d\n",
+                gm20b_dbg_pmu("wpr header");
-                                pnode->wpr_header.falcon_id,
+                gm20b_dbg_pmu("falconid :%d",
-                                wpr_hdr[i].falcon_id);
+                                pnode->wpr_header.falcon_id);
-                        gm20b_dbg_pmu("lsb_offset :%x %x\n",
+                gm20b_dbg_pmu("lsb_offset :%x",
-                                pnode->wpr_header.lsb_offset,
+                                pnode->wpr_header.lsb_offset);
-                                wpr_hdr[i].lsb_offset);
+                gm20b_dbg_pmu("bootstrap_owner :%d",
-                        gm20b_dbg_pmu("bootstrap_owner :%d %d\n",
+                        pnode->wpr_header.bootstrap_owner);
-                                pnode->wpr_header.bootstrap_owner,
+                gm20b_dbg_pmu("lazy_bootstrap :%d",
-                                wpr_hdr[i].bootstrap_owner);
+                                pnode->wpr_header.lazy_bootstrap);
-                        gm20b_dbg_pmu("lazy_bootstrap :%d %d\n",
+                gm20b_dbg_pmu("status :%d",
-                                pnode->wpr_header.lazy_bootstrap,
+                                pnode->wpr_header.status);
-                                wpr_hdr[i].lazy_bootstrap);
-                        gm20b_dbg_pmu("status :%d %d\n",
+                /*Flush LSB header to memory*/
-                                pnode->wpr_header.status, wpr_hdr[i].status);
+                gk20a_mem_wr_n(g, ucode, pnode->wpr_header.lsb_offset,
+                                &pnode->lsb_header, sizeof(pnode->lsb_header));
-                        /*Flush LSB header to memory*/
-                        lsb_hdr = (struct lsf_lsb_header *)((u8 *)nonwpr_addr +
+                gm20b_dbg_pmu("lsb header");
-                                        pnode->wpr_header.lsb_offset);
+                gm20b_dbg_pmu("ucode_off :%x",
-                        memcpy(lsb_hdr, &pnode->lsb_header,
+                                pnode->lsb_header.ucode_off);
-                                        sizeof(struct lsf_lsb_header));
+                gm20b_dbg_pmu("ucode_size :%x",
-                        gm20b_dbg_pmu("lsb header as in memory and pnode\n");
+                                pnode->lsb_header.ucode_size);
-                        gm20b_dbg_pmu("ucode_off :%x %x\n",
+                gm20b_dbg_pmu("data_size :%x",
-                                pnode->lsb_header.ucode_off,
+                                pnode->lsb_header.data_size);
-                                lsb_hdr->ucode_off);
+                gm20b_dbg_pmu("bl_code_size :%x",
-                        gm20b_dbg_pmu("ucode_size :%x %x\n",
+                                pnode->lsb_header.bl_code_size);
-                                pnode->lsb_header.ucode_size,
+                gm20b_dbg_pmu("bl_imem_off :%x",
-                                lsb_hdr->ucode_size);
+                                pnode->lsb_header.bl_imem_off);
-                        gm20b_dbg_pmu("data_size :%x %x\n",
+                gm20b_dbg_pmu("bl_data_off :%x",
-                                pnode->lsb_header.data_size,
+                                pnode->lsb_header.bl_data_off);
-                                lsb_hdr->data_size);
+                gm20b_dbg_pmu("bl_data_size :%x",
-                        gm20b_dbg_pmu("bl_code_size :%x %x\n",
+                                pnode->lsb_header.bl_data_size);
-                                pnode->lsb_header.bl_code_size,
+                gm20b_dbg_pmu("app_code_off :%x",
-                                lsb_hdr->bl_code_size);
+                                pnode->lsb_header.app_code_off);
-                        gm20b_dbg_pmu("bl_imem_off :%x %x\n",
+                gm20b_dbg_pmu("app_code_size :%x",
-                                pnode->lsb_header.bl_imem_off,
+                                pnode->lsb_header.app_code_size);
-                                lsb_hdr->bl_imem_off);
+                gm20b_dbg_pmu("app_data_off :%x",
-                        gm20b_dbg_pmu("bl_data_off :%x %x\n",
+                                pnode->lsb_header.app_data_off);
-                                pnode->lsb_header.bl_data_off,
+                gm20b_dbg_pmu("app_data_size :%x",
-                                lsb_hdr->bl_data_off);
+                                pnode->lsb_header.app_data_size);
-                        gm20b_dbg_pmu("bl_data_size :%x %x\n",
+                gm20b_dbg_pmu("flags :%x",
-                                pnode->lsb_header.bl_data_size,
+                                pnode->lsb_header.flags);
-                                lsb_hdr->bl_data_size);
-                        gm20b_dbg_pmu("app_code_off :%x %x\n",
+                /*If this falcon has a boot loader and related args,
-                                pnode->lsb_header.app_code_off,
+                 * flush them.*/
-                                lsb_hdr->app_code_off);
+                if (!pnode->ucode_img.header) {
-                        gm20b_dbg_pmu("app_code_size :%x %x\n",
+                        /*Populate gen bl and flush to memory*/
-                                pnode->lsb_header.app_code_size,
+                        lsfm_fill_flcn_bl_gen_desc(g, pnode);
-                                lsb_hdr->app_code_size);
+                        gk20a_mem_wr_n(g, ucode,
-                        gm20b_dbg_pmu("app_data_off :%x %x\n",
+                                        pnode->lsb_header.bl_data_off,
-                                pnode->lsb_header.app_data_off,
+                                        &pnode->bl_gen_desc,
-                                lsb_hdr->app_data_off);
-                        gm20b_dbg_pmu("app_data_size :%x %x\n",
-                                pnode->lsb_header.app_data_size,
-                                lsb_hdr->app_data_size);
-                        gm20b_dbg_pmu("flags :%x %x\n",
-                                pnode->lsb_header.flags, lsb_hdr->flags);
-                        /*If this falcon has a boot loader and related args,
-                         * flush them.*/
-                        if (!pnode->ucode_img.header) {
-                                nonwpr_bl_gen_desc =
-                                        (union flcn_bl_generic_desc *)
-                                        ((u8 *)nonwpr_addr +
-                                        pnode->lsb_header.bl_data_off);
-                                /*Populate gen bl and flush to memory*/
-                                lsfm_fill_flcn_bl_gen_desc(g, pnode);
-                                memcpy(nonwpr_bl_gen_desc, &pnode->bl_gen_desc,
                                        pnode->bl_gen_desc_size);
-                        }
-                        ucode_off = (void *)(pnode->lsb_header.ucode_off +
-                                (u8 *)nonwpr_addr);
-                        /*Copying of ucode*/
-                        memcpy(ucode_off, pnode->ucode_img.data,
-                                pnode->ucode_img.data_size);
-                        pnode = pnode->next;
-                        i++;
                }
+                /*Copying of ucode*/
-                /* Tag the terminator WPR header with an invalid falcon ID. */
+                gk20a_mem_wr_n(g, ucode, pnode->lsb_header.ucode_off,
-                gk20a_mem_wr32(&wpr_hdr[plsfm->managed_flcn_cnt].falcon_id,
+                                pnode->ucode_img.data,
-                        0, LSF_FALCON_ID_INVALID);
+                                pnode->ucode_img.data_size);
+                pnode = pnode->next;
+                i++;
        }
-        return status;
+        /* Tag the terminator WPR header with an invalid falcon ID. */
+        gk20a_mem_wr32(g, ucode,
+                        plsfm->managed_flcn_cnt * sizeof(struct lsf_wpr_header) +
+                        offsetof(struct lsf_wpr_header, falcon_id),
+                        LSF_FALCON_ID_INVALID);
 }
 /*!
@@ -1000,7 +971,7 @@ int gm20b_bootstrap_hs_flcn(struct gk20a *g)
 {
        struct mm_gk20a *mm = &g->mm;
        struct vm_gk20a *vm = &mm->pmu.vm;
-        int i, err = 0;
+        int err = 0;
        u64 *acr_dmem;
        u32 img_size_in_bytes = 0;
        u32 status, size;
@@ -1066,10 +1037,8 @@ int gm20b_bootstrap_hs_flcn(struct gk20a *g)
                ((struct flcn_acr_desc *)acr_dmem)->regions.no_regions = 2;
                ((struct flcn_acr_desc *)acr_dmem)->wpr_offset = 0;
-                for (i = 0; i < (img_size_in_bytes/4); i++) {
+                gk20a_mem_wr_n(g, &acr->acr_ucode, 0,
-                        gk20a_mem_wr32(acr->acr_ucode.cpu_va, i,
+                                acr_ucode_data_t210_load, img_size_in_bytes);
-                                        acr_ucode_data_t210_load[i]);
-                }
                /*
                 * In order to execute this binary, we will be using
                 * a bootloader which will load this image into PMU IMEM/DMEM.
@@ -1323,7 +1292,7 @@ int pmu_exec_gen_bl(struct gk20a *g, void *desc, u8 b_wait_for_halt)
        struct mm_gk20a *mm = &g->mm;
        struct vm_gk20a *vm = &mm->pmu.vm;
        struct device *d = dev_from_gk20a(g);
-        int i, err = 0;
+        int err = 0;
        u32 bl_sz;
        struct acr_gm20b *acr = &g->acr;
        const struct firmware *hsbl_fw = acr->hsbl_fw;
@@ -1369,8 +1338,7 @@ int pmu_exec_gen_bl(struct gk20a *g, void *desc, u8 b_wait_for_halt)
                        goto err_free_ucode;
                }
-                for (i = 0; i < (bl_sz) >> 2; i++)
+                gk20a_mem_wr_n(g, &acr->hsbl_ucode, 0, pmu_bl_gm10x, bl_sz);
-                        gk20a_mem_wr32(acr->hsbl_ucode.cpu_va, i, pmu_bl_gm10x[i]);
                gm20b_dbg_pmu("Copied bl ucode to bl_cpuva\n");
        }
        /*
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
index b9a1e685..2197bae5 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
@@ -849,7 +849,7 @@ static int gr_gm20b_alloc_gr_ctx(struct gk20a *g,
 static void gr_gm20b_update_ctxsw_preemption_mode(struct gk20a *g,
                struct channel_ctx_gk20a *ch_ctx,
-                void *ctx_ptr)
+                struct mem_desc *mem)
 {
        struct gr_ctx_desc *gr_ctx = ch_ctx->gr_ctx;
        u32 cta_preempt_option =
@@ -859,7 +859,8 @@ static void gr_gm20b_update_ctxsw_preemption_mode(struct gk20a *g,
        if (gr_ctx->compute_preempt_mode == NVGPU_COMPUTE_PREEMPTION_MODE_CTA) {
                gk20a_dbg_info("CTA: %x", cta_preempt_option);
-                gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_preemption_options_o(), 0,
+                gk20a_mem_wr(g, mem,
+                                ctxsw_prog_main_image_preemption_options_o(),
                                cta_preempt_option);
        }
@@ -1005,7 +1006,7 @@ static int gr_gm20b_update_pc_sampling(struct channel_gk20a *c,
                                       bool enable)
 {
        struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
-        void *ctx_ptr = NULL;
+        struct mem_desc *mem;
        u32 v;
        gk20a_dbg_fn("");
@@ -1013,18 +1014,17 @@ static int gr_gm20b_update_pc_sampling(struct channel_gk20a *c,
        if (!ch_ctx || !ch_ctx->gr_ctx || c->vpr)
                return -EINVAL;
-        ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
+        mem = &ch_ctx->gr_ctx->mem;
-                        PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-                        0, pgprot_writecombine(PAGE_KERNEL));
+        if (gk20a_mem_begin(c->g, mem))
-        if (!ctx_ptr)
                return -ENOMEM;
-        v = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+        v = gk20a_mem_rd(c->g, mem, ctxsw_prog_main_image_pm_o());
        v &= ~ctxsw_prog_main_image_pm_pc_sampling_m();
        v |= ctxsw_prog_main_image_pm_pc_sampling_f(enable);
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0, v);
+        gk20a_mem_wr(c->g, mem, ctxsw_prog_main_image_pm_o(), v);
-        vunmap(ctx_ptr);
+        gk20a_mem_end(c->g, mem);
        gk20a_dbg_fn("done");
@@ -1089,13 +1089,13 @@ static void gr_gm20b_init_cyclestats(struct gk20a *g)
 #endif
 }
-static void gr_gm20b_enable_cde_in_fecs(void *ctx_ptr)
+static void gr_gm20b_enable_cde_in_fecs(struct gk20a *g, struct mem_desc *mem)
 {
        u32 cde_v;
-        cde_v = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_ctl_o(), 0);
+        cde_v = gk20a_mem_rd(g, mem, ctxsw_prog_main_image_ctl_o());
        cde_v |=  ctxsw_prog_main_image_ctl_cde_enabled_f();
-        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_ctl_o(), 0, cde_v);
+        gk20a_mem_wr(g, mem, ctxsw_prog_main_image_ctl_o(), cde_v);
 }
 static void gr_gm20b_bpt_reg_info(struct gk20a *g, struct warpstate *w_state)
diff --git a/drivers/gpu/nvgpu/gm20b/mm_gm20b.c b/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
index ac73b5c8..726d73ed 100644
--- a/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
@@ -106,14 +106,14 @@ static void gm20b_mm_mmu_set_debug_mode(struct gk20a *g, bool enable)
 }
 static void gm20b_mm_set_big_page_size(struct gk20a *g,
-                                void *inst_ptr, int size)
+                                struct mem_desc *mem, int size)
 {
        u32 val;
        gk20a_dbg_fn("");
        gk20a_dbg_info("big page size %d\n", size);
-        val = gk20a_mem_rd32(inst_ptr, ram_in_big_page_size_w());
+        val = gk20a_mem_rd32(g, mem, ram_in_big_page_size_w());
        val &= ~ram_in_big_page_size_m();
        if (size == SZ_64K)
@@ -121,7 +121,7 @@ static void gm20b_mm_set_big_page_size(struct gk20a *g,
        else
                val |= ram_in_big_page_size_128kb_f();
-        gk20a_mem_wr32(inst_ptr, ram_in_big_page_size_w(), val);
+        gk20a_mem_wr32(g, mem, ram_in_big_page_size_w(), val);
        gk20a_dbg_fn("done");
 }
diff --git a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
index 66b5e410..d1cba979 100644
--- a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
@@ -285,8 +285,6 @@ static int vgpu_init_fifo_setup_sw(struct gk20a *g)
        mutex_init(&f->free_chs_mutex);
        for (chid = 0; chid < f->num_channels; chid++) {
-                f->channel[chid].userd_cpu_va =
-                        f->userd.cpu_va + chid * f->userd_entry_size;
                f->channel[chid].userd_iova =
                        g->ops.mm.get_iova_addr(g, f->userd.sgt->sgl, 0)
                                + chid * f->userd_entry_size;