gpu: nvgpu: Add NVIDIA GPU Driver

This patch moves the NVIDIA GPU driver to a new location. Bug 1482562 Change-Id: I24293810b9d0f1504fd9be00135e21dad656ccb6 Signed-off-by: Arto Merilainen <amerilainen@nvidia.com> Reviewed-on: http://git-master/r/383722 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: Arto Merilainen <amerilainen@nvidia.com> 2014-03-19 03:38:25 -0400
committer: Dan Willemsen <dwillemsen@nvidia.com> 2015-03-18 15:08:53 -0400
commit: a9785995d5f22aaeb659285f8aeb64d8b56982e0 (patch)
tree: cc75f75bcf43db316a002a7a240b81f299bf6d7f
parent: 61efaf843c22b85424036ec98015121c08f5f16c (diff)
82 files changed, 43318 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/Kconfig b/drivers/gpu/nvgpu/Kconfig
new file mode 100644
index 00000000..160ec8be
--- /dev/null
+++ b/drivers/gpu/nvgpu/Kconfig
@@ -0,0 +1,60 @@
+config GK20A
+        bool "Nvidia GK20A GPU support"
+        help
+          Choose this option if you have an SoC with integrated
+          Nvidia GPU IP.
+config GK20A_DEFAULT_TIMEOUT
+        depends on GK20A
+        int "Default timeout for submits"
+        default 10000
+        help
+          Default timeout for jobs in milliseconds. Set to zero for no timeout.
+config GK20A_PMU
+        bool "Support GK20A PMU"
+        depends on GK20A
+        default n
+        help
+          Say Y here to enable GK20A PMU features.
+choice
+        depends on GK20A
+        prompt "Enable GK20A frequency scaling"
+        default GK20A_PERFMON
+        optional
+        help
+          Select this entry to enable gk20a scaling
+config GK20A_PERFMON
+        bool "Use Perfmon"
+        help
+          Select this to enable built-in perfmon scaling.
+          The built-in scaling option uses simplistic
+          scaling mechanism (if busy, increase frequency and
+          decrease frequency if idle).
+config GK20A_DEVFREQ
+        bool "Use Devfreq"
+        help
+          Select this to use devfreq based scaling.
+          Devfreq is a common framework that allows using
+          variety of different governors and changing
+          between governors on the fly. By default, no
+          governor is selected.
+endchoice
+config GK20A_CYCLE_STATS
+        bool "Support GK20A GPU CYCLE STATS"
+        depends on GK20A
+        default y
+        help
+          Say Y here to enable the cycle stats debugging features.
+config GK20A_PHYS_PAGE_TABLES
+        bool "Use physical addressing for gk20a page tables"
+        default y if TEGRA_SIMULATION_PLATFORM
+        help
+          Use physical addressing for gk20a page tables. If this is off, we
+          use SMMU translation.
diff --git a/drivers/gpu/nvgpu/gk20a/Makefile b/drivers/gpu/nvgpu/gk20a/Makefile
new file mode 100644
index 00000000..f9b06b72
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/Makefile
@@ -0,0 +1,36 @@
+GCOV_PROFILE := y
+ccflags-y += -Idrivers/devfreq
+ccflags-y += -Wno-multichar
+ccflags-y += -Werror
+obj-$(CONFIG_GK20A) += \
+        gk20a.o \
+        as_gk20a.o \
+        ctrl_gk20a.o \
+        fifo_gk20a.o \
+        channel_gk20a.o \
+        channel_sync_gk20a.o \
+        debug_gk20a.o \
+        dbg_gpu_gk20a.o \
+        regops_gk20a.o \
+        gr_gk20a.o \
+        kind_gk20a.o \
+        mm_gk20a.o \
+        pmu_gk20a.o \
+        priv_ring_gk20a.o \
+        clk_gk20a.o \
+        therm_gk20a.o \
+        gr_ctx_gk20a_sim.o \
+        gr_ctx_gk20a.o \
+        gk20a_gating_reglist.o \
+        gk20a_scale.o \
+        gk20a_sysfs.o \
+        ltc_gk20a.o \
+        fb_gk20a.o \
+        hal.o \
+        hal_gk20a.o \
+        gk20a_allocator.o
+obj-$(CONFIG_GK20A) += platform_gk20a_generic.o
+obj-$(CONFIG_TEGRA_GK20A) += platform_gk20a_tegra.o
diff --git a/drivers/gpu/nvgpu/gk20a/as_gk20a.c b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
new file mode 100644
index 00000000..65c26938
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
@@ -0,0 +1,293 @@
+/*
+ * drivers/video/tegra/host/gk20a/as_gk20a.c
+ *
+ * GK20A Address Spaces
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/uaccess.h>
+#include <trace/events/gk20a.h>
+#include "gk20a.h"
+/* dumb allocator... */
+static int generate_as_share_id(struct gk20a_as *as)
+{
+        gk20a_dbg_fn("");
+        return ++as->last_share_id;
+}
+/* still dumb */
+static void release_as_share_id(struct gk20a_as *as, int id)
+{
+        gk20a_dbg_fn("");
+        return;
+}
+static int gk20a_as_alloc_share(struct gk20a_as *as,
+                                struct gk20a_as_share **out)
+{
+        struct gk20a_as_share *as_share;
+        int err = 0;
+        gk20a_dbg_fn("");
+        *out = 0;
+        as_share = kzalloc(sizeof(*as_share), GFP_KERNEL);
+        if (!as_share)
+                return -ENOMEM;
+        as_share->as = as;
+        as_share->id = generate_as_share_id(as_share->as);
+        as_share->ref_cnt.counter = 1;
+        /* this will set as_share->vm. */
+        err = gk20a_vm_alloc_share(as_share);
+        if (err)
+                goto failed;
+        *out = as_share;
+        return 0;
+ failed:
+        kfree(as_share);
+        return err;
+}
+/*
+ * channels and the device nodes call this to release.
+ * once the ref_cnt hits zero the share is deleted.
+ */
+int gk20a_as_release_share(struct gk20a_as_share *as_share)
+{
+        int err;
+        gk20a_dbg_fn("");
+        if (atomic_dec_return(&as_share->ref_cnt) > 0)
+                return 0;
+        err = gk20a_vm_release_share(as_share);
+        release_as_share_id(as_share->as, as_share->id);
+        kfree(as_share);
+        return err;
+}
+static int gk20a_as_ioctl_bind_channel(
+                struct gk20a_as_share *as_share,
+                struct nvhost_as_bind_channel_args *args)
+{
+        int err = 0;
+        struct channel_gk20a *ch;
+        gk20a_dbg_fn("");
+        ch = gk20a_get_channel_from_file(args->channel_fd);
+        if (!ch || gk20a_channel_as_bound(ch))
+                return -EINVAL;
+        atomic_inc(&as_share->ref_cnt);
+        /* this will set channel_gk20a->vm */
+        err = gk20a_vm_bind_channel(as_share, ch);
+        if (err) {
+                atomic_dec(&as_share->ref_cnt);
+                return err;
+        }
+        return err;
+}
+static int gk20a_as_ioctl_alloc_space(
+                struct gk20a_as_share *as_share,
+                struct nvhost_as_alloc_space_args *args)
+{
+        gk20a_dbg_fn("");
+        return gk20a_vm_alloc_space(as_share, args);
+}
+static int gk20a_as_ioctl_free_space(
+                struct gk20a_as_share *as_share,
+                struct nvhost_as_free_space_args *args)
+{
+        gk20a_dbg_fn("");
+        return gk20a_vm_free_space(as_share, args);
+}
+static int gk20a_as_ioctl_map_buffer_ex(
+                struct gk20a_as_share *as_share,
+                struct nvhost_as_map_buffer_ex_args *args)
+{
+        int i;
+        gk20a_dbg_fn("");
+        /* ensure that padding is not set. this is required for ensuring that
+         * we can safely use these fields later */
+        for (i = 0; i < ARRAY_SIZE(args->padding); i++)
+                if (args->padding[i])
+                        return -EINVAL;
+        return gk20a_vm_map_buffer(as_share, args->dmabuf_fd,
+                                   &args->offset, args->flags,
+                                   args->kind);
+}
+static int gk20a_as_ioctl_map_buffer(
+                struct gk20a_as_share *as_share,
+                struct nvhost_as_map_buffer_args *args)
+{
+        gk20a_dbg_fn("");
+        return gk20a_vm_map_buffer(as_share, args->nvmap_handle,
+                                   &args->o_a.align,
+                                   args->flags, NV_KIND_DEFAULT);
+        /* args->o_a.offset will be set if !err */
+}
+static int gk20a_as_ioctl_unmap_buffer(
+                struct gk20a_as_share *as_share,
+                struct nvhost_as_unmap_buffer_args *args)
+{
+        gk20a_dbg_fn("");
+        return gk20a_vm_unmap_buffer(as_share, args->offset);
+}
+int gk20a_as_dev_open(struct inode *inode, struct file *filp)
+{
+        struct gk20a_as_share *as_share;
+        struct gk20a *g;
+        int err;
+        gk20a_dbg_fn("");
+        g = container_of(inode->i_cdev, struct gk20a, as.cdev);
+        err = gk20a_get_client(g);
+        if (err) {
+                gk20a_dbg_fn("fail to get channel!");
+                return err;
+        }
+        err = gk20a_as_alloc_share(&g->as, &as_share);
+        if (err) {
+                gk20a_dbg_fn("failed to alloc share");
+                gk20a_put_client(g);
+                return err;
+        }
+        filp->private_data = as_share;
+        return 0;
+}
+int gk20a_as_dev_release(struct inode *inode, struct file *filp)
+{
+        struct gk20a_as_share *as_share = filp->private_data;
+        int ret;
+        struct gk20a *g = gk20a_from_as(as_share->as);
+        gk20a_dbg_fn("");
+        ret = gk20a_as_release_share(as_share);
+        gk20a_put_client(g);
+        return ret;
+}
+long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+        int err = 0;
+        struct gk20a_as_share *as_share = filp->private_data;
+        struct gk20a *g = gk20a_from_as(as_share->as);
+        u8 buf[NVHOST_AS_IOCTL_MAX_ARG_SIZE];
+        if ((_IOC_TYPE(cmd) != NVHOST_AS_IOCTL_MAGIC) ||
+                (_IOC_NR(cmd) == 0) ||
+                (_IOC_NR(cmd) > NVHOST_AS_IOCTL_LAST))
+                return -EFAULT;
+        BUG_ON(_IOC_SIZE(cmd) > NVHOST_AS_IOCTL_MAX_ARG_SIZE);
+        if (_IOC_DIR(cmd) & _IOC_WRITE) {
+                if (copy_from_user(buf, (void __user *)arg, _IOC_SIZE(cmd)))
+                        return -EFAULT;
+        }
+        err = gk20a_channel_busy(g->dev);
+        if (err)
+                return err;
+        switch (cmd) {
+        case NVHOST_AS_IOCTL_BIND_CHANNEL:
+                trace_gk20a_as_ioctl_bind_channel(dev_name(dev_from_gk20a(g)));
+                err = gk20a_as_ioctl_bind_channel(as_share,
+                               (struct nvhost_as_bind_channel_args *)buf);
+                break;
+        case NVHOST32_AS_IOCTL_ALLOC_SPACE:
+        {
+                struct nvhost32_as_alloc_space_args *args32 =
+                        (struct nvhost32_as_alloc_space_args *)buf;
+                struct nvhost_as_alloc_space_args args;
+                args.pages = args32->pages;
+                args.page_size = args32->page_size;
+                args.flags = args32->flags;
+                args.o_a.offset = args32->o_a.offset;
+                trace_gk20a_as_ioctl_alloc_space(dev_name(dev_from_gk20a(g)));
+                err = gk20a_as_ioctl_alloc_space(as_share, &args);
+                args32->o_a.offset = args.o_a.offset;
+                break;
+        }
+        case NVHOST_AS_IOCTL_ALLOC_SPACE:
+                trace_gk20a_as_ioctl_alloc_space(dev_name(dev_from_gk20a(g)));
+                err = gk20a_as_ioctl_alloc_space(as_share,
+                                (struct nvhost_as_alloc_space_args *)buf);
+                break;
+        case NVHOST_AS_IOCTL_FREE_SPACE:
+                trace_gk20a_as_ioctl_free_space(dev_name(dev_from_gk20a(g)));
+                err = gk20a_as_ioctl_free_space(as_share,
+                                (struct nvhost_as_free_space_args *)buf);
+                break;
+        case NVHOST_AS_IOCTL_MAP_BUFFER:
+                trace_gk20a_as_ioctl_map_buffer(dev_name(dev_from_gk20a(g)));
+                err = gk20a_as_ioctl_map_buffer(as_share,
+                                (struct nvhost_as_map_buffer_args *)buf);
+                break;
+        case NVHOST_AS_IOCTL_MAP_BUFFER_EX:
+                trace_gk20a_as_ioctl_map_buffer(dev_name(dev_from_gk20a(g)));
+                err = gk20a_as_ioctl_map_buffer_ex(as_share,
+                                (struct nvhost_as_map_buffer_ex_args *)buf);
+                break;
+        case NVHOST_AS_IOCTL_UNMAP_BUFFER:
+                trace_gk20a_as_ioctl_unmap_buffer(dev_name(dev_from_gk20a(g)));
+                err = gk20a_as_ioctl_unmap_buffer(as_share,
+                                (struct nvhost_as_unmap_buffer_args *)buf);
+                break;
+        default:
+                dev_err(dev_from_gk20a(g), "unrecognized as ioctl: 0x%x", cmd);
+                err = -ENOTTY;
+                break;
+        }
+        gk20a_channel_idle(g->dev);
+        if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
+                err = copy_to_user((void __user *)arg, buf, _IOC_SIZE(cmd));
+        return err;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/as_gk20a.h b/drivers/gpu/nvgpu/gk20a/as_gk20a.h
new file mode 100644
index 00000000..be0e9707
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.h
@@ -0,0 +1,50 @@
+/*
+ * drivers/video/tegra/host/gk20a/as_gk20a.h
+ *
+ * GK20A Address Space
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef __GK20A_AS_H
+#define __GK20A_AS_H
+#include <linux/atomic.h>
+#include <linux/cdev.h>
+#include <linux/fs.h>
+#include <linux/nvhost_as_ioctl.h>
+struct gk20a_as;
+struct gk20a_as_share;
+struct vm_gk20a;
+struct gk20a_as_share {
+        struct gk20a_as *as;
+        atomic_t ref_cnt;
+        int id;
+        struct vm_gk20a *vm;
+};
+struct gk20a_as {
+        int last_share_id; /* dummy allocator for now */
+        struct cdev cdev;
+        struct device *node;
+};
+int gk20a_as_release_share(struct gk20a_as_share *as_share);
+/* struct file_operations driver interface */
+int gk20a_as_dev_open(struct inode *inode, struct file *filp);
+int gk20a_as_dev_release(struct inode *inode, struct file *filp);
+long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
new file mode 100644
index 00000000..6056f558
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -0,0 +1,2111 @@
+/*
+ * drivers/video/tegra/host/gk20a/channel_gk20a.c
+ *
+ * GK20A Graphics channel
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/nvhost.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <linux/highmem.h> /* need for nvmap.h*/
+#include <trace/events/gk20a.h>
+#include <linux/scatterlist.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/dma-buf.h>
+#include "debug_gk20a.h"
+#include "gk20a.h"
+#include "dbg_gpu_gk20a.h"
+#include "hw_ram_gk20a.h"
+#include "hw_fifo_gk20a.h"
+#include "hw_pbdma_gk20a.h"
+#include "hw_ccsr_gk20a.h"
+#include "hw_ltc_gk20a.h"
+#define NVMAP_HANDLE_PARAM_SIZE 1
+static struct channel_gk20a *acquire_unused_channel(struct fifo_gk20a *f);
+static void release_used_channel(struct fifo_gk20a *f, struct channel_gk20a *c);
+static void free_priv_cmdbuf(struct channel_gk20a *c,
+                             struct priv_cmd_entry *e);
+static void recycle_priv_cmdbuf(struct channel_gk20a *c);
+static int channel_gk20a_alloc_priv_cmdbuf(struct channel_gk20a *c);
+static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c);
+static int channel_gk20a_commit_userd(struct channel_gk20a *c);
+static int channel_gk20a_setup_userd(struct channel_gk20a *c);
+static int channel_gk20a_setup_ramfc(struct channel_gk20a *c,
+                        u64 gpfifo_base, u32 gpfifo_entries);
+static void channel_gk20a_bind(struct channel_gk20a *ch_gk20a);
+static void channel_gk20a_unbind(struct channel_gk20a *ch_gk20a);
+static int channel_gk20a_alloc_inst(struct gk20a *g,
+                                struct channel_gk20a *ch);
+static void channel_gk20a_free_inst(struct gk20a *g,
+                                struct channel_gk20a *ch);
+static int channel_gk20a_update_runlist(struct channel_gk20a *c,
+                                        bool add);
+static void gk20a_free_error_notifiers(struct channel_gk20a *ch);
+static struct channel_gk20a *acquire_unused_channel(struct fifo_gk20a *f)
+{
+        struct channel_gk20a *ch = NULL;
+        int chid;
+        mutex_lock(&f->ch_inuse_mutex);
+        for (chid = 0; chid < f->num_channels; chid++) {
+                if (!f->channel[chid].in_use) {
+                        f->channel[chid].in_use = true;
+                        ch = &f->channel[chid];
+                        break;
+                }
+        }
+        mutex_unlock(&f->ch_inuse_mutex);
+        return ch;
+}
+static void release_used_channel(struct fifo_gk20a *f, struct channel_gk20a *c)
+{
+        mutex_lock(&f->ch_inuse_mutex);
+        f->channel[c->hw_chid].in_use = false;
+        mutex_unlock(&f->ch_inuse_mutex);
+}
+int channel_gk20a_commit_va(struct channel_gk20a *c)
+{
+        u64 addr;
+        u32 addr_lo;
+        u32 addr_hi;
+        void *inst_ptr;
+        gk20a_dbg_fn("");
+        inst_ptr = c->inst_block.cpuva;
+        if (!inst_ptr)
+                return -ENOMEM;
+        addr = gk20a_mm_iova_addr(c->vm->pdes.sgt->sgl);
+        addr_lo = u64_lo32(addr >> 12);
+        addr_hi = u64_hi32(addr);
+        gk20a_dbg_info("pde pa=0x%llx addr_lo=0x%x addr_hi=0x%x",
+                   (u64)addr, addr_lo, addr_hi);
+        gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
+                ram_in_page_dir_base_target_vid_mem_f() |
+                ram_in_page_dir_base_vol_true_f() |
+                ram_in_page_dir_base_lo_f(addr_lo));
+        gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
+                ram_in_page_dir_base_hi_f(addr_hi));
+        gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
+                 u64_lo32(c->vm->va_limit) | 0xFFF);
+        gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
+                ram_in_adr_limit_hi_f(u64_hi32(c->vm->va_limit)));
+        gk20a_mm_l2_invalidate(c->g);
+        return 0;
+}
+static int channel_gk20a_commit_userd(struct channel_gk20a *c)
+{
+        u32 addr_lo;
+        u32 addr_hi;
+        void *inst_ptr;
+        gk20a_dbg_fn("");
+        inst_ptr = c->inst_block.cpuva;
+        if (!inst_ptr)
+                return -ENOMEM;
+        addr_lo = u64_lo32(c->userd_iova >> ram_userd_base_shift_v());
+        addr_hi = u64_hi32(c->userd_iova);
+        gk20a_dbg_info("channel %d : set ramfc userd 0x%16llx",
+                c->hw_chid, (u64)c->userd_iova);
+        gk20a_mem_wr32(inst_ptr, ram_in_ramfc_w() + ram_fc_userd_w(),
+                 pbdma_userd_target_vid_mem_f() |
+                 pbdma_userd_addr_f(addr_lo));
+        gk20a_mem_wr32(inst_ptr, ram_in_ramfc_w() + ram_fc_userd_hi_w(),
+                 pbdma_userd_target_vid_mem_f() |
+                 pbdma_userd_hi_addr_f(addr_hi));
+        gk20a_mm_l2_invalidate(c->g);
+        return 0;
+}
+static int channel_gk20a_set_schedule_params(struct channel_gk20a *c,
+                                u32 timeslice_timeout)
+{
+        void *inst_ptr;
+        int shift = 3;
+        int value = timeslice_timeout;
+        inst_ptr = c->inst_block.cpuva;
+        if (!inst_ptr)
+                return -ENOMEM;
+        /* disable channel */
+        gk20a_writel(c->g, ccsr_channel_r(c->hw_chid),
+                gk20a_readl(c->g, ccsr_channel_r(c->hw_chid)) |
+                ccsr_channel_enable_clr_true_f());
+        /* preempt the channel */
+        WARN_ON(gk20a_fifo_preempt_channel(c->g, c->hw_chid));
+        /* flush GPU cache */
+        gk20a_mm_l2_flush(c->g, true);
+        /* value field is 8 bits long */
+        while (value >= 1 << 8) {
+                value >>= 1;
+                shift++;
+        }
+        /* time slice register is only 18bits long */
+        if ((value << shift) >= 1<<19) {
+                pr_err("Requested timeslice value is clamped to 18 bits\n");
+                value = 255;
+                shift = 10;
+        }
+        /* set new timeslice */
+        gk20a_mem_wr32(inst_ptr, ram_fc_eng_timeslice_w(),
+                value | (shift << 12) |
+                fifo_eng_timeslice_enable_true_f());
+        /* enable channel */
+        gk20a_writel(c->g, ccsr_channel_r(c->hw_chid),
+                gk20a_readl(c->g, ccsr_channel_r(c->hw_chid)) |
+                ccsr_channel_enable_set_true_f());
+        gk20a_mm_l2_invalidate(c->g);
+        return 0;
+}
+static int channel_gk20a_setup_ramfc(struct channel_gk20a *c,
+                                u64 gpfifo_base, u32 gpfifo_entries)
+{
+        void *inst_ptr;
+        gk20a_dbg_fn("");
+        inst_ptr = c->inst_block.cpuva;
+        if (!inst_ptr)
+                return -ENOMEM;
+        memset(inst_ptr, 0, ram_fc_size_val_v());
+        gk20a_mem_wr32(inst_ptr, ram_fc_gp_base_w(),
+                pbdma_gp_base_offset_f(
+                u64_lo32(gpfifo_base >> pbdma_gp_base_rsvd_s())));
+        gk20a_mem_wr32(inst_ptr, ram_fc_gp_base_hi_w(),
+                pbdma_gp_base_hi_offset_f(u64_hi32(gpfifo_base)) |
+                pbdma_gp_base_hi_limit2_f(ilog2(gpfifo_entries)));
+        gk20a_mem_wr32(inst_ptr, ram_fc_signature_w(),
+                 pbdma_signature_hw_valid_f() | pbdma_signature_sw_zero_f());
+        gk20a_mem_wr32(inst_ptr, ram_fc_formats_w(),
+                pbdma_formats_gp_fermi0_f() |
+                pbdma_formats_pb_fermi1_f() |
+                pbdma_formats_mp_fermi0_f());
+        gk20a_mem_wr32(inst_ptr, ram_fc_pb_header_w(),
+                pbdma_pb_header_priv_user_f() |
+                pbdma_pb_header_method_zero_f() |
+                pbdma_pb_header_subchannel_zero_f() |
+                pbdma_pb_header_level_main_f() |
+                pbdma_pb_header_first_true_f() |
+                pbdma_pb_header_type_inc_f());
+        gk20a_mem_wr32(inst_ptr, ram_fc_subdevice_w(),
+                pbdma_subdevice_id_f(1) |
+                pbdma_subdevice_status_active_f() |
+                pbdma_subdevice_channel_dma_enable_f());
+        gk20a_mem_wr32(inst_ptr, ram_fc_target_w(), pbdma_target_engine_sw_f());
+        gk20a_mem_wr32(inst_ptr, ram_fc_acquire_w(),
+                pbdma_acquire_retry_man_2_f() |
+                pbdma_acquire_retry_exp_2_f() |
+                pbdma_acquire_timeout_exp_max_f() |
+                pbdma_acquire_timeout_man_max_f() |
+                pbdma_acquire_timeout_en_disable_f());
+        gk20a_mem_wr32(inst_ptr, ram_fc_eng_timeslice_w(),
+                fifo_eng_timeslice_timeout_128_f() |
+                fifo_eng_timeslice_timescale_3_f() |
+                fifo_eng_timeslice_enable_true_f());
+        gk20a_mem_wr32(inst_ptr, ram_fc_pb_timeslice_w(),
+                fifo_pb_timeslice_timeout_16_f() |
+                fifo_pb_timeslice_timescale_0_f() |
+                fifo_pb_timeslice_enable_true_f());
+        gk20a_mem_wr32(inst_ptr, ram_fc_chid_w(), ram_fc_chid_id_f(c->hw_chid));
+        /* TBD: alwasy priv mode? */
+        gk20a_mem_wr32(inst_ptr, ram_fc_hce_ctrl_w(),
+                 pbdma_hce_ctrl_hce_priv_mode_yes_f());
+        gk20a_mm_l2_invalidate(c->g);
+        return 0;
+}
+static int channel_gk20a_setup_userd(struct channel_gk20a *c)
+{
+        BUG_ON(!c->userd_cpu_va);
+        gk20a_dbg_fn("");
+        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_put_w(), 0);
+        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_get_w(), 0);
+        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_ref_w(), 0);
+        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_put_hi_w(), 0);
+        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_ref_threshold_w(), 0);
+        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_top_level_get_w(), 0);
+        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_top_level_get_hi_w(), 0);
+        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_get_hi_w(), 0);
+        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_get_w(), 0);
+        gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_put_w(), 0);
+        gk20a_mm_l2_invalidate(c->g);
+        return 0;
+}
+static void channel_gk20a_bind(struct channel_gk20a *ch_gk20a)
+{
+        struct gk20a *g = ch_gk20a->g;
+        struct fifo_gk20a *f = &g->fifo;
+        struct fifo_engine_info_gk20a *engine_info =
+                f->engine_info + ENGINE_GR_GK20A;
+        u32 inst_ptr = ch_gk20a->inst_block.cpu_pa
+                >> ram_in_base_shift_v();
+        gk20a_dbg_info("bind channel %d inst ptr 0x%08x",
+                ch_gk20a->hw_chid, inst_ptr);
+        ch_gk20a->bound = true;
+        gk20a_writel(g, ccsr_channel_r(ch_gk20a->hw_chid),
+                (gk20a_readl(g, ccsr_channel_r(ch_gk20a->hw_chid)) &
+                 ~ccsr_channel_runlist_f(~0)) |
+                 ccsr_channel_runlist_f(engine_info->runlist_id));
+        gk20a_writel(g, ccsr_channel_inst_r(ch_gk20a->hw_chid),
+                ccsr_channel_inst_ptr_f(inst_ptr) |
+                ccsr_channel_inst_target_vid_mem_f() |
+                ccsr_channel_inst_bind_true_f());
+        gk20a_writel(g, ccsr_channel_r(ch_gk20a->hw_chid),
+                (gk20a_readl(g, ccsr_channel_r(ch_gk20a->hw_chid)) &
+                 ~ccsr_channel_enable_set_f(~0)) |
+                 ccsr_channel_enable_set_true_f());
+}
+static void channel_gk20a_unbind(struct channel_gk20a *ch_gk20a)
+{
+        struct gk20a *g = ch_gk20a->g;
+        gk20a_dbg_fn("");
+        if (ch_gk20a->bound)
+                gk20a_writel(g, ccsr_channel_inst_r(ch_gk20a->hw_chid),
+                        ccsr_channel_inst_ptr_f(0) |
+                        ccsr_channel_inst_bind_false_f());
+        ch_gk20a->bound = false;
+}
+static int channel_gk20a_alloc_inst(struct gk20a *g,
+                                struct channel_gk20a *ch)
+{
+        struct device *d = dev_from_gk20a(g);
+        int err = 0;
+        dma_addr_t iova;
+        gk20a_dbg_fn("");
+        ch->inst_block.size = ram_in_alloc_size_v();
+        ch->inst_block.cpuva = dma_alloc_coherent(d,
+                                        ch->inst_block.size,
+                                        &iova,
+                                        GFP_KERNEL);
+        if (!ch->inst_block.cpuva) {
+                gk20a_err(d, "%s: memory allocation failed\n", __func__);
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        ch->inst_block.iova = iova;
+        ch->inst_block.cpu_pa = gk20a_get_phys_from_iova(d,
+                                                        ch->inst_block.iova);
+        if (!ch->inst_block.cpu_pa) {
+                gk20a_err(d, "%s: failed to get physical address\n", __func__);
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        gk20a_dbg_info("channel %d inst block physical addr: 0x%16llx",
+                ch->hw_chid, (u64)ch->inst_block.cpu_pa);
+        gk20a_dbg_fn("done");
+        return 0;
+clean_up:
+        gk20a_err(d, "fail");
+        channel_gk20a_free_inst(g, ch);
+        return err;
+}
+static void channel_gk20a_free_inst(struct gk20a *g,
+                                struct channel_gk20a *ch)
+{
+        struct device *d = dev_from_gk20a(g);
+        if (ch->inst_block.cpuva)
+                dma_free_coherent(d, ch->inst_block.size,
+                                ch->inst_block.cpuva, ch->inst_block.iova);
+        ch->inst_block.cpuva = NULL;
+        ch->inst_block.iova = 0;
+        memset(&ch->inst_block, 0, sizeof(struct inst_desc));
+}
+static int channel_gk20a_update_runlist(struct channel_gk20a *c, bool add)
+{
+        return gk20a_fifo_update_runlist(c->g, 0, c->hw_chid, add, true);
+}
+void gk20a_disable_channel_no_update(struct channel_gk20a *ch)
+{
+        /* ensure no fences are pending */
+        if (ch->sync)
+                ch->sync->set_min_eq_max(ch->sync);
+        /* disable channel */
+        gk20a_writel(ch->g, ccsr_channel_r(ch->hw_chid),
+                     gk20a_readl(ch->g,
+                     ccsr_channel_r(ch->hw_chid)) |
+                     ccsr_channel_enable_clr_true_f());
+}
+static int gk20a_wait_channel_idle(struct channel_gk20a *ch)
+{
+        bool channel_idle = false;
+        unsigned long end_jiffies = jiffies +
+                msecs_to_jiffies(gk20a_get_gr_idle_timeout(ch->g));
+        do {
+                mutex_lock(&ch->jobs_lock);
+                channel_idle = list_empty(&ch->jobs);
+                mutex_unlock(&ch->jobs_lock);
+                if (channel_idle)
+                        break;
+                usleep_range(1000, 3000);
+        } while (time_before(jiffies, end_jiffies)
+                        || !tegra_platform_is_silicon());
+        if (!channel_idle)
+                gk20a_err(dev_from_gk20a(ch->g), "channel jobs not freed");
+        return 0;
+}
+void gk20a_disable_channel(struct channel_gk20a *ch,
+                           bool finish,
+                           unsigned long finish_timeout)
+{
+        if (finish) {
+                int err = gk20a_channel_finish(ch, finish_timeout);
+                WARN_ON(err);
+        }
+        /* disable the channel from hw and increment syncpoints */
+        gk20a_disable_channel_no_update(ch);
+        gk20a_wait_channel_idle(ch);
+        /* preempt the channel */
+        gk20a_fifo_preempt_channel(ch->g, ch->hw_chid);
+        /* remove channel from runlist */
+        channel_gk20a_update_runlist(ch, false);
+}
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+static void gk20a_free_cycle_stats_buffer(struct channel_gk20a *ch)
+{
+        /* disable existing cyclestats buffer */
+        mutex_lock(&ch->cyclestate.cyclestate_buffer_mutex);
+        if (ch->cyclestate.cyclestate_buffer_handler) {
+                dma_buf_vunmap(ch->cyclestate.cyclestate_buffer_handler,
+                                ch->cyclestate.cyclestate_buffer);
+                dma_buf_put(ch->cyclestate.cyclestate_buffer_handler);
+                ch->cyclestate.cyclestate_buffer_handler = NULL;
+                ch->cyclestate.cyclestate_buffer = NULL;
+                ch->cyclestate.cyclestate_buffer_size = 0;
+        }
+        mutex_unlock(&ch->cyclestate.cyclestate_buffer_mutex);
+}
+static int gk20a_channel_cycle_stats(struct channel_gk20a *ch,
+                       struct nvhost_cycle_stats_args *args)
+{
+        struct dma_buf *dmabuf;
+        void *virtual_address;
+        if (args->nvmap_handle && !ch->cyclestate.cyclestate_buffer_handler) {
+                /* set up new cyclestats buffer */
+                dmabuf = dma_buf_get(args->nvmap_handle);
+                if (IS_ERR(dmabuf))
+                        return PTR_ERR(dmabuf);
+                virtual_address = dma_buf_vmap(dmabuf);
+                if (!virtual_address)
+                        return -ENOMEM;
+                ch->cyclestate.cyclestate_buffer_handler = dmabuf;
+                ch->cyclestate.cyclestate_buffer = virtual_address;
+                ch->cyclestate.cyclestate_buffer_size = dmabuf->size;
+                return 0;
+        } else if (!args->nvmap_handle &&
+                        ch->cyclestate.cyclestate_buffer_handler) {
+                gk20a_free_cycle_stats_buffer(ch);
+                return 0;
+        } else if (!args->nvmap_handle &&
+                        !ch->cyclestate.cyclestate_buffer_handler) {
+                /* no requst from GL */
+                return 0;
+        } else {
+                pr_err("channel already has cyclestats buffer\n");
+                return -EINVAL;
+        }
+}
+#endif
+static int gk20a_init_error_notifier(struct channel_gk20a *ch,
+                struct nvhost_set_error_notifier *args) {
+        void *va;
+        struct dma_buf *dmabuf;
+        if (!args->mem) {
+                pr_err("gk20a_init_error_notifier: invalid memory handle\n");
+                return -EINVAL;
+        }
+        dmabuf = dma_buf_get(args->mem);
+        if (ch->error_notifier_ref)
+                gk20a_free_error_notifiers(ch);
+        if (IS_ERR(dmabuf)) {
+                pr_err("Invalid handle: %d\n", args->mem);
+                return -EINVAL;
+        }
+        /* map handle */
+        va = dma_buf_vmap(dmabuf);
+        if (!va) {
+                dma_buf_put(dmabuf);
+                pr_err("Cannot map notifier handle\n");
+                return -ENOMEM;
+        }
+        /* set channel notifiers pointer */
+        ch->error_notifier_ref = dmabuf;
+        ch->error_notifier = va + args->offset;
+        ch->error_notifier_va = va;
+        memset(ch->error_notifier, 0, sizeof(struct nvhost_notification));
+        return 0;
+}
+void gk20a_set_error_notifier(struct channel_gk20a *ch, __u32 error)
+{
+        if (ch->error_notifier_ref) {
+                struct timespec time_data;
+                u64 nsec;
+                getnstimeofday(&time_data);
+                nsec = ((u64)time_data.tv_sec) * 1000000000u +
+                                (u64)time_data.tv_nsec;
+                ch->error_notifier->time_stamp.nanoseconds[0] =
+                                (u32)nsec;
+                ch->error_notifier->time_stamp.nanoseconds[1] =
+                                (u32)(nsec >> 32);
+                ch->error_notifier->info32 = error;
+                ch->error_notifier->status = 0xffff;
+                gk20a_err(dev_from_gk20a(ch->g),
+                                "error notifier set to %d\n", error);
+        }
+}
+static void gk20a_free_error_notifiers(struct channel_gk20a *ch)
+{
+        if (ch->error_notifier_ref) {
+                dma_buf_vunmap(ch->error_notifier_ref, ch->error_notifier_va);
+                dma_buf_put(ch->error_notifier_ref);
+                ch->error_notifier_ref = 0;
+                ch->error_notifier = 0;
+                ch->error_notifier_va = 0;
+        }
+}
+void gk20a_free_channel(struct channel_gk20a *ch, bool finish)
+{
+        struct gk20a *g = ch->g;
+        struct device *d = dev_from_gk20a(g);
+        struct fifo_gk20a *f = &g->fifo;
+        struct gr_gk20a *gr = &g->gr;
+        struct vm_gk20a *ch_vm = ch->vm;
+        unsigned long timeout = gk20a_get_gr_idle_timeout(g);
+        struct dbg_session_gk20a *dbg_s;
+        gk20a_dbg_fn("");
+        /* if engine reset was deferred, perform it now */
+        mutex_lock(&f->deferred_reset_mutex);
+        if (g->fifo.deferred_reset_pending) {
+                gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "engine reset was"
+                           " deferred, running now");
+                fifo_gk20a_finish_mmu_fault_handling(g, g->fifo.mmu_fault_engines);
+                g->fifo.mmu_fault_engines = 0;
+                g->fifo.deferred_reset_pending = false;
+        }
+        mutex_unlock(&f->deferred_reset_mutex);
+        if (!ch->bound)
+                return;
+        if (!gk20a_channel_as_bound(ch))
+                goto unbind;
+        gk20a_dbg_info("freeing bound channel context, timeout=%ld",
+                        timeout);
+        gk20a_disable_channel(ch, finish && !ch->has_timedout, timeout);
+        gk20a_free_error_notifiers(ch);
+        /* release channel ctx */
+        gk20a_free_channel_ctx(ch);
+        gk20a_gr_flush_channel_tlb(gr);
+        memset(&ch->ramfc, 0, sizeof(struct mem_desc_sub));
+        /* free gpfifo */
+        if (ch->gpfifo.gpu_va)
+                gk20a_gmmu_unmap(ch_vm, ch->gpfifo.gpu_va,
+                        ch->gpfifo.size, gk20a_mem_flag_none);
+        if (ch->gpfifo.cpu_va)
+                dma_free_coherent(d, ch->gpfifo.size,
+                        ch->gpfifo.cpu_va, ch->gpfifo.iova);
+        ch->gpfifo.cpu_va = NULL;
+        ch->gpfifo.iova = 0;
+        gk20a_mm_l2_invalidate(ch->g);
+        memset(&ch->gpfifo, 0, sizeof(struct gpfifo_desc));
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+        gk20a_free_cycle_stats_buffer(ch);
+#endif
+        channel_gk20a_free_priv_cmdbuf(ch);
+        if (ch->sync) {
+                ch->sync->destroy(ch->sync);
+                ch->sync = NULL;
+        }
+        /* release channel binding to the as_share */
+        gk20a_as_release_share(ch_vm->as_share);
+unbind:
+        channel_gk20a_unbind(ch);
+        channel_gk20a_free_inst(g, ch);
+        ch->vpr = false;
+        ch->vm = NULL;
+        WARN_ON(ch->sync);
+        /* unlink all debug sessions */
+        mutex_lock(&ch->dbg_s_lock);
+        list_for_each_entry(dbg_s, &ch->dbg_s_list, dbg_s_list_node) {
+                dbg_s->ch = NULL;
+                list_del_init(&dbg_s->dbg_s_list_node);
+        }
+        mutex_unlock(&ch->dbg_s_lock);
+        /* ALWAYS last */
+        release_used_channel(f, ch);
+}
+int gk20a_channel_release(struct inode *inode, struct file *filp)
+{
+        struct channel_gk20a *ch = (struct channel_gk20a *)filp->private_data;
+        struct gk20a *g = ch->g;
+        trace_gk20a_channel_release(dev_name(&g->dev->dev));
+        gk20a_channel_busy(ch->g->dev);
+        gk20a_free_channel(ch, true);
+        gk20a_channel_idle(ch->g->dev);
+        gk20a_put_client(g);
+        filp->private_data = NULL;
+        return 0;
+}
+static struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        struct channel_gk20a *ch;
+        ch = acquire_unused_channel(f);
+        if (ch == NULL) {
+                /* TBD: we want to make this virtualizable */
+                gk20a_err(dev_from_gk20a(g), "out of hw chids");
+                return 0;
+        }
+        ch->g = g;
+        if (channel_gk20a_alloc_inst(g, ch)) {
+                ch->in_use = false;
+                gk20a_err(dev_from_gk20a(g),
+                           "failed to open gk20a channel, out of inst mem");
+                return 0;
+        }
+        g->ops.fifo.bind_channel(ch);
+        ch->pid = current->pid;
+        /* reset timeout counter and update timestamp */
+        ch->timeout_accumulated_ms = 0;
+        ch->timeout_gpfifo_get = 0;
+        /* set gr host default timeout */
+        ch->timeout_ms_max = gk20a_get_gr_idle_timeout(g);
+        ch->timeout_debug_dump = true;
+        ch->has_timedout = false;
+        /* The channel is *not* runnable at this point. It still needs to have
+         * an address space bound and allocate a gpfifo and grctx. */
+        init_waitqueue_head(&ch->notifier_wq);
+        init_waitqueue_head(&ch->semaphore_wq);
+        init_waitqueue_head(&ch->submit_wq);
+        return ch;
+}
+static int __gk20a_channel_open(struct gk20a *g, struct file *filp)
+{
+        int err;
+        struct channel_gk20a *ch;
+        trace_gk20a_channel_open(dev_name(&g->dev->dev));
+        err = gk20a_get_client(g);
+        if (err) {
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to get client ref");
+                return err;
+        }
+        err = gk20a_channel_busy(g->dev);
+        if (err) {
+                gk20a_put_client(g);
+                gk20a_err(dev_from_gk20a(g), "failed to power on, %d", err);
+                return err;
+        }
+        ch = gk20a_open_new_channel(g);
+        gk20a_channel_idle(g->dev);
+        if (!ch) {
+                gk20a_put_client(g);
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to get f");
+                return -ENOMEM;
+        }
+        filp->private_data = ch;
+        return 0;
+}
+int gk20a_channel_open(struct inode *inode, struct file *filp)
+{
+        struct gk20a *g = container_of(inode->i_cdev,
+                        struct gk20a, channel.cdev);
+        return __gk20a_channel_open(g, filp);
+}
+/* allocate private cmd buffer.
+   used for inserting commands before/after user submitted buffers. */
+static int channel_gk20a_alloc_priv_cmdbuf(struct channel_gk20a *c)
+{
+        struct device *d = dev_from_gk20a(c->g);
+        struct vm_gk20a *ch_vm = c->vm;
+        struct priv_cmd_queue *q = &c->priv_cmd_q;
+        struct priv_cmd_entry *e;
+        u32 i = 0, size;
+        int err = 0;
+        struct sg_table *sgt;
+        dma_addr_t iova;
+        /* Kernel can insert gpfifos before and after user gpfifos.
+           Before user gpfifos, kernel inserts fence_wait, which takes
+           syncpoint_a (2 dwords) + syncpoint_b (2 dwords) = 4 dwords.
+           After user gpfifos, kernel inserts fence_get, which takes
+           wfi (2 dwords) + syncpoint_a (2 dwords) + syncpoint_b (2 dwords)
+           = 6 dwords.
+           Worse case if kernel adds both of them for every user gpfifo,
+           max size of priv_cmdbuf is :
+           (gpfifo entry number * (2 / 3) * (4 + 6) * 4 bytes */
+        size = roundup_pow_of_two(
+                c->gpfifo.entry_num * 2 * 10 * sizeof(u32) / 3);
+        q->mem.base_cpuva = dma_alloc_coherent(d, size,
+                                        &iova,
+                                        GFP_KERNEL);
+        if (!q->mem.base_cpuva) {
+                gk20a_err(d, "%s: memory allocation failed\n", __func__);
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        q->mem.base_iova = iova;
+        q->mem.size = size;
+        err = gk20a_get_sgtable(d, &sgt,
+                        q->mem.base_cpuva, q->mem.base_iova, size);
+        if (err) {
+                gk20a_err(d, "%s: failed to create sg table\n", __func__);
+                goto clean_up;
+        }
+        memset(q->mem.base_cpuva, 0, size);
+        q->base_gpuva = gk20a_gmmu_map(ch_vm, &sgt,
+                                        size,
+                                        0, /* flags */
+                                        gk20a_mem_flag_none);
+        if (!q->base_gpuva) {
+                gk20a_err(d, "ch %d : failed to map gpu va"
+                           "for priv cmd buffer", c->hw_chid);
+                err = -ENOMEM;
+                goto clean_up_sgt;
+        }
+        q->size = q->mem.size / sizeof (u32);
+        INIT_LIST_HEAD(&q->head);
+        INIT_LIST_HEAD(&q->free);
+        /* pre-alloc 25% of priv cmdbuf entries and put them on free list */
+        for (i = 0; i < q->size / 4; i++) {
+                e = kzalloc(sizeof(struct priv_cmd_entry), GFP_KERNEL);
+                if (!e) {
+                        gk20a_err(d, "ch %d: fail to pre-alloc cmd entry",
+                                c->hw_chid);
+                        err = -ENOMEM;
+                        goto clean_up_sgt;
+                }
+                e->pre_alloc = true;
+                list_add(&e->list, &q->free);
+        }
+        gk20a_free_sgtable(&sgt);
+        return 0;
+clean_up_sgt:
+        gk20a_free_sgtable(&sgt);
+clean_up:
+        channel_gk20a_free_priv_cmdbuf(c);
+        return err;
+}
+static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c)
+{
+        struct device *d = dev_from_gk20a(c->g);
+        struct vm_gk20a *ch_vm = c->vm;
+        struct priv_cmd_queue *q = &c->priv_cmd_q;
+        struct priv_cmd_entry *e;
+        struct list_head *pos, *tmp, *head;
+        if (q->size == 0)
+                return;
+        if (q->base_gpuva)
+                gk20a_gmmu_unmap(ch_vm, q->base_gpuva,
+                                q->mem.size, gk20a_mem_flag_none);
+        if (q->mem.base_cpuva)
+                dma_free_coherent(d, q->mem.size,
+                        q->mem.base_cpuva, q->mem.base_iova);
+        q->mem.base_cpuva = NULL;
+        q->mem.base_iova = 0;
+        /* free used list */
+        head = &q->head;
+        list_for_each_safe(pos, tmp, head) {
+                e = container_of(pos, struct priv_cmd_entry, list);
+                free_priv_cmdbuf(c, e);
+        }
+        /* free free list */
+        head = &q->free;
+        list_for_each_safe(pos, tmp, head) {
+                e = container_of(pos, struct priv_cmd_entry, list);
+                e->pre_alloc = false;
+                free_priv_cmdbuf(c, e);
+        }
+        memset(q, 0, sizeof(struct priv_cmd_queue));
+}
+/* allocate a cmd buffer with given size. size is number of u32 entries */
+int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 orig_size,
+                             struct priv_cmd_entry **entry)
+{
+        struct priv_cmd_queue *q = &c->priv_cmd_q;
+        struct priv_cmd_entry *e;
+        struct list_head *node;
+        u32 free_count;
+        u32 size = orig_size;
+        bool no_retry = false;
+        gk20a_dbg_fn("size %d", orig_size);
+        *entry = NULL;
+        /* if free space in the end is less than requested, increase the size
+         * to make the real allocated space start from beginning. */
+        if (q->put + size > q->size)
+                size = orig_size + (q->size - q->put);
+        gk20a_dbg_info("ch %d: priv cmd queue get:put %d:%d",
+                        c->hw_chid, q->get, q->put);
+TRY_AGAIN:
+        free_count = (q->size - (q->put - q->get) - 1) % q->size;
+        if (size > free_count) {
+                if (!no_retry) {
+                        recycle_priv_cmdbuf(c);
+                        no_retry = true;
+                        goto TRY_AGAIN;
+                } else
+                        return -EAGAIN;
+        }
+        if (unlikely(list_empty(&q->free))) {
+                gk20a_dbg_info("ch %d: run out of pre-alloc entries",
+                        c->hw_chid);
+                e = kzalloc(sizeof(struct priv_cmd_entry), GFP_KERNEL);
+                if (!e) {
+                        gk20a_err(dev_from_gk20a(c->g),
+                                "ch %d: fail to allocate priv cmd entry",
+                                c->hw_chid);
+                        return -ENOMEM;
+                }
+        } else  {
+                node = q->free.next;
+                list_del(node);
+                e = container_of(node, struct priv_cmd_entry, list);
+        }
+        e->size = orig_size;
+        e->gp_get = c->gpfifo.get;
+        e->gp_put = c->gpfifo.put;
+        e->gp_wrap = c->gpfifo.wrap;
+        /* if we have increased size to skip free space in the end, set put
+           to beginning of cmd buffer (0) + size */
+        if (size != orig_size) {
+                e->ptr = q->mem.base_cpuva;
+                e->gva = q->base_gpuva;
+                q->put = orig_size;
+        } else {
+                e->ptr = q->mem.base_cpuva + q->put;
+                e->gva = q->base_gpuva + q->put * sizeof(u32);
+                q->put = (q->put + orig_size) & (q->size - 1);
+        }
+        /* we already handled q->put + size > q->size so BUG_ON this */
+        BUG_ON(q->put > q->size);
+        /* add new entry to head since we free from head */
+        list_add(&e->list, &q->head);
+        *entry = e;
+        gk20a_dbg_fn("done");
+        return 0;
+}
+/* Don't call this to free an explict cmd entry.
+ * It doesn't update priv_cmd_queue get/put */
+static void free_priv_cmdbuf(struct channel_gk20a *c,
+                             struct priv_cmd_entry *e)
+{
+        struct priv_cmd_queue *q = &c->priv_cmd_q;
+        if (!e)
+                return;
+        list_del(&e->list);
+        if (unlikely(!e->pre_alloc))
+                kfree(e);
+        else {
+                memset(e, 0, sizeof(struct priv_cmd_entry));
+                e->pre_alloc = true;
+                list_add(&e->list, &q->free);
+        }
+}
+/* free entries if they're no longer being used */
+static void recycle_priv_cmdbuf(struct channel_gk20a *c)
+{
+        struct priv_cmd_queue *q = &c->priv_cmd_q;
+        struct priv_cmd_entry *e, *tmp;
+        struct list_head *head = &q->head;
+        bool wrap_around, found = false;
+        gk20a_dbg_fn("");
+        /* Find the most recent free entry. Free it and everything before it */
+        list_for_each_entry(e, head, list) {
+                gk20a_dbg_info("ch %d: cmd entry get:put:wrap %d:%d:%d "
+                        "curr get:put:wrap %d:%d:%d",
+                        c->hw_chid, e->gp_get, e->gp_put, e->gp_wrap,
+                        c->gpfifo.get, c->gpfifo.put, c->gpfifo.wrap);
+                wrap_around = (c->gpfifo.wrap != e->gp_wrap);
+                if (e->gp_get < e->gp_put) {
+                        if (c->gpfifo.get >= e->gp_put ||
+                            wrap_around) {
+                                found = true;
+                                break;
+                        } else
+                                e->gp_get = c->gpfifo.get;
+                } else if (e->gp_get > e->gp_put) {
+                        if (wrap_around &&
+                            c->gpfifo.get >= e->gp_put) {
+                                found = true;
+                                break;
+                        } else
+                                e->gp_get = c->gpfifo.get;
+                }
+        }
+        if (found)
+                q->get = (e->ptr - q->mem.base_cpuva) + e->size;
+        else {
+                gk20a_dbg_info("no free entry recycled");
+                return;
+        }
+        list_for_each_entry_safe_continue(e, tmp, head, list) {
+                free_priv_cmdbuf(c, e);
+        }
+        gk20a_dbg_fn("done");
+}
+static int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
+                                      struct nvhost_alloc_gpfifo_args *args)
+{
+        struct gk20a *g = c->g;
+        struct device *d = dev_from_gk20a(g);
+        struct vm_gk20a *ch_vm;
+        u32 gpfifo_size;
+        int err = 0;
+        struct sg_table *sgt;
+        dma_addr_t iova;
+        /* Kernel can insert one extra gpfifo entry before user submitted gpfifos
+           and another one after, for internal usage. Triple the requested size. */
+        gpfifo_size = roundup_pow_of_two(args->num_entries * 3);
+        if (args->flags & NVHOST_ALLOC_GPFIFO_FLAGS_VPR_ENABLED)
+                c->vpr = true;
+        /* an address space needs to have been bound at this point.   */
+        if (!gk20a_channel_as_bound(c)) {
+                gk20a_err(d,
+                            "not bound to an address space at time of gpfifo"
+                            " allocation.  Attempting to create and bind to"
+                            " one...");
+                return -EINVAL;
+        }
+        ch_vm = c->vm;
+        c->cmds_pending = false;
+        c->last_submit_fence.valid = false;
+        c->ramfc.offset = 0;
+        c->ramfc.size = ram_in_ramfc_s() / 8;
+        if (c->gpfifo.cpu_va) {
+                gk20a_err(d, "channel %d :"
+                           "gpfifo already allocated", c->hw_chid);
+                return -EEXIST;
+        }
+        c->gpfifo.size = gpfifo_size * sizeof(struct gpfifo);
+        c->gpfifo.cpu_va = (struct gpfifo *)dma_alloc_coherent(d,
+                                                c->gpfifo.size,
+                                                &iova,
+                                                GFP_KERNEL);
+        if (!c->gpfifo.cpu_va) {
+                gk20a_err(d, "%s: memory allocation failed\n", __func__);
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        c->gpfifo.iova = iova;
+        c->gpfifo.entry_num = gpfifo_size;
+        c->gpfifo.get = c->gpfifo.put = 0;
+        err = gk20a_get_sgtable(d, &sgt,
+                        c->gpfifo.cpu_va, c->gpfifo.iova, c->gpfifo.size);
+        if (err) {
+                gk20a_err(d, "%s: failed to allocate sg table\n", __func__);
+                goto clean_up;
+        }
+        c->gpfifo.gpu_va = gk20a_gmmu_map(ch_vm,
+                                        &sgt,
+                                        c->gpfifo.size,
+                                        0, /* flags */
+                                        gk20a_mem_flag_none);
+        if (!c->gpfifo.gpu_va) {
+                gk20a_err(d, "channel %d : failed to map"
+                           " gpu_va for gpfifo", c->hw_chid);
+                err = -ENOMEM;
+                goto clean_up_sgt;
+        }
+        gk20a_dbg_info("channel %d : gpfifo_base 0x%016llx, size %d",
+                c->hw_chid, c->gpfifo.gpu_va, c->gpfifo.entry_num);
+        channel_gk20a_setup_ramfc(c, c->gpfifo.gpu_va, c->gpfifo.entry_num);
+        channel_gk20a_setup_userd(c);
+        channel_gk20a_commit_userd(c);
+        gk20a_mm_l2_invalidate(c->g);
+        /* TBD: setup engine contexts */
+        err = channel_gk20a_alloc_priv_cmdbuf(c);
+        if (err)
+                goto clean_up_unmap;
+        err = channel_gk20a_update_runlist(c, true);
+        if (err)
+                goto clean_up_unmap;
+        gk20a_free_sgtable(&sgt);
+        gk20a_dbg_fn("done");
+        return 0;
+clean_up_unmap:
+        gk20a_gmmu_unmap(ch_vm, c->gpfifo.gpu_va,
+                c->gpfifo.size, gk20a_mem_flag_none);
+clean_up_sgt:
+        gk20a_free_sgtable(&sgt);
+clean_up:
+        dma_free_coherent(d, c->gpfifo.size,
+                c->gpfifo.cpu_va, c->gpfifo.iova);
+        c->gpfifo.cpu_va = NULL;
+        c->gpfifo.iova = 0;
+        memset(&c->gpfifo, 0, sizeof(struct gpfifo_desc));
+        gk20a_err(d, "fail");
+        return err;
+}
+static inline int wfi_cmd_size(void)
+{
+        return 2;
+}
+void add_wfi_cmd(struct priv_cmd_entry *cmd, int *i)
+{
+        /* wfi */
+        cmd->ptr[(*i)++] = 0x2001001E;
+        /* handle, ignored */
+        cmd->ptr[(*i)++] = 0x00000000;
+}
+static inline bool check_gp_put(struct gk20a *g,
+                                struct channel_gk20a *c)
+{
+        u32 put;
+        /* gp_put changed unexpectedly since last update? */
+        put = gk20a_bar1_readl(g,
+               c->userd_gpu_va + 4 * ram_userd_gp_put_w());
+        if (c->gpfifo.put != put) {
+                /*TBD: BUG_ON/teardown on this*/
+                gk20a_err(dev_from_gk20a(g), "gp_put changed unexpectedly "
+                           "since last update");
+                c->gpfifo.put = put;
+                return false; /* surprise! */
+        }
+        return true; /* checked out ok */
+}
+/* Update with this periodically to determine how the gpfifo is draining. */
+static inline u32 update_gp_get(struct gk20a *g,
+                                struct channel_gk20a *c)
+{
+        u32 new_get = gk20a_bar1_readl(g,
+                c->userd_gpu_va + sizeof(u32) * ram_userd_gp_get_w());
+        if (new_get < c->gpfifo.get)
+                c->gpfifo.wrap = !c->gpfifo.wrap;
+        c->gpfifo.get = new_get;
+        return new_get;
+}
+static inline u32 gp_free_count(struct channel_gk20a *c)
+{
+        return (c->gpfifo.entry_num - (c->gpfifo.put - c->gpfifo.get) - 1) %
+                c->gpfifo.entry_num;
+}
+bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
+                u32 timeout_delta_ms)
+{
+        u32 gpfifo_get = update_gp_get(ch->g, ch);
+        /* Count consequent timeout isr */
+        if (gpfifo_get == ch->timeout_gpfifo_get) {
+                /* we didn't advance since previous channel timeout check */
+                ch->timeout_accumulated_ms += timeout_delta_ms;
+        } else {
+                /* first timeout isr encountered */
+                ch->timeout_accumulated_ms = timeout_delta_ms;
+        }
+        ch->timeout_gpfifo_get = gpfifo_get;
+        return ch->g->timeouts_enabled &&
+                ch->timeout_accumulated_ms > ch->timeout_ms_max;
+}
+/* Issue a syncpoint increment *preceded* by a wait-for-idle
+ * command.  All commands on the channel will have been
+ * consumed at the time the fence syncpoint increment occurs.
+ */
+static int gk20a_channel_submit_wfi(struct channel_gk20a *c)
+{
+        struct priv_cmd_entry *cmd = NULL;
+        struct gk20a *g = c->g;
+        u32 free_count;
+        int err;
+        if (c->has_timedout)
+                return -ETIMEDOUT;
+        if (!c->sync) {
+                c->sync = gk20a_channel_sync_create(c);
+                if (!c->sync)
+                        return -ENOMEM;
+        }
+        update_gp_get(g, c);
+        free_count = gp_free_count(c);
+        if (unlikely(!free_count)) {
+                gk20a_err(dev_from_gk20a(g),
+                           "not enough gpfifo space");
+                return -EAGAIN;
+        }
+        err = c->sync->incr_wfi(c->sync, &cmd, &c->last_submit_fence);
+        if (unlikely(err))
+                return err;
+        WARN_ON(!c->last_submit_fence.wfi);
+        c->gpfifo.cpu_va[c->gpfifo.put].entry0 = u64_lo32(cmd->gva);
+        c->gpfifo.cpu_va[c->gpfifo.put].entry1 = u64_hi32(cmd->gva) |
+                pbdma_gp_entry1_length_f(cmd->size);
+        c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1);
+        /* save gp_put */
+        cmd->gp_put = c->gpfifo.put;
+        gk20a_bar1_writel(g,
+                c->userd_gpu_va + 4 * ram_userd_gp_put_w(),
+                c->gpfifo.put);
+        gk20a_dbg_info("post-submit put %d, get %d, size %d",
+                c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
+        return 0;
+}
+static u32 get_gp_free_count(struct channel_gk20a *c)
+{
+        update_gp_get(c->g, c);
+        return gp_free_count(c);
+}
+static void trace_write_pushbuffer(struct channel_gk20a *c, struct gpfifo *g)
+{
+        void *mem = NULL;
+        unsigned int words;
+        u64 offset;
+        struct dma_buf *dmabuf = NULL;
+        if (gk20a_debug_trace_cmdbuf) {
+                u64 gpu_va = (u64)g->entry0 |
+                        (u64)((u64)pbdma_gp_entry1_get_hi_v(g->entry1) << 32);
+                int err;
+                words = pbdma_gp_entry1_length_v(g->entry1);
+                err = gk20a_vm_find_buffer(c->vm, gpu_va, &dmabuf, &offset);
+                if (!err)
+                        mem = dma_buf_vmap(dmabuf);
+        }
+        if (mem) {
+                u32 i;
+                /*
+                 * Write in batches of 128 as there seems to be a limit
+                 * of how much you can output to ftrace at once.
+                 */
+                for (i = 0; i < words; i += 128U) {
+                        trace_gk20a_push_cmdbuf(
+                                c->g->dev->name,
+                                0,
+                                min(words - i, 128U),
+                                offset + i * sizeof(u32),
+                                mem);
+                }
+                dma_buf_vunmap(dmabuf, mem);
+        }
+}
+static int gk20a_channel_add_job(struct channel_gk20a *c,
+                                 struct gk20a_channel_fence *fence)
+{
+        struct vm_gk20a *vm = c->vm;
+        struct channel_gk20a_job *job = NULL;
+        struct mapped_buffer_node **mapped_buffers = NULL;
+        int err = 0, num_mapped_buffers;
+        /* job needs reference to this vm */
+        gk20a_vm_get(vm);
+        err = gk20a_vm_get_buffers(vm, &mapped_buffers, &num_mapped_buffers);
+        if (err) {
+                gk20a_vm_put(vm);
+                return err;
+        }
+        job = kzalloc(sizeof(*job), GFP_KERNEL);
+        if (!job) {
+                gk20a_vm_put_buffers(vm, mapped_buffers, num_mapped_buffers);
+                gk20a_vm_put(vm);
+                return -ENOMEM;
+        }
+        job->num_mapped_buffers = num_mapped_buffers;
+        job->mapped_buffers = mapped_buffers;
+        job->fence = *fence;
+        mutex_lock(&c->jobs_lock);
+        list_add_tail(&job->list, &c->jobs);
+        mutex_unlock(&c->jobs_lock);
+        return 0;
+}
+void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
+{
+        struct gk20a *g = c->g;
+        struct vm_gk20a *vm = c->vm;
+        struct channel_gk20a_job *job, *n;
+        int i;
+        wake_up(&c->submit_wq);
+        mutex_lock(&c->jobs_lock);
+        list_for_each_entry_safe(job, n, &c->jobs, list) {
+                bool completed = WARN_ON(!c->sync) ||
+                        c->sync->is_expired(c->sync, &job->fence);
+                if (!completed)
+                        break;
+                gk20a_vm_put_buffers(vm, job->mapped_buffers,
+                                job->num_mapped_buffers);
+                /* job is done. release its reference to vm */
+                gk20a_vm_put(vm);
+                list_del_init(&job->list);
+                kfree(job);
+                gk20a_channel_idle(g->dev);
+        }
+        mutex_unlock(&c->jobs_lock);
+        for (i = 0; i < nr_completed; i++)
+                gk20a_channel_idle(c->g->dev);
+}
+static int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
+                                struct nvhost_gpfifo *gpfifo,
+                                u32 num_entries,
+                                struct nvhost_fence *fence,
+                                u32 flags)
+{
+        struct gk20a *g = c->g;
+        struct device *d = dev_from_gk20a(g);
+        u32 err = 0;
+        int i;
+        struct priv_cmd_entry *wait_cmd = NULL;
+        struct priv_cmd_entry *incr_cmd = NULL;
+        /* we might need two extra gpfifo entries - one for pre fence
+         * and one for post fence. */
+        const int extra_entries = 2;
+        if (c->has_timedout)
+                return -ETIMEDOUT;
+        if ((flags & (NVHOST_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT |
+                      NVHOST_SUBMIT_GPFIFO_FLAGS_FENCE_GET)) &&
+            !fence)
+                return -EINVAL;
+        if (!c->sync) {
+                c->sync = gk20a_channel_sync_create(c);
+                if (!c->sync)
+                        return -ENOMEM;
+        }
+#ifdef CONFIG_DEBUG_FS
+        /* update debug settings */
+        if (g->ops.ltc.sync_debugfs)
+                g->ops.ltc.sync_debugfs(g);
+#endif
+        gk20a_dbg_info("channel %d", c->hw_chid);
+        /* gk20a_channel_update releases this ref. */
+        gk20a_channel_busy(g->dev);
+        trace_gk20a_channel_submit_gpfifo(c->g->dev->name,
+                                          c->hw_chid,
+                                          num_entries,
+                                          flags,
+                                          fence->syncpt_id, fence->value);
+        check_gp_put(g, c);
+        update_gp_get(g, c);
+        gk20a_dbg_info("pre-submit put %d, get %d, size %d",
+                c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
+        /* Invalidate tlb if it's dirty...                                   */
+        /* TBD: this should be done in the cmd stream, not with PRIs.        */
+        /* We don't know what context is currently running...                */
+        /* Note also: there can be more than one context associated with the */
+        /* address space (vm).   */
+        gk20a_mm_tlb_invalidate(c->vm);
+        /* Make sure we have enough space for gpfifo entries. If not,
+         * wait for signals from completed submits */
+        if (gp_free_count(c) < num_entries + extra_entries) {
+                err = wait_event_interruptible(c->submit_wq,
+                        get_gp_free_count(c) >= num_entries + extra_entries ||
+                        c->has_timedout);
+        }
+        if (c->has_timedout) {
+                err = -ETIMEDOUT;
+                goto clean_up;
+        }
+        if (err) {
+                gk20a_err(d, "not enough gpfifo space");
+                err = -EAGAIN;
+                goto clean_up;
+        }
+        /*
+         * optionally insert syncpt wait in the beginning of gpfifo submission
+         * when user requested and the wait hasn't expired.
+         * validate that the id makes sense, elide if not
+         * the only reason this isn't being unceremoniously killed is to
+         * keep running some tests which trigger this condition
+         */
+        if (flags & NVHOST_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) {
+                if (flags & NVHOST_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE)
+                        err = c->sync->wait_fd(c->sync, fence->syncpt_id,
+                                        &wait_cmd);
+                else
+                        err = c->sync->wait_syncpt(c->sync, fence->syncpt_id,
+                                        fence->value, &wait_cmd);
+        }
+        if (err)
+                goto clean_up;
+        /* always insert syncpt increment at end of gpfifo submission
+           to keep track of method completion for idle railgating */
+        if (flags & NVHOST_SUBMIT_GPFIFO_FLAGS_FENCE_GET &&
+                        flags & NVHOST_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE)
+                err = c->sync->incr_user_fd(c->sync, &incr_cmd,
+                                            &c->last_submit_fence,
+                                            &fence->syncpt_id);
+        else if (flags & NVHOST_SUBMIT_GPFIFO_FLAGS_FENCE_GET)
+                err = c->sync->incr_user_syncpt(c->sync, &incr_cmd,
+                                                &c->last_submit_fence,
+                                                &fence->syncpt_id,
+                                                &fence->value);
+        else
+                err = c->sync->incr(c->sync, &incr_cmd,
+                                    &c->last_submit_fence);
+        if (err)
+                goto clean_up;
+        if (wait_cmd) {
+                c->gpfifo.cpu_va[c->gpfifo.put].entry0 =
+                        u64_lo32(wait_cmd->gva);
+                c->gpfifo.cpu_va[c->gpfifo.put].entry1 =
+                        u64_hi32(wait_cmd->gva) |
+                        pbdma_gp_entry1_length_f(wait_cmd->size);
+                trace_write_pushbuffer(c, &c->gpfifo.cpu_va[c->gpfifo.put]);
+                c->gpfifo.put = (c->gpfifo.put + 1) &
+                        (c->gpfifo.entry_num - 1);
+                /* save gp_put */
+                wait_cmd->gp_put = c->gpfifo.put;
+        }
+        for (i = 0; i < num_entries; i++) {
+                c->gpfifo.cpu_va[c->gpfifo.put].entry0 =
+                        gpfifo[i].entry0; /* cmd buf va low 32 */
+                c->gpfifo.cpu_va[c->gpfifo.put].entry1 =
+                        gpfifo[i].entry1; /* cmd buf va high 32 | words << 10 */
+                trace_write_pushbuffer(c, &c->gpfifo.cpu_va[c->gpfifo.put]);
+                c->gpfifo.put = (c->gpfifo.put + 1) &
+                        (c->gpfifo.entry_num - 1);
+        }
+        if (incr_cmd) {
+                c->gpfifo.cpu_va[c->gpfifo.put].entry0 =
+                        u64_lo32(incr_cmd->gva);
+                c->gpfifo.cpu_va[c->gpfifo.put].entry1 =
+                        u64_hi32(incr_cmd->gva) |
+                        pbdma_gp_entry1_length_f(incr_cmd->size);
+                trace_write_pushbuffer(c, &c->gpfifo.cpu_va[c->gpfifo.put]);
+                c->gpfifo.put = (c->gpfifo.put + 1) &
+                        (c->gpfifo.entry_num - 1);
+                /* save gp_put */
+                incr_cmd->gp_put = c->gpfifo.put;
+        }
+        /* Invalidate tlb if it's dirty...                                   */
+        /* TBD: this should be done in the cmd stream, not with PRIs.        */
+        /* We don't know what context is currently running...                */
+        /* Note also: there can be more than one context associated with the */
+        /* address space (vm).   */
+        gk20a_mm_tlb_invalidate(c->vm);
+        trace_gk20a_channel_submitted_gpfifo(c->g->dev->name,
+                                             c->hw_chid,
+                                             num_entries,
+                                             flags,
+                                             fence->syncpt_id, fence->value);
+        /* TODO! Check for errors... */
+        gk20a_channel_add_job(c, &c->last_submit_fence);
+        c->cmds_pending = true;
+        gk20a_bar1_writel(g,
+                c->userd_gpu_va + 4 * ram_userd_gp_put_w(),
+                c->gpfifo.put);
+        gk20a_dbg_info("post-submit put %d, get %d, size %d",
+                c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
+        gk20a_dbg_fn("done");
+        return err;
+clean_up:
+        gk20a_err(d, "fail");
+        free_priv_cmdbuf(c, wait_cmd);
+        free_priv_cmdbuf(c, incr_cmd);
+        gk20a_channel_idle(g->dev);
+        return err;
+}
+void gk20a_remove_channel_support(struct channel_gk20a *c)
+{
+}
+int gk20a_init_channel_support(struct gk20a *g, u32 chid)
+{
+        struct channel_gk20a *c = g->fifo.channel+chid;
+        c->g = g;
+        c->in_use = false;
+        c->hw_chid = chid;
+        c->bound = false;
+        c->remove_support = gk20a_remove_channel_support;
+        mutex_init(&c->jobs_lock);
+        INIT_LIST_HEAD(&c->jobs);
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+        mutex_init(&c->cyclestate.cyclestate_buffer_mutex);
+#endif
+        INIT_LIST_HEAD(&c->dbg_s_list);
+        mutex_init(&c->dbg_s_lock);
+        return 0;
+}
+int gk20a_channel_finish(struct channel_gk20a *ch, unsigned long timeout)
+{
+        int err = 0;
+        if (!ch->cmds_pending)
+                return 0;
+        /* Do not wait for a timedout channel */
+        if (ch->has_timedout)
+                return -ETIMEDOUT;
+        if (!(ch->last_submit_fence.valid && ch->last_submit_fence.wfi)) {
+                gk20a_dbg_fn("issuing wfi, incr to finish the channel");
+                err = gk20a_channel_submit_wfi(ch);
+        }
+        if (err)
+                return err;
+        BUG_ON(!(ch->last_submit_fence.valid && ch->last_submit_fence.wfi));
+        gk20a_dbg_fn("waiting for channel to finish thresh:%d",
+                      ch->last_submit_fence.thresh);
+        err = ch->sync->wait_cpu(ch->sync, &ch->last_submit_fence, timeout);
+        if (WARN_ON(err))
+                dev_warn(dev_from_gk20a(ch->g),
+                         "timed out waiting for gk20a channel to finish");
+        else
+                ch->cmds_pending = false;
+        return err;
+}
+static int gk20a_channel_wait_semaphore(struct channel_gk20a *ch,
+                                        ulong id, u32 offset,
+                                        u32 payload, long timeout)
+{
+        struct platform_device *pdev = ch->g->dev;
+        struct dma_buf *dmabuf;
+        void *data;
+        u32 *semaphore;
+        int ret = 0;
+        long remain;
+        /* do not wait if channel has timed out */
+        if (ch->has_timedout)
+                return -ETIMEDOUT;
+        dmabuf = dma_buf_get(id);
+        if (IS_ERR(dmabuf)) {
+                gk20a_err(&pdev->dev, "invalid notifier nvmap handle 0x%lx",
+                           id);
+                return -EINVAL;
+        }
+        data = dma_buf_kmap(dmabuf, offset >> PAGE_SHIFT);
+        if (!data) {
+                gk20a_err(&pdev->dev, "failed to map notifier memory");
+                ret = -EINVAL;
+                goto cleanup_put;
+        }
+        semaphore = data + (offset & ~PAGE_MASK);
+        remain = wait_event_interruptible_timeout(
+                        ch->semaphore_wq,
+                        *semaphore == payload || ch->has_timedout,
+                        timeout);
+        if (remain == 0 && *semaphore != payload)
+                ret = -ETIMEDOUT;
+        else if (remain < 0)
+                ret = remain;
+        dma_buf_kunmap(dmabuf, offset >> PAGE_SHIFT, data);
+cleanup_put:
+        dma_buf_put(dmabuf);
+        return ret;
+}
+static int gk20a_channel_wait(struct channel_gk20a *ch,
+                              struct nvhost_wait_args *args)
+{
+        struct device *d = dev_from_gk20a(ch->g);
+        struct dma_buf *dmabuf;
+        struct notification *notif;
+        struct timespec tv;
+        u64 jiffies;
+        ulong id;
+        u32 offset;
+        unsigned long timeout;
+        int remain, ret = 0;
+        gk20a_dbg_fn("");
+        if (ch->has_timedout)
+                return -ETIMEDOUT;
+        if (args->timeout == NVHOST_NO_TIMEOUT)
+                timeout = MAX_SCHEDULE_TIMEOUT;
+        else
+                timeout = (u32)msecs_to_jiffies(args->timeout);
+        switch (args->type) {
+        case NVHOST_WAIT_TYPE_NOTIFIER:
+                id = args->condition.notifier.nvmap_handle;
+                offset = args->condition.notifier.offset;
+                dmabuf = dma_buf_get(id);
+                if (IS_ERR(dmabuf)) {
+                        gk20a_err(d, "invalid notifier nvmap handle 0x%lx",
+                                   id);
+                        return -EINVAL;
+                }
+                notif = dma_buf_vmap(dmabuf);
+                if (!notif) {
+                        gk20a_err(d, "failed to map notifier memory");
+                        return -ENOMEM;
+                }
+                notif = (struct notification *)((uintptr_t)notif + offset);
+                /* user should set status pending before
+                 * calling this ioctl */
+                remain = wait_event_interruptible_timeout(
+                                ch->notifier_wq,
+                                notif->status == 0 || ch->has_timedout,
+                                timeout);
+                if (remain == 0 && notif->status != 0) {
+                        ret = -ETIMEDOUT;
+                        goto notif_clean_up;
+                } else if (remain < 0) {
+                        ret = -EINTR;
+                        goto notif_clean_up;
+                }
+                /* TBD: fill in correct information */
+                jiffies = get_jiffies_64();
+                jiffies_to_timespec(jiffies, &tv);
+                notif->timestamp.nanoseconds[0] = tv.tv_nsec;
+                notif->timestamp.nanoseconds[1] = tv.tv_sec;
+                notif->info32 = 0xDEADBEEF; /* should be object name */
+                notif->info16 = ch->hw_chid; /* should be method offset */
+notif_clean_up:
+                dma_buf_vunmap(dmabuf, notif);
+                return ret;
+        case NVHOST_WAIT_TYPE_SEMAPHORE:
+                ret = gk20a_channel_wait_semaphore(ch,
+                                args->condition.semaphore.nvmap_handle,
+                                args->condition.semaphore.offset,
+                                args->condition.semaphore.payload,
+                                timeout);
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+        return ret;
+}
+static int gk20a_channel_set_priority(struct channel_gk20a *ch,
+                u32 priority)
+{
+        u32 timeslice_timeout;
+        /* set priority of graphics channel */
+        switch (priority) {
+        case NVHOST_PRIORITY_LOW:
+                /* 64 << 3 = 512us */
+                timeslice_timeout = 64;
+                break;
+        case NVHOST_PRIORITY_MEDIUM:
+                /* 128 << 3 = 1024us */
+                timeslice_timeout = 128;
+                break;
+        case NVHOST_PRIORITY_HIGH:
+                /* 255 << 3 = 2048us */
+                timeslice_timeout = 255;
+                break;
+        default:
+                pr_err("Unsupported priority");
+                return -EINVAL;
+        }
+        channel_gk20a_set_schedule_params(ch,
+                        timeslice_timeout);
+        return 0;
+}
+static int gk20a_channel_zcull_bind(struct channel_gk20a *ch,
+                            struct nvhost_zcull_bind_args *args)
+{
+        struct gk20a *g = ch->g;
+        struct gr_gk20a *gr = &g->gr;
+        gk20a_dbg_fn("");
+        return gr_gk20a_bind_ctxsw_zcull(g, gr, ch,
+                                args->gpu_va, args->mode);
+}
+/* in this context the "channel" is the host1x channel which
+ * maps to *all* gk20a channels */
+int gk20a_channel_suspend(struct gk20a *g)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        u32 chid;
+        bool channels_in_use = false;
+        struct device *d = dev_from_gk20a(g);
+        int err;
+        gk20a_dbg_fn("");
+        /* idle the engine by submitting WFI on non-KEPLER_C channel */
+        for (chid = 0; chid < f->num_channels; chid++) {
+                struct channel_gk20a *c = &f->channel[chid];
+                if (c->in_use && c->obj_class != KEPLER_C) {
+                        err = gk20a_channel_submit_wfi(c);
+                        if (err) {
+                                gk20a_err(d, "cannot idle channel %d\n",
+                                                chid);
+                                return err;
+                        }
+                        c->sync->wait_cpu(c->sync, &c->last_submit_fence,
+                                          500000);
+                        break;
+                }
+        }
+        for (chid = 0; chid < f->num_channels; chid++) {
+                if (f->channel[chid].in_use) {
+                        gk20a_dbg_info("suspend channel %d", chid);
+                        /* disable channel */
+                        gk20a_writel(g, ccsr_channel_r(chid),
+                                gk20a_readl(g, ccsr_channel_r(chid)) |
+                                ccsr_channel_enable_clr_true_f());
+                        /* preempt the channel */
+                        gk20a_fifo_preempt_channel(g, chid);
+                        channels_in_use = true;
+                }
+        }
+        if (channels_in_use) {
+                gk20a_fifo_update_runlist(g, 0, ~0, false, true);
+                for (chid = 0; chid < f->num_channels; chid++) {
+                        if (f->channel[chid].in_use)
+                                channel_gk20a_unbind(&f->channel[chid]);
+                }
+        }
+        gk20a_dbg_fn("done");
+        return 0;
+}
+/* in this context the "channel" is the host1x channel which
+ * maps to *all* gk20a channels */
+int gk20a_channel_resume(struct gk20a *g)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        u32 chid;
+        bool channels_in_use = false;
+        gk20a_dbg_fn("");
+        for (chid = 0; chid < f->num_channels; chid++) {
+                if (f->channel[chid].in_use) {
+                        gk20a_dbg_info("resume channel %d", chid);
+                        g->ops.fifo.bind_channel(&f->channel[chid]);
+                        channels_in_use = true;
+                }
+        }
+        if (channels_in_use)
+                gk20a_fifo_update_runlist(g, 0, ~0, true, true);
+        gk20a_dbg_fn("done");
+        return 0;
+}
+void gk20a_channel_semaphore_wakeup(struct gk20a *g)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        u32 chid;
+        gk20a_dbg_fn("");
+        for (chid = 0; chid < f->num_channels; chid++) {
+                struct channel_gk20a *c = g->fifo.channel+chid;
+                if (c->in_use)
+                        wake_up_interruptible_all(&c->semaphore_wq);
+        }
+}
+static int gk20a_ioctl_channel_submit_gpfifo(
+        struct channel_gk20a *ch,
+        struct nvhost_submit_gpfifo_args *args)
+{
+        void *gpfifo;
+        u32 size;
+        int ret = 0;
+        gk20a_dbg_fn("");
+        if (ch->has_timedout)
+                return -ETIMEDOUT;
+        size = args->num_entries * sizeof(struct nvhost_gpfifo);
+        gpfifo = kzalloc(size, GFP_KERNEL);
+        if (!gpfifo)
+                return -ENOMEM;
+        if (copy_from_user(gpfifo,
+                           (void __user *)(uintptr_t)args->gpfifo, size)) {
+                ret = -EINVAL;
+                goto clean_up;
+        }
+        ret = gk20a_submit_channel_gpfifo(ch, gpfifo, args->num_entries,
+                                        &args->fence, args->flags);
+clean_up:
+        kfree(gpfifo);
+        return ret;
+}
+void gk20a_init_fifo(struct gpu_ops *gops)
+{
+        gops->fifo.bind_channel = channel_gk20a_bind;
+}
+long gk20a_channel_ioctl(struct file *filp,
+        unsigned int cmd, unsigned long arg)
+{
+        struct channel_gk20a *ch = filp->private_data;
+        struct platform_device *dev = ch->g->dev;
+        u8 buf[NVHOST_IOCTL_CHANNEL_MAX_ARG_SIZE];
+        int err = 0;
+        if ((_IOC_TYPE(cmd) != NVHOST_IOCTL_MAGIC) ||
+                (_IOC_NR(cmd) == 0) ||
+                (_IOC_NR(cmd) > NVHOST_IOCTL_CHANNEL_LAST) ||
+                (_IOC_SIZE(cmd) > NVHOST_IOCTL_CHANNEL_MAX_ARG_SIZE))
+                return -EFAULT;
+        if (_IOC_DIR(cmd) & _IOC_WRITE) {
+                if (copy_from_user(buf, (void __user *)arg, _IOC_SIZE(cmd)))
+                        return -EFAULT;
+        }
+        switch (cmd) {
+        case NVHOST_IOCTL_CHANNEL_OPEN:
+        {
+                int fd;
+                struct file *file;
+                char *name;
+                err = get_unused_fd_flags(O_RDWR);
+                if (err < 0)
+                        break;
+                fd = err;
+                name = kasprintf(GFP_KERNEL, "nvhost-%s-fd%d",
+                                dev_name(&dev->dev), fd);
+                if (!name) {
+                        err = -ENOMEM;
+                        put_unused_fd(fd);
+                        break;
+                }
+                file = anon_inode_getfile(name, filp->f_op, NULL, O_RDWR);
+                kfree(name);
+                if (IS_ERR(file)) {
+                        err = PTR_ERR(file);
+                        put_unused_fd(fd);
+                        break;
+                }
+                fd_install(fd, file);
+                err = __gk20a_channel_open(ch->g, file);
+                if (err) {
+                        put_unused_fd(fd);
+                        fput(file);
+                        break;
+                }
+                ((struct nvhost_channel_open_args *)buf)->channel_fd = fd;
+                break;
+        }
+        case NVHOST_IOCTL_CHANNEL_SET_NVMAP_FD:
+                break;
+        case NVHOST_IOCTL_CHANNEL_ALLOC_OBJ_CTX:
+                gk20a_channel_busy(dev);
+                err = gk20a_alloc_obj_ctx(ch,
+                                (struct nvhost_alloc_obj_ctx_args *)buf);
+                gk20a_channel_idle(dev);
+                break;
+        case NVHOST_IOCTL_CHANNEL_FREE_OBJ_CTX:
+                gk20a_channel_busy(dev);
+                err = gk20a_free_obj_ctx(ch,
+                                (struct nvhost_free_obj_ctx_args *)buf);
+                gk20a_channel_idle(dev);
+                break;
+        case NVHOST_IOCTL_CHANNEL_ALLOC_GPFIFO:
+                gk20a_channel_busy(dev);
+                err = gk20a_alloc_channel_gpfifo(ch,
+                                (struct nvhost_alloc_gpfifo_args *)buf);
+                gk20a_channel_idle(dev);
+                break;
+        case NVHOST_IOCTL_CHANNEL_SUBMIT_GPFIFO:
+                err = gk20a_ioctl_channel_submit_gpfifo(ch,
+                                (struct nvhost_submit_gpfifo_args *)buf);
+                break;
+        case NVHOST_IOCTL_CHANNEL_WAIT:
+                gk20a_channel_busy(dev);
+                err = gk20a_channel_wait(ch,
+                                (struct nvhost_wait_args *)buf);
+                gk20a_channel_idle(dev);
+                break;
+        case NVHOST_IOCTL_CHANNEL_ZCULL_BIND:
+                gk20a_channel_busy(dev);
+                err = gk20a_channel_zcull_bind(ch,
+                                (struct nvhost_zcull_bind_args *)buf);
+                gk20a_channel_idle(dev);
+                break;
+        case NVHOST_IOCTL_CHANNEL_SET_ERROR_NOTIFIER:
+                gk20a_channel_busy(dev);
+                err = gk20a_init_error_notifier(ch,
+                                (struct nvhost_set_error_notifier *)buf);
+                gk20a_channel_idle(dev);
+                break;
+#ifdef CONFIG_GK20A_CYCLE_STATS
+        case NVHOST_IOCTL_CHANNEL_CYCLE_STATS:
+                gk20a_channel_busy(dev);
+                err = gk20a_channel_cycle_stats(ch,
+                                (struct nvhost_cycle_stats_args *)buf);
+                gk20a_channel_idle(dev);
+                break;
+#endif
+        case NVHOST_IOCTL_CHANNEL_SET_TIMEOUT:
+        {
+                u32 timeout =
+                        (u32)((struct nvhost_set_timeout_args *)buf)->timeout;
+                gk20a_dbg(gpu_dbg_gpu_dbg, "setting timeout (%d ms) for chid %d",
+                           timeout, ch->hw_chid);
+                ch->timeout_ms_max = timeout;
+                break;
+        }
+        case NVHOST_IOCTL_CHANNEL_SET_TIMEOUT_EX:
+        {
+                u32 timeout =
+                        (u32)((struct nvhost_set_timeout_args *)buf)->timeout;
+                bool timeout_debug_dump = !((u32)
+                        ((struct nvhost_set_timeout_ex_args *)buf)->flags &
+                        (1 << NVHOST_TIMEOUT_FLAG_DISABLE_DUMP));
+                gk20a_dbg(gpu_dbg_gpu_dbg, "setting timeout (%d ms) for chid %d",
+                           timeout, ch->hw_chid);
+                ch->timeout_ms_max = timeout;
+                ch->timeout_debug_dump = timeout_debug_dump;
+                break;
+        }
+        case NVHOST_IOCTL_CHANNEL_GET_TIMEDOUT:
+                ((struct nvhost_get_param_args *)buf)->value =
+                        ch->has_timedout;
+                break;
+        case NVHOST_IOCTL_CHANNEL_SET_PRIORITY:
+                gk20a_channel_busy(dev);
+                gk20a_channel_set_priority(ch,
+                        ((struct nvhost_set_priority_args *)buf)->priority);
+                gk20a_channel_idle(dev);
+                break;
+        default:
+                dev_err(&dev->dev, "unrecognized ioctl cmd: 0x%x", cmd);
+                err = -ENOTTY;
+                break;
+        }
+        if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
+                err = copy_to_user((void __user *)arg, buf, _IOC_SIZE(cmd));
+        return err;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
new file mode 100644
index 00000000..429db85d
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -0,0 +1,172 @@
+/*
+ * drivers/video/tegra/host/gk20a/channel_gk20a.h
+ *
+ * GK20A graphics channel
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __CHANNEL_GK20A_H__
+#define __CHANNEL_GK20A_H__
+#include <linux/log2.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/mutex.h>
+#include <linux/nvhost_ioctl.h>
+struct gk20a;
+struct gr_gk20a;
+struct dbg_session_gk20a;
+#include "channel_sync_gk20a.h"
+#include "mm_gk20a.h"
+#include "gr_gk20a.h"
+struct gpfifo {
+        u32 entry0;
+        u32 entry1;
+};
+struct notification {
+        struct {
+                u32 nanoseconds[2];
+        } timestamp;
+        u32 info32;
+        u16 info16;
+        u16 status;
+};
+struct fence {
+        u32 hw_chid;
+        u32 syncpt_val;
+};
+/* contexts associated with a channel */
+struct channel_ctx_gk20a {
+        struct gr_ctx_desc      gr_ctx;
+        struct pm_ctx_desc      pm_ctx;
+        struct patch_desc       patch_ctx;
+        struct zcull_ctx_desc   zcull_ctx;
+        u64     global_ctx_buffer_va[NR_GLOBAL_CTX_BUF_VA];
+        u64     global_ctx_buffer_size[NR_GLOBAL_CTX_BUF_VA];
+        bool    global_ctx_buffer_mapped;
+};
+struct channel_gk20a_job {
+        struct mapped_buffer_node **mapped_buffers;
+        int num_mapped_buffers;
+        struct gk20a_channel_fence fence;
+        struct list_head list;
+};
+/* this is the priv element of struct nvhost_channel */
+struct channel_gk20a {
+        struct gk20a *g;
+        bool in_use;
+        int hw_chid;
+        bool bound;
+        bool first_init;
+        bool vpr;
+        pid_t pid;
+        struct list_head jobs;
+        struct mutex jobs_lock;
+        struct vm_gk20a *vm;
+        struct gpfifo_desc gpfifo;
+        struct channel_ctx_gk20a ch_ctx;
+        struct inst_desc inst_block;
+        struct mem_desc_sub ramfc;
+        void *userd_cpu_va;
+        u64 userd_iova;
+        u64 userd_gpu_va;
+        s32 num_objects;
+        u32 obj_class;  /* we support only one obj per channel */
+        struct priv_cmd_queue priv_cmd_q;
+        wait_queue_head_t notifier_wq;
+        wait_queue_head_t semaphore_wq;
+        wait_queue_head_t submit_wq;
+        u32 timeout_accumulated_ms;
+        u32 timeout_gpfifo_get;
+        bool cmds_pending;
+        struct gk20a_channel_fence last_submit_fence;
+        void (*remove_support)(struct channel_gk20a *);
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+        struct {
+        void *cyclestate_buffer;
+        u32 cyclestate_buffer_size;
+        struct dma_buf *cyclestate_buffer_handler;
+        struct mutex cyclestate_buffer_mutex;
+        } cyclestate;
+#endif
+        struct mutex dbg_s_lock;
+        struct list_head dbg_s_list;
+        bool has_timedout;
+        u32 timeout_ms_max;
+        bool timeout_debug_dump;
+        struct dma_buf *error_notifier_ref;
+        struct nvhost_notification *error_notifier;
+        void *error_notifier_va;
+        struct gk20a_channel_sync *sync;
+};
+static inline bool gk20a_channel_as_bound(struct channel_gk20a *ch)
+{
+        return !!ch->vm;
+}
+int channel_gk20a_commit_va(struct channel_gk20a *c);
+int gk20a_init_channel_support(struct gk20a *, u32 chid);
+void gk20a_free_channel(struct channel_gk20a *ch, bool finish);
+bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
+                                            u32 timeout_delta_ms);
+void gk20a_disable_channel(struct channel_gk20a *ch,
+                           bool wait_for_finish,
+                           unsigned long finish_timeout);
+void gk20a_disable_channel_no_update(struct channel_gk20a *ch);
+int gk20a_channel_finish(struct channel_gk20a *ch, unsigned long timeout);
+void gk20a_set_error_notifier(struct channel_gk20a *ch, __u32 error);
+void gk20a_channel_semaphore_wakeup(struct gk20a *g);
+int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 size,
+                             struct priv_cmd_entry **entry);
+int gk20a_channel_suspend(struct gk20a *g);
+int gk20a_channel_resume(struct gk20a *g);
+/* Channel file operations */
+int gk20a_channel_open(struct inode *inode, struct file *filp);
+long gk20a_channel_ioctl(struct file *filp,
+                         unsigned int cmd,
+                         unsigned long arg);
+int gk20a_channel_release(struct inode *inode, struct file *filp);
+struct channel_gk20a *gk20a_get_channel_from_file(int fd);
+void gk20a_channel_update(struct channel_gk20a *c, int nr_completed);
+void gk20a_init_fifo(struct gpu_ops *gops);
+#endif /*__CHANNEL_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
new file mode 100644
index 00000000..9f9c3ba7
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -0,0 +1,356 @@
+/*
+ * drivers/video/tegra/host/gk20a/channel_sync_gk20a.c
+ *
+ * GK20A Channel Synchronization Abstraction
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/gk20a.h>
+#include "channel_sync_gk20a.h"
+#include "gk20a.h"
+#ifdef CONFIG_SYNC
+#include "../../../staging/android/sync.h"
+#endif
+#ifdef CONFIG_TEGRA_GK20A
+#include <linux/nvhost.h>
+#endif
+#ifdef CONFIG_TEGRA_GK20A
+struct gk20a_channel_syncpt {
+        struct gk20a_channel_sync ops;
+        struct channel_gk20a *c;
+        struct platform_device *host1x_pdev;
+        u32 id;
+};
+static void add_wait_cmd(u32 *ptr, u32 id, u32 thresh)
+{
+        /* syncpoint_a */
+        ptr[0] = 0x2001001C;
+        /* payload */
+        ptr[1] = thresh;
+        /* syncpoint_b */
+        ptr[2] = 0x2001001D;
+        /* syncpt_id, switch_en, wait */
+        ptr[3] = (id << 8) | 0x10;
+}
+int gk20a_channel_syncpt_wait_cpu(struct gk20a_channel_sync *s,
+                                  struct gk20a_channel_fence *fence,
+                                  int timeout)
+{
+        struct gk20a_channel_syncpt *sp =
+                container_of(s, struct gk20a_channel_syncpt, ops);
+        if (!fence->valid)
+                return 0;
+        return nvhost_syncpt_wait_timeout_ext(
+                        sp->host1x_pdev, sp->id, fence->thresh,
+                        timeout, NULL, NULL);
+}
+bool gk20a_channel_syncpt_is_expired(struct gk20a_channel_sync *s,
+                                     struct gk20a_channel_fence *fence)
+{
+        struct gk20a_channel_syncpt *sp =
+                container_of(s, struct gk20a_channel_syncpt, ops);
+        if (!fence->valid)
+                return true;
+        return nvhost_syncpt_is_expired_ext(sp->host1x_pdev, sp->id,
+                        fence->thresh);
+}
+int gk20a_channel_syncpt_wait_syncpt(struct gk20a_channel_sync *s, u32 id,
+                u32 thresh, struct priv_cmd_entry **entry)
+{
+        struct gk20a_channel_syncpt *sp =
+                container_of(s, struct gk20a_channel_syncpt, ops);
+        struct priv_cmd_entry *wait_cmd = NULL;
+        if (id >= nvhost_syncpt_nb_pts_ext(sp->host1x_pdev)) {
+                dev_warn(dev_from_gk20a(sp->c->g),
+                                "invalid wait id in gpfifo submit, elided");
+                return 0;
+        }
+        if (nvhost_syncpt_is_expired_ext(sp->host1x_pdev, id, thresh))
+                return 0;
+        gk20a_channel_alloc_priv_cmdbuf(sp->c, 4, &wait_cmd);
+        if (wait_cmd == NULL) {
+                gk20a_err(dev_from_gk20a(sp->c->g),
+                                "not enough priv cmd buffer space");
+                return -EAGAIN;
+        }
+        add_wait_cmd(&wait_cmd->ptr[0], id, thresh);
+        *entry = wait_cmd;
+        return 0;
+}
+int gk20a_channel_syncpt_wait_fd(struct gk20a_channel_sync *s, int fd,
+                       struct priv_cmd_entry **entry)
+{
+#ifdef CONFIG_SYNC
+        int i;
+        int num_wait_cmds;
+        struct sync_pt *pt;
+        struct sync_fence *sync_fence;
+        struct priv_cmd_entry *wait_cmd = NULL;
+        struct gk20a_channel_syncpt *sp =
+                container_of(s, struct gk20a_channel_syncpt, ops);
+        struct channel_gk20a *c = sp->c;
+        sync_fence = nvhost_sync_fdget(fd);
+        if (!sync_fence)
+                return -EINVAL;
+        num_wait_cmds = nvhost_sync_num_pts(sync_fence);
+        gk20a_channel_alloc_priv_cmdbuf(c, 4 * num_wait_cmds, &wait_cmd);
+        if (wait_cmd == NULL) {
+                gk20a_err(dev_from_gk20a(c->g),
+                                "not enough priv cmd buffer space");
+                sync_fence_put(sync_fence);
+                return -EAGAIN;
+        }
+        i = 0;
+        list_for_each_entry(pt, &sync_fence->pt_list_head, pt_list) {
+                u32 wait_id = nvhost_sync_pt_id(pt);
+                u32 wait_value = nvhost_sync_pt_thresh(pt);
+                if (nvhost_syncpt_is_expired_ext(sp->host1x_pdev,
+                                wait_id, wait_value)) {
+                        wait_cmd->ptr[i * 4 + 0] = 0;
+                        wait_cmd->ptr[i * 4 + 1] = 0;
+                        wait_cmd->ptr[i * 4 + 2] = 0;
+                        wait_cmd->ptr[i * 4 + 3] = 0;
+                } else
+                        add_wait_cmd(&wait_cmd->ptr[i * 4], wait_id,
+                                        wait_value);
+                i++;
+        }
+        WARN_ON(i != num_wait_cmds);
+        sync_fence_put(sync_fence);
+        *entry = wait_cmd;
+        return 0;
+#else
+        return -ENODEV;
+#endif
+}
+static void gk20a_channel_syncpt_update(void *priv, int nr_completed)
+{
+        struct channel_gk20a *ch20a = priv;
+        gk20a_channel_update(ch20a, nr_completed);
+}
+static int __gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
+                                       bool gfx_class, bool wfi_cmd,
+                                       struct priv_cmd_entry **entry,
+                                       struct gk20a_channel_fence *fence)
+{
+        u32 thresh;
+        int incr_cmd_size;
+        int j = 0;
+        int err;
+        struct priv_cmd_entry *incr_cmd = NULL;
+        struct gk20a_channel_syncpt *sp =
+                container_of(s, struct gk20a_channel_syncpt, ops);
+        struct channel_gk20a *c = sp->c;
+        /* nvhost action_gpfifo_submit_complete releases this ref. */
+        err = gk20a_channel_busy(c->g->dev);
+        if (err)
+                return err;
+        incr_cmd_size = 4;
+        if (wfi_cmd)
+                incr_cmd_size += 2;
+        gk20a_channel_alloc_priv_cmdbuf(c, incr_cmd_size, &incr_cmd);
+        if (incr_cmd == NULL) {
+                gk20a_channel_idle(c->g->dev);
+                gk20a_err(dev_from_gk20a(c->g),
+                                "not enough priv cmd buffer space");
+                return -EAGAIN;
+        }
+        if (gfx_class) {
+                WARN_ON(wfi_cmd); /* No sense to use gfx class + wfi. */
+                /* setobject KEPLER_C */
+                incr_cmd->ptr[j++] = 0x20010000;
+                incr_cmd->ptr[j++] = KEPLER_C;
+                /* syncpt incr */
+                incr_cmd->ptr[j++] = 0x200100B2;
+                incr_cmd->ptr[j++] = sp->id |
+                        (0x1 << 20) | (0x1 << 16);
+        } else {
+                if (wfi_cmd) {
+                        /* wfi */
+                        incr_cmd->ptr[j++] = 0x2001001E;
+                        /* handle, ignored */
+                        incr_cmd->ptr[j++] = 0x00000000;
+                }
+                /* syncpoint_a */
+                incr_cmd->ptr[j++] = 0x2001001C;
+                /* payload, ignored */
+                incr_cmd->ptr[j++] = 0;
+                /* syncpoint_b */
+                incr_cmd->ptr[j++] = 0x2001001D;
+                /* syncpt_id, incr */
+                incr_cmd->ptr[j++] = (sp->id << 8) | 0x1;
+        }
+        WARN_ON(j != incr_cmd_size);
+        thresh = nvhost_syncpt_incr_max_ext(sp->host1x_pdev, sp->id, 1);
+        err = nvhost_intr_register_notifier(sp->host1x_pdev, sp->id, thresh,
+                        gk20a_channel_syncpt_update, c);
+        /* Adding interrupt action should never fail. A proper error handling
+         * here would require us to decrement the syncpt max back to its
+         * original value. */
+        if (WARN(err, "failed to set submit complete interrupt")) {
+                gk20a_channel_idle(c->g->dev);
+                err = 0; /* Ignore this error. */
+        }
+        fence->thresh = thresh;
+        fence->valid = true;
+        fence->wfi = wfi_cmd;
+        *entry = incr_cmd;
+        return 0;
+}
+int gk20a_channel_syncpt_incr_wfi(struct gk20a_channel_sync *s,
+                                  struct priv_cmd_entry **entry,
+                                  struct gk20a_channel_fence *fence)
+{
+        return __gk20a_channel_syncpt_incr(s,
+                        false /* use host class */,
+                        true /* wfi */,
+                        entry, fence);
+}
+int gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
+                              struct priv_cmd_entry **entry,
+                              struct gk20a_channel_fence *fence)
+{
+        struct gk20a_channel_syncpt *sp =
+                container_of(s, struct gk20a_channel_syncpt, ops);
+        /* Don't put wfi cmd to this one since we're not returning
+         * a fence to user space. */
+        return __gk20a_channel_syncpt_incr(s,
+                        sp->c->obj_class == KEPLER_C /* may use gfx class */,
+                        false /* no wfi */,
+                        entry, fence);
+}
+int gk20a_channel_syncpt_incr_user_syncpt(struct gk20a_channel_sync *s,
+                                          struct priv_cmd_entry **entry,
+                                          struct gk20a_channel_fence *fence,
+                                          u32 *id, u32 *thresh)
+{
+        struct gk20a_channel_syncpt *sp =
+                container_of(s, struct gk20a_channel_syncpt, ops);
+        /* Need to do 'host incr + wfi' or 'gfx incr' since we return the fence
+         * to user space. */
+        int err = __gk20a_channel_syncpt_incr(s,
+                        sp->c->obj_class == KEPLER_C /* use gfx class? */,
+                        sp->c->obj_class != KEPLER_C /* wfi if host class */,
+                        entry, fence);
+        if (err)
+                return err;
+        *id = sp->id;
+        *thresh = fence->thresh;
+        return 0;
+}
+int gk20a_channel_syncpt_incr_user_fd(struct gk20a_channel_sync *s,
+                                      struct priv_cmd_entry **entry,
+                                      struct gk20a_channel_fence *fence,
+                                      int *fd)
+{
+#ifdef CONFIG_SYNC
+        int err;
+        struct nvhost_ctrl_sync_fence_info pt;
+        struct gk20a_channel_syncpt *sp =
+                container_of(s, struct gk20a_channel_syncpt, ops);
+        err = gk20a_channel_syncpt_incr_user_syncpt(s, entry, fence,
+                                                    &pt.id, &pt.thresh);
+        if (err)
+                return err;
+        return nvhost_sync_create_fence_fd(sp->host1x_pdev, &pt, 1,
+                                           "fence", fd);
+#else
+        return -ENODEV;
+#endif
+}
+void gk20a_channel_syncpt_set_min_eq_max(struct gk20a_channel_sync *s)
+{
+        struct gk20a_channel_syncpt *sp =
+                container_of(s, struct gk20a_channel_syncpt, ops);
+        nvhost_syncpt_set_min_eq_max_ext(sp->host1x_pdev, sp->id);
+}
+static void gk20a_channel_syncpt_destroy(struct gk20a_channel_sync *s)
+{
+        struct gk20a_channel_syncpt *sp =
+                container_of(s, struct gk20a_channel_syncpt, ops);
+        nvhost_free_syncpt(sp->id);
+        kfree(sp);
+}
+static struct gk20a_channel_sync *
+gk20a_channel_syncpt_create(struct channel_gk20a *c)
+{
+        struct gk20a_channel_syncpt *sp;
+        sp = kzalloc(sizeof(*sp), GFP_KERNEL);
+        if (!sp)
+                return NULL;
+        sp->c = c;
+        sp->host1x_pdev = to_platform_device(c->g->dev->dev.parent);
+        sp->id = nvhost_get_syncpt_host_managed(sp->host1x_pdev, c->hw_chid);
+        sp->ops.wait_cpu                = gk20a_channel_syncpt_wait_cpu;
+        sp->ops.is_expired              = gk20a_channel_syncpt_is_expired;
+        sp->ops.wait_syncpt             = gk20a_channel_syncpt_wait_syncpt;
+        sp->ops.wait_fd                 = gk20a_channel_syncpt_wait_fd;
+        sp->ops.incr                    = gk20a_channel_syncpt_incr;
+        sp->ops.incr_wfi                = gk20a_channel_syncpt_incr_wfi;
+        sp->ops.incr_user_syncpt        = gk20a_channel_syncpt_incr_user_syncpt;
+        sp->ops.incr_user_fd            = gk20a_channel_syncpt_incr_user_fd;
+        sp->ops.set_min_eq_max          = gk20a_channel_syncpt_set_min_eq_max;
+        sp->ops.destroy                 = gk20a_channel_syncpt_destroy;
+        return &sp->ops;
+}
+#endif /* CONFIG_TEGRA_GK20A */
+struct gk20a_channel_sync *gk20a_channel_sync_create(struct channel_gk20a *c)
+{
+#ifdef CONFIG_TEGRA_GK20A
+        if (gk20a_platform_has_syncpoints(c->g->dev))
+                return gk20a_channel_syncpt_create(c);
+#endif
+        WARN_ON(1);
+        return NULL;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h
new file mode 100644
index 00000000..69feb89f
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h
@@ -0,0 +1,102 @@
+/*
+ * drivers/video/tegra/host/gk20a/channel_sync_gk20a.h
+ *
+ * GK20A Channel Synchronization Abstraction
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef _GK20A_CHANNEL_SYNC_H_
+#define _GK20A_CHANNEL_SYNC_H_
+#include <linux/types.h>
+struct gk20a_channel_sync;
+struct priv_cmd_entry;
+struct channel_gk20a;
+struct gk20a_channel_fence {
+        bool valid;
+        bool wfi; /* was issued with preceding wfi */
+        u32 thresh; /* either semaphore or syncpoint value */
+};
+struct gk20a_channel_sync {
+        /* CPU wait for a fence returned by incr_syncpt() or incr_fd(). */
+        int (*wait_cpu)(struct gk20a_channel_sync *s,
+                        struct gk20a_channel_fence *fence,
+                        int timeout);
+        /* Test whether a fence returned by incr_syncpt() or incr_fd() is
+         * expired. */
+        bool (*is_expired)(struct gk20a_channel_sync *s,
+                           struct gk20a_channel_fence *fence);
+        /* Generate a gpu wait cmdbuf from syncpoint. */
+        int (*wait_syncpt)(struct gk20a_channel_sync *s, u32 id, u32 thresh,
+                           struct priv_cmd_entry **entry);
+        /* Generate a gpu wait cmdbuf from sync fd. */
+        int (*wait_fd)(struct gk20a_channel_sync *s, int fd,
+                       struct priv_cmd_entry **entry);
+        /* Increment syncpoint/semaphore.
+         * Returns
+         *  - a gpu cmdbuf that performs the increment when executed,
+         *  - a fence that can be passed to wait_cpu() and is_expired().
+         */
+        int (*incr)(struct gk20a_channel_sync *s,
+                    struct priv_cmd_entry **entry,
+                    struct gk20a_channel_fence *fence);
+        /* Increment syncpoint/semaphore, preceded by a wfi.
+         * Returns
+         *  - a gpu cmdbuf that performs the increment when executed,
+         *  - a fence that can be passed to wait_cpu() and is_expired().
+         */
+        int (*incr_wfi)(struct gk20a_channel_sync *s,
+                        struct priv_cmd_entry **entry,
+                        struct gk20a_channel_fence *fence);
+        /* Increment syncpoint, so that the returned fence represents
+         * work completion (may need wfi) and can be returned to user space.
+         * Returns
+         *  - a gpu cmdbuf that performs the increment when executed,
+         *  - a fence that can be passed to wait_cpu() and is_expired(),
+         *  - a syncpoint id/value pair that can be returned to user space.
+         */
+        int (*incr_user_syncpt)(struct gk20a_channel_sync *s,
+                                struct priv_cmd_entry **entry,
+                                struct gk20a_channel_fence *fence,
+                                u32 *id, u32 *thresh);
+        /* Increment syncpoint/semaphore, so that the returned fence represents
+         * work completion (may need wfi) and can be returned to user space.
+         * Returns
+         *  - a gpu cmdbuf that performs the increment when executed,
+         *  - a fence that can be passed to wait_cpu() and is_expired(),
+         *  - a sync fd that can be returned to user space.
+         */
+        int (*incr_user_fd)(struct gk20a_channel_sync *s,
+                            struct priv_cmd_entry **entry,
+                            struct gk20a_channel_fence *fence,
+                            int *fd);
+        /* Reset the channel syncpoint/semaphore. */
+        void (*set_min_eq_max)(struct gk20a_channel_sync *s);
+        /* Free the resources allocated by gk20a_channel_sync_create. */
+        void (*destroy)(struct gk20a_channel_sync *s);
+};
+struct gk20a_channel_sync *gk20a_channel_sync_create(struct channel_gk20a *c);
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/clk_gk20a.c b/drivers/gpu/nvgpu/gk20a/clk_gk20a.c
new file mode 100644
index 00000000..151a332b
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/clk_gk20a.c
@@ -0,0 +1,865 @@
+/*
+ * drivers/video/tegra/host/gk20a/clk_gk20a.c
+ *
+ * GK20A Clocks
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/clk.h>
+#include <linux/delay.h>        /* for mdelay */
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/clk/tegra.h>
+#include <mach/thermal.h>
+#include "gk20a.h"
+#include "hw_trim_gk20a.h"
+#include "hw_timer_gk20a.h"
+#define gk20a_dbg_clk(fmt, arg...) \
+        gk20a_dbg(gpu_dbg_clk, fmt, ##arg)
+/* from vbios PLL info table */
+struct pll_parms gpc_pll_params = {
+        144, 2064,      /* freq */
+        1000, 2064,     /* vco */
+        12, 38,         /* u */
+        1, 255,         /* M */
+        8, 255,         /* N */
+        1, 32,          /* PL */
+};
+static int num_gpu_cooling_freq;
+static struct gpufreq_table_data *gpu_cooling_freq;
+struct gpufreq_table_data *tegra_gpufreq_table_get(void)
+{
+        return gpu_cooling_freq;
+}
+unsigned int tegra_gpufreq_table_size_get(void)
+{
+        return num_gpu_cooling_freq;
+}
+static u8 pl_to_div[] = {
+/* PL:   0, 1, 2, 3, 4, 5, 6,  7,  8,  9, 10, 11, 12, 13, 14 */
+/* p: */ 1, 2, 3, 4, 5, 6, 8, 10, 12, 16, 12, 16, 20, 24, 32 };
+/* Calculate and update M/N/PL as well as pll->freq
+    ref_clk_f = clk_in_f / src_div = clk_in_f; (src_div = 1 on gk20a)
+    u_f = ref_clk_f / M;
+    PLL output = vco_f = u_f * N = ref_clk_f * N / M;
+    gpc2clk = target clock frequency = vco_f / PL;
+    gpcclk = gpc2clk / 2; */
+static int clk_config_pll(struct clk_gk20a *clk, struct pll *pll,
+        struct pll_parms *pll_params, u32 *target_freq, bool best_fit)
+{
+        u32 min_vco_f, max_vco_f;
+        u32 best_M, best_N;
+        u32 low_PL, high_PL, best_PL;
+        u32 m, n, n2;
+        u32 target_vco_f, vco_f;
+        u32 ref_clk_f, target_clk_f, u_f;
+        u32 delta, lwv, best_delta = ~0;
+        int pl;
+        BUG_ON(target_freq == NULL);
+        gk20a_dbg_fn("request target freq %d MHz", *target_freq);
+        ref_clk_f = pll->clk_in;
+        target_clk_f = *target_freq;
+        max_vco_f = pll_params->max_vco;
+        min_vco_f = pll_params->min_vco;
+        best_M = pll_params->max_M;
+        best_N = pll_params->min_N;
+        best_PL = pll_params->min_PL;
+        target_vco_f = target_clk_f + target_clk_f / 50;
+        if (max_vco_f < target_vco_f)
+                max_vco_f = target_vco_f;
+        high_PL = (max_vco_f + target_vco_f - 1) / target_vco_f;
+        high_PL = min(high_PL, pll_params->max_PL);
+        high_PL = max(high_PL, pll_params->min_PL);
+        low_PL = min_vco_f / target_vco_f;
+        low_PL = min(low_PL, pll_params->max_PL);
+        low_PL = max(low_PL, pll_params->min_PL);
+        /* Find Indices of high_PL and low_PL */
+        for (pl = 0; pl < 14; pl++) {
+                if (pl_to_div[pl] >= low_PL) {
+                        low_PL = pl;
+                        break;
+                }
+        }
+        for (pl = 0; pl < 14; pl++) {
+                if (pl_to_div[pl] >= high_PL) {
+                        high_PL = pl;
+                        break;
+                }
+        }
+        gk20a_dbg_info("low_PL %d(div%d), high_PL %d(div%d)",
+                        low_PL, pl_to_div[low_PL], high_PL, pl_to_div[high_PL]);
+        for (pl = low_PL; pl <= high_PL; pl++) {
+                target_vco_f = target_clk_f * pl_to_div[pl];
+                for (m = pll_params->min_M; m <= pll_params->max_M; m++) {
+                        u_f = ref_clk_f / m;
+                        if (u_f < pll_params->min_u)
+                                break;
+                        if (u_f > pll_params->max_u)
+                                continue;
+                        n = (target_vco_f * m) / ref_clk_f;
+                        n2 = ((target_vco_f * m) + (ref_clk_f - 1)) / ref_clk_f;
+                        if (n > pll_params->max_N)
+                                break;
+                        for (; n <= n2; n++) {
+                                if (n < pll_params->min_N)
+                                        continue;
+                                if (n > pll_params->max_N)
+                                        break;
+                                vco_f = ref_clk_f * n / m;
+                                if (vco_f >= min_vco_f && vco_f <= max_vco_f) {
+                                        lwv = (vco_f + (pl_to_div[pl] / 2))
+                                                / pl_to_div[pl];
+                                        delta = abs(lwv - target_clk_f);
+                                        if (delta < best_delta) {
+                                                best_delta = delta;
+                                                best_M = m;
+                                                best_N = n;
+                                                best_PL = pl;
+                                                if (best_delta == 0 ||
+                                                    /* 0.45% for non best fit */
+                                                    (!best_fit && (vco_f / best_delta > 218))) {
+                                                        goto found_match;
+                                                }
+                                                gk20a_dbg_info("delta %d @ M %d, N %d, PL %d",
+                                                        delta, m, n, pl);
+                                        }
+                                }
+                        }
+                }
+        }
+found_match:
+        BUG_ON(best_delta == ~0);
+        if (best_fit && best_delta != 0)
+                gk20a_dbg_clk("no best match for target @ %dMHz on gpc_pll",
+                        target_clk_f);
+        pll->M = best_M;
+        pll->N = best_N;
+        pll->PL = best_PL;
+        /* save current frequency */
+        pll->freq = ref_clk_f * pll->N / (pll->M * pl_to_div[pll->PL]);
+        *target_freq = pll->freq;
+        gk20a_dbg_clk("actual target freq %d MHz, M %d, N %d, PL %d(div%d)",
+                *target_freq, pll->M, pll->N, pll->PL, pl_to_div[pll->PL]);
+        gk20a_dbg_fn("done");
+        return 0;
+}
+static int clk_slide_gpc_pll(struct gk20a *g, u32 n)
+{
+        u32 data, coeff;
+        u32 nold;
+        int ramp_timeout = 500;
+        /* get old coefficients */
+        coeff = gk20a_readl(g, trim_sys_gpcpll_coeff_r());
+        nold = trim_sys_gpcpll_coeff_ndiv_v(coeff);
+        /* do nothing if NDIV is same */
+        if (n == nold)
+                return 0;
+        /* setup */
+        data = gk20a_readl(g, trim_sys_gpcpll_cfg2_r());
+        data = set_field(data, trim_sys_gpcpll_cfg2_pll_stepa_m(),
+                        trim_sys_gpcpll_cfg2_pll_stepa_f(0x2b));
+        gk20a_writel(g, trim_sys_gpcpll_cfg2_r(), data);
+        data = gk20a_readl(g, trim_sys_gpcpll_cfg3_r());
+        data = set_field(data, trim_sys_gpcpll_cfg3_pll_stepb_m(),
+                        trim_sys_gpcpll_cfg3_pll_stepb_f(0xb));
+        gk20a_writel(g, trim_sys_gpcpll_cfg3_r(), data);
+        /* pll slowdown mode */
+        data = gk20a_readl(g, trim_sys_gpcpll_ndiv_slowdown_r());
+        data = set_field(data,
+                        trim_sys_gpcpll_ndiv_slowdown_slowdown_using_pll_m(),
+                        trim_sys_gpcpll_ndiv_slowdown_slowdown_using_pll_yes_f());
+        gk20a_writel(g, trim_sys_gpcpll_ndiv_slowdown_r(), data);
+        /* new ndiv ready for ramp */
+        coeff = gk20a_readl(g, trim_sys_gpcpll_coeff_r());
+        coeff = set_field(coeff, trim_sys_gpcpll_coeff_ndiv_m(),
+                        trim_sys_gpcpll_coeff_ndiv_f(n));
+        udelay(1);
+        gk20a_writel(g, trim_sys_gpcpll_coeff_r(), coeff);
+        /* dynamic ramp to new ndiv */
+        data = gk20a_readl(g, trim_sys_gpcpll_ndiv_slowdown_r());
+        data = set_field(data,
+                        trim_sys_gpcpll_ndiv_slowdown_en_dynramp_m(),
+                        trim_sys_gpcpll_ndiv_slowdown_en_dynramp_yes_f());
+        udelay(1);
+        gk20a_writel(g, trim_sys_gpcpll_ndiv_slowdown_r(), data);
+        do {
+                udelay(1);
+                ramp_timeout--;
+                data = gk20a_readl(
+                        g, trim_gpc_bcast_gpcpll_ndiv_slowdown_debug_r());
+                if (trim_gpc_bcast_gpcpll_ndiv_slowdown_debug_pll_dynramp_done_synced_v(data))
+                        break;
+        } while (ramp_timeout > 0);
+        /* exit slowdown mode */
+        data = gk20a_readl(g, trim_sys_gpcpll_ndiv_slowdown_r());
+        data = set_field(data,
+                        trim_sys_gpcpll_ndiv_slowdown_slowdown_using_pll_m(),
+                        trim_sys_gpcpll_ndiv_slowdown_slowdown_using_pll_no_f());
+        data = set_field(data,
+                        trim_sys_gpcpll_ndiv_slowdown_en_dynramp_m(),
+                        trim_sys_gpcpll_ndiv_slowdown_en_dynramp_no_f());
+        gk20a_writel(g, trim_sys_gpcpll_ndiv_slowdown_r(), data);
+        gk20a_readl(g, trim_sys_gpcpll_ndiv_slowdown_r());
+        if (ramp_timeout <= 0) {
+                gk20a_err(dev_from_gk20a(g), "gpcpll dynamic ramp timeout");
+                return -ETIMEDOUT;
+        }
+        return 0;
+}
+static int clk_program_gpc_pll(struct gk20a *g, struct clk_gk20a *clk,
+                        int allow_slide)
+{
+        u32 data, cfg, coeff, timeout;
+        u32 m, n, pl;
+        u32 nlo;
+        gk20a_dbg_fn("");
+        if (!tegra_platform_is_silicon())
+                return 0;
+        /* get old coefficients */
+        coeff = gk20a_readl(g, trim_sys_gpcpll_coeff_r());
+        m = trim_sys_gpcpll_coeff_mdiv_v(coeff);
+        n = trim_sys_gpcpll_coeff_ndiv_v(coeff);
+        pl = trim_sys_gpcpll_coeff_pldiv_v(coeff);
+        /* do NDIV slide if there is no change in M and PL */
+        cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+        if (allow_slide && clk->gpc_pll.M == m && clk->gpc_pll.PL == pl
+                && trim_sys_gpcpll_cfg_enable_v(cfg)) {
+                return clk_slide_gpc_pll(g, clk->gpc_pll.N);
+        }
+        /* slide down to NDIV_LO */
+        nlo = DIV_ROUND_UP(m * gpc_pll_params.min_vco, clk->gpc_pll.clk_in);
+        if (allow_slide && trim_sys_gpcpll_cfg_enable_v(cfg)) {
+                int ret = clk_slide_gpc_pll(g, nlo);
+                if (ret)
+                        return ret;
+        }
+        /* split FO-to-bypass jump in halfs by setting out divider 1:2 */
+        data = gk20a_readl(g, trim_sys_gpc2clk_out_r());
+        data = set_field(data, trim_sys_gpc2clk_out_vcodiv_m(),
+                trim_sys_gpc2clk_out_vcodiv_f(2));
+        gk20a_writel(g, trim_sys_gpc2clk_out_r(), data);
+        /* put PLL in bypass before programming it */
+        data = gk20a_readl(g, trim_sys_sel_vco_r());
+        data = set_field(data, trim_sys_sel_vco_gpc2clk_out_m(),
+                trim_sys_sel_vco_gpc2clk_out_bypass_f());
+        udelay(2);
+        gk20a_writel(g, trim_sys_sel_vco_r(), data);
+        /* get out from IDDQ */
+        cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+        if (trim_sys_gpcpll_cfg_iddq_v(cfg)) {
+                cfg = set_field(cfg, trim_sys_gpcpll_cfg_iddq_m(),
+                                trim_sys_gpcpll_cfg_iddq_power_on_v());
+                gk20a_writel(g, trim_sys_gpcpll_cfg_r(), cfg);
+                gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+                udelay(2);
+        }
+        /* disable PLL before changing coefficients */
+        cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+        cfg = set_field(cfg, trim_sys_gpcpll_cfg_enable_m(),
+                        trim_sys_gpcpll_cfg_enable_no_f());
+        gk20a_writel(g, trim_sys_gpcpll_cfg_r(), cfg);
+        gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+        /* change coefficients */
+        nlo = DIV_ROUND_UP(clk->gpc_pll.M * gpc_pll_params.min_vco,
+                        clk->gpc_pll.clk_in);
+        coeff = trim_sys_gpcpll_coeff_mdiv_f(clk->gpc_pll.M) |
+                trim_sys_gpcpll_coeff_ndiv_f(allow_slide ?
+                                             nlo : clk->gpc_pll.N) |
+                trim_sys_gpcpll_coeff_pldiv_f(clk->gpc_pll.PL);
+        gk20a_writel(g, trim_sys_gpcpll_coeff_r(), coeff);
+        /* enable PLL after changing coefficients */
+        cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+        cfg = set_field(cfg, trim_sys_gpcpll_cfg_enable_m(),
+                        trim_sys_gpcpll_cfg_enable_yes_f());
+        gk20a_writel(g, trim_sys_gpcpll_cfg_r(), cfg);
+        /* lock pll */
+        cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+        if (cfg & trim_sys_gpcpll_cfg_enb_lckdet_power_off_f()){
+                cfg = set_field(cfg, trim_sys_gpcpll_cfg_enb_lckdet_m(),
+                        trim_sys_gpcpll_cfg_enb_lckdet_power_on_f());
+                gk20a_writel(g, trim_sys_gpcpll_cfg_r(), cfg);
+        }
+        /* wait pll lock */
+        timeout = clk->pll_delay / 2 + 1;
+        do {
+                cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+                if (cfg & trim_sys_gpcpll_cfg_pll_lock_true_f())
+                        goto pll_locked;
+                udelay(2);
+        } while (--timeout > 0);
+        /* PLL is messed up. What can we do here? */
+        BUG();
+        return -EBUSY;
+pll_locked:
+        /* put PLL back on vco */
+        data = gk20a_readl(g, trim_sys_sel_vco_r());
+        data = set_field(data, trim_sys_sel_vco_gpc2clk_out_m(),
+                trim_sys_sel_vco_gpc2clk_out_vco_f());
+        gk20a_writel(g, trim_sys_sel_vco_r(), data);
+        clk->gpc_pll.enabled = true;
+        /* restore out divider 1:1 */
+        data = gk20a_readl(g, trim_sys_gpc2clk_out_r());
+        data = set_field(data, trim_sys_gpc2clk_out_vcodiv_m(),
+                trim_sys_gpc2clk_out_vcodiv_by1_f());
+        udelay(2);
+        gk20a_writel(g, trim_sys_gpc2clk_out_r(), data);
+        /* slide up to target NDIV */
+        return clk_slide_gpc_pll(g, clk->gpc_pll.N);
+}
+static int clk_disable_gpcpll(struct gk20a *g, int allow_slide)
+{
+        u32 cfg, coeff, m, nlo;
+        struct clk_gk20a *clk = &g->clk;
+        /* slide to VCO min */
+        cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+        if (allow_slide && trim_sys_gpcpll_cfg_enable_v(cfg)) {
+                coeff = gk20a_readl(g, trim_sys_gpcpll_coeff_r());
+                m = trim_sys_gpcpll_coeff_mdiv_v(coeff);
+                nlo = DIV_ROUND_UP(m * gpc_pll_params.min_vco,
+                                   clk->gpc_pll.clk_in);
+                clk_slide_gpc_pll(g, nlo);
+        }
+        /* put PLL in bypass before disabling it */
+        cfg = gk20a_readl(g, trim_sys_sel_vco_r());
+        cfg = set_field(cfg, trim_sys_sel_vco_gpc2clk_out_m(),
+                        trim_sys_sel_vco_gpc2clk_out_bypass_f());
+        gk20a_writel(g, trim_sys_sel_vco_r(), cfg);
+        /* disable PLL */
+        cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+        cfg = set_field(cfg, trim_sys_gpcpll_cfg_enable_m(),
+                        trim_sys_gpcpll_cfg_enable_no_f());
+        gk20a_writel(g, trim_sys_gpcpll_cfg_r(), cfg);
+        gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+        clk->gpc_pll.enabled = false;
+        return 0;
+}
+static int gk20a_init_clk_reset_enable_hw(struct gk20a *g)
+{
+        gk20a_dbg_fn("");
+        return 0;
+}
+struct clk *gk20a_clk_get(struct gk20a *g)
+{
+        if (!g->clk.tegra_clk) {
+                struct clk *clk;
+                clk = clk_get_sys("tegra_gk20a", "gpu");
+                if (IS_ERR(clk)) {
+                        gk20a_err(dev_from_gk20a(g),
+                                "fail to get tegra gpu clk tegra_gk20a/gpu");
+                        return NULL;
+                }
+                g->clk.tegra_clk = clk;
+        }
+        return g->clk.tegra_clk;
+}
+static int gk20a_init_clk_setup_sw(struct gk20a *g)
+{
+        struct clk_gk20a *clk = &g->clk;
+        static int initialized;
+        unsigned long *freqs;
+        int err, num_freqs;
+        struct clk *ref;
+        unsigned long ref_rate;
+        gk20a_dbg_fn("");
+        if (clk->sw_ready) {
+                gk20a_dbg_fn("skip init");
+                return 0;
+        }
+        if (!gk20a_clk_get(g))
+                return -EINVAL;
+        ref = clk_get_parent(clk_get_parent(clk->tegra_clk));
+        if (IS_ERR(ref)) {
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to get GPCPLL reference clock");
+                return -EINVAL;
+        }
+        ref_rate = clk_get_rate(ref);
+        clk->pll_delay = 300; /* usec */
+        clk->gpc_pll.id = GK20A_GPC_PLL;
+        clk->gpc_pll.clk_in = ref_rate / 1000000; /* MHz */
+        /* Decide initial frequency */
+        if (!initialized) {
+                initialized = 1;
+                clk->gpc_pll.M = 1;
+                clk->gpc_pll.N = DIV_ROUND_UP(gpc_pll_params.min_vco,
+                                        clk->gpc_pll.clk_in);
+                clk->gpc_pll.PL = 1;
+                clk->gpc_pll.freq = clk->gpc_pll.clk_in * clk->gpc_pll.N;
+                clk->gpc_pll.freq /= pl_to_div[clk->gpc_pll.PL];
+        }
+        err = tegra_dvfs_get_freqs(clk_get_parent(clk->tegra_clk),
+                                   &freqs, &num_freqs);
+        if (!err) {
+                int i, j;
+                /* init j for inverse traversal of frequencies */
+                j = num_freqs - 1;
+                gpu_cooling_freq = kzalloc(
+                                (1 + num_freqs) * sizeof(*gpu_cooling_freq),
+                                GFP_KERNEL);
+                /* store frequencies in inverse order */
+                for (i = 0; i < num_freqs; ++i, --j) {
+                        gpu_cooling_freq[i].index = i;
+                        gpu_cooling_freq[i].frequency = freqs[j];
+                }
+                /* add 'end of table' marker */
+                gpu_cooling_freq[i].index = i;
+                gpu_cooling_freq[i].frequency = GPUFREQ_TABLE_END;
+                /* store number of frequencies */
+                num_gpu_cooling_freq = num_freqs + 1;
+        }
+        mutex_init(&clk->clk_mutex);
+        clk->sw_ready = true;
+        gk20a_dbg_fn("done");
+        return 0;
+}
+static int gk20a_init_clk_setup_hw(struct gk20a *g)
+{
+        u32 data;
+        gk20a_dbg_fn("");
+        data = gk20a_readl(g, trim_sys_gpc2clk_out_r());
+        data = set_field(data,
+                        trim_sys_gpc2clk_out_sdiv14_m() |
+                        trim_sys_gpc2clk_out_vcodiv_m() |
+                        trim_sys_gpc2clk_out_bypdiv_m(),
+                        trim_sys_gpc2clk_out_sdiv14_indiv4_mode_f() |
+                        trim_sys_gpc2clk_out_vcodiv_by1_f() |
+                        trim_sys_gpc2clk_out_bypdiv_f(0));
+        gk20a_writel(g, trim_sys_gpc2clk_out_r(), data);
+        return 0;
+}
+static int set_pll_target(struct gk20a *g, u32 freq, u32 old_freq)
+{
+        struct clk_gk20a *clk = &g->clk;
+        if (freq > gpc_pll_params.max_freq)
+                freq = gpc_pll_params.max_freq;
+        else if (freq < gpc_pll_params.min_freq)
+                freq = gpc_pll_params.min_freq;
+        if (freq != old_freq) {
+                /* gpc_pll.freq is changed to new value here */
+                if (clk_config_pll(clk, &clk->gpc_pll, &gpc_pll_params,
+                                   &freq, true)) {
+                        gk20a_err(dev_from_gk20a(g),
+                                   "failed to set pll target for %d", freq);
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
+static int set_pll_freq(struct gk20a *g, u32 freq, u32 old_freq)
+{
+        struct clk_gk20a *clk = &g->clk;
+        int err = 0;
+        gk20a_dbg_fn("curr freq: %dMHz, target freq %dMHz", old_freq, freq);
+        if ((freq == old_freq) && clk->gpc_pll.enabled)
+                return 0;
+        /* change frequency only if power is on */
+        if (g->clk.clk_hw_on) {
+                err = clk_program_gpc_pll(g, clk, 1);
+                if (err)
+                        err = clk_program_gpc_pll(g, clk, 0);
+        }
+        /* Just report error but not restore PLL since dvfs could already change
+            voltage even when it returns error. */
+        if (err)
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to set pll to %d", freq);
+        return err;
+}
+static int gk20a_clk_export_set_rate(void *data, unsigned long *rate)
+{
+        u32 old_freq;
+        int ret = -ENODATA;
+        struct gk20a *g = data;
+        struct clk_gk20a *clk = &g->clk;
+        if (rate) {
+                mutex_lock(&clk->clk_mutex);
+                old_freq = clk->gpc_pll.freq;
+                ret = set_pll_target(g, rate_gpu_to_gpc2clk(*rate), old_freq);
+                if (!ret && clk->gpc_pll.enabled)
+                        ret = set_pll_freq(g, clk->gpc_pll.freq, old_freq);
+                if (!ret)
+                        *rate = rate_gpc2clk_to_gpu(clk->gpc_pll.freq);
+                mutex_unlock(&clk->clk_mutex);
+        }
+        return ret;
+}
+static int gk20a_clk_export_enable(void *data)
+{
+        int ret;
+        struct gk20a *g = data;
+        struct clk_gk20a *clk = &g->clk;
+        mutex_lock(&clk->clk_mutex);
+        ret = set_pll_freq(g, clk->gpc_pll.freq, clk->gpc_pll.freq);
+        mutex_unlock(&clk->clk_mutex);
+        return ret;
+}
+static void gk20a_clk_export_disable(void *data)
+{
+        struct gk20a *g = data;
+        struct clk_gk20a *clk = &g->clk;
+        mutex_lock(&clk->clk_mutex);
+        if (g->clk.clk_hw_on)
+                clk_disable_gpcpll(g, 1);
+        mutex_unlock(&clk->clk_mutex);
+}
+static void gk20a_clk_export_init(void *data, unsigned long *rate, bool *state)
+{
+        struct gk20a *g = data;
+        struct clk_gk20a *clk = &g->clk;
+        mutex_lock(&clk->clk_mutex);
+        if (state)
+                *state = clk->gpc_pll.enabled;
+        if (rate)
+                *rate = rate_gpc2clk_to_gpu(clk->gpc_pll.freq);
+        mutex_unlock(&clk->clk_mutex);
+}
+static struct tegra_clk_export_ops gk20a_clk_export_ops = {
+        .init = gk20a_clk_export_init,
+        .enable = gk20a_clk_export_enable,
+        .disable = gk20a_clk_export_disable,
+        .set_rate = gk20a_clk_export_set_rate,
+};
+static int gk20a_clk_register_export_ops(struct gk20a *g)
+{
+        int ret;
+        struct clk *c;
+        if (gk20a_clk_export_ops.data)
+                return 0;
+        gk20a_clk_export_ops.data = (void *)g;
+        c = g->clk.tegra_clk;
+        if (!c || !clk_get_parent(c))
+                return -ENOSYS;
+        ret = tegra_clk_register_export_ops(clk_get_parent(c),
+                                            &gk20a_clk_export_ops);
+        return ret;
+}
+int gk20a_init_clk_support(struct gk20a *g)
+{
+        struct clk_gk20a *clk = &g->clk;
+        u32 err;
+        gk20a_dbg_fn("");
+        clk->g = g;
+        err = gk20a_init_clk_reset_enable_hw(g);
+        if (err)
+                return err;
+        err = gk20a_init_clk_setup_sw(g);
+        if (err)
+                return err;
+        mutex_lock(&clk->clk_mutex);
+        clk->clk_hw_on = true;
+        err = gk20a_init_clk_setup_hw(g);
+        mutex_unlock(&clk->clk_mutex);
+        if (err)
+                return err;
+        err = gk20a_clk_register_export_ops(g);
+        if (err)
+                return err;
+        /* FIXME: this effectively prevents host level clock gating */
+        err = clk_enable(g->clk.tegra_clk);
+        if (err)
+                return err;
+        /* The prev call may not enable PLL if gbus is unbalanced - force it */
+        mutex_lock(&clk->clk_mutex);
+        err = set_pll_freq(g, clk->gpc_pll.freq, clk->gpc_pll.freq);
+        mutex_unlock(&clk->clk_mutex);
+        if (err)
+                return err;
+        return err;
+}
+unsigned long gk20a_clk_get_rate(struct gk20a *g)
+{
+        struct clk_gk20a *clk = &g->clk;
+        return rate_gpc2clk_to_gpu(clk->gpc_pll.freq);
+}
+long gk20a_clk_round_rate(struct gk20a *g, unsigned long rate)
+{
+        /* make sure the clock is available */
+        if (!gk20a_clk_get(g))
+                return rate;
+        return clk_round_rate(clk_get_parent(g->clk.tegra_clk), rate);
+}
+int gk20a_clk_set_rate(struct gk20a *g, unsigned long rate)
+{
+        return clk_set_rate(g->clk.tegra_clk, rate);
+}
+int gk20a_suspend_clk_support(struct gk20a *g)
+{
+        int ret;
+        clk_disable(g->clk.tegra_clk);
+        /* The prev call may not disable PLL if gbus is unbalanced - force it */
+        mutex_lock(&g->clk.clk_mutex);
+        ret = clk_disable_gpcpll(g, 1);
+        g->clk.clk_hw_on = false;
+        mutex_unlock(&g->clk.clk_mutex);
+        return ret;
+}
+#ifdef CONFIG_DEBUG_FS
+static int rate_get(void *data, u64 *val)
+{
+        struct gk20a *g = (struct gk20a *)data;
+        *val = (u64)gk20a_clk_get_rate(g);
+        return 0;
+}
+static int rate_set(void *data, u64 val)
+{
+        struct gk20a *g = (struct gk20a *)data;
+        return gk20a_clk_set_rate(g, (u32)val);
+}
+DEFINE_SIMPLE_ATTRIBUTE(rate_fops, rate_get, rate_set, "%llu\n");
+static int pll_reg_show(struct seq_file *s, void *data)
+{
+        struct gk20a *g = s->private;
+        u32 reg, m, n, pl, f;
+        mutex_lock(&g->clk.clk_mutex);
+        if (!g->clk.clk_hw_on) {
+                seq_printf(s, "gk20a powered down - no access to registers\n");
+                mutex_unlock(&g->clk.clk_mutex);
+                return 0;
+        }
+        reg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+        seq_printf(s, "cfg  = 0x%x : %s : %s\n", reg,
+                   trim_sys_gpcpll_cfg_enable_v(reg) ? "enabled" : "disabled",
+                   trim_sys_gpcpll_cfg_pll_lock_v(reg) ? "locked" : "unlocked");
+        reg = gk20a_readl(g, trim_sys_gpcpll_coeff_r());
+        m = trim_sys_gpcpll_coeff_mdiv_v(reg);
+        n = trim_sys_gpcpll_coeff_ndiv_v(reg);
+        pl = trim_sys_gpcpll_coeff_pldiv_v(reg);
+        f = g->clk.gpc_pll.clk_in * n / (m * pl_to_div[pl]);
+        seq_printf(s, "coef = 0x%x : m = %u : n = %u : pl = %u", reg, m, n, pl);
+        seq_printf(s, " : pll_f(gpu_f) = %u(%u) MHz\n", f, f/2);
+        mutex_unlock(&g->clk.clk_mutex);
+        return 0;
+}
+static int pll_reg_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, pll_reg_show, inode->i_private);
+}
+static const struct file_operations pll_reg_fops = {
+        .open           = pll_reg_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int monitor_get(void *data, u64 *val)
+{
+        struct gk20a *g = (struct gk20a *)data;
+        struct clk_gk20a *clk = &g->clk;
+        int err;
+        u32 ncycle = 100; /* count GPCCLK for ncycle of clkin */
+        u32 clkin = clk->gpc_pll.clk_in;
+        u32 count1, count2;
+        err = gk20a_busy(g->dev);
+        if (err)
+                return err;
+        gk20a_writel(g, trim_gpc_clk_cntr_ncgpcclk_cfg_r(0),
+                     trim_gpc_clk_cntr_ncgpcclk_cfg_reset_asserted_f());
+        gk20a_writel(g, trim_gpc_clk_cntr_ncgpcclk_cfg_r(0),
+                     trim_gpc_clk_cntr_ncgpcclk_cfg_enable_asserted_f() |
+                     trim_gpc_clk_cntr_ncgpcclk_cfg_write_en_asserted_f() |
+                     trim_gpc_clk_cntr_ncgpcclk_cfg_noofipclks_f(ncycle));
+        /* start */
+        /* It should take about 8us to finish 100 cycle of 12MHz.
+           But longer than 100us delay is required here. */
+        gk20a_readl(g, trim_gpc_clk_cntr_ncgpcclk_cfg_r(0));
+        udelay(2000);
+        count1 = gk20a_readl(g, trim_gpc_clk_cntr_ncgpcclk_cnt_r(0));
+        udelay(100);
+        count2 = gk20a_readl(g, trim_gpc_clk_cntr_ncgpcclk_cnt_r(0));
+        *val = (u64)(trim_gpc_clk_cntr_ncgpcclk_cnt_value_v(count2) * clkin / ncycle);
+        gk20a_idle(g->dev);
+        if (count1 != count2)
+                return -EBUSY;
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(monitor_fops, monitor_get, NULL, "%llu\n");
+int clk_gk20a_debugfs_init(struct platform_device *dev)
+{
+        struct dentry *d;
+        struct gk20a_platform *platform = platform_get_drvdata(dev);
+        struct gk20a *g = get_gk20a(dev);
+        d = debugfs_create_file(
+                "rate", S_IRUGO|S_IWUSR, platform->debugfs, g, &rate_fops);
+        if (!d)
+                goto err_out;
+        d = debugfs_create_file(
+                "pll_reg", S_IRUGO, platform->debugfs, g, &pll_reg_fops);
+        if (!d)
+                goto err_out;
+        d = debugfs_create_file(
+                "monitor", S_IRUGO, platform->debugfs, g, &monitor_fops);
+        if (!d)
+                goto err_out;
+        return 0;
+err_out:
+        pr_err("%s: Failed to make debugfs node\n", __func__);
+        debugfs_remove_recursive(platform->debugfs);
+        return -ENOMEM;
+}
+#endif /* CONFIG_DEBUG_FS */
diff --git a/drivers/gpu/nvgpu/gk20a/clk_gk20a.h b/drivers/gpu/nvgpu/gk20a/clk_gk20a.h
new file mode 100644
index 00000000..d2665259
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/clk_gk20a.h
@@ -0,0 +1,94 @@
+/*
+ * drivers/video/tegra/host/gk20a/clk_gk20a.h
+ *
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011 - 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _NVHOST_CLK_GK20A_H_
+#define _NVHOST_CLK_GK20A_H_
+#include <linux/mutex.h>
+#define GPUFREQ_TABLE_END     ~(u32)1
+enum {
+        /* only one PLL for gk20a */
+        GK20A_GPC_PLL = 0,
+};
+struct pll {
+        u32 id;
+        u32 clk_in;     /* MHz */
+        u32 M;
+        u32 N;
+        u32 PL;
+        u32 freq;       /* MHz */
+        bool enabled;
+};
+struct pll_parms {
+        u32 min_freq, max_freq; /* MHz */
+        u32 min_vco, max_vco;   /* MHz */
+        u32 min_u,   max_u;     /* MHz */
+        u32 min_M,   max_M;
+        u32 min_N,   max_N;
+        u32 min_PL,  max_PL;
+};
+struct clk_gk20a {
+        struct gk20a *g;
+        struct clk *tegra_clk;
+        struct pll gpc_pll;
+        u32 pll_delay; /* default PLL settle time */
+        struct mutex clk_mutex;
+        bool sw_ready;
+        bool clk_hw_on;
+};
+struct gpufreq_table_data {
+        unsigned int index;
+        unsigned int frequency; /* MHz */
+};
+struct gpufreq_table_data *tegra_gpufreq_table_get(void);
+unsigned int tegra_gpufreq_table_size_get(void);
+int gk20a_init_clk_support(struct gk20a *g);
+unsigned long gk20a_clk_get_rate(struct gk20a *g);
+int gk20a_clk_set_rate(struct gk20a *g, unsigned long rate);
+int gk20a_suspend_clk_support(struct gk20a *g);
+struct clk *gk20a_clk_get(struct gk20a *g);
+long gk20a_clk_round_rate(struct gk20a *g, unsigned long rate);
+extern struct pll_parms gpc_pll_params;
+#define KHZ 1000
+#define MHZ 1000000
+static inline unsigned long rate_gpc2clk_to_gpu(unsigned long rate)
+{
+        /* convert the MHz gpc2clk frequency to Hz gpcpll frequency */
+        return (rate * MHZ) / 2;
+}
+static inline unsigned long rate_gpu_to_gpc2clk(unsigned long rate)
+{
+        /* convert the Hz gpcpll frequency to MHz gpc2clk frequency */
+        return (rate * 2) / MHZ;
+}
+#endif /* _NVHOST_CLK_GK20A_H_ */
diff --git a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
new file mode 100644
index 00000000..9128959f
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
@@ -0,0 +1,240 @@
+/*
+ * GK20A Ctrl
+ *
+ * Copyright (c) 2011-2014, NVIDIA Corporation.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/highmem.h>
+#include <linux/cdev.h>
+#include <linux/nvhost_gpu_ioctl.h>
+#include "gk20a.h"
+int gk20a_ctrl_dev_open(struct inode *inode, struct file *filp)
+{
+        int err;
+        struct gk20a *g;
+        gk20a_dbg_fn("");
+        g = container_of(inode->i_cdev,
+                         struct gk20a, ctrl.cdev);
+        filp->private_data = g->dev;
+        err = gk20a_get_client(g);
+        if (err) {
+                gk20a_dbg_fn("fail to get channel!");
+                return err;
+        }
+        return 0;
+}
+int gk20a_ctrl_dev_release(struct inode *inode, struct file *filp)
+{
+        struct platform_device *dev = filp->private_data;
+        gk20a_dbg_fn("");
+        gk20a_put_client(get_gk20a(dev));
+        return 0;
+}
+static long
+gk20a_ctrl_ioctl_gpu_characteristics(
+        struct gk20a *g,
+        struct nvhost_gpu_get_characteristics *request)
+{
+        struct nvhost_gpu_characteristics *pgpu = &g->gpu_characteristics;
+        long err = 0;
+        if (request->gpu_characteristics_buf_size > 0) {
+                size_t write_size = sizeof(*pgpu);
+                if (write_size > request->gpu_characteristics_buf_size)
+                        write_size = request->gpu_characteristics_buf_size;
+                err = copy_to_user((void __user *)(uintptr_t)
+                                   request->gpu_characteristics_buf_addr,
+                                   pgpu, write_size);
+        }
+        if (err == 0)
+                request->gpu_characteristics_buf_size = sizeof(*pgpu);
+        return err;
+}
+long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+        struct platform_device *dev = filp->private_data;
+        struct gk20a *g = get_gk20a(dev);
+        struct nvhost_gpu_zcull_get_ctx_size_args *get_ctx_size_args;
+        struct nvhost_gpu_zcull_get_info_args *get_info_args;
+        struct nvhost_gpu_zbc_set_table_args *set_table_args;
+        struct nvhost_gpu_zbc_query_table_args *query_table_args;
+        u8 buf[NVHOST_GPU_IOCTL_MAX_ARG_SIZE];
+        struct gr_zcull_info *zcull_info;
+        struct zbc_entry *zbc_val;
+        struct zbc_query_params *zbc_tbl;
+        int i, err = 0;
+        gk20a_dbg_fn("");
+        if ((_IOC_TYPE(cmd) != NVHOST_GPU_IOCTL_MAGIC) ||
+                (_IOC_NR(cmd) == 0) ||
+                (_IOC_NR(cmd) > NVHOST_GPU_IOCTL_LAST))
+                return -EFAULT;
+        BUG_ON(_IOC_SIZE(cmd) > NVHOST_GPU_IOCTL_MAX_ARG_SIZE);
+        if (_IOC_DIR(cmd) & _IOC_WRITE) {
+                if (copy_from_user(buf, (void __user *)arg, _IOC_SIZE(cmd)))
+                        return -EFAULT;
+        }
+        if (!g->gr.sw_ready) {
+                err = gk20a_busy(g->dev);
+                if (err)
+                        return err;
+                gk20a_idle(g->dev);
+        }
+        switch (cmd) {
+        case NVHOST_GPU_IOCTL_ZCULL_GET_CTX_SIZE:
+                get_ctx_size_args = (struct nvhost_gpu_zcull_get_ctx_size_args *)buf;
+                get_ctx_size_args->size = gr_gk20a_get_ctxsw_zcull_size(g, &g->gr);
+                break;
+        case NVHOST_GPU_IOCTL_ZCULL_GET_INFO:
+                get_info_args = (struct nvhost_gpu_zcull_get_info_args *)buf;
+                memset(get_info_args, 0, sizeof(struct nvhost_gpu_zcull_get_info_args));
+                zcull_info = kzalloc(sizeof(struct gr_zcull_info), GFP_KERNEL);
+                if (zcull_info == NULL)
+                        return -ENOMEM;
+                err = gr_gk20a_get_zcull_info(g, &g->gr, zcull_info);
+                if (err) {
+                        kfree(zcull_info);
+                        break;
+                }
+                get_info_args->width_align_pixels = zcull_info->width_align_pixels;
+                get_info_args->height_align_pixels = zcull_info->height_align_pixels;
+                get_info_args->pixel_squares_by_aliquots = zcull_info->pixel_squares_by_aliquots;
+                get_info_args->aliquot_total = zcull_info->aliquot_total;
+                get_info_args->region_byte_multiplier = zcull_info->region_byte_multiplier;
+                get_info_args->region_header_size = zcull_info->region_header_size;
+                get_info_args->subregion_header_size = zcull_info->subregion_header_size;
+                get_info_args->subregion_width_align_pixels = zcull_info->subregion_width_align_pixels;
+                get_info_args->subregion_height_align_pixels = zcull_info->subregion_height_align_pixels;
+                get_info_args->subregion_count = zcull_info->subregion_count;
+                kfree(zcull_info);
+                break;
+        case NVHOST_GPU_IOCTL_ZBC_SET_TABLE:
+                set_table_args = (struct nvhost_gpu_zbc_set_table_args *)buf;
+                zbc_val = kzalloc(sizeof(struct zbc_entry), GFP_KERNEL);
+                if (zbc_val == NULL)
+                        return -ENOMEM;
+                zbc_val->format = set_table_args->format;
+                zbc_val->type = set_table_args->type;
+                switch (zbc_val->type) {
+                case GK20A_ZBC_TYPE_COLOR:
+                        for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+                                zbc_val->color_ds[i] = set_table_args->color_ds[i];
+                                zbc_val->color_l2[i] = set_table_args->color_l2[i];
+                        }
+                        break;
+                case GK20A_ZBC_TYPE_DEPTH:
+                        zbc_val->depth = set_table_args->depth;
+                        break;
+                default:
+                        err = -EINVAL;
+                }
+                if (!err) {
+                        gk20a_busy(dev);
+                        err = gk20a_gr_zbc_set_table(g, &g->gr, zbc_val);
+                        gk20a_idle(dev);
+                }
+                if (zbc_val)
+                        kfree(zbc_val);
+                break;
+        case NVHOST_GPU_IOCTL_ZBC_QUERY_TABLE:
+                query_table_args = (struct nvhost_gpu_zbc_query_table_args *)buf;
+                zbc_tbl = kzalloc(sizeof(struct zbc_query_params), GFP_KERNEL);
+                if (zbc_tbl == NULL)
+                        return -ENOMEM;
+                zbc_tbl->type = query_table_args->type;
+                zbc_tbl->index_size = query_table_args->index_size;
+                err = gr_gk20a_query_zbc(g, &g->gr, zbc_tbl);
+                if (!err) {
+                        switch (zbc_tbl->type) {
+                        case GK20A_ZBC_TYPE_COLOR:
+                                for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+                                        query_table_args->color_ds[i] = zbc_tbl->color_ds[i];
+                                        query_table_args->color_l2[i] = zbc_tbl->color_l2[i];
+                                }
+                                break;
+                        case GK20A_ZBC_TYPE_DEPTH:
+                                query_table_args->depth = zbc_tbl->depth;
+                                break;
+                        case GK20A_ZBC_TYPE_INVALID:
+                                query_table_args->index_size = zbc_tbl->index_size;
+                                break;
+                        default:
+                                err = -EINVAL;
+                        }
+                        if (!err) {
+                                query_table_args->format = zbc_tbl->format;
+                                query_table_args->ref_cnt = zbc_tbl->ref_cnt;
+                        }
+                }
+                if (zbc_tbl)
+                        kfree(zbc_tbl);
+                break;
+        case NVHOST_GPU_IOCTL_GET_CHARACTERISTICS:
+                err = gk20a_ctrl_ioctl_gpu_characteristics(
+                        g, (struct nvhost_gpu_get_characteristics *)buf);
+                break;
+        default:
+                gk20a_err(dev_from_gk20a(g), "unrecognized gpu ioctl cmd: 0x%x", cmd);
+                err = -ENOTTY;
+                break;
+        }
+        if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
+                err = copy_to_user((void __user *)arg, buf, _IOC_SIZE(cmd));
+        return err;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.h b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.h
new file mode 100644
index 00000000..ac9c253e
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.h
@@ -0,0 +1,28 @@
+/*
+ * drivers/video/tegra/host/gk20a/gk20a_ctrl.h
+ *
+ * GK20A Ctrl
+ *
+ * Copyright (c) 2011-2012, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _NVHOST_GK20A_CTRL_H_
+#define _NVHOST_GK20A_CTRL_H_
+int gk20a_ctrl_dev_open(struct inode *inode, struct file *filp);
+int gk20a_ctrl_dev_release(struct inode *inode, struct file *filp);
+long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+#endif /* _NVHOST_GK20A_CTRL_H_ */
diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
new file mode 100644
index 00000000..da7d733e
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
@@ -0,0 +1,699 @@
+/*
+ * Tegra GK20A GPU Debugger/Profiler Driver
+ *
+ * Copyright (c) 2013-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/cdev.h>
+#include <linux/uaccess.h>
+#include <linux/nvhost.h>
+#include <linux/nvhost_dbg_gpu_ioctl.h>
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include "dbg_gpu_gk20a.h"
+#include "regops_gk20a.h"
+#include "hw_therm_gk20a.h"
+struct dbg_gpu_session_ops dbg_gpu_session_ops_gk20a = {
+        .exec_reg_ops = exec_regops_gk20a,
+};
+/* silly allocator - just increment session id */
+static atomic_t session_id = ATOMIC_INIT(0);
+static int generate_session_id(void)
+{
+        return atomic_add_return(1, &session_id);
+}
+static int alloc_session(struct dbg_session_gk20a **_dbg_s)
+{
+        struct dbg_session_gk20a *dbg_s;
+        *_dbg_s = NULL;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+        dbg_s = kzalloc(sizeof(*dbg_s), GFP_KERNEL);
+        if (!dbg_s)
+                return -ENOMEM;
+        dbg_s->id = generate_session_id();
+        dbg_s->ops = &dbg_gpu_session_ops_gk20a;
+        *_dbg_s = dbg_s;
+        return 0;
+}
+int gk20a_dbg_gpu_do_dev_open(struct inode *inode, struct file *filp, bool is_profiler)
+{
+        struct dbg_session_gk20a *dbg_session;
+        struct gk20a *g;
+        struct platform_device *pdev;
+        struct device *dev;
+        int err;
+        if (!is_profiler)
+                g = container_of(inode->i_cdev,
+                                 struct gk20a, dbg.cdev);
+        else
+                g = container_of(inode->i_cdev,
+                                 struct gk20a, prof.cdev);
+        pdev = g->dev;
+        dev  = &pdev->dev;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "dbg session: %s", dev_name(dev));
+        err  = alloc_session(&dbg_session);
+        if (err)
+                return err;
+        filp->private_data = dbg_session;
+        dbg_session->pdev  = pdev;
+        dbg_session->dev   = dev;
+        dbg_session->g     = g;
+        dbg_session->is_profiler = is_profiler;
+        dbg_session->is_pg_disabled = false;
+        INIT_LIST_HEAD(&dbg_session->dbg_s_list_node);
+        init_waitqueue_head(&dbg_session->dbg_events.wait_queue);
+        dbg_session->dbg_events.events_enabled = false;
+        dbg_session->dbg_events.num_pending_events = 0;
+        return 0;
+}
+/* used in scenarios where the debugger session can take just the inter-session
+ * lock for performance, but the profiler session must take the per-gpu lock
+ * since it might not have an associated channel. */
+static void gk20a_dbg_session_mutex_lock(struct dbg_session_gk20a *dbg_s)
+{
+        if (dbg_s->is_profiler)
+                mutex_lock(&dbg_s->g->dbg_sessions_lock);
+        else
+                mutex_lock(&dbg_s->ch->dbg_s_lock);
+}
+static void gk20a_dbg_session_mutex_unlock(struct dbg_session_gk20a *dbg_s)
+{
+        if (dbg_s->is_profiler)
+                mutex_unlock(&dbg_s->g->dbg_sessions_lock);
+        else
+                mutex_unlock(&dbg_s->ch->dbg_s_lock);
+}
+static void gk20a_dbg_gpu_events_enable(struct dbg_session_gk20a *dbg_s)
+{
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+        gk20a_dbg_session_mutex_lock(dbg_s);
+        dbg_s->dbg_events.events_enabled = true;
+        dbg_s->dbg_events.num_pending_events = 0;
+        gk20a_dbg_session_mutex_unlock(dbg_s);
+}
+static void gk20a_dbg_gpu_events_disable(struct dbg_session_gk20a *dbg_s)
+{
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+        gk20a_dbg_session_mutex_lock(dbg_s);
+        dbg_s->dbg_events.events_enabled = false;
+        dbg_s->dbg_events.num_pending_events = 0;
+        gk20a_dbg_session_mutex_unlock(dbg_s);
+}
+static void gk20a_dbg_gpu_events_clear(struct dbg_session_gk20a *dbg_s)
+{
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+        gk20a_dbg_session_mutex_lock(dbg_s);
+        if (dbg_s->dbg_events.events_enabled &&
+                        dbg_s->dbg_events.num_pending_events > 0)
+                dbg_s->dbg_events.num_pending_events--;
+        gk20a_dbg_session_mutex_unlock(dbg_s);
+}
+static int gk20a_dbg_gpu_events_ctrl(struct dbg_session_gk20a *dbg_s,
+                          struct nvhost_dbg_gpu_events_ctrl_args *args)
+{
+        int ret = 0;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "dbg events ctrl cmd %d", args->cmd);
+        if (!dbg_s->ch) {
+                gk20a_err(dev_from_gk20a(dbg_s->g),
+                           "no channel bound to dbg session\n");
+                return -EINVAL;
+        }
+        switch (args->cmd) {
+        case NVHOST_DBG_GPU_EVENTS_CTRL_CMD_ENABLE:
+                gk20a_dbg_gpu_events_enable(dbg_s);
+                break;
+        case NVHOST_DBG_GPU_EVENTS_CTRL_CMD_DISABLE:
+                gk20a_dbg_gpu_events_disable(dbg_s);
+                break;
+        case NVHOST_DBG_GPU_EVENTS_CTRL_CMD_CLEAR:
+                gk20a_dbg_gpu_events_clear(dbg_s);
+                break;
+        default:
+                gk20a_err(dev_from_gk20a(dbg_s->g),
+                           "unrecognized dbg gpu events ctrl cmd: 0x%x",
+                           args->cmd);
+                ret = -EINVAL;
+                break;
+        }
+        return ret;
+}
+unsigned int gk20a_dbg_gpu_dev_poll(struct file *filep, poll_table *wait)
+{
+        unsigned int mask = 0;
+        struct dbg_session_gk20a *dbg_s = filep->private_data;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+        poll_wait(filep, &dbg_s->dbg_events.wait_queue, wait);
+        gk20a_dbg_session_mutex_lock(dbg_s);
+        if (dbg_s->dbg_events.events_enabled &&
+                        dbg_s->dbg_events.num_pending_events > 0) {
+                gk20a_dbg(gpu_dbg_gpu_dbg, "found pending event on session id %d",
+                                dbg_s->id);
+                gk20a_dbg(gpu_dbg_gpu_dbg, "%d events pending",
+                                dbg_s->dbg_events.num_pending_events);
+                mask = (POLLPRI | POLLIN);
+        }
+        gk20a_dbg_session_mutex_unlock(dbg_s);
+        return mask;
+}
+int gk20a_dbg_gpu_dev_open(struct inode *inode, struct file *filp)
+{
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+        return gk20a_dbg_gpu_do_dev_open(inode, filp, false /* not profiler */);
+}
+int gk20a_prof_gpu_dev_open(struct inode *inode, struct file *filp)
+{
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+        return gk20a_dbg_gpu_do_dev_open(inode, filp, true /* is profiler */);
+}
+void gk20a_dbg_gpu_post_events(struct channel_gk20a *ch)
+{
+        struct dbg_session_gk20a *dbg_s;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+        /* guard against the session list being modified */
+        mutex_lock(&ch->dbg_s_lock);
+        list_for_each_entry(dbg_s, &ch->dbg_s_list, dbg_s_list_node) {
+                if (dbg_s->dbg_events.events_enabled) {
+                        gk20a_dbg(gpu_dbg_gpu_dbg, "posting event on session id %d",
+                                        dbg_s->id);
+                        gk20a_dbg(gpu_dbg_gpu_dbg, "%d events pending",
+                                        dbg_s->dbg_events.num_pending_events);
+                        dbg_s->dbg_events.num_pending_events++;
+                        wake_up_interruptible_all(&dbg_s->dbg_events.wait_queue);
+                }
+        }
+        mutex_unlock(&ch->dbg_s_lock);
+}
+static int dbg_set_powergate(struct dbg_session_gk20a *dbg_s,
+                                __u32  powermode);
+static int dbg_unbind_channel_gk20a(struct dbg_session_gk20a *dbg_s)
+{
+        struct channel_gk20a *ch_gk20a = dbg_s->ch;
+        struct gk20a *g = dbg_s->g;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+        /* wasn't bound to start with ? */
+        if (!ch_gk20a) {
+                gk20a_dbg(gpu_dbg_gpu_dbg | gpu_dbg_fn, "not bound already?");
+                return -ENODEV;
+        }
+        mutex_lock(&g->dbg_sessions_lock);
+        mutex_lock(&ch_gk20a->dbg_s_lock);
+        --g->dbg_sessions;
+        /* Powergate enable is called here as possibility of dbg_session
+         * which called powergate disable ioctl, to be killed without calling
+         * powergate enable ioctl
+         */
+        dbg_set_powergate(dbg_s, NVHOST_DBG_GPU_POWERGATE_MODE_ENABLE);
+        dbg_s->ch = NULL;
+        fput(dbg_s->ch_f);
+        dbg_s->ch_f = NULL;
+        list_del_init(&dbg_s->dbg_s_list_node);
+        mutex_unlock(&ch_gk20a->dbg_s_lock);
+        mutex_unlock(&g->dbg_sessions_lock);
+        return 0;
+}
+int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp)
+{
+        struct dbg_session_gk20a *dbg_s = filp->private_data;
+        gk20a_dbg(gpu_dbg_gpu_dbg | gpu_dbg_fn, "%s", dev_name(dbg_s->dev));
+        /* unbind if it was bound */
+        if (!dbg_s->ch)
+                return 0;
+        dbg_unbind_channel_gk20a(dbg_s);
+        kfree(dbg_s);
+        return 0;
+}
+static int dbg_bind_channel_gk20a(struct dbg_session_gk20a *dbg_s,
+                          struct nvhost_dbg_gpu_bind_channel_args *args)
+{
+        struct file *f;
+        struct gk20a *g;
+        struct channel_gk20a *ch;
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_gpu_dbg, "%s fd=%d",
+                   dev_name(dbg_s->dev), args->channel_fd);
+        if (args->channel_fd == ~0)
+                return dbg_unbind_channel_gk20a(dbg_s);
+        /* even though get_file_channel is doing this it releases it as well */
+        /* by holding it here we'll keep it from disappearing while the
+         * debugger is in session */
+        f = fget(args->channel_fd);
+        if (!f)
+                return -ENODEV;
+        ch = gk20a_get_channel_from_file(args->channel_fd);
+        if (!ch) {
+                gk20a_dbg_fn("no channel found for fd");
+                fput(f);
+                return -EINVAL;
+        }
+        g = dbg_s->g;
+        gk20a_dbg_fn("%s hwchid=%d", dev_name(dbg_s->dev), ch->hw_chid);
+        mutex_lock(&g->dbg_sessions_lock);
+        mutex_lock(&ch->dbg_s_lock);
+        dbg_s->ch_f = f;
+        dbg_s->ch = ch;
+        list_add(&dbg_s->dbg_s_list_node, &dbg_s->ch->dbg_s_list);
+        g->dbg_sessions++;
+        mutex_unlock(&ch->dbg_s_lock);
+        mutex_unlock(&g->dbg_sessions_lock);
+        return 0;
+}
+static int nvhost_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s,
+                                struct nvhost_dbg_gpu_exec_reg_ops_args *args);
+static int nvhost_ioctl_powergate_gk20a(struct dbg_session_gk20a *dbg_s,
+                                struct nvhost_dbg_gpu_powergate_args *args);
+static int nvhost_dbg_gpu_ioctl_smpc_ctxsw_mode(struct dbg_session_gk20a *dbg_s,
+                              struct nvhost_dbg_gpu_smpc_ctxsw_mode_args *args);
+long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
+                             unsigned long arg)
+{
+        struct dbg_session_gk20a *dbg_s = filp->private_data;
+        struct gk20a *g = get_gk20a(dbg_s->pdev);
+        u8 buf[NVHOST_DBG_GPU_IOCTL_MAX_ARG_SIZE];
+        int err = 0;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+        if ((_IOC_TYPE(cmd) != NVHOST_DBG_GPU_IOCTL_MAGIC) ||
+            (_IOC_NR(cmd) == 0) ||
+            (_IOC_NR(cmd) > NVHOST_DBG_GPU_IOCTL_LAST))
+                return -EFAULT;
+        BUG_ON(_IOC_SIZE(cmd) > NVHOST_DBG_GPU_IOCTL_MAX_ARG_SIZE);
+        if (_IOC_DIR(cmd) & _IOC_WRITE) {
+                if (copy_from_user(buf, (void __user *)arg, _IOC_SIZE(cmd)))
+                        return -EFAULT;
+        }
+        if (!g->gr.sw_ready) {
+                err = gk20a_busy(g->dev);
+                if (err)
+                        return err;
+                gk20a_idle(g->dev);
+        }
+        switch (cmd) {
+        case NVHOST_DBG_GPU_IOCTL_BIND_CHANNEL:
+                err = dbg_bind_channel_gk20a(dbg_s,
+                             (struct nvhost_dbg_gpu_bind_channel_args *)buf);
+                gk20a_dbg(gpu_dbg_gpu_dbg, "ret=%d", err);
+                break;
+        case NVHOST_DBG_GPU_IOCTL_REG_OPS:
+                err = nvhost_ioctl_channel_reg_ops(dbg_s,
+                           (struct nvhost_dbg_gpu_exec_reg_ops_args *)buf);
+                gk20a_dbg(gpu_dbg_gpu_dbg, "ret=%d", err);
+                break;
+        case NVHOST_DBG_GPU_IOCTL_POWERGATE:
+                err = nvhost_ioctl_powergate_gk20a(dbg_s,
+                           (struct nvhost_dbg_gpu_powergate_args *)buf);
+                gk20a_dbg(gpu_dbg_gpu_dbg, "ret=%d", err);
+                break;
+        case NVHOST_DBG_GPU_IOCTL_EVENTS_CTRL:
+                err = gk20a_dbg_gpu_events_ctrl(dbg_s,
+                           (struct nvhost_dbg_gpu_events_ctrl_args *)buf);
+                break;
+        case NVHOST_DBG_GPU_IOCTL_SMPC_CTXSW_MODE:
+                err = nvhost_dbg_gpu_ioctl_smpc_ctxsw_mode(dbg_s,
+                           (struct nvhost_dbg_gpu_smpc_ctxsw_mode_args *)buf);
+                break;
+        default:
+                gk20a_err(dev_from_gk20a(g),
+                           "unrecognized dbg gpu ioctl cmd: 0x%x",
+                           cmd);
+                err = -ENOTTY;
+                break;
+        }
+        if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
+                err = copy_to_user((void __user *)arg,
+                                   buf, _IOC_SIZE(cmd));
+        return err;
+}
+/* In order to perform a context relative op the context has
+ * to be created already... which would imply that the
+ * context switch mechanism has already been put in place.
+ * So by the time we perform such an opertation it should always
+ * be possible to query for the appropriate context offsets, etc.
+ *
+ * But note: while the dbg_gpu bind requires the a channel fd,
+ * it doesn't require an allocated gr/compute obj at that point...
+ */
+static bool gr_context_info_available(struct dbg_session_gk20a *dbg_s,
+                                      struct gr_gk20a *gr)
+{
+        int err;
+        mutex_lock(&gr->ctx_mutex);
+        err = !gr->ctx_vars.golden_image_initialized;
+        mutex_unlock(&gr->ctx_mutex);
+        if (err)
+                return false;
+        return true;
+}
+static int nvhost_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s,
+                                struct nvhost_dbg_gpu_exec_reg_ops_args *args)
+{
+        int err;
+        struct device *dev = dbg_s->dev;
+        struct gk20a *g = get_gk20a(dbg_s->pdev);
+        struct nvhost_dbg_gpu_reg_op *ops;
+        u64 ops_size = sizeof(ops[0]) * args->num_ops;
+        gk20a_dbg_fn("%d ops, total size %llu", args->num_ops, ops_size);
+        if (!dbg_s->ops) {
+                gk20a_err(dev, "can't call reg_ops on an unbound debugger session");
+                return -EINVAL;
+        }
+        if (!dbg_s->is_profiler && !dbg_s->ch) {
+                gk20a_err(dev, "bind a channel before regops for a debugging session");
+                return -EINVAL;
+        }
+        /* be sure that ctx info is in place */
+        if (!gr_context_info_available(dbg_s, &g->gr)) {
+                gk20a_err(dev, "gr context data not available\n");
+                return -ENODEV;
+        }
+        ops = kzalloc(ops_size, GFP_KERNEL);
+        if (!ops) {
+                gk20a_err(dev, "Allocating memory failed!");
+                return -ENOMEM;
+        }
+        gk20a_dbg_fn("Copying regops from userspace");
+        if (copy_from_user(ops, (void *)(uintptr_t)args->ops, ops_size)) {
+                dev_err(dev, "copy_from_user failed!");
+                err = -EFAULT;
+                goto clean_up;
+        }
+        /* since exec_reg_ops sends methods to the ucode, it must take the
+         * global gpu lock to protect against mixing methods from debug sessions
+         * on other channels */
+        mutex_lock(&g->dbg_sessions_lock);
+        err = dbg_s->ops->exec_reg_ops(dbg_s, ops, args->num_ops);
+        mutex_unlock(&g->dbg_sessions_lock);
+        if (err) {
+                gk20a_err(dev, "dbg regops failed");
+                goto clean_up;
+        }
+        gk20a_dbg_fn("Copying result to userspace");
+        if (copy_to_user((void *)(uintptr_t)args->ops, ops, ops_size)) {
+                dev_err(dev, "copy_to_user failed!");
+                err = -EFAULT;
+                goto clean_up;
+        }
+        return 0;
+ clean_up:
+        kfree(ops);
+        return err;
+}
+static int dbg_set_powergate(struct dbg_session_gk20a *dbg_s,
+                                __u32  powermode)
+{
+        int err = 0;
+        struct gk20a *g = get_gk20a(dbg_s->pdev);
+         /* This function must be called with g->dbg_sessions_lock held */
+        gk20a_dbg(gpu_dbg_fn|gpu_dbg_gpu_dbg, "%s powergate mode = %d",
+                   dev_name(dbg_s->dev), powermode);
+        switch (powermode) {
+        case NVHOST_DBG_GPU_POWERGATE_MODE_DISABLE:
+                /* save off current powergate, clk state.
+                 * set gpu module's can_powergate = 0.
+                 * set gpu module's clk to max.
+                 * while *a* debug session is active there will be no power or
+                 * clocking state changes allowed from mainline code (but they
+                 * should be saved).
+                 */
+                /* Allow powergate disable if the current dbg_session doesn't
+                 * call a powergate disable ioctl and the global
+                 * powergating_disabled_refcount is zero
+                 */
+                if ((dbg_s->is_pg_disabled == false) &&
+                    (g->dbg_powergating_disabled_refcount++ == 0)) {
+                        gk20a_dbg(gpu_dbg_gpu_dbg | gpu_dbg_fn, "module busy");
+                        gk20a_busy(g->dev);
+                        gk20a_channel_busy(dbg_s->pdev);
+                        g->ops.clock_gating.slcg_gr_load_gating_prod(g,
+                                        false);
+                        g->ops.clock_gating.slcg_perf_load_gating_prod(g,
+                                        false);
+                        gr_gk20a_init_blcg_mode(g, BLCG_RUN, ENGINE_GR_GK20A);
+                        g->elcg_enabled = false;
+                        gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
+                        gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
+                        gk20a_pmu_disable_elpg(g);
+                }
+                dbg_s->is_pg_disabled = true;
+                break;
+        case NVHOST_DBG_GPU_POWERGATE_MODE_ENABLE:
+                /* restore (can) powergate, clk state */
+                /* release pending exceptions to fault/be handled as usual */
+                /*TBD: ordering of these? */
+                /* Re-enabling powergate as no other sessions want
+                 * powergate disabled and the current dbg-sessions had
+                 * requested the powergate disable through ioctl
+                */
+                if (dbg_s->is_pg_disabled &&
+                    --g->dbg_powergating_disabled_refcount == 0) {
+                        g->elcg_enabled = true;
+                        gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A);
+                        gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A);
+                        gr_gk20a_init_blcg_mode(g, BLCG_AUTO, ENGINE_GR_GK20A);
+                        g->ops.clock_gating.slcg_gr_load_gating_prod(g,
+                                        g->slcg_enabled);
+                        g->ops.clock_gating.slcg_perf_load_gating_prod(g,
+                                        g->slcg_enabled);
+                        gk20a_pmu_enable_elpg(g);
+                        gk20a_dbg(gpu_dbg_gpu_dbg | gpu_dbg_fn, "module idle");
+                        gk20a_channel_idle(dbg_s->pdev);
+                        gk20a_idle(g->dev);
+                }
+                dbg_s->is_pg_disabled = false;
+                break;
+        default:
+                gk20a_err(dev_from_gk20a(g),
+                           "unrecognized dbg gpu powergate mode: 0x%x",
+                           powermode);
+                err = -ENOTTY;
+                break;
+        }
+        return err;
+}
+static int nvhost_ioctl_powergate_gk20a(struct dbg_session_gk20a *dbg_s,
+                                struct nvhost_dbg_gpu_powergate_args *args)
+{
+        int err;
+        struct gk20a *g = get_gk20a(dbg_s->pdev);
+        gk20a_dbg_fn("%s  powergate mode = %d",
+                      dev_name(dbg_s->dev), args->mode);
+        mutex_lock(&g->dbg_sessions_lock);
+        err = dbg_set_powergate(dbg_s, args->mode);
+        mutex_unlock(&g->dbg_sessions_lock);
+        return  err;
+}
+static int nvhost_dbg_gpu_ioctl_smpc_ctxsw_mode(struct dbg_session_gk20a *dbg_s,
+                               struct nvhost_dbg_gpu_smpc_ctxsw_mode_args *args)
+{
+        int err;
+        struct gk20a *g = get_gk20a(dbg_s->pdev);
+        struct channel_gk20a *ch_gk20a;
+        gk20a_dbg_fn("%s smpc ctxsw mode = %d",
+                     dev_name(dbg_s->dev), args->mode);
+        /* Take the global lock, since we'll be doing global regops */
+        mutex_lock(&g->dbg_sessions_lock);
+        ch_gk20a = dbg_s->ch;
+        if (!ch_gk20a) {
+                gk20a_err(dev_from_gk20a(dbg_s->g),
+                          "no bound channel for smpc ctxsw mode update\n");
+                err = -EINVAL;
+                goto clean_up;
+        }
+        err = gr_gk20a_update_smpc_ctxsw_mode(g, ch_gk20a,
+                      args->mode == NVHOST_DBG_GPU_SMPC_CTXSW_MODE_CTXSW);
+        if (err) {
+                gk20a_err(dev_from_gk20a(dbg_s->g),
+                          "error (%d) during smpc ctxsw mode update\n", err);
+                goto clean_up;
+        }
+        /* The following regops are a hack/war to make up for the fact that we
+         * just scribbled into the ctxsw image w/o really knowing whether
+         * it was already swapped out in/out once or not, etc.
+         */
+        {
+                struct nvhost_dbg_gpu_reg_op ops[4];
+                int i;
+                for (i = 0; i < ARRAY_SIZE(ops); i++) {
+                        ops[i].op     = NVHOST_DBG_GPU_REG_OP_WRITE_32;
+                        ops[i].type   = NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX;
+                        ops[i].status = NVHOST_DBG_GPU_REG_OP_STATUS_SUCCESS;
+                        ops[i].value_hi      = 0;
+                        ops[i].and_n_mask_lo = 0;
+                        ops[i].and_n_mask_hi = 0;
+                }
+                /* gr_pri_gpcs_tpcs_sm_dsm_perf_counter_control_sel1_r();*/
+                ops[0].offset   = 0x00419e08;
+                ops[0].value_lo = 0x1d;
+                /* gr_pri_gpcs_tpcs_sm_dsm_perf_counter_control5_r(); */
+                ops[1].offset   = 0x00419e58;
+                ops[1].value_lo = 0x1;
+                /* gr_pri_gpcs_tpcs_sm_dsm_perf_counter_control3_r(); */
+                ops[2].offset   = 0x00419e68;
+                ops[2].value_lo = 0xaaaa;
+                /* gr_pri_gpcs_tpcs_sm_dsm_perf_counter4_control_r(); */
+                ops[3].offset   = 0x00419f40;
+                ops[3].value_lo = 0x18;
+                err = dbg_s->ops->exec_reg_ops(dbg_s, ops, ARRAY_SIZE(ops));
+        }
+ clean_up:
+        mutex_unlock(&g->dbg_sessions_lock);
+        return  err;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h
new file mode 100644
index 00000000..49827608
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h
@@ -0,0 +1,83 @@
+/*
+ * Tegra GK20A GPU Debugger Driver
+ *
+ * Copyright (c) 2013-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __DBG_GPU_GK20A_H_
+#define __DBG_GPU_GK20A_H_
+#include <linux/poll.h>
+/* module debug driver interface */
+int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp);
+int gk20a_dbg_gpu_dev_open(struct inode *inode, struct file *filp);
+long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+unsigned int gk20a_dbg_gpu_dev_poll(struct file *filep, poll_table *wait);
+/* used by profiler driver interface */
+int gk20a_prof_gpu_dev_open(struct inode *inode, struct file *filp);
+/* used by the interrupt handler to post events */
+void gk20a_dbg_gpu_post_events(struct channel_gk20a *fault_ch);
+struct dbg_gpu_session_ops {
+        int (*exec_reg_ops)(struct dbg_session_gk20a *dbg_s,
+                            struct nvhost_dbg_gpu_reg_op *ops,
+                            u64 num_ops);
+};
+struct dbg_gpu_session_events {
+        wait_queue_head_t wait_queue;
+        bool events_enabled;
+        int num_pending_events;
+};
+struct dbg_session_gk20a {
+        /* dbg session id used for trace/prints */
+        int id;
+        /* profiler session, if any */
+        bool is_profiler;
+        /* power enabled or disabled */
+        bool is_pg_disabled;
+        /*
+         * There can be different versions of the whitelists
+         * between both global and per-context sets; as well
+         * as between debugger and profiler interfaces.
+         */
+        struct regops_whitelist *global;
+        struct regops_whitelist *per_context;
+        /* gpu module vagaries */
+        struct device             *dev;
+        struct platform_device    *pdev;
+        struct gk20a              *g;
+        /* bound channel, if any */
+        struct file          *ch_f;
+        struct channel_gk20a *ch;
+        /* session operations */
+        struct dbg_gpu_session_ops *ops;
+        /* event support */
+        struct dbg_gpu_session_events dbg_events;
+        struct list_head dbg_s_list_node;
+};
+extern struct dbg_gpu_session_ops dbg_gpu_session_ops_gk20a;
+#endif /* __DBG_GPU_GK20A_H_ */
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
new file mode 100644
index 00000000..c5b6953c
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
@@ -0,0 +1,295 @@
+/*
+ * drivers/video/tegra/host/t20/debug_gk20a.c
+ *
+ * Copyright (C) 2011-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/nvhost.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/io.h>
+#include "gk20a.h"
+#include "debug_gk20a.h"
+#include "hw_ram_gk20a.h"
+#include "hw_fifo_gk20a.h"
+#include "hw_ccsr_gk20a.h"
+#include "hw_pbdma_gk20a.h"
+unsigned int gk20a_debug_trace_cmdbuf;
+struct platform_device *gk20a_device;
+struct gk20a_debug_output {
+        void (*fn)(void *ctx, const char *str, size_t len);
+        void *ctx;
+        char buf[256];
+};
+static const char * const ccsr_chan_status_str[] = {
+        "idle",
+        "pending",
+        "pending_ctx_reload",
+        "pending_acquire",
+        "pending_acq_ctx_reload",
+        "on_pbdma",
+        "on_pbdma_and_eng",
+        "on_eng",
+        "on_eng_pending_acquire",
+        "on_eng_pending",
+        "on_pbdma_ctx_reload",
+        "on_pbdma_and_eng_ctx_reload",
+        "on_eng_ctx_reload",
+        "on_eng_pending_ctx_reload",
+        "on_eng_pending_acq_ctx_reload",
+};
+static const char * const chan_status_str[] = {
+        "invalid",
+        "valid",
+        "chsw_load",
+        "chsw_save",
+        "chsw_switch",
+};
+static const char * const ctx_status_str[] = {
+        "invalid",
+        "valid",
+        NULL,
+        NULL,
+        NULL,
+        "ctxsw_load",
+        "ctxsw_save",
+        "ctxsw_switch",
+};
+static inline void gk20a_debug_write_printk(void *ctx, const char *str,
+                                            size_t len)
+{
+        pr_info("%s", str);
+}
+static inline void gk20a_debug_write_to_seqfile(void *ctx, const char *str,
+                                                size_t len)
+{
+        seq_write((struct seq_file *)ctx, str, len);
+}
+void gk20a_debug_output(struct gk20a_debug_output *o, const char *fmt, ...)
+{
+        va_list args;
+        int len;
+        va_start(args, fmt);
+        len = vsnprintf(o->buf, sizeof(o->buf), fmt, args);
+        va_end(args);
+        o->fn(o->ctx, o->buf, len);
+}
+static void gk20a_debug_show_channel(struct gk20a *g,
+                                     struct gk20a_debug_output *o,
+                                     struct channel_gk20a *ch)
+{
+        u32 channel = gk20a_readl(g, ccsr_channel_r(ch->hw_chid));
+        u32 status = ccsr_channel_status_v(channel);
+        u32 syncpointa, syncpointb;
+        void *inst_ptr;
+        inst_ptr = ch->inst_block.cpuva;
+        if (!inst_ptr)
+                return;
+        syncpointa = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointa_w());
+        syncpointb = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointb_w());
+        gk20a_debug_output(o, "%d-%s, pid %d: ", ch->hw_chid,
+                        ch->g->dev->name,
+                        ch->pid);
+        gk20a_debug_output(o, "%s in use %s %s\n",
+                        ccsr_channel_enable_v(channel) ? "" : "not",
+                        ccsr_chan_status_str[status],
+                        ccsr_channel_busy_v(channel) ? "busy" : "not busy");
+        gk20a_debug_output(o, "TOP: %016llx PUT: %016llx GET: %016llx "
+                        "FETCH: %016llx\nHEADER: %08x COUNT: %08x\n"
+                        "SYNCPOINT %08x %08x SEMAPHORE %08x %08x %08x %08x\n",
+                (u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_top_level_get_w()) +
+                ((u64)gk20a_mem_rd32(inst_ptr,
+                        ram_fc_pb_top_level_get_hi_w()) << 32ULL),
+                (u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_put_w()) +
+                ((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_put_hi_w()) << 32ULL),
+                (u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_get_w()) +
+                ((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_get_hi_w()) << 32ULL),
+                (u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_fetch_w()) +
+                ((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_fetch_hi_w()) << 32ULL),
+                gk20a_mem_rd32(inst_ptr, ram_fc_pb_header_w()),
+                gk20a_mem_rd32(inst_ptr, ram_fc_pb_count_w()),
+                syncpointa,
+                syncpointb,
+                gk20a_mem_rd32(inst_ptr, ram_fc_semaphorea_w()),
+                gk20a_mem_rd32(inst_ptr, ram_fc_semaphoreb_w()),
+                gk20a_mem_rd32(inst_ptr, ram_fc_semaphorec_w()),
+                gk20a_mem_rd32(inst_ptr, ram_fc_semaphored_w()));
+        if ((pbdma_syncpointb_op_v(syncpointb) == pbdma_syncpointb_op_wait_v())
+                && (pbdma_syncpointb_wait_switch_v(syncpointb) ==
+                        pbdma_syncpointb_wait_switch_en_v()))
+                gk20a_debug_output(o, "Waiting on syncpt %u (%s) val %u\n",
+                        pbdma_syncpointb_syncpt_index_v(syncpointb),
+                        nvhost_syncpt_get_name(
+                                to_platform_device(g->dev->dev.parent),
+                                pbdma_syncpointb_syncpt_index_v(syncpointb)),
+                        pbdma_syncpointa_payload_v(syncpointa));
+        gk20a_debug_output(o, "\n");
+}
+void gk20a_debug_show_dump(struct platform_device *pdev,
+                           struct gk20a_debug_output *o)
+{
+        struct gk20a_platform *platform = gk20a_get_platform(pdev);
+        struct gk20a *g = platform->g;
+        struct fifo_gk20a *f = &g->fifo;
+        u32 chid;
+        int i;
+        gk20a_busy(g->dev);
+        for (i = 0; i < fifo_pbdma_status__size_1_v(); i++) {
+                u32 status = gk20a_readl(g, fifo_pbdma_status_r(i));
+                u32 chan_status = fifo_pbdma_status_chan_status_v(status);
+                gk20a_debug_output(o, "%s pbdma %d: ", g->dev->name, i);
+                gk20a_debug_output(o,
+                                "id: %d (%s), next_id: %d (%s) status: %s\n",
+                                fifo_pbdma_status_id_v(status),
+                                fifo_pbdma_status_id_type_v(status) ?
+                                        "tsg" : "channel",
+                                fifo_pbdma_status_next_id_v(status),
+                                fifo_pbdma_status_next_id_type_v(status) ?
+                                        "tsg" : "channel",
+                                chan_status_str[chan_status]);
+                gk20a_debug_output(o, "PUT: %016llx GET: %016llx "
+                                "FETCH: %08x HEADER: %08x\n",
+                        (u64)gk20a_readl(g, pbdma_put_r(i)) +
+                        ((u64)gk20a_readl(g, pbdma_put_hi_r(i)) << 32ULL),
+                        (u64)gk20a_readl(g, pbdma_get_r(i)) +
+                        ((u64)gk20a_readl(g, pbdma_get_hi_r(i)) << 32ULL),
+                        gk20a_readl(g, pbdma_gp_fetch_r(i)),
+                        gk20a_readl(g, pbdma_pb_header_r(i)));
+        }
+        gk20a_debug_output(o, "\n");
+        for (i = 0; i < fifo_engine_status__size_1_v(); i++) {
+                u32 status = gk20a_readl(g, fifo_engine_status_r(i));
+                u32 ctx_status = fifo_engine_status_ctx_status_v(status);
+                gk20a_debug_output(o, "%s eng %d: ", g->dev->name, i);
+                gk20a_debug_output(o,
+                                "id: %d (%s), next_id: %d (%s), ctx: %s ",
+                                fifo_engine_status_id_v(status),
+                                fifo_engine_status_id_type_v(status) ?
+                                        "tsg" : "channel",
+                                fifo_engine_status_next_id_v(status),
+                                fifo_engine_status_next_id_type_v(status) ?
+                                        "tsg" : "channel",
+                                ctx_status_str[ctx_status]);
+                if (fifo_engine_status_faulted_v(status))
+                        gk20a_debug_output(o, "faulted ");
+                if (fifo_engine_status_engine_v(status))
+                        gk20a_debug_output(o, "busy ");
+                gk20a_debug_output(o, "\n");
+        }
+        gk20a_debug_output(o, "\n");
+        for (chid = 0; chid < f->num_channels; chid++) {
+                if (f->channel[chid].in_use) {
+                        struct channel_gk20a *gpu_ch = &f->channel[chid];
+                        gk20a_debug_show_channel(g, o, gpu_ch);
+                }
+        }
+        gk20a_idle(g->dev);
+}
+void gk20a_debug_dump(struct platform_device *pdev)
+{
+        struct gk20a_platform *platform = gk20a_get_platform(pdev);
+        struct gk20a_debug_output o = {
+                .fn = gk20a_debug_write_printk
+        };
+        if (platform->dump_platform_dependencies)
+                platform->dump_platform_dependencies(pdev);
+        gk20a_debug_show_dump(pdev, &o);
+}
+void gk20a_debug_dump_device(struct platform_device *pdev)
+{
+        struct gk20a_debug_output o = {
+                .fn = gk20a_debug_write_printk
+        };
+        /* Dump the first device if no info is provided */
+        if (!pdev && gk20a_device)
+                pdev = gk20a_device;
+        gk20a_debug_show_dump(pdev, &o);
+}
+EXPORT_SYMBOL(gk20a_debug_dump_device);
+static int gk20a_debug_show(struct seq_file *s, void *unused)
+{
+        struct platform_device *pdev = s->private;
+        struct gk20a_debug_output o = {
+                .fn = gk20a_debug_write_to_seqfile,
+                .ctx = s,
+        };
+        gk20a_debug_show_dump(pdev, &o);
+        return 0;
+}
+static int gk20a_debug_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, gk20a_debug_show, inode->i_private);
+}
+static const struct file_operations gk20a_debug_fops = {
+        .open           = gk20a_debug_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+void gk20a_debug_init(struct platform_device *pdev)
+{
+        struct gk20a_platform *platform = platform_get_drvdata(pdev);
+        /* Store the first device */
+        if (!gk20a_device)
+                gk20a_device = pdev;
+        platform->debugfs = debugfs_create_dir(pdev->name, NULL);
+        debugfs_create_file("status", S_IRUGO, platform->debugfs,
+                        pdev, &gk20a_debug_fops);
+        debugfs_create_u32("trace_cmdbuf", S_IRUGO|S_IWUSR, platform->debugfs,
+                        &gk20a_debug_trace_cmdbuf);
+#if defined(GK20A_DEBUG)
+        debugfs_create_u32("dbg_mask", S_IRUGO|S_IWUSR, platform->debugfs,
+                        &gk20a_dbg_mask);
+        debugfs_create_u32("dbg_ftrace", S_IRUGO|S_IWUSR, platform->debugfs,
+                        &gk20a_dbg_ftrace);
+#endif
+}
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.h b/drivers/gpu/nvgpu/gk20a/debug_gk20a.h
new file mode 100644
index 00000000..cd2e09c3
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/debug_gk20a.h
@@ -0,0 +1,25 @@
+/*
+ * GK20A Debug functionality
+ *
+ * Copyright (C) 2011-2014 NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef _DEBUG_GK20A_H_
+#define _DEBUG_GK20A_H_
+extern unsigned int gk20a_debug_trace_cmdbuf;
+void gk20a_debug_dump(struct platform_device *pdev);
+void gk20a_debug_init(struct platform_device *pdev);
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/fb_gk20a.c b/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
new file mode 100644
index 00000000..52f2db4d
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
@@ -0,0 +1,37 @@
+/*
+ * GK20A memory interface
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/types.h>
+#include "gk20a.h"
+#include "kind_gk20a.h"
+#include "hw_mc_gk20a.h"
+static void fb_gk20a_reset(struct gk20a *g)
+{
+        gk20a_dbg_info("reset gk20a fb");
+        gk20a_reset(g, mc_enable_pfb_enabled_f()
+                        | mc_enable_l2_enabled_f()
+                        | mc_enable_xbar_enabled_f()
+                        | mc_enable_hub_enabled_f());
+}
+void gk20a_init_fb(struct gpu_ops *gops)
+{
+        gops->fb.reset = fb_gk20a_reset;
+        gk20a_init_uncompressed_kind_map();
+        gk20a_init_kind_attr();
+}
diff --git a/drivers/gpu/nvgpu/gk20a/fb_gk20a.h b/drivers/gpu/nvgpu/gk20a/fb_gk20a.h
new file mode 100644
index 00000000..34c21c9b
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/fb_gk20a.h
@@ -0,0 +1,21 @@
+/*
+ * GK20A FB
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef _NVHOST_GK20A_FB
+#define _NVHOST_GK20A_FB
+struct gk20a;
+void gk20a_init_fb(struct gpu_ops *gops);
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
new file mode 100644
index 00000000..5575b995
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -0,0 +1,1836 @@
+/*
+ * drivers/video/tegra/host/gk20a/fifo_gk20a.c
+ *
+ * GK20A Graphics FIFO (gr host)
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/scatterlist.h>
+#include <trace/events/gk20a.h>
+#include <linux/dma-mapping.h>
+#include <linux/nvhost.h>
+#include "gk20a.h"
+#include "debug_gk20a.h"
+#include "hw_fifo_gk20a.h"
+#include "hw_pbdma_gk20a.h"
+#include "hw_ccsr_gk20a.h"
+#include "hw_ram_gk20a.h"
+#include "hw_proj_gk20a.h"
+#include "hw_top_gk20a.h"
+#include "hw_mc_gk20a.h"
+#include "hw_gr_gk20a.h"
+static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
+                                            u32 hw_chid, bool add,
+                                            bool wait_for_finish);
+static void gk20a_fifo_handle_mmu_fault_thread(struct work_struct *work);
+/*
+ * Link engine IDs to MMU IDs and vice versa.
+ */
+static inline u32 gk20a_engine_id_to_mmu_id(u32 engine_id)
+{
+        switch (engine_id) {
+        case ENGINE_GR_GK20A:
+                return 0x00;
+        case ENGINE_CE2_GK20A:
+                return 0x1b;
+        default:
+                return ~0;
+        }
+}
+static inline u32 gk20a_mmu_id_to_engine_id(u32 engine_id)
+{
+        switch (engine_id) {
+        case 0x00:
+                return ENGINE_GR_GK20A;
+        case 0x1b:
+                return ENGINE_CE2_GK20A;
+        default:
+                return ~0;
+        }
+}
+static int init_engine_info(struct fifo_gk20a *f)
+{
+        struct gk20a *g = f->g;
+        struct device *d = dev_from_gk20a(g);
+        struct fifo_engine_info_gk20a *gr_info;
+        const u32 gr_sw_id = ENGINE_GR_GK20A;
+        u32 i;
+        u32 max_info_entries = top_device_info__size_1_v();
+        gk20a_dbg_fn("");
+        /* all we really care about finding is the graphics entry    */
+        /* especially early on in sim it probably thinks it has more */
+        f->num_engines = 1;
+        gr_info = f->engine_info + gr_sw_id;
+        gr_info->sw_id = gr_sw_id;
+        gr_info->name = "gr";
+        gr_info->dev_info_id = top_device_info_type_enum_graphics_v();
+        gr_info->mmu_fault_id = fifo_intr_mmu_fault_eng_id_graphics_v();
+        gr_info->runlist_id = ~0;
+        gr_info->pbdma_id   = ~0;
+        gr_info->engine_id  = ~0;
+        for (i = 0; i < max_info_entries; i++) {
+                u32 table_entry = gk20a_readl(f->g, top_device_info_r(i));
+                u32 entry = top_device_info_entry_v(table_entry);
+                u32 engine_enum = top_device_info_type_enum_v(table_entry);
+                u32 table_entry2 = 0;
+                if (entry == top_device_info_entry_not_valid_v())
+                        continue;
+                if (top_device_info_chain_v(table_entry) ==
+                    top_device_info_chain_enable_v()) {
+                        table_entry2 = gk20a_readl(f->g,
+                                                   top_device_info_r(++i));
+                        engine_enum = top_device_info_type_enum_v(table_entry2);
+                }
+                /* we only care about GR engine here */
+                if (entry == top_device_info_entry_enum_v() &&
+                    engine_enum == gr_info->dev_info_id) {
+                        int pbdma_id;
+                        u32 runlist_bit;
+                        gr_info->runlist_id =
+                                top_device_info_runlist_enum_v(table_entry);
+                        gk20a_dbg_info("gr info: runlist_id %d", gr_info->runlist_id);
+                        gr_info->engine_id =
+                                top_device_info_engine_enum_v(table_entry);
+                        gk20a_dbg_info("gr info: engine_id %d", gr_info->engine_id);
+                        runlist_bit = 1 << gr_info->runlist_id;
+                        for (pbdma_id = 0; pbdma_id < f->num_pbdma; pbdma_id++) {
+                                gk20a_dbg_info("gr info: pbdma_map[%d]=%d",
+                                        pbdma_id, f->pbdma_map[pbdma_id]);
+                                if (f->pbdma_map[pbdma_id] & runlist_bit)
+                                        break;
+                        }
+                        if (pbdma_id == f->num_pbdma) {
+                                gk20a_err(d, "busted pbmda map");
+                                return -EINVAL;
+                        }
+                        gr_info->pbdma_id = pbdma_id;
+                        break;
+                }
+        }
+        if (gr_info->runlist_id == ~0) {
+                gk20a_err(d, "busted device info");
+                return -EINVAL;
+        }
+        return 0;
+}
+void gk20a_remove_fifo_support(struct fifo_gk20a *f)
+{
+        struct gk20a *g = f->g;
+        struct device *d = dev_from_gk20a(g);
+        struct fifo_engine_info_gk20a *engine_info;
+        struct fifo_runlist_info_gk20a *runlist;
+        u32 runlist_id;
+        u32 i;
+        gk20a_dbg_fn("");
+        if (f->channel) {
+                int c;
+                for (c = 0; c < f->num_channels; c++) {
+                        if (f->channel[c].remove_support)
+                                f->channel[c].remove_support(f->channel+c);
+                }
+                kfree(f->channel);
+        }
+        if (f->userd.gpu_va)
+                gk20a_gmmu_unmap(&g->mm.bar1.vm,
+                                f->userd.gpu_va,
+                                f->userd.size,
+                                gk20a_mem_flag_none);
+        if (f->userd.sgt)
+                gk20a_free_sgtable(&f->userd.sgt);
+        if (f->userd.cpuva)
+                dma_free_coherent(d,
+                                f->userd_total_size,
+                                f->userd.cpuva,
+                                f->userd.iova);
+        f->userd.cpuva = NULL;
+        f->userd.iova = 0;
+        engine_info = f->engine_info + ENGINE_GR_GK20A;
+        runlist_id = engine_info->runlist_id;
+        runlist = &f->runlist_info[runlist_id];
+        for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
+                if (runlist->mem[i].cpuva)
+                        dma_free_coherent(d,
+                                runlist->mem[i].size,
+                                runlist->mem[i].cpuva,
+                                runlist->mem[i].iova);
+                runlist->mem[i].cpuva = NULL;
+                runlist->mem[i].iova = 0;
+        }
+        kfree(runlist->active_channels);
+        kfree(f->runlist_info);
+        kfree(f->pbdma_map);
+        kfree(f->engine_info);
+}
+/* reads info from hardware and fills in pbmda exception info record */
+static inline void get_exception_pbdma_info(
+        struct gk20a *g,
+        struct fifo_engine_info_gk20a *eng_info)
+{
+        struct fifo_pbdma_exception_info_gk20a *e =
+                &eng_info->pbdma_exception_info;
+        u32 pbdma_status_r = e->status_r = gk20a_readl(g,
+                   fifo_pbdma_status_r(eng_info->pbdma_id));
+        e->id = fifo_pbdma_status_id_v(pbdma_status_r); /* vs. id_hw_v()? */
+        e->id_is_chid = fifo_pbdma_status_id_type_v(pbdma_status_r) ==
+                fifo_pbdma_status_id_type_chid_v();
+        e->chan_status_v  = fifo_pbdma_status_chan_status_v(pbdma_status_r);
+        e->next_id_is_chid =
+                fifo_pbdma_status_next_id_type_v(pbdma_status_r) ==
+                fifo_pbdma_status_next_id_type_chid_v();
+        e->next_id = fifo_pbdma_status_next_id_v(pbdma_status_r);
+        e->chsw_in_progress =
+                fifo_pbdma_status_chsw_v(pbdma_status_r) ==
+                fifo_pbdma_status_chsw_in_progress_v();
+}
+static void fifo_pbdma_exception_status(struct gk20a *g,
+        struct fifo_engine_info_gk20a *eng_info)
+{
+        struct fifo_pbdma_exception_info_gk20a *e;
+        get_exception_pbdma_info(g, eng_info);
+        e = &eng_info->pbdma_exception_info;
+        gk20a_dbg_fn("pbdma_id %d, "
+                      "id_type %s, id %d, chan_status %d, "
+                      "next_id_type %s, next_id %d, "
+                      "chsw_in_progress %d",
+                      eng_info->pbdma_id,
+                      e->id_is_chid ? "chid" : "tsgid", e->id, e->chan_status_v,
+                      e->next_id_is_chid ? "chid" : "tsgid", e->next_id,
+                      e->chsw_in_progress);
+}
+/* reads info from hardware and fills in pbmda exception info record */
+static inline void get_exception_engine_info(
+        struct gk20a *g,
+        struct fifo_engine_info_gk20a *eng_info)
+{
+        struct fifo_engine_exception_info_gk20a *e =
+                &eng_info->engine_exception_info;
+        u32 engine_status_r = e->status_r =
+                gk20a_readl(g, fifo_engine_status_r(eng_info->engine_id));
+        e->id = fifo_engine_status_id_v(engine_status_r); /* vs. id_hw_v()? */
+        e->id_is_chid = fifo_engine_status_id_type_v(engine_status_r) ==
+                fifo_engine_status_id_type_chid_v();
+        e->ctx_status_v = fifo_engine_status_ctx_status_v(engine_status_r);
+        e->faulted =
+                fifo_engine_status_faulted_v(engine_status_r) ==
+                fifo_engine_status_faulted_true_v();
+        e->idle =
+                fifo_engine_status_engine_v(engine_status_r) ==
+                fifo_engine_status_engine_idle_v();
+        e->ctxsw_in_progress =
+                fifo_engine_status_ctxsw_v(engine_status_r) ==
+                fifo_engine_status_ctxsw_in_progress_v();
+}
+static void fifo_engine_exception_status(struct gk20a *g,
+                               struct fifo_engine_info_gk20a *eng_info)
+{
+        struct fifo_engine_exception_info_gk20a *e;
+        get_exception_engine_info(g, eng_info);
+        e = &eng_info->engine_exception_info;
+        gk20a_dbg_fn("engine_id %d, id_type %s, id %d, ctx_status %d, "
+                      "faulted %d, idle %d, ctxsw_in_progress %d, ",
+                      eng_info->engine_id, e->id_is_chid ? "chid" : "tsgid",
+                      e->id, e->ctx_status_v,
+                      e->faulted, e->idle,  e->ctxsw_in_progress);
+}
+static int init_runlist(struct gk20a *g, struct fifo_gk20a *f)
+{
+        struct fifo_engine_info_gk20a *engine_info;
+        struct fifo_runlist_info_gk20a *runlist;
+        struct device *d = dev_from_gk20a(g);
+        u32 runlist_id;
+        u32 i;
+        u64 runlist_size;
+        gk20a_dbg_fn("");
+        f->max_runlists = fifo_eng_runlist_base__size_1_v();
+        f->runlist_info = kzalloc(sizeof(struct fifo_runlist_info_gk20a) *
+                                  f->max_runlists, GFP_KERNEL);
+        if (!f->runlist_info)
+                goto clean_up;
+        engine_info = f->engine_info + ENGINE_GR_GK20A;
+        runlist_id = engine_info->runlist_id;
+        runlist = &f->runlist_info[runlist_id];
+        runlist->active_channels =
+                kzalloc(DIV_ROUND_UP(f->num_channels, BITS_PER_BYTE),
+                        GFP_KERNEL);
+        if (!runlist->active_channels)
+                goto clean_up_runlist_info;
+        runlist_size  = ram_rl_entry_size_v() * f->num_channels;
+        for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
+                dma_addr_t iova;
+                runlist->mem[i].cpuva =
+                        dma_alloc_coherent(d,
+                                        runlist_size,
+                                        &iova,
+                                        GFP_KERNEL);
+                if (!runlist->mem[i].cpuva) {
+                        dev_err(d, "memory allocation failed\n");
+                        goto clean_up_runlist;
+                }
+                runlist->mem[i].iova = iova;
+                runlist->mem[i].size = runlist_size;
+        }
+        mutex_init(&runlist->mutex);
+        init_waitqueue_head(&runlist->runlist_wq);
+        /* None of buffers is pinned if this value doesn't change.
+            Otherwise, one of them (cur_buffer) must have been pinned. */
+        runlist->cur_buffer = MAX_RUNLIST_BUFFERS;
+        gk20a_dbg_fn("done");
+        return 0;
+clean_up_runlist:
+        for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
+                if (runlist->mem[i].cpuva)
+                        dma_free_coherent(d,
+                                runlist->mem[i].size,
+                                runlist->mem[i].cpuva,
+                                runlist->mem[i].iova);
+                runlist->mem[i].cpuva = NULL;
+                runlist->mem[i].iova = 0;
+        }
+        kfree(runlist->active_channels);
+        runlist->active_channels = NULL;
+clean_up_runlist_info:
+        kfree(f->runlist_info);
+        f->runlist_info = NULL;
+clean_up:
+        gk20a_dbg_fn("fail");
+        return -ENOMEM;
+}
+#define GRFIFO_TIMEOUT_CHECK_PERIOD_US 100000
+int gk20a_init_fifo_reset_enable_hw(struct gk20a *g)
+{
+        u32 intr_stall;
+        u32 mask;
+        u32 timeout;
+        int i;
+        gk20a_dbg_fn("");
+        /* enable pmc pfifo */
+        gk20a_reset(g, mc_enable_pfifo_enabled_f()
+                        | mc_enable_ce2_enabled_f());
+        /* enable pbdma */
+        mask = 0;
+        for (i = 0; i < proj_host_num_pbdma_v(); ++i)
+                mask |= mc_enable_pb_sel_f(mc_enable_pb_0_enabled_v(), i);
+        gk20a_writel(g, mc_enable_pb_r(), mask);
+        /* enable pfifo interrupt */
+        gk20a_writel(g, fifo_intr_0_r(), 0xFFFFFFFF);
+        gk20a_writel(g, fifo_intr_en_0_r(), 0x7FFFFFFF);
+        gk20a_writel(g, fifo_intr_en_1_r(), 0x80000000);
+        /* enable pbdma interrupt */
+        mask = 0;
+        for (i = 0; i < proj_host_num_pbdma_v(); i++) {
+                intr_stall = gk20a_readl(g, pbdma_intr_stall_r(i));
+                intr_stall &= ~pbdma_intr_stall_lbreq_enabled_f();
+                gk20a_writel(g, pbdma_intr_stall_r(i), intr_stall);
+                gk20a_writel(g, pbdma_intr_0_r(i), 0xFFFFFFFF);
+                gk20a_writel(g, pbdma_intr_en_0_r(i),
+                        (~0) & ~pbdma_intr_en_0_lbreq_enabled_f());
+                gk20a_writel(g, pbdma_intr_1_r(i), 0xFFFFFFFF);
+                gk20a_writel(g, pbdma_intr_en_1_r(i), 0xFFFFFFFF);
+        }
+        /* TBD: apply overrides */
+        /* TBD: BLCG prod */
+        /* reset runlist interrupts */
+        gk20a_writel(g, fifo_intr_runlist_r(), ~0);
+        /* TBD: do we need those? */
+        timeout = gk20a_readl(g, fifo_fb_timeout_r());
+        timeout = set_field(timeout, fifo_fb_timeout_period_m(),
+                        fifo_fb_timeout_period_max_f());
+        gk20a_writel(g, fifo_fb_timeout_r(), timeout);
+        if (tegra_platform_is_silicon()) {
+                timeout = gk20a_readl(g, fifo_pb_timeout_r());
+                timeout &= ~fifo_pb_timeout_detection_enabled_f();
+                gk20a_writel(g, fifo_pb_timeout_r(), timeout);
+        }
+        timeout = GRFIFO_TIMEOUT_CHECK_PERIOD_US |
+                        fifo_eng_timeout_detection_enabled_f();
+        gk20a_writel(g, fifo_eng_timeout_r(), timeout);
+        gk20a_dbg_fn("done");
+        return 0;
+}
+static void gk20a_init_fifo_pbdma_intr_descs(struct fifo_gk20a *f)
+{
+        /* These are all errors which indicate something really wrong
+         * going on in the device. */
+        f->intr.pbdma.device_fatal_0 =
+                pbdma_intr_0_memreq_pending_f() |
+                pbdma_intr_0_memack_timeout_pending_f() |
+                pbdma_intr_0_memack_extra_pending_f() |
+                pbdma_intr_0_memdat_timeout_pending_f() |
+                pbdma_intr_0_memdat_extra_pending_f() |
+                pbdma_intr_0_memflush_pending_f() |
+                pbdma_intr_0_memop_pending_f() |
+                pbdma_intr_0_lbconnect_pending_f() |
+                pbdma_intr_0_lbreq_pending_f() |
+                pbdma_intr_0_lback_timeout_pending_f() |
+                pbdma_intr_0_lback_extra_pending_f() |
+                pbdma_intr_0_lbdat_timeout_pending_f() |
+                pbdma_intr_0_lbdat_extra_pending_f() |
+                pbdma_intr_0_xbarconnect_pending_f() |
+                pbdma_intr_0_pri_pending_f();
+        /* These are data parsing, framing errors or others which can be
+         * recovered from with intervention... or just resetting the
+         * channel. */
+        f->intr.pbdma.channel_fatal_0 =
+                pbdma_intr_0_gpfifo_pending_f() |
+                pbdma_intr_0_gpptr_pending_f() |
+                pbdma_intr_0_gpentry_pending_f() |
+                pbdma_intr_0_gpcrc_pending_f() |
+                pbdma_intr_0_pbptr_pending_f() |
+                pbdma_intr_0_pbentry_pending_f() |
+                pbdma_intr_0_pbcrc_pending_f() |
+                pbdma_intr_0_method_pending_f() |
+                pbdma_intr_0_methodcrc_pending_f() |
+                pbdma_intr_0_pbseg_pending_f() |
+                pbdma_intr_0_signature_pending_f();
+        /* Can be used for sw-methods, or represents
+         * a recoverable timeout. */
+        f->intr.pbdma.restartable_0 =
+                pbdma_intr_0_device_pending_f() |
+                pbdma_intr_0_acquire_pending_f();
+}
+static int gk20a_init_fifo_setup_sw(struct gk20a *g)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        struct device *d = dev_from_gk20a(g);
+        int chid, i, err = 0;
+        dma_addr_t iova;
+        gk20a_dbg_fn("");
+        if (f->sw_ready) {
+                gk20a_dbg_fn("skip init");
+                return 0;
+        }
+        f->g = g;
+        INIT_WORK(&f->fault_restore_thread,
+                  gk20a_fifo_handle_mmu_fault_thread);
+        mutex_init(&f->intr.isr.mutex);
+        gk20a_init_fifo_pbdma_intr_descs(f); /* just filling in data/tables */
+        f->num_channels = ccsr_channel__size_1_v();
+        f->num_pbdma = proj_host_num_pbdma_v();
+        f->max_engines = ENGINE_INVAL_GK20A;
+        f->userd_entry_size = 1 << ram_userd_base_shift_v();
+        f->userd_total_size = f->userd_entry_size * f->num_channels;
+        f->userd.cpuva = dma_alloc_coherent(d,
+                                        f->userd_total_size,
+                                        &iova,
+                                        GFP_KERNEL);
+        if (!f->userd.cpuva) {
+                dev_err(d, "memory allocation failed\n");
+                goto clean_up;
+        }
+        f->userd.iova = iova;
+        err = gk20a_get_sgtable(d, &f->userd.sgt,
+                                f->userd.cpuva, f->userd.iova,
+                                f->userd_total_size);
+        if (err) {
+                dev_err(d, "failed to create sg table\n");
+                goto clean_up;
+        }
+        /* bar1 va */
+        f->userd.gpu_va = gk20a_gmmu_map(&g->mm.bar1.vm,
+                                        &f->userd.sgt,
+                                        f->userd_total_size,
+                                        0, /* flags */
+                                        gk20a_mem_flag_none);
+        if (!f->userd.gpu_va) {
+                dev_err(d, "gmmu mapping failed\n");
+                goto clean_up;
+        }
+        gk20a_dbg(gpu_dbg_map, "userd bar1 va = 0x%llx", f->userd.gpu_va);
+        f->userd.size = f->userd_total_size;
+        f->channel = kzalloc(f->num_channels * sizeof(*f->channel),
+                                GFP_KERNEL);
+        f->pbdma_map = kzalloc(f->num_pbdma * sizeof(*f->pbdma_map),
+                                GFP_KERNEL);
+        f->engine_info = kzalloc(f->max_engines * sizeof(*f->engine_info),
+                                GFP_KERNEL);
+        if (!(f->channel && f->pbdma_map && f->engine_info)) {
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        /* pbdma map needs to be in place before calling engine info init */
+        for (i = 0; i < f->num_pbdma; ++i)
+                f->pbdma_map[i] = gk20a_readl(g, fifo_pbdma_map_r(i));
+        init_engine_info(f);
+        init_runlist(g, f);
+        for (chid = 0; chid < f->num_channels; chid++) {
+                f->channel[chid].userd_cpu_va =
+                        f->userd.cpuva + chid * f->userd_entry_size;
+                f->channel[chid].userd_iova =
+                        NV_MC_SMMU_VADDR_TRANSLATE(f->userd.iova)
+                                + chid * f->userd_entry_size;
+                f->channel[chid].userd_gpu_va =
+                        f->userd.gpu_va + chid * f->userd_entry_size;
+                gk20a_init_channel_support(g, chid);
+        }
+        mutex_init(&f->ch_inuse_mutex);
+        f->remove_support = gk20a_remove_fifo_support;
+        f->deferred_reset_pending = false;
+        mutex_init(&f->deferred_reset_mutex);
+        f->sw_ready = true;
+        gk20a_dbg_fn("done");
+        return 0;
+clean_up:
+        gk20a_dbg_fn("fail");
+        if (f->userd.gpu_va)
+                gk20a_gmmu_unmap(&g->mm.bar1.vm,
+                                        f->userd.gpu_va,
+                                        f->userd.size,
+                                        gk20a_mem_flag_none);
+        if (f->userd.sgt)
+                gk20a_free_sgtable(&f->userd.sgt);
+        if (f->userd.cpuva)
+                dma_free_coherent(d,
+                                f->userd_total_size,
+                                f->userd.cpuva,
+                                f->userd.iova);
+        f->userd.cpuva = NULL;
+        f->userd.iova = 0;
+        memset(&f->userd, 0, sizeof(struct userd_desc));
+        kfree(f->channel);
+        f->channel = NULL;
+        kfree(f->pbdma_map);
+        f->pbdma_map = NULL;
+        kfree(f->engine_info);
+        f->engine_info = NULL;
+        return err;
+}
+static void gk20a_fifo_handle_runlist_event(struct gk20a *g)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        struct fifo_runlist_info_gk20a *runlist;
+        unsigned long runlist_event;
+        u32 runlist_id;
+        runlist_event = gk20a_readl(g, fifo_intr_runlist_r());
+        gk20a_writel(g, fifo_intr_runlist_r(), runlist_event);
+        for_each_set_bit(runlist_id, &runlist_event, f->max_runlists) {
+                runlist = &f->runlist_info[runlist_id];
+                wake_up(&runlist->runlist_wq);
+        }
+}
+static int gk20a_init_fifo_setup_hw(struct gk20a *g)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        gk20a_dbg_fn("");
+        /* test write, read through bar1 @ userd region before
+         * turning on the snooping */
+        {
+                struct fifo_gk20a *f = &g->fifo;
+                u32 v, v1 = 0x33, v2 = 0x55;
+                u32 bar1_vaddr = f->userd.gpu_va;
+                volatile u32 *cpu_vaddr = f->userd.cpuva;
+                gk20a_dbg_info("test bar1 @ vaddr 0x%x",
+                           bar1_vaddr);
+                v = gk20a_bar1_readl(g, bar1_vaddr);
+                *cpu_vaddr = v1;
+                smp_mb();
+                if (v1 != gk20a_bar1_readl(g, bar1_vaddr)) {
+                        gk20a_err(dev_from_gk20a(g), "bar1 broken @ gk20a!");
+                        return -EINVAL;
+                }
+                gk20a_bar1_writel(g, bar1_vaddr, v2);
+                if (v2 != gk20a_bar1_readl(g, bar1_vaddr)) {
+                        gk20a_err(dev_from_gk20a(g), "bar1 broken @ gk20a!");
+                        return -EINVAL;
+                }
+                /* is it visible to the cpu? */
+                if (*cpu_vaddr != v2) {
+                        gk20a_err(dev_from_gk20a(g),
+                                "cpu didn't see bar1 write @ %p!",
+                                cpu_vaddr);
+                }
+                /* put it back */
+                gk20a_bar1_writel(g, bar1_vaddr, v);
+        }
+        /*XXX all manner of flushes and caching worries, etc */
+        /* set the base for the userd region now */
+        gk20a_writel(g, fifo_bar1_base_r(),
+                        fifo_bar1_base_ptr_f(f->userd.gpu_va >> 12) |
+                        fifo_bar1_base_valid_true_f());
+        gk20a_dbg_fn("done");
+        return 0;
+}
+int gk20a_init_fifo_support(struct gk20a *g)
+{
+        u32 err;
+        err = gk20a_init_fifo_setup_sw(g);
+        if (err)
+                return err;
+        err = gk20a_init_fifo_setup_hw(g);
+        if (err)
+                return err;
+        return err;
+}
+static struct channel_gk20a *
+channel_from_inst_ptr(struct fifo_gk20a *f, u64 inst_ptr)
+{
+        int ci;
+        if (unlikely(!f->channel))
+                return NULL;
+        for (ci = 0; ci < f->num_channels; ci++) {
+                struct channel_gk20a *c = f->channel+ci;
+                if (c->inst_block.cpuva &&
+                    (inst_ptr == c->inst_block.cpu_pa))
+                        return f->channel+ci;
+        }
+        return NULL;
+}
+/* fault info/descriptions.
+ * tbd: move to setup
+ *  */
+static const char * const fault_type_descs[] = {
+         "pde", /*fifo_intr_mmu_fault_info_type_pde_v() == 0 */
+         "pde size",
+         "pte",
+         "va limit viol",
+         "unbound inst",
+         "priv viol",
+         "ro viol",
+         "wo viol",
+         "pitch mask",
+         "work creation",
+         "bad aperture",
+         "compression failure",
+         "bad kind",
+         "region viol",
+         "dual ptes",
+         "poisoned",
+};
+/* engine descriptions */
+static const char * const engine_subid_descs[] = {
+        "gpc",
+        "hub",
+};
+static const char * const hub_client_descs[] = {
+        "vip", "ce0", "ce1", "dniso", "fe", "fecs", "host", "host cpu",
+        "host cpu nb", "iso", "mmu", "mspdec", "msppp", "msvld",
+        "niso", "p2p", "pd", "perf", "pmu", "raster twod", "scc",
+        "scc nb", "sec", "ssync", "gr copy", "ce2", "xv", "mmu nb",
+        "msenc", "d falcon", "sked", "a falcon", "n/a",
+};
+static const char * const gpc_client_descs[] = {
+        "l1 0", "t1 0", "pe 0",
+        "l1 1", "t1 1", "pe 1",
+        "l1 2", "t1 2", "pe 2",
+        "l1 3", "t1 3", "pe 3",
+        "rast", "gcc", "gpccs",
+        "prop 0", "prop 1", "prop 2", "prop 3",
+        "l1 4", "t1 4", "pe 4",
+        "l1 5", "t1 5", "pe 5",
+        "l1 6", "t1 6", "pe 6",
+        "l1 7", "t1 7", "pe 7",
+        "gpm",
+        "ltp utlb 0", "ltp utlb 1", "ltp utlb 2", "ltp utlb 3",
+        "rgg utlb",
+};
+/* reads info from hardware and fills in mmu fault info record */
+static inline void get_exception_mmu_fault_info(
+        struct gk20a *g, u32 engine_id,
+        struct fifo_mmu_fault_info_gk20a *f)
+{
+        u32 fault_info_v;
+        gk20a_dbg_fn("engine_id %d", engine_id);
+        memset(f, 0, sizeof(*f));
+        f->fault_info_v = fault_info_v = gk20a_readl(g,
+             fifo_intr_mmu_fault_info_r(engine_id));
+        f->fault_type_v =
+                fifo_intr_mmu_fault_info_type_v(fault_info_v);
+        f->engine_subid_v =
+                fifo_intr_mmu_fault_info_engine_subid_v(fault_info_v);
+        f->client_v = fifo_intr_mmu_fault_info_client_v(fault_info_v);
+        BUG_ON(f->fault_type_v >= ARRAY_SIZE(fault_type_descs));
+        f->fault_type_desc =  fault_type_descs[f->fault_type_v];
+        BUG_ON(f->engine_subid_v >= ARRAY_SIZE(engine_subid_descs));
+        f->engine_subid_desc = engine_subid_descs[f->engine_subid_v];
+        if (f->engine_subid_v ==
+            fifo_intr_mmu_fault_info_engine_subid_hub_v()) {
+                BUG_ON(f->client_v >= ARRAY_SIZE(hub_client_descs));
+                f->client_desc = hub_client_descs[f->client_v];
+        } else if (f->engine_subid_v ==
+                   fifo_intr_mmu_fault_info_engine_subid_gpc_v()) {
+                BUG_ON(f->client_v >= ARRAY_SIZE(gpc_client_descs));
+                f->client_desc = gpc_client_descs[f->client_v];
+        } else {
+                BUG_ON(1);
+        }
+        f->fault_hi_v = gk20a_readl(g, fifo_intr_mmu_fault_hi_r(engine_id));
+        f->fault_lo_v = gk20a_readl(g, fifo_intr_mmu_fault_lo_r(engine_id));
+        /* note:ignoring aperture on gk20a... */
+        f->inst_ptr = fifo_intr_mmu_fault_inst_ptr_v(
+                 gk20a_readl(g, fifo_intr_mmu_fault_inst_r(engine_id)));
+        /* note: inst_ptr is a 40b phys addr.  */
+        f->inst_ptr <<= fifo_intr_mmu_fault_inst_ptr_align_shift_v();
+}
+static void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id)
+{
+        gk20a_dbg_fn("");
+        if (engine_id == top_device_info_type_enum_graphics_v()) {
+                /* resetting engine using mc_enable_r() is not enough,
+                 * we do full init sequence */
+                gk20a_gr_reset(g);
+        }
+        if (engine_id == top_device_info_type_enum_copy0_v())
+                gk20a_reset(g, mc_enable_ce2_m());
+}
+static void gk20a_fifo_handle_mmu_fault_thread(struct work_struct *work)
+{
+        struct fifo_gk20a *f = container_of(work, struct fifo_gk20a,
+                                            fault_restore_thread);
+        struct gk20a *g = f->g;
+        int i;
+        /* Reinitialise FECS and GR */
+        gk20a_init_pmu_setup_hw2(g);
+        /* It is safe to enable ELPG again. */
+        gk20a_pmu_enable_elpg(g);
+        /* Restore the runlist */
+        for (i = 0; i < g->fifo.max_runlists; i++)
+                gk20a_fifo_update_runlist_locked(g, i, ~0, true, true);
+        /* unlock all runlists */
+        for (i = 0; i < g->fifo.max_runlists; i++)
+                mutex_unlock(&g->fifo.runlist_info[i].mutex);
+}
+static void gk20a_fifo_handle_chsw_fault(struct gk20a *g)
+{
+        u32 intr;
+        intr = gk20a_readl(g, fifo_intr_chsw_error_r());
+        gk20a_err(dev_from_gk20a(g), "chsw: %08x\n", intr);
+        gk20a_fecs_dump_falcon_stats(g);
+        gk20a_writel(g, fifo_intr_chsw_error_r(), intr);
+}
+static void gk20a_fifo_handle_dropped_mmu_fault(struct gk20a *g)
+{
+        struct device *dev = dev_from_gk20a(g);
+        u32 fault_id = gk20a_readl(g, fifo_intr_mmu_fault_id_r());
+        gk20a_err(dev, "dropped mmu fault (0x%08x)", fault_id);
+}
+static bool gk20a_fifo_should_defer_engine_reset(struct gk20a *g, u32 engine_id,
+                struct fifo_mmu_fault_info_gk20a *f, bool fake_fault)
+{
+        /* channel recovery is only deferred if an sm debugger
+           is attached and has MMU debug mode is enabled */
+        if (!gk20a_gr_sm_debugger_attached(g) ||
+            !gk20a_mm_mmu_debug_mode_enabled(g))
+                return false;
+        /* if this fault is fake (due to RC recovery), don't defer recovery */
+        if (fake_fault)
+                return false;
+        if (engine_id != ENGINE_GR_GK20A ||
+            f->engine_subid_v != fifo_intr_mmu_fault_info_engine_subid_gpc_v())
+                return false;
+        return true;
+}
+void fifo_gk20a_finish_mmu_fault_handling(struct gk20a *g,
+                unsigned long fault_id) {
+        u32 engine_mmu_id;
+        int i;
+        /* reset engines */
+        for_each_set_bit(engine_mmu_id, &fault_id, 32) {
+                u32 engine_id = gk20a_mmu_id_to_engine_id(engine_mmu_id);
+                if (engine_id != ~0)
+                        gk20a_fifo_reset_engine(g, engine_id);
+        }
+        /* CLEAR the runlists. Do not wait for runlist to start as
+         * some engines may not be available right now */
+        for (i = 0; i < g->fifo.max_runlists; i++)
+                gk20a_fifo_update_runlist_locked(g, i, ~0, false, false);
+        /* clear interrupt */
+        gk20a_writel(g, fifo_intr_mmu_fault_id_r(), fault_id);
+        /* resume scheduler */
+        gk20a_writel(g, fifo_error_sched_disable_r(),
+                     gk20a_readl(g, fifo_error_sched_disable_r()));
+        /* Spawn a work to enable PMU and restore runlists */
+        schedule_work(&g->fifo.fault_restore_thread);
+}
+static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g,
+                struct channel_gk20a *ch) {
+        bool verbose = true;
+        if (!ch)
+                return verbose;
+        gk20a_err(dev_from_gk20a(g),
+                "channel %d generated a mmu fault",
+                ch->hw_chid);
+        if (ch->error_notifier) {
+                u32 err = ch->error_notifier->info32;
+                if (ch->error_notifier->status == 0xffff) {
+                        /* If error code is already set, this mmu fault
+                         * was triggered as part of recovery from other
+                         * error condition.
+                         * Don't overwrite error flag. */
+                        /* Fifo timeout debug spew is controlled by user */
+                        if (err == NVHOST_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT)
+                                verbose = ch->timeout_debug_dump;
+                } else {
+                        gk20a_set_error_notifier(ch,
+                                NVHOST_CHANNEL_FIFO_ERROR_MMU_ERR_FLT);
+                }
+        }
+        /* mark channel as faulted */
+        ch->has_timedout = true;
+        wmb();
+        /* unblock pending waits */
+        wake_up(&ch->semaphore_wq);
+        wake_up(&ch->notifier_wq);
+        wake_up(&ch->submit_wq);
+        return verbose;
+}
+static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
+{
+        bool fake_fault;
+        unsigned long fault_id;
+        unsigned long engine_mmu_id;
+        int i;
+        bool verbose = true;
+        gk20a_dbg_fn("");
+        g->fifo.deferred_reset_pending = false;
+        /* Disable ELPG */
+        gk20a_pmu_disable_elpg(g);
+        /* If we have recovery in progress, MMU fault id is invalid */
+        if (g->fifo.mmu_fault_engines) {
+                fault_id = g->fifo.mmu_fault_engines;
+                g->fifo.mmu_fault_engines = 0;
+                fake_fault = true;
+        } else {
+                fault_id = gk20a_readl(g, fifo_intr_mmu_fault_id_r());
+                fake_fault = false;
+                gk20a_debug_dump(g->dev);
+        }
+        /* lock all runlists. Note that locks are are released in
+         * gk20a_fifo_handle_mmu_fault_thread() */
+        for (i = 0; i < g->fifo.max_runlists; i++)
+                mutex_lock(&g->fifo.runlist_info[i].mutex);
+        /* go through all faulted engines */
+        for_each_set_bit(engine_mmu_id, &fault_id, 32) {
+                /* bits in fifo_intr_mmu_fault_id_r do not correspond 1:1 to
+                 * engines. Convert engine_mmu_id to engine_id */
+                u32 engine_id = gk20a_mmu_id_to_engine_id(engine_mmu_id);
+                struct fifo_runlist_info_gk20a *runlist = g->fifo.runlist_info;
+                struct fifo_mmu_fault_info_gk20a f;
+                struct channel_gk20a *ch = NULL;
+                get_exception_mmu_fault_info(g, engine_mmu_id, &f);
+                trace_gk20a_mmu_fault(f.fault_hi_v,
+                                      f.fault_lo_v,
+                                      f.fault_info_v,
+                                      f.inst_ptr,
+                                      engine_id,
+                                      f.engine_subid_desc,
+                                      f.client_desc,
+                                      f.fault_type_desc);
+                gk20a_err(dev_from_gk20a(g), "mmu fault on engine %d, "
+                           "engine subid %d (%s), client %d (%s), "
+                           "addr 0x%08x:0x%08x, type %d (%s), info 0x%08x,"
+                           "inst_ptr 0x%llx\n",
+                           engine_id,
+                           f.engine_subid_v, f.engine_subid_desc,
+                           f.client_v, f.client_desc,
+                           f.fault_hi_v, f.fault_lo_v,
+                           f.fault_type_v, f.fault_type_desc,
+                           f.fault_info_v, f.inst_ptr);
+                /* get the channel */
+                if (fake_fault) {
+                        /* read and parse engine status */
+                        u32 status = gk20a_readl(g,
+                                fifo_engine_status_r(engine_id));
+                        u32 ctx_status =
+                                fifo_engine_status_ctx_status_v(status);
+                        bool type_ch = fifo_pbdma_status_id_type_v(status) ==
+                                fifo_pbdma_status_id_type_chid_v();
+                        /* use next_id if context load is failing */
+                        u32 id = (ctx_status ==
+                                fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+                                fifo_engine_status_next_id_v(status) :
+                                fifo_engine_status_id_v(status);
+                        if (type_ch) {
+                                ch = g->fifo.channel + id;
+                        } else {
+                                gk20a_err(dev_from_gk20a(g), "non-chid type not supported");
+                                WARN_ON(1);
+                        }
+                } else {
+                        /* read channel based on instruction pointer */
+                        ch = channel_from_inst_ptr(&g->fifo, f.inst_ptr);
+                }
+                if (ch) {
+                        if (ch->in_use) {
+                                /* disable the channel from hw and increment
+                                 * syncpoints */
+                                gk20a_disable_channel_no_update(ch);
+                                /* remove the channel from runlist */
+                                clear_bit(ch->hw_chid,
+                                          runlist->active_channels);
+                        }
+                        /* check if engine reset should be deferred */
+                        if (gk20a_fifo_should_defer_engine_reset(g, engine_id, &f, fake_fault)) {
+                                g->fifo.mmu_fault_engines = fault_id;
+                                /* handled during channel free */
+                                g->fifo.deferred_reset_pending = true;
+                        } else
+                                verbose = gk20a_fifo_set_ctx_mmu_error(g, ch);
+                } else if (f.inst_ptr ==
+                                g->mm.bar1.inst_block.cpu_pa) {
+                        gk20a_err(dev_from_gk20a(g), "mmu fault from bar1");
+                } else if (f.inst_ptr ==
+                                g->mm.pmu.inst_block.cpu_pa) {
+                        gk20a_err(dev_from_gk20a(g), "mmu fault from pmu");
+                } else
+                        gk20a_err(dev_from_gk20a(g), "couldn't locate channel for mmu fault");
+        }
+        if (g->fifo.deferred_reset_pending) {
+                gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "sm debugger attached,"
+                           " deferring channel recovery to channel free");
+                /* clear interrupt */
+                gk20a_writel(g, fifo_intr_mmu_fault_id_r(), fault_id);
+                return verbose;
+        }
+        /* resetting the engines and clearing the runlists is done in
+           a separate function to allow deferred reset. */
+        fifo_gk20a_finish_mmu_fault_handling(g, fault_id);
+        return verbose;
+}
+static void gk20a_fifo_get_faulty_channel(struct gk20a *g, int engine_id,
+                                          u32 *chid, bool *type_ch)
+{
+        u32 status = gk20a_readl(g, fifo_engine_status_r(engine_id));
+        u32 ctx_status = fifo_engine_status_ctx_status_v(status);
+        *type_ch = fifo_pbdma_status_id_type_v(status) ==
+                fifo_pbdma_status_id_type_chid_v();
+        /* use next_id if context load is failing */
+        *chid = (ctx_status ==
+                fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+                fifo_engine_status_next_id_v(status) :
+                fifo_engine_status_id_v(status);
+}
+void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
+                bool verbose)
+{
+        unsigned long end_jiffies = jiffies +
+                msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+        unsigned long delay = GR_IDLE_CHECK_DEFAULT;
+        unsigned long engine_id, i;
+        unsigned long _engine_ids = __engine_ids;
+        unsigned long engine_ids = 0;
+        int ret;
+        if (verbose)
+                gk20a_debug_dump(g->dev);
+        /* store faulted engines in advance */
+        g->fifo.mmu_fault_engines = 0;
+        for_each_set_bit(engine_id, &_engine_ids, 32) {
+                bool ref_type_ch;
+                int ref_chid;
+                gk20a_fifo_get_faulty_channel(g, engine_id, &ref_chid,
+                                              &ref_type_ch);
+                /* Reset *all* engines that use the
+                 * same channel as faulty engine */
+                for (i = 0; i < g->fifo.max_engines; i++) {
+                        bool type_ch;
+                        u32 chid;
+                        gk20a_fifo_get_faulty_channel(g, i, &chid, &type_ch);
+                        if (ref_type_ch == type_ch && ref_chid == chid) {
+                                engine_ids |= BIT(i);
+                                g->fifo.mmu_fault_engines |=
+                                        BIT(gk20a_engine_id_to_mmu_id(i));
+                        }
+                }
+        }
+        /* trigger faults for all bad engines */
+        for_each_set_bit(engine_id, &engine_ids, 32) {
+                if (engine_id > g->fifo.max_engines) {
+                        WARN_ON(true);
+                        break;
+                }
+                gk20a_writel(g, fifo_trigger_mmu_fault_r(engine_id),
+                             fifo_trigger_mmu_fault_id_f(
+                             gk20a_engine_id_to_mmu_id(engine_id)) |
+                             fifo_trigger_mmu_fault_enable_f(1));
+        }
+        /* Wait for MMU fault to trigger */
+        ret = -EBUSY;
+        do {
+                if (gk20a_readl(g, fifo_intr_0_r()) &
+                                fifo_intr_0_mmu_fault_pending_f()) {
+                        ret = 0;
+                        break;
+                }
+                usleep_range(delay, delay * 2);
+                delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+        } while (time_before(jiffies, end_jiffies) ||
+                        !tegra_platform_is_silicon());
+        if (ret)
+                gk20a_err(dev_from_gk20a(g), "mmu fault timeout");
+        /* release mmu fault trigger */
+        for_each_set_bit(engine_id, &engine_ids, 32)
+                gk20a_writel(g, fifo_trigger_mmu_fault_r(engine_id), 0);
+}
+static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
+{
+        u32 sched_error;
+        u32 engine_id;
+        int id = -1;
+        bool non_chid = false;
+        /* read and reset the scheduler error register */
+        sched_error = gk20a_readl(g, fifo_intr_sched_error_r());
+        gk20a_writel(g, fifo_intr_0_r(), fifo_intr_0_sched_error_reset_f());
+        for (engine_id = 0; engine_id < g->fifo.max_engines; engine_id++) {
+                u32 status = gk20a_readl(g, fifo_engine_status_r(engine_id));
+                u32 ctx_status = fifo_engine_status_ctx_status_v(status);
+                bool failing_engine;
+                /* we are interested in busy engines */
+                failing_engine = fifo_engine_status_engine_v(status) ==
+                        fifo_engine_status_engine_busy_v();
+                /* ..that are doing context switch */
+                failing_engine = failing_engine &&
+                        (ctx_status ==
+                                fifo_engine_status_ctx_status_ctxsw_switch_v()
+                        || ctx_status ==
+                                fifo_engine_status_ctx_status_ctxsw_save_v()
+                        || ctx_status ==
+                                fifo_engine_status_ctx_status_ctxsw_load_v());
+                if (failing_engine) {
+                        id = (ctx_status ==
+                                fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+                                fifo_engine_status_next_id_v(status) :
+                                fifo_engine_status_id_v(status);
+                        non_chid = fifo_pbdma_status_id_type_v(status) !=
+                                fifo_pbdma_status_id_type_chid_v();
+                        break;
+                }
+        }
+        /* could not find the engine - should never happen */
+        if (unlikely(engine_id >= g->fifo.max_engines))
+                goto err;
+        if (fifo_intr_sched_error_code_f(sched_error) ==
+                        fifo_intr_sched_error_code_ctxsw_timeout_v()) {
+                struct fifo_gk20a *f = &g->fifo;
+                struct channel_gk20a *ch = &f->channel[id];
+                if (non_chid) {
+                        gk20a_fifo_recover(g, BIT(engine_id), true);
+                        goto err;
+                }
+                if (gk20a_channel_update_and_check_timeout(ch,
+                        GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000)) {
+                        gk20a_set_error_notifier(ch,
+                                NVHOST_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
+                        gk20a_err(dev_from_gk20a(g),
+                                "fifo sched ctxsw timeout error:"
+                                "engine = %u, ch = %d", engine_id, id);
+                        gk20a_fifo_recover(g, BIT(engine_id),
+                                ch->timeout_debug_dump);
+                } else {
+                        gk20a_warn(dev_from_gk20a(g),
+                                "fifo is waiting for ctx switch for %d ms,"
+                                "ch = %d\n",
+                                ch->timeout_accumulated_ms,
+                                id);
+                }
+                return ch->timeout_debug_dump;
+        }
+err:
+        gk20a_err(dev_from_gk20a(g), "fifo sched error : 0x%08x, engine=%u, %s=%d",
+                   sched_error, engine_id, non_chid ? "non-ch" : "ch", id);
+        return true;
+}
+static u32 fifo_error_isr(struct gk20a *g, u32 fifo_intr)
+{
+        bool print_channel_reset_log = false, reset_engine = false;
+        struct device *dev = dev_from_gk20a(g);
+        u32 handled = 0;
+        gk20a_dbg_fn("");
+        if (fifo_intr & fifo_intr_0_pio_error_pending_f()) {
+                /* pio mode is unused.  this shouldn't happen, ever. */
+                /* should we clear it or just leave it pending? */
+                gk20a_err(dev, "fifo pio error!\n");
+                BUG_ON(1);
+        }
+        if (fifo_intr & fifo_intr_0_bind_error_pending_f()) {
+                u32 bind_error = gk20a_readl(g, fifo_intr_bind_error_r());
+                gk20a_err(dev, "fifo bind error: 0x%08x", bind_error);
+                print_channel_reset_log = true;
+                handled |= fifo_intr_0_bind_error_pending_f();
+        }
+        if (fifo_intr & fifo_intr_0_sched_error_pending_f()) {
+                print_channel_reset_log = gk20a_fifo_handle_sched_error(g);
+                handled |= fifo_intr_0_sched_error_pending_f();
+        }
+        if (fifo_intr & fifo_intr_0_chsw_error_pending_f()) {
+                gk20a_fifo_handle_chsw_fault(g);
+                handled |= fifo_intr_0_chsw_error_pending_f();
+        }
+        if (fifo_intr & fifo_intr_0_mmu_fault_pending_f()) {
+                print_channel_reset_log = gk20a_fifo_handle_mmu_fault(g);
+                reset_engine  = true;
+                handled |= fifo_intr_0_mmu_fault_pending_f();
+        }
+        if (fifo_intr & fifo_intr_0_dropped_mmu_fault_pending_f()) {
+                gk20a_fifo_handle_dropped_mmu_fault(g);
+                handled |= fifo_intr_0_dropped_mmu_fault_pending_f();
+        }
+        print_channel_reset_log = !g->fifo.deferred_reset_pending
+                        && print_channel_reset_log;
+        if (print_channel_reset_log) {
+                int engine_id;
+                gk20a_err(dev_from_gk20a(g),
+                           "channel reset initated from %s", __func__);
+                for (engine_id = 0;
+                     engine_id < g->fifo.max_engines;
+                     engine_id++) {
+                        gk20a_dbg_fn("enum:%d -> engine_id:%d", engine_id,
+                                g->fifo.engine_info[engine_id].engine_id);
+                        fifo_pbdma_exception_status(g,
+                                        &g->fifo.engine_info[engine_id]);
+                        fifo_engine_exception_status(g,
+                                        &g->fifo.engine_info[engine_id]);
+                }
+        }
+        return handled;
+}
+static u32 gk20a_fifo_handle_pbdma_intr(struct device *dev,
+                                        struct gk20a *g,
+                                        struct fifo_gk20a *f,
+                                        u32 pbdma_id)
+{
+        u32 pbdma_intr_0 = gk20a_readl(g, pbdma_intr_0_r(pbdma_id));
+        u32 pbdma_intr_1 = gk20a_readl(g, pbdma_intr_1_r(pbdma_id));
+        u32 handled = 0;
+        bool reset_device = false;
+        bool reset_channel = false;
+        gk20a_dbg_fn("");
+        gk20a_dbg(gpu_dbg_intr, "pbdma id intr pending %d %08x %08x", pbdma_id,
+                        pbdma_intr_0, pbdma_intr_1);
+        if (pbdma_intr_0) {
+                if (f->intr.pbdma.device_fatal_0 & pbdma_intr_0) {
+                        dev_err(dev, "unrecoverable device error: "
+                                "pbdma_intr_0(%d):0x%08x", pbdma_id, pbdma_intr_0);
+                        reset_device = true;
+                        /* TODO: disable pbdma intrs */
+                        handled |= f->intr.pbdma.device_fatal_0 & pbdma_intr_0;
+                }
+                if (f->intr.pbdma.channel_fatal_0 & pbdma_intr_0) {
+                        dev_warn(dev, "channel error: "
+                                 "pbdma_intr_0(%d):0x%08x", pbdma_id, pbdma_intr_0);
+                        reset_channel = true;
+                        /* TODO: clear pbdma channel errors */
+                        handled |= f->intr.pbdma.channel_fatal_0 & pbdma_intr_0;
+                }
+                if (f->intr.pbdma.restartable_0 & pbdma_intr_0) {
+                        dev_warn(dev, "sw method: %08x %08x",
+                                gk20a_readl(g, pbdma_method0_r(0)),
+                                gk20a_readl(g, pbdma_method0_r(0)+4));
+                        gk20a_writel(g, pbdma_method0_r(0), 0);
+                        gk20a_writel(g, pbdma_method0_r(0)+4, 0);
+                        handled |= f->intr.pbdma.restartable_0 & pbdma_intr_0;
+                }
+                gk20a_writel(g, pbdma_intr_0_r(pbdma_id), pbdma_intr_0);
+        }
+        /* all intrs in _intr_1 are "host copy engine" related,
+         * which gk20a doesn't have. for now just make them channel fatal. */
+        if (pbdma_intr_1) {
+                dev_err(dev, "channel hce error: pbdma_intr_1(%d): 0x%08x",
+                        pbdma_id, pbdma_intr_1);
+                reset_channel = true;
+                gk20a_writel(g, pbdma_intr_1_r(pbdma_id), pbdma_intr_1);
+        }
+        return handled;
+}
+static u32 fifo_channel_isr(struct gk20a *g, u32 fifo_intr)
+{
+        gk20a_channel_semaphore_wakeup(g);
+        return fifo_intr_0_channel_intr_pending_f();
+}
+static u32 fifo_pbdma_isr(struct gk20a *g, u32 fifo_intr)
+{
+        struct device *dev = dev_from_gk20a(g);
+        struct fifo_gk20a *f = &g->fifo;
+        u32 clear_intr = 0, i;
+        u32 pbdma_pending = gk20a_readl(g, fifo_intr_pbdma_id_r());
+        for (i = 0; i < fifo_intr_pbdma_id_status__size_1_v(); i++) {
+                if (fifo_intr_pbdma_id_status_f(pbdma_pending, i)) {
+                        gk20a_dbg(gpu_dbg_intr, "pbdma id %d intr pending", i);
+                        clear_intr |=
+                                gk20a_fifo_handle_pbdma_intr(dev, g, f, i);
+                }
+        }
+        return fifo_intr_0_pbdma_intr_pending_f();
+}
+void gk20a_fifo_isr(struct gk20a *g)
+{
+        u32 error_intr_mask =
+                fifo_intr_0_bind_error_pending_f() |
+                fifo_intr_0_sched_error_pending_f() |
+                fifo_intr_0_chsw_error_pending_f() |
+                fifo_intr_0_fb_flush_timeout_pending_f() |
+                fifo_intr_0_dropped_mmu_fault_pending_f() |
+                fifo_intr_0_mmu_fault_pending_f() |
+                fifo_intr_0_lb_error_pending_f() |
+                fifo_intr_0_pio_error_pending_f();
+        u32 fifo_intr = gk20a_readl(g, fifo_intr_0_r());
+        u32 clear_intr = 0;
+        /* note we're not actually in an "isr", but rather
+         * in a threaded interrupt context... */
+        mutex_lock(&g->fifo.intr.isr.mutex);
+        gk20a_dbg(gpu_dbg_intr, "fifo isr %08x\n", fifo_intr);
+        /* handle runlist update */
+        if (fifo_intr & fifo_intr_0_runlist_event_pending_f()) {
+                gk20a_fifo_handle_runlist_event(g);
+                clear_intr |= fifo_intr_0_runlist_event_pending_f();
+        }
+        if (fifo_intr & fifo_intr_0_pbdma_intr_pending_f())
+                clear_intr |= fifo_pbdma_isr(g, fifo_intr);
+        if (unlikely(fifo_intr & error_intr_mask))
+                clear_intr = fifo_error_isr(g, fifo_intr);
+        gk20a_writel(g, fifo_intr_0_r(), clear_intr);
+        mutex_unlock(&g->fifo.intr.isr.mutex);
+        return;
+}
+void gk20a_fifo_nonstall_isr(struct gk20a *g)
+{
+        u32 fifo_intr = gk20a_readl(g, fifo_intr_0_r());
+        u32 clear_intr = 0;
+        gk20a_dbg(gpu_dbg_intr, "fifo nonstall isr %08x\n", fifo_intr);
+        if (fifo_intr & fifo_intr_0_channel_intr_pending_f())
+                clear_intr |= fifo_channel_isr(g, fifo_intr);
+        gk20a_writel(g, fifo_intr_0_r(), clear_intr);
+        return;
+}
+int gk20a_fifo_preempt_channel(struct gk20a *g, u32 hw_chid)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        unsigned long end_jiffies = jiffies
+                + msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+        u32 delay = GR_IDLE_CHECK_DEFAULT;
+        u32 ret = 0;
+        u32 token = PMU_INVALID_MUTEX_OWNER_ID;
+        u32 elpg_off = 0;
+        u32 i;
+        gk20a_dbg_fn("%d", hw_chid);
+        /* we have no idea which runlist we are using. lock all */
+        for (i = 0; i < g->fifo.max_runlists; i++)
+                mutex_lock(&f->runlist_info[i].mutex);
+        /* disable elpg if failed to acquire pmu mutex */
+        elpg_off = pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        if (elpg_off)
+                gk20a_pmu_disable_elpg(g);
+        /* issue preempt */
+        gk20a_writel(g, fifo_preempt_r(),
+                fifo_preempt_chid_f(hw_chid) |
+                fifo_preempt_type_channel_f());
+        /* wait for preempt */
+        ret = -EBUSY;
+        do {
+                if (!(gk20a_readl(g, fifo_preempt_r()) &
+                        fifo_preempt_pending_true_f())) {
+                        ret = 0;
+                        break;
+                }
+                usleep_range(delay, delay * 2);
+                delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+        } while (time_before(jiffies, end_jiffies) ||
+                        !tegra_platform_is_silicon());
+        if (ret) {
+                int i;
+                u32 engines = 0;
+                struct fifo_gk20a *f = &g->fifo;
+                struct channel_gk20a *ch = &f->channel[hw_chid];
+                gk20a_err(dev_from_gk20a(g), "preempt channel %d timeout\n",
+                            hw_chid);
+                /* forcefully reset all busy engines using this channel */
+                for (i = 0; i < g->fifo.max_engines; i++) {
+                        u32 status = gk20a_readl(g, fifo_engine_status_r(i));
+                        u32 ctx_status =
+                                fifo_engine_status_ctx_status_v(status);
+                        bool type_ch = fifo_pbdma_status_id_type_v(status) ==
+                                fifo_pbdma_status_id_type_chid_v();
+                        bool busy = fifo_engine_status_engine_v(status) ==
+                                fifo_engine_status_engine_busy_v();
+                        u32 id = (ctx_status ==
+                                fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+                                fifo_engine_status_next_id_v(status) :
+                                fifo_engine_status_id_v(status);
+                        if (type_ch && busy && id == hw_chid)
+                                engines |= BIT(i);
+                }
+                gk20a_set_error_notifier(ch,
+                                NVHOST_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
+                gk20a_fifo_recover(g, engines, true);
+        }
+        /* re-enable elpg or release pmu mutex */
+        if (elpg_off)
+                gk20a_pmu_enable_elpg(g);
+        else
+                pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        for (i = 0; i < g->fifo.max_runlists; i++)
+                mutex_unlock(&f->runlist_info[i].mutex);
+        return ret;
+}
+int gk20a_fifo_enable_engine_activity(struct gk20a *g,
+                                struct fifo_engine_info_gk20a *eng_info)
+{
+        u32 token = PMU_INVALID_MUTEX_OWNER_ID;
+        u32 elpg_off;
+        u32 enable;
+        gk20a_dbg_fn("");
+        /* disable elpg if failed to acquire pmu mutex */
+        elpg_off = pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        if (elpg_off)
+                gk20a_pmu_disable_elpg(g);
+        enable = gk20a_readl(g, fifo_sched_disable_r());
+        enable &= ~(fifo_sched_disable_true_v() >> eng_info->runlist_id);
+        gk20a_writel(g, fifo_sched_disable_r(), enable);
+        /* re-enable elpg or release pmu mutex */
+        if (elpg_off)
+                gk20a_pmu_enable_elpg(g);
+        else
+                pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        gk20a_dbg_fn("done");
+        return 0;
+}
+int gk20a_fifo_disable_engine_activity(struct gk20a *g,
+                                struct fifo_engine_info_gk20a *eng_info,
+                                bool wait_for_idle)
+{
+        u32 gr_stat, pbdma_stat, chan_stat, eng_stat, ctx_stat;
+        u32 pbdma_chid = ~0, engine_chid = ~0, disable;
+        u32 token = PMU_INVALID_MUTEX_OWNER_ID;
+        u32 elpg_off;
+        u32 err = 0;
+        gk20a_dbg_fn("");
+        gr_stat =
+                gk20a_readl(g, fifo_engine_status_r(eng_info->engine_id));
+        if (fifo_engine_status_engine_v(gr_stat) ==
+            fifo_engine_status_engine_busy_v() && !wait_for_idle)
+                return -EBUSY;
+        /* disable elpg if failed to acquire pmu mutex */
+        elpg_off = pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        if (elpg_off)
+                gk20a_pmu_disable_elpg(g);
+        disable = gk20a_readl(g, fifo_sched_disable_r());
+        disable = set_field(disable,
+                        fifo_sched_disable_runlist_m(eng_info->runlist_id),
+                        fifo_sched_disable_runlist_f(fifo_sched_disable_true_v(),
+                                eng_info->runlist_id));
+        gk20a_writel(g, fifo_sched_disable_r(), disable);
+        /* chid from pbdma status */
+        pbdma_stat = gk20a_readl(g, fifo_pbdma_status_r(eng_info->pbdma_id));
+        chan_stat  = fifo_pbdma_status_chan_status_v(pbdma_stat);
+        if (chan_stat == fifo_pbdma_status_chan_status_valid_v() ||
+            chan_stat == fifo_pbdma_status_chan_status_chsw_save_v())
+                pbdma_chid = fifo_pbdma_status_id_v(pbdma_stat);
+        else if (chan_stat == fifo_pbdma_status_chan_status_chsw_load_v() ||
+                 chan_stat == fifo_pbdma_status_chan_status_chsw_switch_v())
+                pbdma_chid = fifo_pbdma_status_next_id_v(pbdma_stat);
+        if (pbdma_chid != ~0) {
+                err = gk20a_fifo_preempt_channel(g, pbdma_chid);
+                if (err)
+                        goto clean_up;
+        }
+        /* chid from engine status */
+        eng_stat = gk20a_readl(g, fifo_engine_status_r(eng_info->engine_id));
+        ctx_stat  = fifo_engine_status_ctx_status_v(eng_stat);
+        if (ctx_stat == fifo_engine_status_ctx_status_valid_v() ||
+            ctx_stat == fifo_engine_status_ctx_status_ctxsw_save_v())
+                engine_chid = fifo_engine_status_id_v(eng_stat);
+        else if (ctx_stat == fifo_engine_status_ctx_status_ctxsw_load_v() ||
+                 ctx_stat == fifo_engine_status_ctx_status_ctxsw_switch_v())
+                engine_chid = fifo_engine_status_next_id_v(eng_stat);
+        if (engine_chid != ~0 && engine_chid != pbdma_chid) {
+                err = gk20a_fifo_preempt_channel(g, engine_chid);
+                if (err)
+                        goto clean_up;
+        }
+clean_up:
+        /* re-enable elpg or release pmu mutex */
+        if (elpg_off)
+                gk20a_pmu_enable_elpg(g);
+        else
+                pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        if (err) {
+                gk20a_dbg_fn("failed");
+                if (gk20a_fifo_enable_engine_activity(g, eng_info))
+                        gk20a_err(dev_from_gk20a(g),
+                                "failed to enable gr engine activity\n");
+        } else {
+                gk20a_dbg_fn("done");
+        }
+        return err;
+}
+static void gk20a_fifo_runlist_reset_engines(struct gk20a *g, u32 runlist_id)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        u32 engines = 0;
+        int i;
+        for (i = 0; i < f->max_engines; i++) {
+                u32 status = gk20a_readl(g, fifo_engine_status_r(i));
+                bool engine_busy = fifo_engine_status_engine_v(status) ==
+                        fifo_engine_status_engine_busy_v();
+                if (engine_busy &&
+                    (f->engine_info[i].runlist_id == runlist_id))
+                        engines |= BIT(i);
+        }
+        gk20a_fifo_recover(g, engines, true);
+}
+static int gk20a_fifo_runlist_wait_pending(struct gk20a *g, u32 runlist_id)
+{
+        struct fifo_runlist_info_gk20a *runlist;
+        u32 remain;
+        bool pending;
+        runlist = &g->fifo.runlist_info[runlist_id];
+        remain = wait_event_timeout(runlist->runlist_wq,
+                ((pending = gk20a_readl(g, fifo_eng_runlist_r(runlist_id)) &
+                        fifo_eng_runlist_pending_true_f()) == 0),
+                msecs_to_jiffies(gk20a_get_gr_idle_timeout(g)));
+        if (remain == 0 && pending != 0)
+                return -ETIMEDOUT;
+        return 0;
+}
+static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
+                                            u32 hw_chid, bool add,
+                                            bool wait_for_finish)
+{
+        u32 ret = 0;
+        struct device *d = dev_from_gk20a(g);
+        struct fifo_gk20a *f = &g->fifo;
+        struct fifo_runlist_info_gk20a *runlist = NULL;
+        u32 *runlist_entry_base = NULL;
+        u32 *runlist_entry = NULL;
+        phys_addr_t runlist_pa;
+        u32 old_buf, new_buf;
+        u32 chid;
+        u32 count = 0;
+        runlist = &f->runlist_info[runlist_id];
+        /* valid channel, add/remove it from active list.
+           Otherwise, keep active list untouched for suspend/resume. */
+        if (hw_chid != ~0) {
+                if (add) {
+                        if (test_and_set_bit(hw_chid,
+                                runlist->active_channels) == 1)
+                                return 0;
+                } else {
+                        if (test_and_clear_bit(hw_chid,
+                                runlist->active_channels) == 0)
+                                return 0;
+                }
+        }
+        old_buf = runlist->cur_buffer;
+        new_buf = !runlist->cur_buffer;
+        gk20a_dbg_info("runlist_id : %d, switch to new buffer 0x%16llx",
+                runlist_id, runlist->mem[new_buf].iova);
+        runlist_pa = gk20a_get_phys_from_iova(d, runlist->mem[new_buf].iova);
+        if (!runlist_pa) {
+                ret = -EINVAL;
+                goto clean_up;
+        }
+        runlist_entry_base = runlist->mem[new_buf].cpuva;
+        if (!runlist_entry_base) {
+                ret = -ENOMEM;
+                goto clean_up;
+        }
+        if (hw_chid != ~0 || /* add/remove a valid channel */
+            add /* resume to add all channels back */) {
+                runlist_entry = runlist_entry_base;
+                for_each_set_bit(chid,
+                        runlist->active_channels, f->num_channels) {
+                        gk20a_dbg_info("add channel %d to runlist", chid);
+                        runlist_entry[0] = chid;
+                        runlist_entry[1] = 0;
+                        runlist_entry += 2;
+                        count++;
+                }
+        } else  /* suspend to remove all channels */
+                count = 0;
+        if (count != 0) {
+                gk20a_writel(g, fifo_runlist_base_r(),
+                        fifo_runlist_base_ptr_f(u64_lo32(runlist_pa >> 12)) |
+                        fifo_runlist_base_target_vid_mem_f());
+        }
+        gk20a_writel(g, fifo_runlist_r(),
+                fifo_runlist_engine_f(runlist_id) |
+                fifo_eng_runlist_length_f(count));
+        if (wait_for_finish) {
+                ret = gk20a_fifo_runlist_wait_pending(g, runlist_id);
+                if (ret == -ETIMEDOUT) {
+                        gk20a_err(dev_from_gk20a(g),
+                                   "runlist update timeout");
+                        gk20a_fifo_runlist_reset_engines(g, runlist_id);
+                        /* engine reset needs the lock. drop it */
+                        mutex_unlock(&runlist->mutex);
+                        /* wait until the runlist is active again */
+                        ret = gk20a_fifo_runlist_wait_pending(g, runlist_id);
+                        /* get the lock back. at this point everything should
+                         * should be fine */
+                        mutex_lock(&runlist->mutex);
+                        if (ret)
+                                gk20a_err(dev_from_gk20a(g),
+                                           "runlist update failed: %d", ret);
+                } else if (ret == -EINTR)
+                        gk20a_err(dev_from_gk20a(g),
+                                   "runlist update interrupted");
+        }
+        runlist->cur_buffer = new_buf;
+clean_up:
+        return ret;
+}
+/* add/remove a channel from runlist
+   special cases below: runlist->active_channels will NOT be changed.
+   (hw_chid == ~0 && !add) means remove all active channels from runlist.
+   (hw_chid == ~0 &&  add) means restore all active channels on runlist. */
+int gk20a_fifo_update_runlist(struct gk20a *g, u32 runlist_id, u32 hw_chid,
+                              bool add, bool wait_for_finish)
+{
+        struct fifo_runlist_info_gk20a *runlist = NULL;
+        struct fifo_gk20a *f = &g->fifo;
+        u32 token = PMU_INVALID_MUTEX_OWNER_ID;
+        u32 elpg_off;
+        u32 ret = 0;
+        runlist = &f->runlist_info[runlist_id];
+        mutex_lock(&runlist->mutex);
+        /* disable elpg if failed to acquire pmu mutex */
+        elpg_off = pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        if (elpg_off)
+                gk20a_pmu_disable_elpg(g);
+        ret = gk20a_fifo_update_runlist_locked(g, runlist_id, hw_chid, add,
+                                               wait_for_finish);
+        /* re-enable elpg or release pmu mutex */
+        if (elpg_off)
+                gk20a_pmu_enable_elpg(g);
+        else
+                pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+        mutex_unlock(&runlist->mutex);
+        return ret;
+}
+int gk20a_fifo_suspend(struct gk20a *g)
+{
+        gk20a_dbg_fn("");
+        /* stop bar1 snooping */
+        gk20a_writel(g, fifo_bar1_base_r(),
+                        fifo_bar1_base_valid_false_f());
+        /* disable fifo intr */
+        gk20a_writel(g, fifo_intr_en_0_r(), 0);
+        gk20a_writel(g, fifo_intr_en_1_r(), 0);
+        gk20a_dbg_fn("done");
+        return 0;
+}
+bool gk20a_fifo_mmu_fault_pending(struct gk20a *g)
+{
+        if (gk20a_readl(g, fifo_intr_0_r()) &
+                        fifo_intr_0_mmu_fault_pending_f())
+                return true;
+        else
+                return false;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
new file mode 100644
index 00000000..051acda2
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -0,0 +1,164 @@
+/*
+ * drivers/video/tegra/host/gk20a/fifo_gk20a.h
+ *
+ * GK20A graphics fifo (gr host)
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __FIFO_GK20A_H__
+#define __FIFO_GK20A_H__
+#include "channel_gk20a.h"
+#define MAX_RUNLIST_BUFFERS     2
+/* generally corresponds to the "pbdma" engine */
+struct fifo_runlist_info_gk20a {
+        unsigned long *active_channels;
+        /* Each engine has its own SW and HW runlist buffer.*/
+        struct runlist_mem_desc mem[MAX_RUNLIST_BUFFERS];
+        u32  cur_buffer;
+        u32  total_entries;
+        bool stopped;
+        bool support_tsg;
+        struct mutex mutex; /* protect channel preempt and runlist upate */
+        wait_queue_head_t runlist_wq;
+};
+/* so far gk20a has two engines: gr and ce2(gr_copy) */
+enum {
+        ENGINE_GR_GK20A     = 0,
+        ENGINE_CE2_GK20A    = 1,
+        ENGINE_INVAL_GK20A
+};
+struct fifo_pbdma_exception_info_gk20a {
+        u32 status_r; /* raw register value from hardware */
+        u32 id, next_id;
+        u32 chan_status_v; /* raw value from hardware */
+        bool id_is_chid, next_id_is_chid;
+        bool chsw_in_progress;
+};
+struct fifo_engine_exception_info_gk20a {
+        u32 status_r; /* raw register value from hardware */
+        u32 id, next_id;
+        u32 ctx_status_v; /* raw value from hardware */
+        bool id_is_chid, next_id_is_chid;
+        bool faulted, idle, ctxsw_in_progress;
+};
+struct fifo_mmu_fault_info_gk20a {
+        u32 fault_info_v;
+        u32 fault_type_v;
+        u32 engine_subid_v;
+        u32 client_v;
+        u32 fault_hi_v;
+        u32 fault_lo_v;
+        u64 inst_ptr;
+        const char *fault_type_desc;
+        const char *engine_subid_desc;
+        const char *client_desc;
+};
+struct fifo_engine_info_gk20a {
+        u32 sw_id;
+        const char *name;
+        u32 dev_info_id;
+        u32 engine_id;
+        u32 runlist_id;
+        u32 pbdma_id;
+        u32 mmu_fault_id;
+        u32 rc_mask;
+        struct fifo_pbdma_exception_info_gk20a pbdma_exception_info;
+        struct fifo_engine_exception_info_gk20a engine_exception_info;
+        struct fifo_mmu_fault_info_gk20a mmu_fault_info;
+};
+struct fifo_gk20a {
+        struct gk20a *g;
+        int num_channels;
+        int num_pbdma;
+        u32 *pbdma_map;
+        struct fifo_engine_info_gk20a *engine_info;
+        u32 max_engines;
+        u32 num_engines;
+        struct fifo_runlist_info_gk20a *runlist_info;
+        u32 max_runlists;
+        struct userd_desc userd;
+        u32 userd_entry_size;
+        u32 userd_total_size;
+        struct channel_gk20a *channel;
+        struct mutex ch_inuse_mutex; /* protect unused chid look up */
+        void (*remove_support)(struct fifo_gk20a *);
+        bool sw_ready;
+        struct {
+                /* share info between isrs and non-isr code */
+                struct {
+                        struct mutex mutex;
+                } isr;
+                struct {
+                        u32 device_fatal_0;
+                        u32 channel_fatal_0;
+                        u32 restartable_0;
+                } pbdma;
+                struct {
+                } engine;
+        } intr;
+        u32 mmu_fault_engines;
+        bool deferred_reset_pending;
+        struct mutex deferred_reset_mutex;
+        struct work_struct fault_restore_thread;
+};
+int gk20a_init_fifo_support(struct gk20a *g);
+void gk20a_fifo_isr(struct gk20a *g);
+void gk20a_fifo_nonstall_isr(struct gk20a *g);
+int gk20a_fifo_preempt_channel(struct gk20a *g, u32 hw_chid);
+int gk20a_fifo_enable_engine_activity(struct gk20a *g,
+                        struct fifo_engine_info_gk20a *eng_info);
+int gk20a_fifo_disable_engine_activity(struct gk20a *g,
+                        struct fifo_engine_info_gk20a *eng_info,
+                        bool wait_for_idle);
+int gk20a_fifo_update_runlist(struct gk20a *g, u32 engine_id, u32 hw_chid,
+                              bool add, bool wait_for_finish);
+int gk20a_fifo_suspend(struct gk20a *g);
+bool gk20a_fifo_mmu_fault_pending(struct gk20a *g);
+void gk20a_fifo_recover(struct gk20a *g, u32 engine_ids, bool verbose);
+int gk20a_init_fifo_reset_enable_hw(struct gk20a *g);
+void fifo_gk20a_finish_mmu_fault_handling(struct gk20a *g,
+                unsigned long fault_id);
+#endif /*__GR_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
new file mode 100644
index 00000000..4cc500de
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -0,0 +1,1681 @@
+/*
+ * drivers/video/tegra/host/gk20a/gk20a.c
+ *
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/gk20a.h>
+#include <linux/dma-mapping.h>
+#include <linux/highmem.h>
+#include <linux/string.h>
+#include <linux/cdev.h>
+#include <linux/delay.h>
+#include <linux/firmware.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/export.h>
+#include <linux/file.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_platform.h>
+#include <linux/pm_runtime.h>
+#include <linux/thermal.h>
+#include <asm/cacheflush.h>
+#include <linux/debugfs.h>
+#include <linux/spinlock.h>
+#include <linux/tegra-powergate.h>
+#include <linux/sched.h>
+#include <linux/input-cfboost.h>
+#include <mach/pm_domains.h>
+#include "gk20a.h"
+#include "debug_gk20a.h"
+#include "ctrl_gk20a.h"
+#include "hw_mc_gk20a.h"
+#include "hw_timer_gk20a.h"
+#include "hw_bus_gk20a.h"
+#include "hw_sim_gk20a.h"
+#include "hw_top_gk20a.h"
+#include "hw_ltc_gk20a.h"
+#include "gk20a_scale.h"
+#include "dbg_gpu_gk20a.h"
+#include "hal.h"
+#ifdef CONFIG_ARM64
+#define __cpuc_flush_dcache_area __flush_dcache_area
+#endif
+#define CLASS_NAME "nvidia-gpu"
+/* TODO: Change to e.g. "nvidia-gpu%s" once we have symlinks in place. */
+#define INTERFACE_NAME "nvhost%s-gpu"
+#define GK20A_NUM_CDEVS 5
+#if defined(GK20A_DEBUG)
+u32 gk20a_dbg_mask = GK20A_DEFAULT_DBG_MASK;
+u32 gk20a_dbg_ftrace;
+#endif
+static int gk20a_pm_finalize_poweron(struct device *dev);
+static int gk20a_pm_prepare_poweroff(struct device *dev);
+static inline void set_gk20a(struct platform_device *dev, struct gk20a *gk20a)
+{
+        gk20a_get_platform(dev)->g = gk20a;
+}
+static const struct file_operations gk20a_channel_ops = {
+        .owner = THIS_MODULE,
+        .release = gk20a_channel_release,
+        .open = gk20a_channel_open,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl = gk20a_channel_ioctl,
+#endif
+        .unlocked_ioctl = gk20a_channel_ioctl,
+};
+static const struct file_operations gk20a_ctrl_ops = {
+        .owner = THIS_MODULE,
+        .release = gk20a_ctrl_dev_release,
+        .open = gk20a_ctrl_dev_open,
+        .unlocked_ioctl = gk20a_ctrl_dev_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl = gk20a_ctrl_dev_ioctl,
+#endif
+};
+static const struct file_operations gk20a_dbg_ops = {
+        .owner = THIS_MODULE,
+        .release        = gk20a_dbg_gpu_dev_release,
+        .open           = gk20a_dbg_gpu_dev_open,
+        .unlocked_ioctl = gk20a_dbg_gpu_dev_ioctl,
+        .poll           = gk20a_dbg_gpu_dev_poll,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl = gk20a_dbg_gpu_dev_ioctl,
+#endif
+};
+static const struct file_operations gk20a_as_ops = {
+        .owner = THIS_MODULE,
+        .release = gk20a_as_dev_release,
+        .open = gk20a_as_dev_open,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl = gk20a_as_dev_ioctl,
+#endif
+        .unlocked_ioctl = gk20a_as_dev_ioctl,
+};
+/*
+ * Note: We use a different 'open' to trigger handling of the profiler session.
+ * Most of the code is shared between them...  Though, at some point if the
+ * code does get too tangled trying to handle each in the same path we can
+ * separate them cleanly.
+ */
+static const struct file_operations gk20a_prof_ops = {
+        .owner = THIS_MODULE,
+        .release        = gk20a_dbg_gpu_dev_release,
+        .open           = gk20a_prof_gpu_dev_open,
+        .unlocked_ioctl = gk20a_dbg_gpu_dev_ioctl,
+        /* .mmap           = gk20a_prof_gpu_dev_mmap,*/
+        /*int (*mmap) (struct file *, struct vm_area_struct *);*/
+        .compat_ioctl = gk20a_dbg_gpu_dev_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl = gk20a_dbg_gpu_dev_ioctl,
+#endif
+};
+static inline void sim_writel(struct gk20a *g, u32 r, u32 v)
+{
+        writel(v, g->sim.regs+r);
+}
+static inline u32 sim_readl(struct gk20a *g, u32 r)
+{
+        return readl(g->sim.regs+r);
+}
+static void kunmap_and_free_iopage(void **kvaddr, struct page **page)
+{
+        if (*kvaddr) {
+                kunmap(*kvaddr);
+                *kvaddr = 0;
+        }
+        if (*page) {
+                __free_page(*page);
+                *page = 0;
+        }
+}
+static void gk20a_free_sim_support(struct gk20a *g)
+{
+        /* free sim mappings, bfrs */
+        kunmap_and_free_iopage(&g->sim.send_bfr.kvaddr,
+                               &g->sim.send_bfr.page);
+        kunmap_and_free_iopage(&g->sim.recv_bfr.kvaddr,
+                               &g->sim.recv_bfr.page);
+        kunmap_and_free_iopage(&g->sim.msg_bfr.kvaddr,
+                               &g->sim.msg_bfr.page);
+}
+static void gk20a_remove_sim_support(struct sim_gk20a *s)
+{
+        struct gk20a *g = s->g;
+        if (g->sim.regs)
+                sim_writel(g, sim_config_r(), sim_config_mode_disabled_v());
+        gk20a_free_sim_support(g);
+}
+static int alloc_and_kmap_iopage(struct device *d,
+                                 void **kvaddr,
+                                 phys_addr_t *phys,
+                                 struct page **page)
+{
+        int err = 0;
+        *page = alloc_page(GFP_KERNEL);
+        if (!*page) {
+                err = -ENOMEM;
+                dev_err(d, "couldn't allocate io page\n");
+                goto fail;
+        }
+        *kvaddr = kmap(*page);
+        if (!*kvaddr) {
+                err = -ENOMEM;
+                dev_err(d, "couldn't kmap io page\n");
+                goto fail;
+        }
+        *phys = page_to_phys(*page);
+        return 0;
+ fail:
+        kunmap_and_free_iopage(kvaddr, page);
+        return err;
+}
+static void __iomem *gk20a_ioremap_resource(struct platform_device *dev, int i,
+                                            struct resource **out)
+{
+        struct resource *r = platform_get_resource(dev, IORESOURCE_MEM, i);
+        if (!r)
+                return NULL;
+        if (out)
+                *out = r;
+        return devm_request_and_ioremap(&dev->dev, r);
+}
+/* TBD: strip from released */
+static int gk20a_init_sim_support(struct platform_device *dev)
+{
+        int err = 0;
+        struct gk20a *g = get_gk20a(dev);
+        struct device *d = &dev->dev;
+        phys_addr_t phys;
+        g->sim.g = g;
+        g->sim.regs = gk20a_ioremap_resource(dev, GK20A_SIM_IORESOURCE_MEM,
+                                             &g->sim.reg_mem);
+        if (!g->sim.regs) {
+                dev_err(d, "failed to remap gk20a sim regs\n");
+                err = -ENXIO;
+                goto fail;
+        }
+        /* allocate sim event/msg buffers */
+        err = alloc_and_kmap_iopage(d, &g->sim.send_bfr.kvaddr,
+                                    &g->sim.send_bfr.phys,
+                                    &g->sim.send_bfr.page);
+        err = err || alloc_and_kmap_iopage(d, &g->sim.recv_bfr.kvaddr,
+                                           &g->sim.recv_bfr.phys,
+                                           &g->sim.recv_bfr.page);
+        err = err || alloc_and_kmap_iopage(d, &g->sim.msg_bfr.kvaddr,
+                                           &g->sim.msg_bfr.phys,
+                                           &g->sim.msg_bfr.page);
+        if (!(g->sim.send_bfr.kvaddr && g->sim.recv_bfr.kvaddr &&
+              g->sim.msg_bfr.kvaddr)) {
+                dev_err(d, "couldn't allocate all sim buffers\n");
+                goto fail;
+        }
+        /*mark send ring invalid*/
+        sim_writel(g, sim_send_ring_r(), sim_send_ring_status_invalid_f());
+        /*read get pointer and make equal to put*/
+        g->sim.send_ring_put = sim_readl(g, sim_send_get_r());
+        sim_writel(g, sim_send_put_r(), g->sim.send_ring_put);
+        /*write send ring address and make it valid*/
+        /*TBD: work for >32b physmem*/
+        phys = g->sim.send_bfr.phys;
+        sim_writel(g, sim_send_ring_hi_r(), 0);
+        sim_writel(g, sim_send_ring_r(),
+                   sim_send_ring_status_valid_f() |
+                   sim_send_ring_target_phys_pci_coherent_f() |
+                   sim_send_ring_size_4kb_f() |
+                   sim_send_ring_addr_lo_f(phys >> PAGE_SHIFT));
+        /*repeat for recv ring (but swap put,get as roles are opposite) */
+        sim_writel(g, sim_recv_ring_r(), sim_recv_ring_status_invalid_f());
+        /*read put pointer and make equal to get*/
+        g->sim.recv_ring_get = sim_readl(g, sim_recv_put_r());
+        sim_writel(g, sim_recv_get_r(), g->sim.recv_ring_get);
+        /*write send ring address and make it valid*/
+        /*TBD: work for >32b physmem*/
+        phys = g->sim.recv_bfr.phys;
+        sim_writel(g, sim_recv_ring_hi_r(), 0);
+        sim_writel(g, sim_recv_ring_r(),
+                   sim_recv_ring_status_valid_f() |
+                   sim_recv_ring_target_phys_pci_coherent_f() |
+                   sim_recv_ring_size_4kb_f() |
+                   sim_recv_ring_addr_lo_f(phys >> PAGE_SHIFT));
+        g->sim.remove_support = gk20a_remove_sim_support;
+        return 0;
+ fail:
+        gk20a_free_sim_support(g);
+        return err;
+}
+static inline u32 sim_msg_header_size(void)
+{
+        return 24;/*TBD: fix the header to gt this from NV_VGPU_MSG_HEADER*/
+}
+static inline u32 *sim_msg_bfr(struct gk20a *g, u32 byte_offset)
+{
+        return (u32 *)(g->sim.msg_bfr.kvaddr + byte_offset);
+}
+static inline u32 *sim_msg_hdr(struct gk20a *g, u32 byte_offset)
+{
+        return sim_msg_bfr(g, byte_offset); /*starts at 0*/
+}
+static inline u32 *sim_msg_param(struct gk20a *g, u32 byte_offset)
+{
+        /*starts after msg header/cmn*/
+        return sim_msg_bfr(g, byte_offset + sim_msg_header_size());
+}
+static inline void sim_write_hdr(struct gk20a *g, u32 func, u32 size)
+{
+        /*memset(g->sim.msg_bfr.kvaddr,0,min(PAGE_SIZE,size));*/
+        *sim_msg_hdr(g, sim_msg_signature_r()) = sim_msg_signature_valid_v();
+        *sim_msg_hdr(g, sim_msg_result_r())    = sim_msg_result_rpc_pending_v();
+        *sim_msg_hdr(g, sim_msg_spare_r())     = sim_msg_spare__init_v();
+        *sim_msg_hdr(g, sim_msg_function_r())  = func;
+        *sim_msg_hdr(g, sim_msg_length_r())    = size + sim_msg_header_size();
+}
+static inline u32 sim_escape_read_hdr_size(void)
+{
+        return 12; /*TBD: fix NV_VGPU_SIM_ESCAPE_READ_HEADER*/
+}
+static u32 *sim_send_ring_bfr(struct gk20a *g, u32 byte_offset)
+{
+        return (u32 *)(g->sim.send_bfr.kvaddr + byte_offset);
+}
+static int rpc_send_message(struct gk20a *g)
+{
+        /* calculations done in units of u32s */
+        u32 send_base = sim_send_put_pointer_v(g->sim.send_ring_put) * 2;
+        u32 dma_offset = send_base + sim_dma_r()/sizeof(u32);
+        u32 dma_hi_offset = send_base + sim_dma_hi_r()/sizeof(u32);
+        *sim_send_ring_bfr(g, dma_offset*sizeof(u32)) =
+                sim_dma_target_phys_pci_coherent_f() |
+                sim_dma_status_valid_f() |
+                sim_dma_size_4kb_f() |
+                sim_dma_addr_lo_f(g->sim.msg_bfr.phys >> PAGE_SHIFT);
+        *sim_send_ring_bfr(g, dma_hi_offset*sizeof(u32)) = 0; /*TBD >32b phys*/
+        *sim_msg_hdr(g, sim_msg_sequence_r()) = g->sim.sequence_base++;
+        g->sim.send_ring_put = (g->sim.send_ring_put + 2 * sizeof(u32)) %
+                PAGE_SIZE;
+        __cpuc_flush_dcache_area(g->sim.msg_bfr.kvaddr, PAGE_SIZE);
+        __cpuc_flush_dcache_area(g->sim.send_bfr.kvaddr, PAGE_SIZE);
+        __cpuc_flush_dcache_area(g->sim.recv_bfr.kvaddr, PAGE_SIZE);
+        /* Update the put pointer. This will trap into the host. */
+        sim_writel(g, sim_send_put_r(), g->sim.send_ring_put);
+        return 0;
+}
+static inline u32 *sim_recv_ring_bfr(struct gk20a *g, u32 byte_offset)
+{
+        return (u32 *)(g->sim.recv_bfr.kvaddr + byte_offset);
+}
+static int rpc_recv_poll(struct gk20a *g)
+{
+        phys_addr_t recv_phys_addr;
+        /* XXX This read is not required (?) */
+        /*pVGpu->recv_ring_get = VGPU_REG_RD32(pGpu, NV_VGPU_RECV_GET);*/
+        /* Poll the recv ring get pointer in an infinite loop*/
+        do {
+                g->sim.recv_ring_put = sim_readl(g, sim_recv_put_r());
+        } while (g->sim.recv_ring_put == g->sim.recv_ring_get);
+        /* process all replies */
+        while (g->sim.recv_ring_put != g->sim.recv_ring_get) {
+                /* these are in u32 offsets*/
+                u32 dma_lo_offset =
+                        sim_recv_put_pointer_v(g->sim.recv_ring_get)*2 + 0;
+                /*u32 dma_hi_offset = dma_lo_offset + 1;*/
+                u32 recv_phys_addr_lo = sim_dma_addr_lo_v(*sim_recv_ring_bfr(g, dma_lo_offset*4));
+                /*u32 recv_phys_addr_hi = sim_dma_hi_addr_v(
+                      (phys_addr_t)sim_recv_ring_bfr(g,dma_hi_offset*4));*/
+                /*TBD >32b phys addr */
+                recv_phys_addr = recv_phys_addr_lo << PAGE_SHIFT;
+                if (recv_phys_addr != g->sim.msg_bfr.phys) {
+                        dev_err(dev_from_gk20a(g), "%s Error in RPC reply\n",
+                                __func__);
+                        return -1;
+                }
+                /* Update GET pointer */
+                g->sim.recv_ring_get = (g->sim.recv_ring_get + 2*sizeof(u32)) %
+                        PAGE_SIZE;
+                __cpuc_flush_dcache_area(g->sim.msg_bfr.kvaddr, PAGE_SIZE);
+                __cpuc_flush_dcache_area(g->sim.send_bfr.kvaddr, PAGE_SIZE);
+                __cpuc_flush_dcache_area(g->sim.recv_bfr.kvaddr, PAGE_SIZE);
+                sim_writel(g, sim_recv_get_r(), g->sim.recv_ring_get);
+                g->sim.recv_ring_put = sim_readl(g, sim_recv_put_r());
+        }
+        return 0;
+}
+static int issue_rpc_and_wait(struct gk20a *g)
+{
+        int err;
+        err = rpc_send_message(g);
+        if (err) {
+                dev_err(dev_from_gk20a(g), "%s failed rpc_send_message\n",
+                        __func__);
+                return err;
+        }
+        err = rpc_recv_poll(g);
+        if (err) {
+                dev_err(dev_from_gk20a(g), "%s failed rpc_recv_poll\n",
+                        __func__);
+                return err;
+        }
+        /* Now check if RPC really succeeded */
+        if (*sim_msg_hdr(g, sim_msg_result_r()) != sim_msg_result_success_v()) {
+                dev_err(dev_from_gk20a(g), "%s received failed status!\n",
+                        __func__);
+                return -(*sim_msg_hdr(g, sim_msg_result_r()));
+        }
+        return 0;
+}
+int gk20a_sim_esc_read(struct gk20a *g, char *path, u32 index, u32 count, u32 *data)
+{
+        int err;
+        size_t pathlen = strlen(path);
+        u32 data_offset;
+        sim_write_hdr(g, sim_msg_function_sim_escape_read_v(),
+                      sim_escape_read_hdr_size());
+        *sim_msg_param(g, 0) = index;
+        *sim_msg_param(g, 4) = count;
+        data_offset = roundup(0xc +  pathlen + 1, sizeof(u32));
+        *sim_msg_param(g, 8) = data_offset;
+        strcpy((char *)sim_msg_param(g, 0xc), path);
+        err = issue_rpc_and_wait(g);
+        if (!err)
+                memcpy(data, sim_msg_param(g, data_offset), count);
+        return err;
+}
+static irqreturn_t gk20a_intr_isr_stall(int irq, void *dev_id)
+{
+        struct gk20a *g = dev_id;
+        u32 mc_intr_0;
+        if (!g->power_on)
+                return IRQ_NONE;
+        /* not from gpu when sharing irq with others */
+        mc_intr_0 = gk20a_readl(g, mc_intr_0_r());
+        if (unlikely(!mc_intr_0))
+                return IRQ_NONE;
+        gk20a_writel(g, mc_intr_en_0_r(),
+                mc_intr_en_0_inta_disabled_f());
+        /* flush previous write */
+        gk20a_readl(g, mc_intr_en_0_r());
+        return IRQ_WAKE_THREAD;
+}
+static irqreturn_t gk20a_intr_isr_nonstall(int irq, void *dev_id)
+{
+        struct gk20a *g = dev_id;
+        u32 mc_intr_1;
+        if (!g->power_on)
+                return IRQ_NONE;
+        /* not from gpu when sharing irq with others */
+        mc_intr_1 = gk20a_readl(g, mc_intr_1_r());
+        if (unlikely(!mc_intr_1))
+                return IRQ_NONE;
+        gk20a_writel(g, mc_intr_en_1_r(),
+                mc_intr_en_1_inta_disabled_f());
+        /* flush previous write */
+        gk20a_readl(g, mc_intr_en_1_r());
+        return IRQ_WAKE_THREAD;
+}
+static void gk20a_pbus_isr(struct gk20a *g)
+{
+        u32 val;
+        val = gk20a_readl(g, bus_intr_0_r());
+        if (val & (bus_intr_0_pri_squash_m() |
+                        bus_intr_0_pri_fecserr_m() |
+                        bus_intr_0_pri_timeout_m())) {
+                gk20a_err(dev_from_gk20a(g), "top_fs_status_r : 0x%x",
+                        gk20a_readl(g, top_fs_status_r()));
+                gk20a_err(dev_from_gk20a(g), "pmc_enable : 0x%x",
+                        gk20a_readl(g, mc_enable_r()));
+                gk20a_err(&g->dev->dev,
+                        "NV_PTIMER_PRI_TIMEOUT_SAVE_0: 0x%x\n",
+                        gk20a_readl(g, timer_pri_timeout_save_0_r()));
+                gk20a_err(&g->dev->dev,
+                        "NV_PTIMER_PRI_TIMEOUT_SAVE_1: 0x%x\n",
+                        gk20a_readl(g, timer_pri_timeout_save_1_r()));
+                gk20a_err(&g->dev->dev,
+                        "NV_PTIMER_PRI_TIMEOUT_FECS_ERRCODE: 0x%x\n",
+                        gk20a_readl(g, timer_pri_timeout_fecs_errcode_r()));
+        }
+        if (val)
+                gk20a_err(&g->dev->dev,
+                        "Unhandled pending pbus interrupt\n");
+        gk20a_writel(g, bus_intr_0_r(), val);
+}
+static irqreturn_t gk20a_intr_thread_stall(int irq, void *dev_id)
+{
+        struct gk20a *g = dev_id;
+        u32 mc_intr_0;
+        gk20a_dbg(gpu_dbg_intr, "interrupt thread launched");
+        mc_intr_0 = gk20a_readl(g, mc_intr_0_r());
+        gk20a_dbg(gpu_dbg_intr, "stall intr %08x\n", mc_intr_0);
+        if (mc_intr_0 & mc_intr_0_pgraph_pending_f())
+                gr_gk20a_elpg_protected_call(g, gk20a_gr_isr(g));
+        if (mc_intr_0 & mc_intr_0_pfifo_pending_f())
+                gk20a_fifo_isr(g);
+        if (mc_intr_0 & mc_intr_0_pmu_pending_f())
+                gk20a_pmu_isr(g);
+        if (mc_intr_0 & mc_intr_0_priv_ring_pending_f())
+                gk20a_priv_ring_isr(g);
+        if (mc_intr_0 & mc_intr_0_ltc_pending_f())
+                gk20a_mm_ltc_isr(g);
+        if (mc_intr_0 & mc_intr_0_pbus_pending_f())
+                gk20a_pbus_isr(g);
+        gk20a_writel(g, mc_intr_en_0_r(),
+                mc_intr_en_0_inta_hardware_f());
+        /* flush previous write */
+        gk20a_readl(g, mc_intr_en_0_r());
+        return IRQ_HANDLED;
+}
+static irqreturn_t gk20a_intr_thread_nonstall(int irq, void *dev_id)
+{
+        struct gk20a *g = dev_id;
+        u32 mc_intr_1;
+        gk20a_dbg(gpu_dbg_intr, "interrupt thread launched");
+        mc_intr_1 = gk20a_readl(g, mc_intr_1_r());
+        gk20a_dbg(gpu_dbg_intr, "non-stall intr %08x\n", mc_intr_1);
+        if (mc_intr_1 & mc_intr_0_pfifo_pending_f())
+                gk20a_fifo_nonstall_isr(g);
+        if (mc_intr_1 & mc_intr_0_pgraph_pending_f())
+                gk20a_gr_nonstall_isr(g);
+        gk20a_writel(g, mc_intr_en_1_r(),
+                mc_intr_en_1_inta_hardware_f());
+        /* flush previous write */
+        gk20a_readl(g, mc_intr_en_1_r());
+        return IRQ_HANDLED;
+}
+static void gk20a_remove_support(struct platform_device *dev)
+{
+        struct gk20a *g = get_gk20a(dev);
+        /* pmu support should already be removed when driver turns off
+           gpu power rail in prepapre_poweroff */
+        if (g->gk20a_cdev.gk20a_cooling_dev)
+                thermal_cooling_device_unregister(g->gk20a_cdev.gk20a_cooling_dev);
+        if (g->gr.remove_support)
+                g->gr.remove_support(&g->gr);
+        if (g->fifo.remove_support)
+                g->fifo.remove_support(&g->fifo);
+        if (g->mm.remove_support)
+                g->mm.remove_support(&g->mm);
+        if (g->sim.remove_support)
+                g->sim.remove_support(&g->sim);
+        release_firmware(g->pmu_fw);
+        if (g->irq_requested) {
+                free_irq(g->irq_stall, g);
+                free_irq(g->irq_nonstall, g);
+                g->irq_requested = false;
+        }
+        /* free mappings to registers, etc*/
+        if (g->regs) {
+                iounmap(g->regs);
+                g->regs = 0;
+        }
+        if (g->bar1) {
+                iounmap(g->bar1);
+                g->bar1 = 0;
+        }
+}
+static int gk20a_init_support(struct platform_device *dev)
+{
+        int err = 0;
+        struct gk20a *g = get_gk20a(dev);
+        g->regs = gk20a_ioremap_resource(dev, GK20A_BAR0_IORESOURCE_MEM,
+                                         &g->reg_mem);
+        if (!g->regs) {
+                dev_err(dev_from_gk20a(g), "failed to remap gk20a registers\n");
+                err = -ENXIO;
+                goto fail;
+        }
+        g->bar1 = gk20a_ioremap_resource(dev, GK20A_BAR1_IORESOURCE_MEM,
+                                         &g->bar1_mem);
+        if (!g->bar1) {
+                dev_err(dev_from_gk20a(g), "failed to remap gk20a bar1\n");
+                err = -ENXIO;
+                goto fail;
+        }
+        /* Get interrupt numbers */
+        g->irq_stall = platform_get_irq(dev, 0);
+        g->irq_nonstall = platform_get_irq(dev, 1);
+        if (g->irq_stall < 0 || g->irq_nonstall < 0) {
+                err = -ENXIO;
+                goto fail;
+        }
+        if (tegra_cpu_is_asim()) {
+                err = gk20a_init_sim_support(dev);
+                if (err)
+                        goto fail;
+        }
+        mutex_init(&g->dbg_sessions_lock);
+        mutex_init(&g->client_lock);
+        g->remove_support = gk20a_remove_support;
+        return 0;
+ fail:
+        gk20a_remove_support(dev);
+        return err;
+}
+static int gk20a_init_client(struct platform_device *dev)
+{
+        struct gk20a *g = get_gk20a(dev);
+        int err;
+        gk20a_dbg_fn("");
+#ifndef CONFIG_PM_RUNTIME
+        gk20a_pm_finalize_poweron(&dev->dev);
+#endif
+        err = gk20a_init_mm_setup_sw(g);
+        if (err)
+                return err;
+        if (IS_ENABLED(CONFIG_GK20A_DEVFREQ))
+                gk20a_scale_hw_init(dev);
+        return 0;
+}
+static void gk20a_deinit_client(struct platform_device *dev)
+{
+        gk20a_dbg_fn("");
+#ifndef CONFIG_PM_RUNTIME
+        gk20a_pm_prepare_poweroff(&dev->dev);
+#endif
+}
+int gk20a_get_client(struct gk20a *g)
+{
+        int err = 0;
+        mutex_lock(&g->client_lock);
+        if (g->client_refcount == 0)
+                err = gk20a_init_client(g->dev);
+        if (!err)
+                g->client_refcount++;
+        mutex_unlock(&g->client_lock);
+        return err;
+}
+void gk20a_put_client(struct gk20a *g)
+{
+        mutex_lock(&g->client_lock);
+        if (g->client_refcount == 1)
+                gk20a_deinit_client(g->dev);
+        g->client_refcount--;
+        mutex_unlock(&g->client_lock);
+        WARN_ON(g->client_refcount < 0);
+}
+static int gk20a_pm_prepare_poweroff(struct device *_dev)
+{
+        struct platform_device *dev = to_platform_device(_dev);
+        struct gk20a *g = get_gk20a(dev);
+        int ret = 0;
+        gk20a_dbg_fn("");
+        if (!g->power_on)
+                return 0;
+        ret |= gk20a_channel_suspend(g);
+        /* disable elpg before gr or fifo suspend */
+        ret |= gk20a_pmu_destroy(g);
+        ret |= gk20a_gr_suspend(g);
+        ret |= gk20a_mm_suspend(g);
+        ret |= gk20a_fifo_suspend(g);
+        /*
+         * After this point, gk20a interrupts should not get
+         * serviced.
+         */
+        if (g->irq_requested) {
+                free_irq(g->irq_stall, g);
+                free_irq(g->irq_nonstall, g);
+                g->irq_requested = false;
+        }
+        /* Disable GPCPLL */
+        ret |= gk20a_suspend_clk_support(g);
+        g->power_on = false;
+        return ret;
+}
+static void gk20a_detect_chip(struct gk20a *g)
+{
+        struct nvhost_gpu_characteristics *gpu = &g->gpu_characteristics;
+        u32 mc_boot_0_value = gk20a_readl(g, mc_boot_0_r());
+        gpu->arch = mc_boot_0_architecture_v(mc_boot_0_value) <<
+                NVHOST_GPU_ARCHITECTURE_SHIFT;
+        gpu->impl = mc_boot_0_implementation_v(mc_boot_0_value);
+        gpu->rev =
+                (mc_boot_0_major_revision_v(mc_boot_0_value) << 4) |
+                mc_boot_0_minor_revision_v(mc_boot_0_value);
+        gk20a_dbg_info("arch: %x, impl: %x, rev: %x\n",
+                        g->gpu_characteristics.arch,
+                        g->gpu_characteristics.impl,
+                        g->gpu_characteristics.rev);
+}
+static int gk20a_pm_finalize_poweron(struct device *_dev)
+{
+        struct platform_device *dev = to_platform_device(_dev);
+        struct gk20a *g = get_gk20a(dev);
+        int err, nice_value;
+        gk20a_dbg_fn("");
+        if (g->power_on)
+                return 0;
+        nice_value = task_nice(current);
+        set_user_nice(current, -20);
+        if (!g->irq_requested) {
+                err = request_threaded_irq(g->irq_stall,
+                                gk20a_intr_isr_stall,
+                                gk20a_intr_thread_stall,
+                                0, "gk20a_stall", g);
+                if (err) {
+                        dev_err(dev_from_gk20a(g),
+                                "failed to request stall intr irq @ %lld\n",
+                                        (u64)g->irq_stall);
+                        goto done;
+                }
+                err = request_threaded_irq(g->irq_nonstall,
+                                gk20a_intr_isr_nonstall,
+                                gk20a_intr_thread_nonstall,
+                                0, "gk20a_nonstall", g);
+                if (err) {
+                        dev_err(dev_from_gk20a(g),
+                                "failed to request non-stall intr irq @ %lld\n",
+                                        (u64)g->irq_nonstall);
+                        goto done;
+                }
+                g->irq_requested = true;
+        }
+        g->power_on = true;
+        gk20a_writel(g, mc_intr_mask_1_r(),
+                        mc_intr_0_pfifo_pending_f()
+                        | mc_intr_0_pgraph_pending_f());
+        gk20a_writel(g, mc_intr_en_1_r(),
+                mc_intr_en_1_inta_hardware_f());
+        gk20a_writel(g, mc_intr_mask_0_r(),
+                        mc_intr_0_pgraph_pending_f()
+                        | mc_intr_0_pfifo_pending_f()
+                        | mc_intr_0_priv_ring_pending_f()
+                        | mc_intr_0_ltc_pending_f()
+                        | mc_intr_0_pbus_pending_f());
+        gk20a_writel(g, mc_intr_en_0_r(),
+                mc_intr_en_0_inta_hardware_f());
+        if (!tegra_platform_is_silicon())
+                gk20a_writel(g, bus_intr_en_0_r(), 0x0);
+        else
+                gk20a_writel(g, bus_intr_en_0_r(),
+                                bus_intr_en_0_pri_squash_m() |
+                                bus_intr_en_0_pri_fecserr_m() |
+                                bus_intr_en_0_pri_timeout_m());
+        gk20a_reset_priv_ring(g);
+        gk20a_detect_chip(g);
+        err = gpu_init_hal(g);
+        if (err)
+                goto done;
+        /* TBD: move this after graphics init in which blcg/slcg is enabled.
+           This function removes SlowdownOnBoot which applies 32x divider
+           on gpcpll bypass path. The purpose of slowdown is to save power
+           during boot but it also significantly slows down gk20a init on
+           simulation and emulation. We should remove SOB after graphics power
+           saving features (blcg/slcg) are enabled. For now, do it here. */
+        err = gk20a_init_clk_support(g);
+        if (err) {
+                gk20a_err(&dev->dev, "failed to init gk20a clk");
+                goto done;
+        }
+        /* enable pri timeout only on silicon */
+        if (tegra_platform_is_silicon()) {
+                gk20a_writel(g,
+                        timer_pri_timeout_r(),
+                        timer_pri_timeout_period_f(0x186A0) |
+                        timer_pri_timeout_en_en_enabled_f());
+        } else {
+                gk20a_writel(g,
+                        timer_pri_timeout_r(),
+                        timer_pri_timeout_period_f(0x186A0) |
+                        timer_pri_timeout_en_en_disabled_f());
+        }
+        err = gk20a_init_fifo_reset_enable_hw(g);
+        if (err) {
+                gk20a_err(&dev->dev, "failed to reset gk20a fifo");
+                goto done;
+        }
+        err = gk20a_init_mm_support(g);
+        if (err) {
+                gk20a_err(&dev->dev, "failed to init gk20a mm");
+                goto done;
+        }
+        err = gk20a_init_pmu_support(g);
+        if (err) {
+                gk20a_err(&dev->dev, "failed to init gk20a pmu");
+                goto done;
+        }
+        err = gk20a_init_fifo_support(g);
+        if (err) {
+                gk20a_err(&dev->dev, "failed to init gk20a fifo");
+                goto done;
+        }
+        err = gk20a_init_gr_support(g);
+        if (err) {
+                gk20a_err(&dev->dev, "failed to init gk20a gr");
+                goto done;
+        }
+        err = gk20a_init_pmu_setup_hw2(g);
+        if (err) {
+                gk20a_err(&dev->dev, "failed to init gk20a pmu_hw2");
+                goto done;
+        }
+        err = gk20a_init_therm_support(g);
+        if (err) {
+                gk20a_err(&dev->dev, "failed to init gk20a therm");
+                goto done;
+        }
+        err = gk20a_init_gpu_characteristics(g);
+        if (err) {
+                gk20a_err(&dev->dev, "failed to init gk20a gpu characteristics");
+                goto done;
+        }
+        gk20a_channel_resume(g);
+        set_user_nice(current, nice_value);
+done:
+        return err;
+}
+static struct of_device_id tegra_gk20a_of_match[] = {
+#ifdef CONFIG_TEGRA_GK20A
+        { .compatible = "nvidia,tegra124-gk20a",
+                .data = &gk20a_tegra_platform },
+#endif
+        { .compatible = "nvidia,generic-gk20a",
+                .data = &gk20a_generic_platform },
+        { },
+};
+int tegra_gpu_get_max_state(struct thermal_cooling_device *cdev,
+                unsigned long *max_state)
+{
+        struct cooling_device_gk20a *gk20a_gpufreq_device = cdev->devdata;
+        *max_state = gk20a_gpufreq_device->gk20a_freq_table_size - 1;
+        return 0;
+}
+int tegra_gpu_get_cur_state(struct thermal_cooling_device *cdev,
+                unsigned long *cur_state)
+{
+        struct cooling_device_gk20a  *gk20a_gpufreq_device = cdev->devdata;
+        *cur_state = gk20a_gpufreq_device->gk20a_freq_state;
+        return 0;
+}
+int tegra_gpu_set_cur_state(struct thermal_cooling_device *c_dev,
+                unsigned long cur_state)
+{
+        u32 target_freq;
+        struct gk20a *g;
+        struct gpufreq_table_data *gpu_cooling_table;
+        struct cooling_device_gk20a *gk20a_gpufreq_device = c_dev->devdata;
+        BUG_ON(cur_state >= gk20a_gpufreq_device->gk20a_freq_table_size);
+        g = container_of(gk20a_gpufreq_device, struct gk20a, gk20a_cdev);
+        gpu_cooling_table = tegra_gpufreq_table_get();
+        target_freq = gpu_cooling_table[cur_state].frequency;
+        /* ensure a query for state will get the proper value */
+        gk20a_gpufreq_device->gk20a_freq_state = cur_state;
+        gk20a_clk_set_rate(g, target_freq);
+        return 0;
+}
+static struct thermal_cooling_device_ops tegra_gpu_cooling_ops = {
+        .get_max_state = tegra_gpu_get_max_state,
+        .get_cur_state = tegra_gpu_get_cur_state,
+        .set_cur_state = tegra_gpu_set_cur_state,
+};
+static int gk20a_create_device(
+        struct platform_device *pdev, int devno, const char *cdev_name,
+        struct cdev *cdev, struct device **out,
+        const struct file_operations *ops)
+{
+        struct device *dev;
+        int err;
+        struct gk20a *g = get_gk20a(pdev);
+        gk20a_dbg_fn("");
+        cdev_init(cdev, ops);
+        cdev->owner = THIS_MODULE;
+        err = cdev_add(cdev, devno, 1);
+        if (err) {
+                dev_err(&pdev->dev,
+                        "failed to add %s cdev\n", cdev_name);
+                return err;
+        }
+        dev = device_create(g->class, NULL, devno, NULL,
+                (pdev->id <= 0) ? INTERFACE_NAME : INTERFACE_NAME ".%d",
+                cdev_name, pdev->id);
+        if (IS_ERR(dev)) {
+                err = PTR_ERR(dev);
+                cdev_del(cdev);
+                dev_err(&pdev->dev,
+                        "failed to create %s device for %s\n",
+                        cdev_name, pdev->name);
+                return err;
+        }
+        *out = dev;
+        return 0;
+}
+static void gk20a_user_deinit(struct platform_device *dev)
+{
+        struct gk20a *g = get_gk20a(dev);
+        if (g->channel.node) {
+                device_destroy(g->class, g->channel.cdev.dev);
+                cdev_del(&g->channel.cdev);
+        }
+        if (g->as.node) {
+                device_destroy(g->class, g->as.cdev.dev);
+                cdev_del(&g->as.cdev);
+        }
+        if (g->ctrl.node) {
+                device_destroy(g->class, g->ctrl.cdev.dev);
+                cdev_del(&g->ctrl.cdev);
+        }
+        if (g->dbg.node) {
+                device_destroy(g->class, g->dbg.cdev.dev);
+                cdev_del(&g->dbg.cdev);
+        }
+        if (g->prof.node) {
+                device_destroy(g->class, g->prof.cdev.dev);
+                cdev_del(&g->prof.cdev);
+        }
+        if (g->cdev_region)
+                unregister_chrdev_region(g->cdev_region, GK20A_NUM_CDEVS);
+        if (g->class)
+                class_destroy(g->class);
+}
+static int gk20a_user_init(struct platform_device *dev)
+{
+        int err;
+        dev_t devno;
+        struct gk20a *g = get_gk20a(dev);
+        g->class = class_create(THIS_MODULE, CLASS_NAME);
+        if (IS_ERR(g->class)) {
+                err = PTR_ERR(g->class);
+                g->class = NULL;
+                dev_err(&dev->dev,
+                        "failed to create " CLASS_NAME " class\n");
+                goto fail;
+        }
+        err = alloc_chrdev_region(&devno, 0, GK20A_NUM_CDEVS, CLASS_NAME);
+        if (err) {
+                dev_err(&dev->dev, "failed to allocate devno\n");
+                goto fail;
+        }
+        g->cdev_region = devno;
+        err = gk20a_create_device(dev, devno++, "",
+                                  &g->channel.cdev, &g->channel.node,
+                                  &gk20a_channel_ops);
+        if (err)
+                goto fail;
+        err = gk20a_create_device(dev, devno++, "-as",
+                                  &g->as.cdev, &g->as.node,
+                                  &gk20a_as_ops);
+        if (err)
+                goto fail;
+        err = gk20a_create_device(dev, devno++, "-ctrl",
+                                  &g->ctrl.cdev, &g->ctrl.node,
+                                  &gk20a_ctrl_ops);
+        if (err)
+                goto fail;
+        err = gk20a_create_device(dev, devno++, "-dbg",
+                                  &g->dbg.cdev, &g->dbg.node,
+                                  &gk20a_dbg_ops);
+        if (err)
+                goto fail;
+        err = gk20a_create_device(dev, devno++, "-prof",
+                                  &g->prof.cdev, &g->prof.node,
+                                  &gk20a_prof_ops);
+        if (err)
+                goto fail;
+        return 0;
+fail:
+        gk20a_user_deinit(dev);
+        return err;
+}
+struct channel_gk20a *gk20a_get_channel_from_file(int fd)
+{
+        struct channel_gk20a *ch;
+        struct file *f = fget(fd);
+        if (!f)
+                return 0;
+        if (f->f_op != &gk20a_channel_ops) {
+                fput(f);
+                return 0;
+        }
+        ch = (struct channel_gk20a *)f->private_data;
+        fput(f);
+        return ch;
+}
+static int gk20a_pm_enable_clk(struct device *dev)
+{
+        int index = 0;
+        struct gk20a_platform *platform;
+        platform = dev_get_drvdata(dev);
+        if (!platform)
+                return -EINVAL;
+        for (index = 0; index < platform->num_clks; index++) {
+                int err = clk_prepare_enable(platform->clk[index]);
+                if (err)
+                        return -EINVAL;
+        }
+        return 0;
+}
+static int gk20a_pm_disable_clk(struct device *dev)
+{
+        int index = 0;
+        struct gk20a_platform *platform;
+        platform = dev_get_drvdata(dev);
+        if (!platform)
+                return -EINVAL;
+        for (index = 0; index < platform->num_clks; index++)
+                clk_disable_unprepare(platform->clk[index]);
+        return 0;
+}
+#ifdef CONFIG_PM
+const struct dev_pm_ops gk20a_pm_ops = {
+#if defined(CONFIG_PM_RUNTIME) && !defined(CONFIG_PM_GENERIC_DOMAINS)
+        .runtime_resume = gk20a_pm_enable_clk,
+        .runtime_suspend = gk20a_pm_disable_clk,
+#endif
+};
+#endif
+static int gk20a_pm_railgate(struct generic_pm_domain *domain)
+{
+        struct gk20a *g = container_of(domain, struct gk20a, pd);
+        struct gk20a_platform *platform = platform_get_drvdata(g->dev);
+        int ret = 0;
+        if (platform->railgate)
+                ret = platform->railgate(platform->g->dev);
+        return ret;
+}
+static int gk20a_pm_unrailgate(struct generic_pm_domain *domain)
+{
+        struct gk20a *g = container_of(domain, struct gk20a, pd);
+        struct gk20a_platform *platform = platform_get_drvdata(g->dev);
+        int ret = 0;
+        if (platform->unrailgate)
+                ret = platform->unrailgate(platform->g->dev);
+        return ret;
+}
+static int gk20a_pm_suspend(struct device *dev)
+{
+        struct gk20a_platform *platform = dev_get_drvdata(dev);
+        int ret = 0;
+        if (atomic_read(&dev->power.usage_count) > 1)
+                return -EBUSY;
+        ret = gk20a_pm_prepare_poweroff(dev);
+        if (ret)
+                return ret;
+        gk20a_scale_suspend(to_platform_device(dev));
+        if (platform->suspend)
+                platform->suspend(dev);
+        return 0;
+}
+static int gk20a_pm_resume(struct device *dev)
+{
+        int ret = 0;
+        ret = gk20a_pm_finalize_poweron(dev);
+        if (ret)
+                return ret;
+        gk20a_scale_resume(to_platform_device(dev));
+        return 0;
+}
+static int gk20a_pm_initialise_domain(struct platform_device *pdev)
+{
+        struct gk20a_platform *platform = platform_get_drvdata(pdev);
+        struct dev_power_governor *pm_domain_gov = NULL;
+        struct generic_pm_domain *domain = &platform->g->pd;
+        int ret = 0;
+        domain->name = kstrdup(pdev->name, GFP_KERNEL);
+        if (!platform->can_railgate)
+                pm_domain_gov = &pm_domain_always_on_gov;
+        pm_genpd_init(domain, pm_domain_gov, true);
+        domain->power_off = gk20a_pm_railgate;
+        domain->power_on = gk20a_pm_unrailgate;
+        domain->dev_ops.start = gk20a_pm_enable_clk;
+        domain->dev_ops.stop = gk20a_pm_disable_clk;
+        domain->dev_ops.save_state = gk20a_pm_prepare_poweroff;
+        domain->dev_ops.restore_state = gk20a_pm_finalize_poweron;
+        domain->dev_ops.suspend = gk20a_pm_suspend;
+        domain->dev_ops.resume = gk20a_pm_resume;
+        device_set_wakeup_capable(&pdev->dev, 0);
+        ret = pm_genpd_add_device(domain, &pdev->dev);
+        if (platform->railgate_delay)
+                pm_genpd_set_poweroff_delay(domain, platform->railgate_delay);
+        return ret;
+}
+static int gk20a_pm_init(struct platform_device *dev)
+{
+        struct gk20a_platform *platform = platform_get_drvdata(dev);
+        int err = 0;
+        /* Initialise pm runtime */
+        if (platform->clockgate_delay) {
+                pm_runtime_set_autosuspend_delay(&dev->dev,
+                                                 platform->clockgate_delay);
+                pm_runtime_use_autosuspend(&dev->dev);
+        }
+        pm_runtime_enable(&dev->dev);
+        if (!pm_runtime_enabled(&dev->dev))
+                gk20a_pm_enable_clk(&dev->dev);
+        /* Enable runtime railgating if possible. If not,
+         * turn on the rail now. */
+        if (platform->can_railgate && IS_ENABLED(CONFIG_PM_GENERIC_DOMAINS))
+                platform->railgate(dev);
+        else
+                platform->unrailgate(dev);
+        /* genpd will take care of runtime power management if it is enabled */
+        if (IS_ENABLED(CONFIG_PM_GENERIC_DOMAINS))
+                err = gk20a_pm_initialise_domain(dev);
+        return err;
+}
+static int gk20a_probe(struct platform_device *dev)
+{
+        struct gk20a *gk20a;
+        int err;
+        struct gk20a_platform *platform = NULL;
+        struct cooling_device_gk20a *gpu_cdev = NULL;
+        if (dev->dev.of_node) {
+                const struct of_device_id *match;
+                match = of_match_device(tegra_gk20a_of_match, &dev->dev);
+                if (match)
+                        platform = (struct gk20a_platform *)match->data;
+        } else
+                platform = (struct gk20a_platform *)dev->dev.platform_data;
+        if (!platform) {
+                dev_err(&dev->dev, "no platform data\n");
+                return -ENODATA;
+        }
+        gk20a_dbg_fn("");
+        platform_set_drvdata(dev, platform);
+        gk20a = kzalloc(sizeof(struct gk20a), GFP_KERNEL);
+        if (!gk20a) {
+                dev_err(&dev->dev, "couldn't allocate gk20a support");
+                return -ENOMEM;
+        }
+        set_gk20a(dev, gk20a);
+        gk20a->dev = dev;
+        err = gk20a_user_init(dev);
+        if (err)
+                return err;
+        gk20a_init_support(dev);
+        spin_lock_init(&gk20a->mc_enable_lock);
+        /* Initialize the platform interface. */
+        err = platform->probe(dev);
+        if (err) {
+                dev_err(&dev->dev, "platform probe failed");
+                return err;
+        }
+        err = gk20a_pm_init(dev);
+        if (err) {
+                dev_err(&dev->dev, "pm init failed");
+                return err;
+        }
+        /* Initialise scaling */
+        if (IS_ENABLED(CONFIG_GK20A_DEVFREQ))
+                gk20a_scale_init(dev);
+        if (platform->late_probe) {
+                err = platform->late_probe(dev);
+                if (err) {
+                        dev_err(&dev->dev, "late probe failed");
+                        return err;
+                }
+        }
+        gk20a_debug_init(dev);
+        /* Set DMA parameters to allow larger sgt lists */
+        dev->dev.dma_parms = &gk20a->dma_parms;
+        dma_set_max_seg_size(&dev->dev, UINT_MAX);
+        gpu_cdev = &gk20a->gk20a_cdev;
+        gpu_cdev->gk20a_freq_table_size = tegra_gpufreq_table_size_get();
+        gpu_cdev->gk20a_freq_state = 0;
+        gpu_cdev->g = gk20a;
+        gpu_cdev->gk20a_cooling_dev = thermal_cooling_device_register("gk20a_cdev", gpu_cdev,
+                                        &tegra_gpu_cooling_ops);
+        gk20a->gr_idle_timeout_default =
+                        CONFIG_GK20A_DEFAULT_TIMEOUT;
+        gk20a->timeouts_enabled = true;
+        /* Set up initial clock gating settings */
+        if (tegra_platform_is_silicon()) {
+                gk20a->slcg_enabled = true;
+                gk20a->blcg_enabled = true;
+                gk20a->elcg_enabled = true;
+                gk20a->elpg_enabled = true;
+                gk20a->aelpg_enabled = true;
+        }
+        gk20a_create_sysfs(dev);
+#ifdef CONFIG_DEBUG_FS
+        clk_gk20a_debugfs_init(dev);
+        spin_lock_init(&gk20a->debugfs_lock);
+        gk20a->mm.ltc_enabled = true;
+        gk20a->mm.ltc_enabled_debug = true;
+        gk20a->debugfs_ltc_enabled =
+                        debugfs_create_bool("ltc_enabled", S_IRUGO|S_IWUSR,
+                                 platform->debugfs,
+                                 &gk20a->mm.ltc_enabled_debug);
+        gk20a->mm.ltc_enabled_debug = true;
+        gk20a->debugfs_gr_idle_timeout_default =
+                        debugfs_create_u32("gr_idle_timeout_default_us",
+                                        S_IRUGO|S_IWUSR, platform->debugfs,
+                                         &gk20a->gr_idle_timeout_default);
+        gk20a->debugfs_timeouts_enabled =
+                        debugfs_create_bool("timeouts_enabled",
+                                        S_IRUGO|S_IWUSR,
+                                        platform->debugfs,
+                                        &gk20a->timeouts_enabled);
+        gk20a_pmu_debugfs_init(dev);
+#endif
+#ifdef CONFIG_INPUT_CFBOOST
+        cfb_add_device(&dev->dev);
+#endif
+        return 0;
+}
+static int __exit gk20a_remove(struct platform_device *dev)
+{
+        struct gk20a *g = get_gk20a(dev);
+        gk20a_dbg_fn("");
+#ifdef CONFIG_INPUT_CFBOOST
+        cfb_remove_device(&dev->dev);
+#endif
+        if (g->remove_support)
+                g->remove_support(dev);
+        gk20a_user_deinit(dev);
+        set_gk20a(dev, 0);
+#ifdef CONFIG_DEBUG_FS
+        debugfs_remove(g->debugfs_ltc_enabled);
+        debugfs_remove(g->debugfs_gr_idle_timeout_default);
+        debugfs_remove(g->debugfs_timeouts_enabled);
+#endif
+        kfree(g);
+#ifdef CONFIG_PM_RUNTIME
+        pm_runtime_put(&dev->dev);
+        pm_runtime_disable(&dev->dev);
+#else
+        nvhost_module_disable_clk(&dev->dev);
+#endif
+        return 0;
+}
+static struct platform_driver gk20a_driver = {
+        .probe = gk20a_probe,
+        .remove = __exit_p(gk20a_remove),
+        .driver = {
+                .owner = THIS_MODULE,
+                .name = "gk20a",
+#ifdef CONFIG_OF
+                .of_match_table = tegra_gk20a_of_match,
+#endif
+#ifdef CONFIG_PM
+                .pm = &gk20a_pm_ops,
+#endif
+        }
+};
+static int __init gk20a_init(void)
+{
+        return platform_driver_register(&gk20a_driver);
+}
+static void __exit gk20a_exit(void)
+{
+        platform_driver_unregister(&gk20a_driver);
+}
+bool is_gk20a_module(struct platform_device *dev)
+{
+        return &gk20a_driver.driver == dev->dev.driver;
+}
+void gk20a_busy_noresume(struct platform_device *pdev)
+{
+        pm_runtime_get_noresume(&pdev->dev);
+}
+int gk20a_channel_busy(struct platform_device *pdev)
+{
+        int ret = 0;
+        ret = gk20a_platform_channel_busy(pdev);
+        if (ret)
+                return ret;
+        ret = gk20a_busy(pdev);
+        if (ret)
+                gk20a_platform_channel_idle(pdev);
+        return ret;
+}
+void gk20a_channel_idle(struct platform_device *pdev)
+{
+        gk20a_idle(pdev);
+        gk20a_platform_channel_idle(pdev);
+}
+int gk20a_busy(struct platform_device *pdev)
+{
+        int ret = 0;
+#ifdef CONFIG_PM_RUNTIME
+        ret = pm_runtime_get_sync(&pdev->dev);
+#endif
+        gk20a_scale_notify_busy(pdev);
+        return ret < 0 ? ret : 0;
+}
+void gk20a_idle(struct platform_device *pdev)
+{
+#ifdef CONFIG_PM_RUNTIME
+        if (atomic_read(&pdev->dev.power.usage_count) == 1)
+                gk20a_scale_notify_idle(pdev);
+        pm_runtime_mark_last_busy(&pdev->dev);
+        pm_runtime_put_sync_autosuspend(&pdev->dev);
+#else
+        gk20a_scale_notify_idle(pdev);
+#endif
+}
+void gk20a_disable(struct gk20a *g, u32 units)
+{
+        u32 pmc;
+        gk20a_dbg(gpu_dbg_info, "pmc disable: %08x\n", units);
+        spin_lock(&g->mc_enable_lock);
+        pmc = gk20a_readl(g, mc_enable_r());
+        pmc &= ~units;
+        gk20a_writel(g, mc_enable_r(), pmc);
+        spin_unlock(&g->mc_enable_lock);
+}
+void gk20a_enable(struct gk20a *g, u32 units)
+{
+        u32 pmc;
+        gk20a_dbg(gpu_dbg_info, "pmc enable: %08x\n", units);
+        spin_lock(&g->mc_enable_lock);
+        pmc = gk20a_readl(g, mc_enable_r());
+        pmc |= units;
+        gk20a_writel(g, mc_enable_r(), pmc);
+        spin_unlock(&g->mc_enable_lock);
+        gk20a_readl(g, mc_enable_r());
+        udelay(20);
+}
+void gk20a_reset(struct gk20a *g, u32 units)
+{
+        gk20a_disable(g, units);
+        udelay(20);
+        gk20a_enable(g, units);
+}
+int gk20a_init_gpu_characteristics(struct gk20a *g)
+{
+        struct nvhost_gpu_characteristics *gpu = &g->gpu_characteristics;
+        gpu->L2_cache_size = g->ops.ltc.determine_L2_size_bytes(g);
+        gpu->on_board_video_memory_size = 0; /* integrated GPU */
+        gpu->num_gpc = g->gr.gpc_count;
+        gpu->num_tpc_per_gpc = g->gr.max_tpc_per_gpc_count;
+        gpu->bus_type = NVHOST_GPU_BUS_TYPE_AXI; /* always AXI for now */
+        gpu->big_page_size = g->mm.big_page_size;
+        gpu->compression_page_size = g->mm.compression_page_size;
+        return 0;
+}
+int nvhost_vpr_info_fetch(void)
+{
+        struct gk20a *g = get_gk20a(to_platform_device(
+                        bus_find_device_by_name(&platform_bus_type,
+                        NULL, "gk20a.0")));
+        if (!g) {
+                pr_info("gk20a ins't ready yet\n");
+                return 0;
+        }
+        return gk20a_mm_mmu_vpr_info_fetch(g);
+}
+static const struct firmware *
+do_request_firmware(struct device *dev, const char *prefix, const char *fw_name)
+{
+        const struct firmware *fw;
+        char *fw_path = NULL;
+        int path_len, err;
+        if (prefix) {
+                path_len = strlen(prefix) + strlen(fw_name);
+                path_len += 2; /* for the path separator and zero terminator*/
+                fw_path = kzalloc(sizeof(*fw_path) * path_len, GFP_KERNEL);
+                if (!fw_path)
+                        return NULL;
+                sprintf(fw_path, "%s/%s", prefix, fw_name);
+                fw_name = fw_path;
+        }
+        err = request_firmware(&fw, fw_name, dev);
+        kfree(fw_path);
+        if (err)
+                return NULL;
+        return fw;
+}
+/* This is a simple wrapper around request_firmware that takes 'fw_name' and
+ * applies an IP specific relative path prefix to it. The caller is
+ * responsible for calling release_firmware later. */
+const struct firmware *
+gk20a_request_firmware(struct gk20a *g, const char *fw_name)
+{
+        struct device *dev = &g->dev->dev;
+        const struct firmware *fw;
+        /* current->fs is NULL when calling from SYS_EXIT.
+           Add a check here to prevent crash in request_firmware */
+        if (!current->fs || !fw_name)
+                return NULL;
+        BUG_ON(!g->ops.name);
+        fw = do_request_firmware(dev, g->ops.name, fw_name);
+#ifdef CONFIG_TEGRA_GK20A
+        /* TO BE REMOVED - Support loading from legacy SOC specific path. */
+        if (!fw)
+                fw = nvhost_client_request_firmware(g->dev, fw_name);
+#endif
+        if (!fw) {
+                dev_err(dev, "failed to get firmware\n");
+                return NULL;
+        }
+        return fw;
+}
+module_init(gk20a_init);
+module_exit(gk20a_exit);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
new file mode 100644
index 00000000..a9081a9d
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -0,0 +1,559 @@
+/*
+ * drivers/video/tegra/host/gk20a/gk20a.h
+ *
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _NVHOST_GK20A_H_
+#define _NVHOST_GK20A_H_
+struct gk20a;
+struct fifo_gk20a;
+struct channel_gk20a;
+struct gr_gk20a;
+struct sim_gk20a;
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/nvhost_gpu_ioctl.h>
+#include <linux/tegra-soc.h>
+#include "../../../arch/arm/mach-tegra/iomap.h"
+#include "as_gk20a.h"
+#include "clk_gk20a.h"
+#include "fifo_gk20a.h"
+#include "gr_gk20a.h"
+#include "sim_gk20a.h"
+#include "pmu_gk20a.h"
+#include "priv_ring_gk20a.h"
+#include "therm_gk20a.h"
+#include "platform_gk20a.h"
+extern struct platform_device tegra_gk20a_device;
+bool is_gk20a_module(struct platform_device *dev);
+struct cooling_device_gk20a {
+        struct thermal_cooling_device *gk20a_cooling_dev;
+        unsigned int gk20a_freq_state;
+        unsigned int gk20a_freq_table_size;
+        struct gk20a *g;
+};
+struct gpu_ops {
+        struct {
+                int (*determine_L2_size_bytes)(struct gk20a *gk20a);
+                void (*set_max_ways_evict_last)(struct gk20a *g, u32 max_ways);
+                int (*init_comptags)(struct gk20a *g, struct gr_gk20a *gr);
+                int (*clear_comptags)(struct gk20a *g, u32 min, u32 max);
+                void (*set_zbc_color_entry)(struct gk20a *g,
+                                            struct zbc_entry *color_val,
+                                            u32 index);
+                void (*set_zbc_depth_entry)(struct gk20a *g,
+                                            struct zbc_entry *depth_val,
+                                            u32 index);
+                void (*clear_zbc_color_entry)(struct gk20a *g, u32 index);
+                void (*clear_zbc_depth_entry)(struct gk20a *g, u32 index);
+                int  (*init_zbc)(struct gk20a *g, struct gr_gk20a *gr);
+                void (*init_cbc)(struct gk20a *g, struct gr_gk20a *gr);
+                void (*sync_debugfs)(struct gk20a *g);
+                void (*elpg_flush)(struct gk20a *g);
+        } ltc;
+        struct {
+                int (*init_fs_state)(struct gk20a *g);
+                void (*access_smpc_reg)(struct gk20a *g, u32 quad, u32 offset);
+                void (*bundle_cb_defaults)(struct gk20a *g);
+                void (*cb_size_default)(struct gk20a *g);
+                int (*calc_global_ctx_buffer_size)(struct gk20a *g);
+                void (*commit_global_attrib_cb)(struct gk20a *g,
+                                                struct channel_ctx_gk20a *ch_ctx,
+                                                u64 addr, bool patch);
+                void (*commit_global_bundle_cb)(struct gk20a *g,
+                                                struct channel_ctx_gk20a *ch_ctx,
+                                                u64 addr, u64 size, bool patch);
+                int (*commit_global_cb_manager)(struct gk20a *g,
+                                                struct channel_gk20a *ch,
+                                                bool patch);
+                void (*commit_global_pagepool)(struct gk20a *g,
+                                               struct channel_ctx_gk20a *ch_ctx,
+                                               u64 addr, u32 size, bool patch);
+                void (*init_gpc_mmu)(struct gk20a *g);
+                int (*handle_sw_method)(struct gk20a *g, u32 addr,
+                                         u32 class_num, u32 offset, u32 data);
+                void (*set_alpha_circular_buffer_size)(struct gk20a *g,
+                                                       u32 data);
+                void (*set_circular_buffer_size)(struct gk20a *g, u32 data);
+                void (*enable_hww_exceptions)(struct gk20a *g);
+                bool (*is_valid_class)(struct gk20a *g, u32 class_num);
+                void (*get_sm_dsm_perf_regs)(struct gk20a *g,
+                                                  u32 *num_sm_dsm_perf_regs,
+                                                  u32 **sm_dsm_perf_regs,
+                                                  u32 *perf_register_stride);
+                void (*get_sm_dsm_perf_ctrl_regs)(struct gk20a *g,
+                                                  u32 *num_sm_dsm_perf_regs,
+                                                  u32 **sm_dsm_perf_regs,
+                                                  u32 *perf_register_stride);
+                void (*set_hww_esr_report_mask)(struct gk20a *g);
+                int (*setup_alpha_beta_tables)(struct gk20a *g,
+                                              struct gr_gk20a *gr);
+        } gr;
+        const char *name;
+        struct {
+                void (*init_fs_state)(struct gk20a *g);
+                void (*reset)(struct gk20a *g);
+                void (*init_uncompressed_kind_map)(struct gk20a *g);
+                void (*init_kind_attr)(struct gk20a *g);
+        } fb;
+        struct {
+                void (*slcg_gr_load_gating_prod)(struct gk20a *g, bool prod);
+                void (*slcg_perf_load_gating_prod)(struct gk20a *g, bool prod);
+                void (*blcg_gr_load_gating_prod)(struct gk20a *g, bool prod);
+                void (*pg_gr_load_gating_prod)(struct gk20a *g, bool prod);
+                void (*slcg_therm_load_gating_prod)(struct gk20a *g, bool prod);
+        } clock_gating;
+        struct {
+                void (*bind_channel)(struct channel_gk20a *ch_gk20a);
+        } fifo;
+        struct pmu_v {
+                /*used for change of enum zbc update cmd id from ver 0 to ver1*/
+                u32 cmd_id_zbc_table_update;
+                u32 (*get_pmu_cmdline_args_size)(struct pmu_gk20a *pmu);
+                void (*set_pmu_cmdline_args_cpu_freq)(struct pmu_gk20a *pmu,
+                        u32 freq);
+                void * (*get_pmu_cmdline_args_ptr)(struct pmu_gk20a *pmu);
+                u32 (*get_pmu_allocation_struct_size)(struct pmu_gk20a *pmu);
+                void (*set_pmu_allocation_ptr)(struct pmu_gk20a *pmu,
+                                void **pmu_alloc_ptr, void *assign_ptr);
+                void (*pmu_allocation_set_dmem_size)(struct pmu_gk20a *pmu,
+                                void *pmu_alloc_ptr, u16 size);
+                u16 (*pmu_allocation_get_dmem_size)(struct pmu_gk20a *pmu,
+                                void *pmu_alloc_ptr);
+                u32 (*pmu_allocation_get_dmem_offset)(struct pmu_gk20a *pmu,
+                                void *pmu_alloc_ptr);
+                u32 * (*pmu_allocation_get_dmem_offset_addr)(
+                                struct pmu_gk20a *pmu, void *pmu_alloc_ptr);
+                void (*pmu_allocation_set_dmem_offset)(struct pmu_gk20a *pmu,
+                                void *pmu_alloc_ptr, u32 offset);
+                void (*get_pmu_init_msg_pmu_queue_params)(
+                                struct pmu_queue *queue, u32 id,
+                                void *pmu_init_msg);
+                void *(*get_pmu_msg_pmu_init_msg_ptr)(
+                                struct pmu_init_msg *init);
+                u16 (*get_pmu_init_msg_pmu_sw_mg_off)(
+                        union pmu_init_msg_pmu *init_msg);
+                u16 (*get_pmu_init_msg_pmu_sw_mg_size)(
+                        union pmu_init_msg_pmu *init_msg);
+                u32 (*get_pmu_perfmon_cmd_start_size)(void);
+                int (*get_perfmon_cmd_start_offsetofvar)(
+                                enum pmu_perfmon_cmd_start_fields field);
+                void (*perfmon_start_set_cmd_type)(struct pmu_perfmon_cmd *pc,
+                                u8 value);
+                void (*perfmon_start_set_group_id)(struct pmu_perfmon_cmd *pc,
+                                u8 value);
+                void (*perfmon_start_set_state_id)(struct pmu_perfmon_cmd *pc,
+                                u8 value);
+                void (*perfmon_start_set_flags)(struct pmu_perfmon_cmd *pc,
+                                u8 value);
+                u8 (*perfmon_start_get_flags)(struct pmu_perfmon_cmd *pc);
+                u32 (*get_pmu_perfmon_cmd_init_size)(void);
+                int (*get_perfmon_cmd_init_offsetofvar)(
+                                enum pmu_perfmon_cmd_start_fields field);
+                void (*perfmon_cmd_init_set_sample_buffer)(
+                                struct pmu_perfmon_cmd *pc, u16 value);
+                void (*perfmon_cmd_init_set_dec_cnt)(
+                                struct pmu_perfmon_cmd *pc, u8 value);
+                void (*perfmon_cmd_init_set_base_cnt_id)(
+                                struct pmu_perfmon_cmd *pc, u8 value);
+                void (*perfmon_cmd_init_set_samp_period_us)(
+                                struct pmu_perfmon_cmd *pc, u32 value);
+                void (*perfmon_cmd_init_set_num_cnt)(struct pmu_perfmon_cmd *pc,
+                                u8 value);
+                void (*perfmon_cmd_init_set_mov_avg)(struct pmu_perfmon_cmd *pc,
+                                u8 value);
+                void *(*get_pmu_seq_in_a_ptr)(
+                                struct pmu_sequence *seq);
+                void *(*get_pmu_seq_out_a_ptr)(
+                                struct pmu_sequence *seq);
+        } pmu_ver;
+};
+struct gk20a {
+        struct platform_device *dev;
+        struct resource *reg_mem;
+        void __iomem *regs;
+        struct resource *bar1_mem;
+        void __iomem *bar1;
+        bool power_on;
+        bool irq_requested;
+        struct clk_gk20a clk;
+        struct fifo_gk20a fifo;
+        struct gr_gk20a gr;
+        struct sim_gk20a sim;
+        struct mm_gk20a mm;
+        struct pmu_gk20a pmu;
+        struct cooling_device_gk20a gk20a_cdev;
+        /* Save pmu fw here so that it lives cross suspend/resume.
+           pmu suspend destroys all pmu sw/hw states. Loading pmu
+           fw in resume crashes when the resume is from sys_exit. */
+        const struct firmware *pmu_fw;
+        u32 gr_idle_timeout_default;
+        u32 timeouts_enabled;
+        bool slcg_enabled;
+        bool blcg_enabled;
+        bool elcg_enabled;
+        bool elpg_enabled;
+        bool aelpg_enabled;
+#ifdef CONFIG_DEBUG_FS
+        spinlock_t debugfs_lock;
+        struct dentry *debugfs_ltc_enabled;
+        struct dentry *debugfs_timeouts_enabled;
+        struct dentry *debugfs_gr_idle_timeout_default;
+#endif
+        struct gk20a_ctxsw_ucode_info ctxsw_ucode_info;
+        /* held while manipulating # of debug/profiler sessions present */
+        /* also prevents debug sessions from attaching until released */
+        struct mutex dbg_sessions_lock;
+        int dbg_sessions; /* number attached */
+        int dbg_powergating_disabled_refcount; /*refcount for pg disable */
+        void (*remove_support)(struct platform_device *);
+        u64 pg_ingating_time_us;
+        u64 pg_ungating_time_us;
+        u32 pg_gating_cnt;
+        spinlock_t mc_enable_lock;
+        struct nvhost_gpu_characteristics gpu_characteristics;
+        struct {
+                struct cdev cdev;
+                struct device *node;
+        } channel;
+        struct gk20a_as as;
+        struct {
+                struct cdev cdev;
+                struct device *node;
+        } ctrl;
+        struct {
+                struct cdev cdev;
+                struct device *node;
+        } dbg;
+        struct {
+                struct cdev cdev;
+                struct device *node;
+        } prof;
+        struct mutex client_lock;
+        int client_refcount; /* open channels and ctrl nodes */
+        dev_t cdev_region;
+        struct class *class;
+        struct gpu_ops ops;
+        int irq_stall;
+        int irq_nonstall;
+        struct generic_pm_domain pd;
+        struct devfreq *devfreq;
+        struct gk20a_scale_profile *scale_profile;
+        struct device_dma_parameters dma_parms;
+};
+static inline unsigned long gk20a_get_gr_idle_timeout(struct gk20a *g)
+{
+        return g->timeouts_enabled ?
+                g->gr_idle_timeout_default : MAX_SCHEDULE_TIMEOUT;
+}
+static inline struct gk20a *get_gk20a(struct platform_device *dev)
+{
+        return gk20a_get_platform(dev)->g;
+}
+enum BAR0_DEBUG_OPERATION {
+        BARO_ZERO_NOP = 0,
+        OP_END = 'DONE',
+        BAR0_READ32 = '0R32',
+        BAR0_WRITE32 = '0W32',
+};
+struct share_buffer_head {
+        enum BAR0_DEBUG_OPERATION operation;
+/* size of the operation item */
+        u32 size;
+        u32 completed;
+        u32 failed;
+        u64 context;
+        u64 completion_callback;
+};
+struct gk20a_cyclestate_buffer_elem {
+        struct share_buffer_head        head;
+/* in */
+        u64 p_data;
+        u64 p_done;
+        u32 offset_bar0;
+        u16 first_bit;
+        u16 last_bit;
+/* out */
+/* keep 64 bits to be consistent */
+        u64 data;
+};
+/* debug accessories */
+#ifdef CONFIG_DEBUG_FS
+    /* debug info, default is compiled-in but effectively disabled (0 mask) */
+    #define GK20A_DEBUG
+    /*e.g: echo 1 > /d/tegra_host/dbg_mask */
+    #define GK20A_DEFAULT_DBG_MASK 0
+#else
+    /* manually enable and turn it on the mask */
+    /*#define NVHOST_DEBUG*/
+    #define GK20A_DEFAULT_DBG_MASK (dbg_info)
+#endif
+enum gk20a_dbg_categories {
+        gpu_dbg_info    = BIT(0),  /* lightly verbose info */
+        gpu_dbg_fn      = BIT(2),  /* fn name tracing */
+        gpu_dbg_reg     = BIT(3),  /* register accesses, very verbose */
+        gpu_dbg_pte     = BIT(4),  /* gmmu ptes */
+        gpu_dbg_intr    = BIT(5),  /* interrupts */
+        gpu_dbg_pmu     = BIT(6),  /* gk20a pmu */
+        gpu_dbg_clk     = BIT(7),  /* gk20a clk */
+        gpu_dbg_map     = BIT(8),  /* mem mappings */
+        gpu_dbg_gpu_dbg = BIT(9),  /* gpu debugger/profiler */
+        gpu_dbg_mem     = BIT(31), /* memory accesses, very verbose */
+};
+#if defined(GK20A_DEBUG)
+extern u32 gk20a_dbg_mask;
+extern u32 gk20a_dbg_ftrace;
+#define gk20a_dbg(dbg_mask, format, arg...)                             \
+do {                                                                    \
+        if (unlikely((dbg_mask) & gk20a_dbg_mask)) {            \
+                if (gk20a_dbg_ftrace)                                   \
+                        trace_printk(format "\n", ##arg);               \
+                else                                                    \
+                        pr_info("gk20a %s: " format "\n",               \
+                                        __func__, ##arg);               \
+        }                                                               \
+} while (0)
+#else /* GK20A_DEBUG */
+#define gk20a_dbg(dbg_mask, format, arg...)                             \
+do {                                                                    \
+        if (0)                                                          \
+                pr_info("gk20a %s: " format "\n", __func__, ##arg);\
+} while (0)
+#endif
+#define gk20a_err(d, fmt, arg...) \
+        dev_err(d, "%s: " fmt "\n", __func__, ##arg)
+#define gk20a_warn(d, fmt, arg...) \
+        dev_warn(d, "%s: " fmt "\n", __func__, ##arg)
+#define gk20a_dbg_fn(fmt, arg...) \
+        gk20a_dbg(gpu_dbg_fn, fmt, ##arg)
+#define gk20a_dbg_info(fmt, arg...) \
+        gk20a_dbg(gpu_dbg_info, fmt, ##arg)
+/* mem access with dbg_mem logging */
+static inline u8 gk20a_mem_rd08(void *ptr, int b)
+{
+        u8 _b = ((const u8 *)ptr)[b];
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+        gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u8)*b, _b);
+#endif
+        return _b;
+}
+static inline u16 gk20a_mem_rd16(void *ptr, int s)
+{
+        u16 _s = ((const u16 *)ptr)[s];
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+        gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u16)*s, _s);
+#endif
+        return _s;
+}
+static inline u32 gk20a_mem_rd32(void *ptr, int w)
+{
+        u32 _w = ((const u32 *)ptr)[w];
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+        gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + sizeof(u32)*w, _w);
+#endif
+        return _w;
+}
+static inline void gk20a_mem_wr08(void *ptr, int b, u8 data)
+{
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+        gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u8)*b, data);
+#endif
+        ((u8 *)ptr)[b] = data;
+}
+static inline void gk20a_mem_wr16(void *ptr, int s, u16 data)
+{
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+        gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u16)*s, data);
+#endif
+        ((u16 *)ptr)[s] = data;
+}
+static inline void gk20a_mem_wr32(void *ptr, int w, u32 data)
+{
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+        gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u32)*w, data);
+#endif
+        ((u32 *)ptr)[w] = data;
+}
+/* register accessors */
+static inline void gk20a_writel(struct gk20a *g, u32 r, u32 v)
+{
+        gk20a_dbg(gpu_dbg_reg, " r=0x%x v=0x%x", r, v);
+        writel(v, g->regs + r);
+}
+static inline u32 gk20a_readl(struct gk20a *g, u32 r)
+{
+        u32 v = readl(g->regs + r);
+        gk20a_dbg(gpu_dbg_reg, " r=0x%x v=0x%x", r, v);
+        return v;
+}
+static inline void gk20a_bar1_writel(struct gk20a *g, u32 b, u32 v)
+{
+        gk20a_dbg(gpu_dbg_reg, " b=0x%x v=0x%x", b, v);
+        writel(v, g->bar1 + b);
+}
+static inline u32 gk20a_bar1_readl(struct gk20a *g, u32 b)
+{
+        u32 v = readl(g->bar1 + b);
+        gk20a_dbg(gpu_dbg_reg, " b=0x%x v=0x%x", b, v);
+        return v;
+}
+/* convenience */
+static inline struct device *dev_from_gk20a(struct gk20a *g)
+{
+        return &g->dev->dev;
+}
+static inline struct gk20a *gk20a_from_as(struct gk20a_as *as)
+{
+        return container_of(as, struct gk20a, as);
+}
+static inline u32 u64_hi32(u64 n)
+{
+        return (u32)((n >> 32) & ~(u32)0);
+}
+static inline u32 u64_lo32(u64 n)
+{
+        return (u32)(n & ~(u32)0);
+}
+static inline u32 set_field(u32 val, u32 mask, u32 field)
+{
+        return ((val & ~mask) | field);
+}
+/* invalidate channel lookup tlb */
+static inline void gk20a_gr_flush_channel_tlb(struct gr_gk20a *gr)
+{
+        spin_lock(&gr->ch_tlb_lock);
+        memset(gr->chid_tlb, 0,
+                sizeof(struct gr_channel_map_tlb_entry) *
+                GR_CHANNEL_MAP_TLB_SIZE);
+        spin_unlock(&gr->ch_tlb_lock);
+}
+/* classes that the device supports */
+/* TBD: get these from an open-sourced SDK? */
+enum {
+        KEPLER_C                  = 0xA297,
+        FERMI_TWOD_A              = 0x902D,
+        KEPLER_COMPUTE_A          = 0xA0C0,
+        KEPLER_INLINE_TO_MEMORY_A = 0xA040,
+        KEPLER_DMA_COPY_A         = 0xA0B5, /*not sure about this one*/
+};
+#if defined(CONFIG_GK20A_PMU)
+static inline int support_gk20a_pmu(void)
+{
+        return 1;
+}
+#else
+static inline int support_gk20a_pmu(void){return 0;}
+#endif
+void gk20a_create_sysfs(struct platform_device *dev);
+#ifdef CONFIG_DEBUG_FS
+int clk_gk20a_debugfs_init(struct platform_device *dev);
+#endif
+#define GK20A_BAR0_IORESOURCE_MEM 0
+#define GK20A_BAR1_IORESOURCE_MEM 1
+#define GK20A_SIM_IORESOURCE_MEM 2
+void gk20a_busy_noresume(struct platform_device *pdev);
+int gk20a_busy(struct platform_device *pdev);
+void gk20a_idle(struct platform_device *pdev);
+int gk20a_channel_busy(struct platform_device *pdev);
+void gk20a_channel_idle(struct platform_device *pdev);
+void gk20a_disable(struct gk20a *g, u32 units);
+void gk20a_enable(struct gk20a *g, u32 units);
+void gk20a_reset(struct gk20a *g, u32 units);
+int gk20a_get_client(struct gk20a *g);
+void gk20a_put_client(struct gk20a *g);
+const struct firmware *
+gk20a_request_firmware(struct gk20a *g, const char *fw_name);
+#define NVHOST_GPU_ARCHITECTURE_SHIFT 4
+/* constructs unique and compact GPUID from nvhost_gpu_characteristics
+ * arch/impl fields */
+#define GK20A_GPUID(arch, impl) ((u32) ((arch) | (impl)))
+#define GK20A_GPUID_GK20A \
+        GK20A_GPUID(NVHOST_GPU_ARCH_GK100, NVHOST_GPU_IMPL_GK20A)
+int gk20a_init_gpu_characteristics(struct gk20a *g);
+#endif /* _NVHOST_GK20A_H_ */
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
new file mode 100644
index 00000000..32c003b6
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
@@ -0,0 +1,1247 @@
+/*
+ * gk20a allocator
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include "gk20a_allocator.h"
+static inline void link_block_list(struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *block,
+                struct gk20a_alloc_block *prev,
+                struct rb_node *rb_parent);
+static inline void link_block_rb(struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *block,
+                struct rb_node **rb_link,
+                struct rb_node *rb_parent);
+static void link_block(struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *block,
+                struct gk20a_alloc_block *prev, struct rb_node **rb_link,
+                struct rb_node *rb_parent);
+static void insert_block(struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *block);
+static void unlink_block(struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *block,
+                struct gk20a_alloc_block *prev);
+static struct gk20a_alloc_block *unlink_blocks(
+                struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *block,
+                struct gk20a_alloc_block *prev, u32 end);
+static struct gk20a_alloc_block *find_block(
+                struct gk20a_allocator *allocator, u32 addr);
+static struct gk20a_alloc_block *find_block_prev(
+                struct gk20a_allocator *allocator, u32 addr,
+                struct gk20a_alloc_block **pprev);
+static struct gk20a_alloc_block *find_block_prepare(
+                struct gk20a_allocator *allocator, u32 addr,
+                struct gk20a_alloc_block **pprev, struct rb_node ***rb_link,
+                struct rb_node **rb_parent);
+static u32 check_free_space(u32 addr, u32 limit, u32 len, u32 align);
+static void update_free_addr_cache(struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *block,
+                u32 addr, u32 len, bool free);
+static int find_free_area(struct gk20a_allocator *allocator,
+                u32 *addr, u32 len);
+static int find_free_area_nc(struct gk20a_allocator *allocator,
+                u32 *addr, u32 *len);
+static void adjust_block(struct gk20a_alloc_block *block,
+                u32 start, u32 end,
+                struct gk20a_alloc_block *insert);
+static struct gk20a_alloc_block *merge_block(
+                struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *block, u32 addr, u32 end);
+static int split_block(struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *block,
+                u32 addr, int new_below);
+static int block_alloc_single_locked(struct gk20a_allocator *allocator,
+                u32 *addr, u32 len);
+static int block_alloc_list_locked(struct gk20a_allocator *allocator,
+                u32 *addr, u32 len,
+                struct gk20a_alloc_block **pblock);
+static int block_free_locked(struct gk20a_allocator *allocator,
+                u32 addr, u32 len);
+static void block_free_list_locked(struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *list);
+/* link a block into allocator block list */
+static inline void link_block_list(struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *block,
+                struct gk20a_alloc_block *prev,
+                struct rb_node *rb_parent)
+{
+        struct gk20a_alloc_block *next;
+        block->prev = prev;
+        if (prev) {
+                next = prev->next;
+                prev->next = block;
+        } else {
+                allocator->block_first = block;
+                if (rb_parent)
+                        next = rb_entry(rb_parent,
+                                        struct gk20a_alloc_block, rb);
+                else
+                        next = NULL;
+        }
+        block->next = next;
+        if (next)
+                next->prev = block;
+}
+/* link a block into allocator rb tree */
+static inline void link_block_rb(struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *block, struct rb_node **rb_link,
+                struct rb_node *rb_parent)
+{
+        rb_link_node(&block->rb, rb_parent, rb_link);
+        rb_insert_color(&block->rb, &allocator->rb_root);
+}
+/* add a block to allocator with known location */
+static void link_block(struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *block,
+                struct gk20a_alloc_block *prev, struct rb_node **rb_link,
+                struct rb_node *rb_parent)
+{
+        struct gk20a_alloc_block *next;
+        link_block_list(allocator, block, prev, rb_parent);
+        link_block_rb(allocator, block, rb_link, rb_parent);
+        allocator->block_count++;
+        next = block->next;
+        allocator_dbg(allocator, "link new block %d:%d between block %d:%d and block %d:%d",
+                block->start, block->end,
+                prev ? prev->start : -1, prev ? prev->end : -1,
+                next ? next->start : -1, next ? next->end : -1);
+}
+/* add a block to allocator */
+static void insert_block(struct gk20a_allocator *allocator,
+                        struct gk20a_alloc_block *block)
+{
+        struct gk20a_alloc_block *prev;
+        struct rb_node **rb_link, *rb_parent;
+        find_block_prepare(allocator, block->start,
+                        &prev, &rb_link, &rb_parent);
+        link_block(allocator, block, prev, rb_link, rb_parent);
+}
+/* remove a block from allocator */
+static void unlink_block(struct gk20a_allocator *allocator,
+                        struct gk20a_alloc_block *block,
+                        struct gk20a_alloc_block *prev)
+{
+        struct gk20a_alloc_block *next = block->next;
+        allocator_dbg(allocator, "unlink block %d:%d between block %d:%d and block %d:%d",
+                block->start, block->end,
+                prev ? prev->start : -1, prev ? prev->end : -1,
+                next ? next->start : -1, next ? next->end : -1);
+        BUG_ON(block->start < allocator->base);
+        BUG_ON(block->end > allocator->limit);
+        if (prev)
+                prev->next = next;
+        else
+                allocator->block_first = next;
+        if (next)
+                next->prev = prev;
+        rb_erase(&block->rb, &allocator->rb_root);
+        if (allocator->block_recent == block)
+                allocator->block_recent = prev;
+        allocator->block_count--;
+}
+/* remove a list of blocks from allocator. the list can contain both
+   regular blocks and non-contiguous blocks. skip all non-contiguous
+   blocks, remove regular blocks into a separate list, return list head */
+static struct gk20a_alloc_block *
+unlink_blocks(struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *block,
+                struct gk20a_alloc_block *prev,
+                u32 end)
+{
+        struct gk20a_alloc_block **insertion_point;
+        struct gk20a_alloc_block *last_unfreed_block = prev;
+        struct gk20a_alloc_block *last_freed_block = NULL;
+        struct gk20a_alloc_block *first_freed_block = NULL;
+        insertion_point = (prev ? &prev->next : &allocator->block_first);
+        *insertion_point = NULL;
+        do {
+                if (!block->nc_block) {
+                        allocator_dbg(allocator, "unlink block %d:%d",
+                                block->start, block->end);
+                        if (last_freed_block)
+                                last_freed_block->next = block;
+                        block->prev = last_freed_block;
+                        rb_erase(&block->rb, &allocator->rb_root);
+                        last_freed_block = block;
+                        allocator->block_count--;
+                        if (!first_freed_block)
+                                first_freed_block = block;
+                } else {
+                        allocator_dbg(allocator, "skip nc block %d:%d",
+                                block->start, block->end);
+                        if (!*insertion_point)
+                                *insertion_point = block;
+                        if (last_unfreed_block)
+                                last_unfreed_block->next = block;
+                        block->prev = last_unfreed_block;
+                        last_unfreed_block = block;
+                }
+                block = block->next;
+        } while (block && block->start < end);
+        if (!*insertion_point)
+                *insertion_point = block;
+        if (block)
+                block->prev = last_unfreed_block;
+        if (last_unfreed_block)
+                last_unfreed_block->next = block;
+        if (last_freed_block)
+                last_freed_block->next = NULL;
+        allocator->block_recent = NULL;
+        return first_freed_block;
+}
+/* Look up the first block which satisfies addr < block->end,
+   NULL if none */
+static struct gk20a_alloc_block *
+find_block(struct gk20a_allocator *allocator, u32 addr)
+{
+        struct gk20a_alloc_block *block = allocator->block_recent;
+        if (!(block && block->end > addr && block->start <= addr)) {
+                struct rb_node *rb_node;
+                rb_node = allocator->rb_root.rb_node;
+                block = NULL;
+                while (rb_node) {
+                        struct gk20a_alloc_block *block_tmp;
+                        block_tmp = rb_entry(rb_node,
+                                        struct gk20a_alloc_block, rb);
+                        if (block_tmp->end > addr) {
+                                block = block_tmp;
+                                if (block_tmp->start <= addr)
+                                        break;
+                                rb_node = rb_node->rb_left;
+                        } else
+                                rb_node = rb_node->rb_right;
+                        if (block)
+                                allocator->block_recent = block;
+                }
+        }
+        return block;
+}
+/* Same as find_block, but also return a pointer to the previous block */
+static struct gk20a_alloc_block *
+find_block_prev(struct gk20a_allocator *allocator, u32 addr,
+                struct gk20a_alloc_block **pprev)
+{
+        struct gk20a_alloc_block *block = NULL, *prev = NULL;
+        struct rb_node *rb_node;
+        if (!allocator)
+                goto out;
+        block = allocator->block_first;
+        rb_node = allocator->rb_root.rb_node;
+        while (rb_node) {
+                struct gk20a_alloc_block *block_tmp;
+                block_tmp = rb_entry(rb_node, struct gk20a_alloc_block, rb);
+                if (addr < block_tmp->end)
+                        rb_node = rb_node->rb_left;
+                else {
+                        prev = block_tmp;
+                        if (!prev->next || addr < prev->next->end)
+                                break;
+                        rb_node = rb_node->rb_right;
+                }
+        }
+out:
+        *pprev = prev;
+        return prev ? prev->next : block;
+}
+/* Same as find_block, but also return a pointer to the previous block
+   and return rb_node to prepare for rbtree insertion */
+static struct gk20a_alloc_block *
+find_block_prepare(struct gk20a_allocator *allocator, u32 addr,
+                struct gk20a_alloc_block **pprev, struct rb_node ***rb_link,
+                struct rb_node **rb_parent)
+{
+        struct gk20a_alloc_block *block;
+        struct rb_node **__rb_link, *__rb_parent, *rb_prev;
+        __rb_link = &allocator->rb_root.rb_node;
+        rb_prev = __rb_parent = NULL;
+        block = NULL;
+        while (*__rb_link) {
+                struct gk20a_alloc_block *block_tmp;
+                __rb_parent = *__rb_link;
+                block_tmp = rb_entry(__rb_parent,
+                                struct gk20a_alloc_block, rb);
+                if (block_tmp->end > addr) {
+                        block = block_tmp;
+                        if (block_tmp->start <= addr)
+                                break;
+                        __rb_link = &__rb_parent->rb_left;
+                } else {
+                        rb_prev = __rb_parent;
+                        __rb_link = &__rb_parent->rb_right;
+                }
+        }
+        *pprev = NULL;
+        if (rb_prev)
+                *pprev = rb_entry(rb_prev, struct gk20a_alloc_block, rb);
+        *rb_link = __rb_link;
+        *rb_parent = __rb_parent;
+        return block;
+}
+/* return available space */
+static u32 check_free_space(u32 addr, u32 limit, u32 len, u32 align)
+{
+        if (addr >= limit)
+                return 0;
+        if (addr + len <= limit)
+                return len;
+        return (limit - addr) & ~(align - 1);
+}
+/* update first_free_addr/last_free_addr based on new free addr
+   called when free block(s) and allocate block(s) */
+static void update_free_addr_cache(struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *next,
+                u32 addr, u32 len, bool free)
+{
+        /* update from block free */
+        if (free) {
+                if (allocator->first_free_addr > addr)
+                        allocator->first_free_addr = addr;
+        } else { /* update from block alloc */
+                if (allocator->last_free_addr < addr + len)
+                        allocator->last_free_addr = addr + len;
+                if (allocator->first_free_addr == addr) {
+                        if (!next || next->start > addr + len)
+                                allocator->first_free_addr = addr + len;
+                        else
+                                allocator->first_free_addr = next->end;
+                }
+        }
+        if (allocator->first_free_addr > allocator->last_free_addr)
+                allocator->first_free_addr = allocator->last_free_addr;
+}
+/* find a free address range for a fixed len */
+static int find_free_area(struct gk20a_allocator *allocator,
+                        u32 *addr, u32 len)
+{
+        struct gk20a_alloc_block *block;
+        u32 start_addr, search_base, search_limit;
+        /* fixed addr allocation */
+        /* note: constraints for fixed are handled by caller */
+        if (*addr) {
+                block = find_block(allocator, *addr);
+                if (allocator->limit - len >= *addr &&
+                    (!block || *addr + len <= block->start)) {
+                        update_free_addr_cache(allocator, block,
+                                        *addr, len, false);
+                        return 0;
+                } else
+                        return -ENOMEM;
+        }
+        if (!allocator->constraint.enable) {
+                search_base  = allocator->base;
+                search_limit = allocator->limit;
+        } else {
+                start_addr = *addr = allocator->constraint.base;
+                search_base = allocator->constraint.base;
+                search_limit = allocator->constraint.limit;
+        }
+        /* cached_hole_size has max free space up to last_free_addr */
+        if (len > allocator->cached_hole_size)
+                start_addr = *addr = allocator->last_free_addr;
+        else {
+                start_addr = *addr = allocator->base;
+                allocator->cached_hole_size = 0;
+        }
+        allocator_dbg(allocator, "start search addr : %d", start_addr);
+full_search:
+        for (block = find_block(allocator, *addr);; block = block->next) {
+                if (search_limit - len < *addr) {
+                        /* start a new search in case we missed any hole */
+                        if (start_addr != search_base) {
+                                start_addr = *addr = search_base;
+                                allocator->cached_hole_size = 0;
+                                allocator_dbg(allocator, "start a new search from base");
+                                goto full_search;
+                        }
+                        return -ENOMEM;
+                }
+                if (!block || *addr + len <= block->start) {
+                        update_free_addr_cache(allocator, block,
+                                        *addr, len, false);
+                        allocator_dbg(allocator, "free space from %d, len %d",
+                                *addr, len);
+                        allocator_dbg(allocator, "next free addr: %d",
+                                allocator->last_free_addr);
+                        return 0;
+                }
+                if (*addr + allocator->cached_hole_size < block->start)
+                        allocator->cached_hole_size = block->start - *addr;
+                *addr = block->end;
+        }
+}
+/* find a free address range for as long as it meets alignment or meet len */
+static int find_free_area_nc(struct gk20a_allocator *allocator,
+                        u32 *addr, u32 *len)
+{
+        struct gk20a_alloc_block *block;
+        u32 start_addr;
+        u32 avail_len;
+        /* fixed addr allocation */
+        if (*addr) {
+                block = find_block(allocator, *addr);
+                if (allocator->limit - *len >= *addr) {
+                        if (!block)
+                                return 0;
+                        avail_len = check_free_space(*addr, block->start,
+                                                *len, allocator->align);
+                        if (avail_len != 0) {
+                                update_free_addr_cache(allocator, block,
+                                        *addr, avail_len, false);
+                                allocator_dbg(allocator,
+                                        "free space between %d, %d, len %d",
+                                        *addr, block->start, avail_len);
+                                allocator_dbg(allocator, "next free addr: %d",
+                                        allocator->last_free_addr);
+                                *len = avail_len;
+                                return 0;
+                        } else
+                                return -ENOMEM;
+                } else
+                        return -ENOMEM;
+        }
+        start_addr = *addr = allocator->first_free_addr;
+        allocator_dbg(allocator, "start search addr : %d", start_addr);
+        for (block = find_block(allocator, *addr);; block = block->next) {
+                if (allocator->limit - *len < *addr)
+                        return -ENOMEM;
+                if (!block) {
+                        update_free_addr_cache(allocator, block,
+                                        *addr, *len, false);
+                        allocator_dbg(allocator, "free space from %d, len %d",
+                                *addr, *len);
+                        allocator_dbg(allocator, "next free addr: %d",
+                                allocator->first_free_addr);
+                        return 0;
+                }
+                avail_len = check_free_space(*addr, block->start,
+                                        *len, allocator->align);
+                if (avail_len != 0) {
+                        update_free_addr_cache(allocator, block,
+                                        *addr, avail_len, false);
+                        allocator_dbg(allocator, "free space between %d, %d, len %d",
+                                *addr, block->start, avail_len);
+                        allocator_dbg(allocator, "next free addr: %d",
+                                allocator->first_free_addr);
+                        *len = avail_len;
+                        return 0;
+                }
+                if (*addr + allocator->cached_hole_size < block->start)
+                        allocator->cached_hole_size = block->start - *addr;
+                *addr = block->end;
+        }
+}
+/* expand/shrink a block with new start and new end
+   split_block function provides insert block for shrink */
+static void adjust_block(struct gk20a_alloc_block *block,
+                u32 start, u32 end, struct gk20a_alloc_block *insert)
+{
+        struct gk20a_allocator *allocator = block->allocator;
+        allocator_dbg(allocator, "curr block %d:%d, new start %d, new end %d",
+                block->start, block->end, start, end);
+        /* expand */
+        if (!insert) {
+                if (start == block->end) {
+                        struct gk20a_alloc_block *next = block->next;
+                        if (next && end == next->start) {
+                                /* ....AAAA.... */
+                                /* PPPP....NNNN */
+                                /* PPPPPPPPPPPP */
+                                unlink_block(allocator, next, block);
+                                block->end = next->end;
+                                kmem_cache_free(allocator->block_cache, next);
+                        } else {
+                                /* ....AAAA.... */
+                                /* PPPP........ */
+                                /* PPPPPPPP.... */
+                                block->end = end;
+                        }
+                }
+                if (end == block->start) {
+                        /* ....AAAA.... */
+                        /* ........NNNN */
+                        /* PP..NNNNNNNN        ....NNNNNNNN */
+                        block->start = start;
+                }
+        } else { /* shrink */
+                /* BBBBBBBB -> BBBBIIII  OR  BBBBBBBB -> IIIIBBBB */
+                block->start = start;
+                block->end = end;
+                insert_block(allocator, insert);
+        }
+}
+/* given a range [addr, end], merge it with blocks before or after or both
+   if they can be combined into a contiguous block */
+static struct gk20a_alloc_block *
+merge_block(struct gk20a_allocator *allocator,
+        struct gk20a_alloc_block *prev, u32 addr, u32 end)
+{
+        struct gk20a_alloc_block *next;
+        if (prev)
+                next = prev->next;
+        else
+                next = allocator->block_first;
+        allocator_dbg(allocator, "curr block %d:%d", addr, end);
+        if (prev)
+                allocator_dbg(allocator, "prev block %d:%d",
+                        prev->start, prev->end);
+        if (next)
+                allocator_dbg(allocator, "next block %d:%d",
+                        next->start, next->end);
+        /* don't merge with non-contiguous allocation block */
+        if (prev && prev->end == addr && !prev->nc_block) {
+                adjust_block(prev, addr, end, NULL);
+                return prev;
+        }
+        /* don't merge with non-contiguous allocation block */
+        if (next && end == next->start && !next->nc_block) {
+                adjust_block(next, addr, end, NULL);
+                return next;
+        }
+        return NULL;
+}
+/* split a block based on addr. addr must be within (start, end).
+   if new_below == 1, link new block before adjusted current block */
+static int split_block(struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *block, u32 addr, int new_below)
+{
+        struct gk20a_alloc_block *new_block;
+        allocator_dbg(allocator, "start %d, split %d, end %d, new_below %d",
+                block->start, addr, block->end, new_below);
+        BUG_ON(!(addr > block->start && addr < block->end));
+        new_block = kmem_cache_alloc(allocator->block_cache, GFP_KERNEL);
+        if (!new_block)
+                return -ENOMEM;
+        *new_block = *block;
+        if (new_below)
+                new_block->end = addr;
+        else
+                new_block->start = addr;
+        if (new_below)
+                adjust_block(block, addr, block->end, new_block);
+        else
+                adjust_block(block, block->start, addr, new_block);
+        return 0;
+}
+/* free a list of blocks */
+static void free_blocks(struct gk20a_allocator *allocator,
+                        struct gk20a_alloc_block *block)
+{
+        struct gk20a_alloc_block *curr_block;
+        while (block) {
+                curr_block = block;
+                block = block->next;
+                kmem_cache_free(allocator->block_cache, curr_block);
+        }
+}
+/* called with rw_sema acquired */
+static int block_alloc_single_locked(struct gk20a_allocator *allocator,
+                                u32 *addr_req, u32 len)
+{
+        struct gk20a_alloc_block *block, *prev;
+        struct rb_node **rb_link, *rb_parent;
+        u32 addr = *addr_req;
+        int err;
+        *addr_req = ~0;
+        err = find_free_area(allocator, &addr, len);
+        if (err)
+                return err;
+        find_block_prepare(allocator, addr, &prev, &rb_link, &rb_parent);
+        /* merge requested free space with existing block(s)
+           if they can be combined into one contiguous block */
+        block = merge_block(allocator, prev, addr, addr + len);
+        if (block) {
+                *addr_req = addr;
+                return 0;
+        }
+        /* create a new block if cannot merge */
+        block = kmem_cache_zalloc(allocator->block_cache, GFP_KERNEL);
+        if (!block)
+                return -ENOMEM;
+        block->allocator = allocator;
+        block->start = addr;
+        block->end = addr + len;
+        link_block(allocator, block, prev, rb_link, rb_parent);
+        *addr_req = addr;
+        return 0;
+}
+static int block_alloc_list_locked(struct gk20a_allocator *allocator,
+        u32 *addr_req, u32 nc_len, struct gk20a_alloc_block **pblock)
+{
+        struct gk20a_alloc_block *block;
+        struct gk20a_alloc_block *nc_head = NULL, *nc_prev = NULL;
+        u32 addr = *addr_req, len = nc_len;
+        int err = 0;
+        *addr_req = ~0;
+        while (nc_len > 0) {
+                err = find_free_area_nc(allocator, &addr, &len);
+                if (err) {
+                        allocator_dbg(allocator, "not enough free space");
+                        goto clean_up;
+                }
+                /* never merge non-contiguous allocation block,
+                   just create a new block */
+                block = kmem_cache_zalloc(allocator->block_cache,
+                                        GFP_KERNEL);
+                if (!block) {
+                        err = -ENOMEM;
+                        goto clean_up;
+                }
+                block->allocator = allocator;
+                block->start = addr;
+                block->end = addr + len;
+                insert_block(allocator, block);
+                block->nc_prev = nc_prev;
+                if (nc_prev)
+                        nc_prev->nc_next = block;
+                nc_prev = block;
+                block->nc_block = true;
+                if (!nc_head)
+                        nc_head = block;
+                if (*addr_req == ~0)
+                        *addr_req = addr;
+                addr = 0;
+                nc_len -= len;
+                len = nc_len;
+                allocator_dbg(allocator, "remaining length %d", nc_len);
+        }
+clean_up:
+        if (err) {
+                while (nc_head) {
+                        unlink_block(allocator, nc_head, nc_head->prev);
+                        nc_prev = nc_head;
+                        nc_head = nc_head->nc_next;
+                        kmem_cache_free(allocator->block_cache, nc_prev);
+                }
+                *pblock = NULL;
+                *addr_req = ~0;
+        } else {
+                *pblock = nc_head;
+        }
+        return err;
+}
+/* called with rw_sema acquired */
+static int block_free_locked(struct gk20a_allocator *allocator,
+                        u32 addr, u32 len)
+{
+        struct gk20a_alloc_block *block, *prev, *last;
+        u32 end;
+        int err;
+        /* no block has block->end > addr, already free */
+        block = find_block_prev(allocator, addr, &prev);
+        if (!block)
+                return 0;
+        allocator_dbg(allocator, "first block in free range %d:%d",
+                block->start, block->end);
+        end = addr + len;
+        /* not in any block, already free */
+        if (block->start >= end)
+                return 0;
+        /* don't touch nc_block in range free */
+        if (addr > block->start && !block->nc_block) {
+                int err = split_block(allocator, block, addr, 0);
+                if (err)
+                        return err;
+                prev = block;
+        }
+        last = find_block(allocator, end);
+        if (last && end > last->start && !last->nc_block) {
+                allocator_dbg(allocator, "last block in free range %d:%d",
+                        last->start, last->end);
+                err = split_block(allocator, last, end, 1);
+                if (err)
+                        return err;
+        }
+        block = prev ? prev->next : allocator->block_first;
+        allocator_dbg(allocator, "first block for free %d:%d",
+                block->start, block->end);
+        /* remove blocks between [addr, addr + len) from rb tree
+           and put them in a list */
+        block = unlink_blocks(allocator, block, prev, end);
+        free_blocks(allocator, block);
+        update_free_addr_cache(allocator, NULL, addr, len, true);
+        return 0;
+}
+/* called with rw_sema acquired */
+static void block_free_list_locked(struct gk20a_allocator *allocator,
+                        struct gk20a_alloc_block *list)
+{
+        struct gk20a_alloc_block *block;
+        u32 len;
+        update_free_addr_cache(allocator, NULL,
+                        list->start, list->end - list->start, true);
+        while (list) {
+                block = list;
+                unlink_block(allocator, block, block->prev);
+                len = block->end - block->start;
+                if (allocator->cached_hole_size < len)
+                        allocator->cached_hole_size = len;
+                list = block->nc_next;
+                kmem_cache_free(allocator->block_cache, block);
+        }
+}
+static int
+gk20a_allocator_constrain(struct gk20a_allocator *a,
+                           bool enable, u32 base, u32 limit)
+{
+        if (enable) {
+                a->constraint.enable = (base >= a->base &&
+                                        limit <= a->limit);
+                if (!a->constraint.enable)
+                        return -EINVAL;
+                a->constraint.base  = base;
+                a->constraint.limit = limit;
+                a->first_free_addr = a->last_free_addr = base;
+        } else {
+                a->constraint.enable = false;
+                a->first_free_addr = a->last_free_addr = a->base;
+        }
+        a->cached_hole_size = 0;
+        return 0;
+}
+/* init allocator struct */
+int gk20a_allocator_init(struct gk20a_allocator *allocator,
+                const char *name, u32 start, u32 len, u32 align)
+{
+        memset(allocator, 0, sizeof(struct gk20a_allocator));
+        strncpy(allocator->name, name, 32);
+        allocator->block_cache =
+                kmem_cache_create(allocator->name,
+                        sizeof(struct gk20a_alloc_block), 0,
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+        if (!allocator->block_cache)
+                return -ENOMEM;
+        allocator->rb_root = RB_ROOT;
+        allocator->base = start;
+        allocator->limit = start + len - 1;
+        allocator->align = align;
+        allocator_dbg(allocator, "%s : base %d, limit %d, align %d",
+                allocator->name, allocator->base,
+                allocator->limit, allocator->align);
+        allocator->first_free_addr = allocator->last_free_addr = start;
+        allocator->cached_hole_size = len;
+        init_rwsem(&allocator->rw_sema);
+        allocator->alloc = gk20a_allocator_block_alloc;
+        allocator->alloc_nc = gk20a_allocator_block_alloc_nc;
+        allocator->free = gk20a_allocator_block_free;
+        allocator->free_nc = gk20a_allocator_block_free_nc;
+        allocator->constrain = gk20a_allocator_constrain;
+        return 0;
+}
+/* destroy allocator, free all remaining blocks if any */
+void gk20a_allocator_destroy(struct gk20a_allocator *allocator)
+{
+        struct gk20a_alloc_block *block, *next;
+        u32 free_count = 0;
+        down_write(&allocator->rw_sema);
+        for (block = allocator->block_first; block; ) {
+                allocator_dbg(allocator, "free remaining block %d:%d",
+                        block->start, block->end);
+                next = block->next;
+                kmem_cache_free(allocator->block_cache, block);
+                free_count++;
+                block = next;
+        }
+        up_write(&allocator->rw_sema);
+        /* block_count doesn't match real number of blocks */
+        BUG_ON(free_count != allocator->block_count);
+        kmem_cache_destroy(allocator->block_cache);
+        memset(allocator, 0, sizeof(struct gk20a_allocator));
+}
+/*
+ * *addr != ~0 for fixed address allocation. if *addr == 0, base addr is
+ * returned to caller in *addr.
+ *
+ * contiguous allocation, which allocates one block of
+ * contiguous address.
+*/
+int gk20a_allocator_block_alloc(struct gk20a_allocator *allocator,
+                u32 *addr, u32 len)
+{
+        int ret;
+#if defined(ALLOCATOR_DEBUG)
+        struct gk20a_alloc_block *block;
+        bool should_fail = false;
+#endif
+        allocator_dbg(allocator, "[in] addr %d, len %d", *addr, len);
+        if (*addr + len > allocator->limit || /* check addr range */
+            *addr & (allocator->align - 1) || /* check addr alignment */
+             len == 0)                        /* check len */
+                return -EINVAL;
+        if (allocator->constraint.enable &&
+            (*addr + len > allocator->constraint.limit ||
+             *addr > allocator->constraint.base))
+                return -EINVAL;
+        len = ALIGN(len, allocator->align);
+        if (!len)
+                return -ENOMEM;
+        down_write(&allocator->rw_sema);
+#if defined(ALLOCATOR_DEBUG)
+        if (*addr) {
+                for (block = allocator->block_first;
+                     block; block = block->next) {
+                        if (block->end > *addr && block->start < *addr + len) {
+                                should_fail = true;
+                                break;
+                        }
+                }
+        }
+#endif
+        ret = block_alloc_single_locked(allocator, addr, len);
+#if defined(ALLOCATOR_DEBUG)
+        if (!ret) {
+                bool allocated = false;
+                BUG_ON(should_fail);
+                BUG_ON(*addr < allocator->base);
+                BUG_ON(*addr + len > allocator->limit);
+                for (block = allocator->block_first;
+                     block; block = block->next) {
+                        if (!block->nc_block &&
+                            block->start <= *addr &&
+                            block->end >= *addr + len) {
+                                allocated = true;
+                                break;
+                        }
+                }
+                BUG_ON(!allocated);
+        }
+#endif
+        up_write(&allocator->rw_sema);
+        allocator_dbg(allocator, "[out] addr %d, len %d", *addr, len);
+        return ret;
+}
+/*
+ * *addr != ~0 for fixed address allocation. if *addr == 0, base addr is
+ * returned to caller in *addr.
+ *
+ * non-contiguous allocation, which returns a list of blocks with aggregated
+ * size == len. Individual block size must meet alignment requirement.
+ */
+int gk20a_allocator_block_alloc_nc(struct gk20a_allocator *allocator,
+                u32 *addr, u32 len, struct gk20a_alloc_block **pblock)
+{
+        int ret;
+        allocator_dbg(allocator, "[in] addr %d, len %d", *addr, len);
+        BUG_ON(pblock == NULL);
+        *pblock = NULL;
+        if (*addr + len > allocator->limit || /* check addr range */
+            *addr & (allocator->align - 1) || /* check addr alignment */
+             len == 0)                        /* check len */
+                return -EINVAL;
+        len = ALIGN(len, allocator->align);
+        if (!len)
+                return -ENOMEM;
+        down_write(&allocator->rw_sema);
+        ret = block_alloc_list_locked(allocator, addr, len, pblock);
+#if defined(ALLOCATOR_DEBUG)
+        if (!ret) {
+                struct gk20a_alloc_block *block = *pblock;
+                BUG_ON(!block);
+                BUG_ON(block->start < allocator->base);
+                while (block->nc_next) {
+                        BUG_ON(block->end > block->nc_next->start);
+                        block = block->nc_next;
+                }
+                BUG_ON(block->end > allocator->limit);
+        }
+#endif
+        up_write(&allocator->rw_sema);
+        allocator_dbg(allocator, "[out] addr %d, len %d", *addr, len);
+        return ret;
+}
+/* free all blocks between start and end */
+int gk20a_allocator_block_free(struct gk20a_allocator *allocator,
+                u32 addr, u32 len)
+{
+        int ret;
+        allocator_dbg(allocator, "[in] addr %d, len %d", addr, len);
+        if (addr + len > allocator->limit || /* check addr range */
+            addr < allocator->base ||
+            addr & (allocator->align - 1))   /* check addr alignment */
+                return -EINVAL;
+        len = ALIGN(len, allocator->align);
+        if (!len)
+                return -EINVAL;
+        down_write(&allocator->rw_sema);
+        ret = block_free_locked(allocator, addr, len);
+#if defined(ALLOCATOR_DEBUG)
+        if (!ret) {
+                struct gk20a_alloc_block *block;
+                for (block = allocator->block_first;
+                     block; block = block->next) {
+                        if (!block->nc_block)
+                                BUG_ON(block->start >= addr &&
+                                        block->end <= addr + len);
+                }
+        }
+#endif
+        up_write(&allocator->rw_sema);
+        allocator_dbg(allocator, "[out] addr %d, len %d", addr, len);
+        return ret;
+}
+/* free non-contiguous allocation block list */
+void gk20a_allocator_block_free_nc(struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *block)
+{
+        /* nothing to free */
+        if (!block)
+                return;
+        down_write(&allocator->rw_sema);
+        block_free_list_locked(allocator, block);
+        up_write(&allocator->rw_sema);
+}
+#if defined(ALLOCATOR_DEBUG)
+#include <linux/random.h>
+/* test suite */
+void gk20a_allocator_test(void)
+{
+        struct gk20a_allocator allocator;
+        struct gk20a_alloc_block *list[5];
+        u32 addr, len;
+        u32 count;
+        int n;
+        gk20a_allocator_init(&allocator, "test", 0, 10, 1);
+        /* alloc/free a single block in the beginning */
+        addr = 0;
+        gk20a_allocator_block_alloc(&allocator, &addr, 2);
+        gk20a_allocator_dump(&allocator);
+        gk20a_allocator_block_free(&allocator, addr, 2);
+        gk20a_allocator_dump(&allocator);
+        /* alloc/free a single block in the middle */
+        addr = 4;
+        gk20a_allocator_block_alloc(&allocator, &addr, 2);
+        gk20a_allocator_dump(&allocator);
+        gk20a_allocator_block_free(&allocator, addr, 2);
+        gk20a_allocator_dump(&allocator);
+        /* alloc/free a single block in the end */
+        addr = 8;
+        gk20a_allocator_block_alloc(&allocator, &addr, 2);
+        gk20a_allocator_dump(&allocator);
+        gk20a_allocator_block_free(&allocator, addr, 2);
+        gk20a_allocator_dump(&allocator);
+        /* allocate contiguous blocks */
+        addr = 0;
+        gk20a_allocator_block_alloc(&allocator, &addr, 2);
+        gk20a_allocator_dump(&allocator);
+        addr = 0;
+        gk20a_allocator_block_alloc(&allocator, &addr, 4);
+        gk20a_allocator_dump(&allocator);
+        addr = 0;
+        gk20a_allocator_block_alloc(&allocator, &addr, 4);
+        gk20a_allocator_dump(&allocator);
+        /* no free space */
+        addr = 0;
+        gk20a_allocator_block_alloc(&allocator, &addr, 2);
+        gk20a_allocator_dump(&allocator);
+        /* free in the end */
+        gk20a_allocator_block_free(&allocator, 8, 2);
+        gk20a_allocator_dump(&allocator);
+        /* free in the beginning */
+        gk20a_allocator_block_free(&allocator, 0, 2);
+        gk20a_allocator_dump(&allocator);
+        /* free in the middle */
+        gk20a_allocator_block_free(&allocator, 4, 2);
+        gk20a_allocator_dump(&allocator);
+        /* merge case PPPPAAAANNNN */
+        addr = 4;
+        gk20a_allocator_block_alloc(&allocator, &addr, 2);
+        gk20a_allocator_dump(&allocator);
+        /* merge case ....AAAANNNN */
+        addr = 0;
+        gk20a_allocator_block_alloc(&allocator, &addr, 2);
+        gk20a_allocator_dump(&allocator);
+        /* merge case PPPPAAAA.... */
+        addr = 8;
+        gk20a_allocator_block_alloc(&allocator, &addr, 2);
+        gk20a_allocator_dump(&allocator);
+        /* test free across multiple blocks and split */
+        gk20a_allocator_block_free(&allocator, 2, 2);
+        gk20a_allocator_dump(&allocator);
+        gk20a_allocator_block_free(&allocator, 6, 2);
+        gk20a_allocator_dump(&allocator);
+        gk20a_allocator_block_free(&allocator, 1, 8);
+        gk20a_allocator_dump(&allocator);
+        /* test non-contiguous allocation */
+        addr = 4;
+        gk20a_allocator_block_alloc(&allocator, &addr, 2);
+        gk20a_allocator_dump(&allocator);
+        addr = 0;
+        gk20a_allocator_block_alloc_nc(&allocator, &addr, 5, &list[0]);
+        gk20a_allocator_dump(&allocator);
+        gk20a_allocator_dump_nc_list(&allocator, list[0]);
+        /* test free a range overlaping non-contiguous blocks */
+        gk20a_allocator_block_free(&allocator, 2, 6);
+        gk20a_allocator_dump(&allocator);
+        /* test non-contiguous free */
+        gk20a_allocator_block_free_nc(&allocator, list[0]);
+        gk20a_allocator_dump(&allocator);
+        gk20a_allocator_destroy(&allocator);
+        /* random stress test */
+        gk20a_allocator_init(&allocator, "test", 4096, 4096 * 1024, 4096);
+        for (;;) {
+                pr_debug("alloc tests...\n");
+                for (count = 0; count < 50; count++) {
+                        addr = 0;
+                        len = random32() % (4096 * 1024 / 16);
+                        gk20a_allocator_block_alloc(&allocator, &addr, len);
+                        gk20a_allocator_dump(&allocator);
+                }
+                pr_debug("free tests...\n");
+                for (count = 0; count < 30; count++) {
+                        addr = (random32() % (4096 * 1024)) & ~(4096 - 1);
+                        len = random32() % (4096 * 1024 / 16);
+                        gk20a_allocator_block_free(&allocator, addr, len);
+                        gk20a_allocator_dump(&allocator);
+                }
+                pr_debug("non-contiguous alloc tests...\n");
+                for (n = 0; n < 5; n++) {
+                        addr = 0;
+                        len = random32() % (4096 * 1024 / 8);
+                        gk20a_allocator_block_alloc_nc(&allocator, &addr,
+                                len, &list[n]);
+                        gk20a_allocator_dump(&allocator);
+                        gk20a_allocator_dump_nc_list(&allocator, list[n]);
+                }
+                pr_debug("free tests...\n");
+                for (count = 0; count < 10; count++) {
+                        addr = (random32() % (4096 * 1024)) & ~(4096 - 1);
+                        len = random32() % (4096 * 1024 / 16);
+                        gk20a_allocator_block_free(&allocator, addr, len);
+                        gk20a_allocator_dump(&allocator);
+                }
+                pr_debug("non-contiguous free tests...\n");
+                for (n = 4; n >= 0; n--) {
+                        gk20a_allocator_dump_nc_list(&allocator, list[n]);
+                        gk20a_allocator_block_free_nc(&allocator, list[n]);
+                        gk20a_allocator_dump(&allocator);
+                }
+                pr_debug("fixed addr alloc tests...\n");
+                for (count = 0; count < 10; count++) {
+                        addr = (random32() % (4096 * 1024)) & ~(4096 - 1);
+                        len = random32() % (4096 * 1024 / 32);
+                        gk20a_allocator_block_alloc(&allocator, &addr, len);
+                        gk20a_allocator_dump(&allocator);
+                }
+                pr_debug("free tests...\n");
+                for (count = 0; count < 10; count++) {
+                        addr = (random32() % (4096 * 1024)) & ~(4096 - 1);
+                        len = random32() % (4096 * 1024 / 16);
+                        gk20a_allocator_block_free(&allocator, addr, len);
+                        gk20a_allocator_dump(&allocator);
+                }
+        }
+        gk20a_allocator_destroy(&allocator);
+}
+#endif /* ALLOCATOR_DEBUG */
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
new file mode 100644
index 00000000..dba397e2
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
@@ -0,0 +1,177 @@
+/*
+ * gk20a allocator
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __NVHOST_ALLOCATOR_H__
+#define __NVHOST_ALLOCATOR_H__
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+/* #define ALLOCATOR_DEBUG */
+struct allocator_block;
+/* main struct */
+struct gk20a_allocator {
+        char name[32];                  /* name for allocator */
+        struct rb_root rb_root;         /* rb tree root for blocks */
+        u32 base;                       /* min value of this linear space */
+        u32 limit;                      /* max value = limit - 1 */
+        u32 align;                      /* alignment size, power of 2 */
+        struct gk20a_alloc_block *block_first;  /* first block in list */
+        struct gk20a_alloc_block *block_recent; /* last visited block */
+        u32 first_free_addr;            /* first free addr, non-contigous
+                                           allocation preferred start,
+                                           in order to pick up small holes */
+        u32 last_free_addr;             /* last free addr, contiguous
+                                           allocation preferred start */
+        u32 cached_hole_size;           /* max free hole size up to
+                                           last_free_addr */
+        u32 block_count;                /* number of blocks */
+        struct rw_semaphore rw_sema;    /* lock */
+        struct kmem_cache *block_cache; /* slab cache */
+        /* if enabled, constrain to [base, limit) */
+        struct {
+                bool enable;
+                u32 base;
+                u32 limit;
+        } constraint;
+        int (*alloc)(struct gk20a_allocator *allocator,
+                u32 *addr, u32 len);
+        int (*alloc_nc)(struct gk20a_allocator *allocator,
+                u32 *addr, u32 len,
+                struct gk20a_alloc_block **pblock);
+        int (*free)(struct gk20a_allocator *allocator,
+                u32 addr, u32 len);
+        void (*free_nc)(struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *block);
+        int (*constrain)(struct gk20a_allocator *a,
+                         bool enable,
+                         u32 base, u32 limit);
+};
+/* a block of linear space range [start, end) */
+struct gk20a_alloc_block {
+        struct gk20a_allocator *allocator;      /* parent allocator */
+        struct rb_node rb;                      /* rb tree node */
+        u32 start;                              /* linear space range
+                                                   [start, end) */
+        u32 end;
+        void *priv;                             /* backing structure for this
+                                                   linear space block
+                                                   page table, comp tag, etc */
+        struct gk20a_alloc_block *prev; /* prev block with lower address */
+        struct gk20a_alloc_block *next; /* next block with higher address */
+        bool nc_block;
+        struct gk20a_alloc_block *nc_prev;      /* prev block for
+                                                   non-contiguous allocation */
+        struct gk20a_alloc_block *nc_next;      /* next block for
+                                                   non-contiguous allocation */
+};
+int gk20a_allocator_init(struct gk20a_allocator *allocator,
+                        const char *name, u32 base, u32 size, u32 align);
+void gk20a_allocator_destroy(struct gk20a_allocator *allocator);
+int gk20a_allocator_block_alloc(struct gk20a_allocator *allocator,
+                        u32 *addr, u32 len);
+int gk20a_allocator_block_alloc_nc(struct gk20a_allocator *allocator,
+                        u32 *addr, u32 len,
+                        struct gk20a_alloc_block **pblock);
+int gk20a_allocator_block_free(struct gk20a_allocator *allocator,
+                        u32 addr, u32 len);
+void gk20a_allocator_block_free_nc(struct gk20a_allocator *allocator,
+                        struct gk20a_alloc_block *block);
+#if defined(ALLOCATOR_DEBUG)
+#define allocator_dbg(alloctor, format, arg...)                         \
+do {                                                            \
+        if (1)                                                  \
+                pr_debug("gk20a_allocator (%s) %s: " format "\n",\
+                        alloctor->name, __func__, ##arg);\
+} while (0)
+static inline void
+gk20a_allocator_dump(struct gk20a_allocator *allocator) {
+        struct gk20a_alloc_block *block;
+        u32 count = 0;
+        down_read(&allocator->rw_sema);
+        for (block = allocator->block_first; block; block = block->next) {
+                allocator_dbg(allocator, "block %d - %d:%d, nc %d",
+                        count++, block->start, block->end, block->nc_block);
+                if (block->prev)
+                        BUG_ON(block->prev->end > block->start);
+                if (block->next)
+                        BUG_ON(block->next->start < block->end);
+        }
+        allocator_dbg(allocator, "tracked count %d, actual count %d",
+                allocator->block_count, count);
+        allocator_dbg(allocator, "first block %d:%d",
+                allocator->block_first ? allocator->block_first->start : -1,
+                allocator->block_first ? allocator->block_first->end : -1);
+        allocator_dbg(allocator, "first free addr %d",
+                allocator->first_free_addr);
+        allocator_dbg(allocator, "last free addr %d",
+                allocator->last_free_addr);
+        allocator_dbg(allocator, "cached hole size %d",
+                allocator->cached_hole_size);
+        up_read(&allocator->rw_sema);
+        BUG_ON(count != allocator->block_count);
+}
+static inline void
+gk20a_allocator_dump_nc_list(
+                struct gk20a_allocator *allocator,
+                struct gk20a_alloc_block *block)
+{
+        down_read(&allocator->rw_sema);
+        while (block) {
+                pr_debug("non-contiguous block %d:%d\n",
+                        block->start, block->end);
+                block = block->nc_next;
+        }
+        up_read(&allocator->rw_sema);
+}
+void gk20a_allocator_test(void);
+#else /* ALLOCATOR_DEBUG */
+#define allocator_dbg(format, arg...)
+#endif /* ALLOCATOR_DEBUG */
+#endif /*__NVHOST_ALLOCATOR_H__ */
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_gating_reglist.c b/drivers/gpu/nvgpu/gk20a/gk20a_gating_reglist.c
new file mode 100644
index 00000000..c6478a5e
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_gating_reglist.c
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA Corporation.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * This file is autogenerated.  Do not edit.
+ */
+#ifndef __gk20a_gating_reglist_h__
+#define __gk20a_gating_reglist_h__
+#include <linux/types.h>
+#include "gk20a_gating_reglist.h"
+struct gating_desc {
+        u32 addr;
+        u32 prod;
+        u32 disable;
+};
+/* slcg gr */
+const struct gating_desc gk20a_slcg_gr[] = {
+        {.addr = 0x004041f4, .prod = 0x00000000, .disable = 0x03fffffe},
+        {.addr = 0x00409894, .prod = 0x00000040, .disable = 0x0003fffe},
+        {.addr = 0x004078c4, .prod = 0x00000000, .disable = 0x000001fe},
+        {.addr = 0x00406004, .prod = 0x00000000, .disable = 0x0001fffe},
+        {.addr = 0x00405864, .prod = 0x00000000, .disable = 0x000001fe},
+        {.addr = 0x00405910, .prod = 0x00000000, .disable = 0xfffffffe},
+        {.addr = 0x00408044, .prod = 0x00000000, .disable = 0x000007fe},
+        {.addr = 0x00407004, .prod = 0x00000000, .disable = 0x0000001e},
+        {.addr = 0x0041a894, .prod = 0x00000000, .disable = 0x0003fffe},
+        {.addr = 0x00418504, .prod = 0x00000000, .disable = 0x0001fffe},
+        {.addr = 0x0041860c, .prod = 0x00000000, .disable = 0x000001fe},
+        {.addr = 0x0041868c, .prod = 0x00000000, .disable = 0x0000001e},
+        {.addr = 0x0041871c, .prod = 0x00000000, .disable = 0x0000003e},
+        {.addr = 0x00418388, .prod = 0x00000000, .disable = 0x00000001},
+        {.addr = 0x0041882c, .prod = 0x00000000, .disable = 0x0001fffe},
+        {.addr = 0x00418bc0, .prod = 0x00000000, .disable = 0x000001fe},
+        {.addr = 0x00418974, .prod = 0x00000000, .disable = 0x0001fffe},
+        {.addr = 0x00418c74, .prod = 0x00000000, .disable = 0xfffffffe},
+        {.addr = 0x00418cf4, .prod = 0x00000000, .disable = 0xfffffffe},
+        {.addr = 0x00418d74, .prod = 0x00000000, .disable = 0xfffffffe},
+        {.addr = 0x00418f10, .prod = 0x00000000, .disable = 0xfffffffe},
+        {.addr = 0x00418e10, .prod = 0x00000000, .disable = 0xfffffffe},
+        {.addr = 0x00419024, .prod = 0x00000000, .disable = 0x000001fe},
+        {.addr = 0x00419a44, .prod = 0x00000000, .disable = 0x0000000e},
+        {.addr = 0x00419a4c, .prod = 0x00000000, .disable = 0x000001fe},
+        {.addr = 0x00419a54, .prod = 0x00000000, .disable = 0x0000003e},
+        {.addr = 0x00419a5c, .prod = 0x00000000, .disable = 0x0000000e},
+        {.addr = 0x00419a64, .prod = 0x00000000, .disable = 0x000001fe},
+        {.addr = 0x00419a6c, .prod = 0x00000000, .disable = 0x0000000e},
+        {.addr = 0x00419a74, .prod = 0x00000000, .disable = 0x0000000e},
+        {.addr = 0x00419a7c, .prod = 0x00000000, .disable = 0x0000003e},
+        {.addr = 0x00419a84, .prod = 0x00000000, .disable = 0x0000000e},
+        {.addr = 0x00419ad0, .prod = 0x00000000, .disable = 0x0000000e},
+        {.addr = 0x0041986c, .prod = 0x0000dfc0, .disable = 0x00fffffe},
+        {.addr = 0x00419cd8, .prod = 0x00000000, .disable = 0x001ffffe},
+        {.addr = 0x00419ce0, .prod = 0x00000000, .disable = 0x001ffffe},
+        {.addr = 0x00419c74, .prod = 0x00000000, .disable = 0x0000001e},
+        {.addr = 0x00419fd4, .prod = 0x00000000, .disable = 0x0003fffe},
+        {.addr = 0x00419fdc, .prod = 0x00000000, .disable = 0xfffffffe},
+        {.addr = 0x00419fe4, .prod = 0x00000000, .disable = 0x0000000e},
+        {.addr = 0x00419ff4, .prod = 0x00000000, .disable = 0x00003ffe},
+        {.addr = 0x00419ffc, .prod = 0x00000000, .disable = 0x0001fffe},
+        {.addr = 0x0041be2c, .prod = 0x020bbfc0, .disable = 0xfffffffe},
+        {.addr = 0x0041bfec, .prod = 0x00000000, .disable = 0xfffffffe},
+        {.addr = 0x0041bed4, .prod = 0x00000000, .disable = 0xfffffffe},
+        {.addr = 0x00408814, .prod = 0x00000000, .disable = 0x0001fffe},
+        {.addr = 0x0040881c, .prod = 0x00000000, .disable = 0x0001fffe},
+        {.addr = 0x00408a84, .prod = 0x00000000, .disable = 0x0001fffe},
+        {.addr = 0x00408a8c, .prod = 0x00000000, .disable = 0x0001fffe},
+        {.addr = 0x00408a94, .prod = 0x00000000, .disable = 0x0001fffe},
+        {.addr = 0x00408a9c, .prod = 0x00000000, .disable = 0x0001fffe},
+        {.addr = 0x00408aa4, .prod = 0x00000000, .disable = 0x0001fffe},
+        {.addr = 0x00408aac, .prod = 0x00000000, .disable = 0x0001fffe},
+        {.addr = 0x004089ac, .prod = 0x00000000, .disable = 0x0001fffe},
+        {.addr = 0x00408a24, .prod = 0x00000000, .disable = 0x000001ff},
+        {.addr = 0x0017e050, .prod = 0x00000000, .disable = 0x00fffffe},
+        {.addr = 0x001200a8, .prod = 0x00000000, .disable = 0x00000001},
+        {.addr = 0x0010e48c, .prod = 0x00000000, .disable = 0x0000003e},
+        {.addr = 0x00001c04, .prod = 0x00000000, .disable = 0x000000fe},
+        {.addr = 0x00106f28, .prod = 0x00000040, .disable = 0x000007fe},
+        {.addr = 0x000206b8, .prod = 0x00000000, .disable = 0x0000000f},
+        {.addr = 0x0017ea98, .prod = 0x00000000, .disable = 0xfffffffe},
+        {.addr = 0x00106f28, .prod = 0x00000040, .disable = 0x000007fe},
+        {.addr = 0x00120048, .prod = 0x00000000, .disable = 0x00000049},
+};
+/* slcg perf */
+const struct gating_desc gk20a_slcg_perf[] = {
+        {.addr = 0x001be018, .prod = 0x000001ff, .disable = 0x00000000},
+        {.addr = 0x001bc018, .prod = 0x000001ff, .disable = 0x00000000},
+        {.addr = 0x001b8018, .prod = 0x000001ff, .disable = 0x00000000},
+        {.addr = 0x001b4124, .prod = 0x00000001, .disable = 0x00000000},
+};
+/* blcg gr */
+const struct gating_desc gk20a_blcg_gr[] = {
+        {.addr = 0x004041f0, .prod = 0x00004046, .disable = 0x00000000},
+        {.addr = 0x00409890, .prod = 0x0000007f, .disable = 0x00000000},
+        {.addr = 0x004098b0, .prod = 0x0000007f, .disable = 0x00000000},
+        {.addr = 0x004078c0, .prod = 0x00000042, .disable = 0x00000000},
+        {.addr = 0x00406000, .prod = 0x00004044, .disable = 0x00000000},
+        {.addr = 0x00405860, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x0040590c, .prod = 0x00004044, .disable = 0x00000000},
+        {.addr = 0x00408040, .prod = 0x00004044, .disable = 0x00000000},
+        {.addr = 0x00407000, .prod = 0x00004041, .disable = 0x00000000},
+        {.addr = 0x00405bf0, .prod = 0x00004044, .disable = 0x00000000},
+        {.addr = 0x0041a890, .prod = 0x0000007f, .disable = 0x00000000},
+        {.addr = 0x0041a8b0, .prod = 0x0000007f, .disable = 0x00000000},
+        {.addr = 0x00418500, .prod = 0x00004044, .disable = 0x00000000},
+        {.addr = 0x00418608, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00418688, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00418718, .prod = 0x00000042, .disable = 0x00000000},
+        {.addr = 0x00418828, .prod = 0x00000044, .disable = 0x00000000},
+        {.addr = 0x00418bbc, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00418970, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00418c70, .prod = 0x00004044, .disable = 0x00000000},
+        {.addr = 0x00418cf0, .prod = 0x00004044, .disable = 0x00000000},
+        {.addr = 0x00418d70, .prod = 0x00004044, .disable = 0x00000000},
+        {.addr = 0x00418f0c, .prod = 0x00004044, .disable = 0x00000000},
+        {.addr = 0x00418e0c, .prod = 0x00004044, .disable = 0x00000000},
+        {.addr = 0x00419020, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00419038, .prod = 0x00000042, .disable = 0x00000000},
+        {.addr = 0x00419a40, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00419a48, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00419a50, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00419a58, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00419a60, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00419a68, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00419a70, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00419a78, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00419a80, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00419acc, .prod = 0x00004047, .disable = 0x00000000},
+        {.addr = 0x00419868, .prod = 0x00000043, .disable = 0x00000000},
+        {.addr = 0x00419cd4, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00419cdc, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00419c70, .prod = 0x00004045, .disable = 0x00000000},
+        {.addr = 0x00419fd0, .prod = 0x00004043, .disable = 0x00000000},
+        {.addr = 0x00419fd8, .prod = 0x00004045, .disable = 0x00000000},
+        {.addr = 0x00419fe0, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00419fe8, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00419ff0, .prod = 0x00004044, .disable = 0x00000000},
+        {.addr = 0x00419ff8, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00419f90, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x0041be28, .prod = 0x00000042, .disable = 0x00000000},
+        {.addr = 0x0041bfe8, .prod = 0x00004044, .disable = 0x00000000},
+        {.addr = 0x0041bed0, .prod = 0x00004044, .disable = 0x00000000},
+        {.addr = 0x00408810, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00408818, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00408a80, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00408a88, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00408a90, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00408a98, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00408aa0, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x00408aa8, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x004089a8, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x004089b0, .prod = 0x00000042, .disable = 0x00000000},
+        {.addr = 0x004089b8, .prod = 0x00004042, .disable = 0x00000000},
+        {.addr = 0x0017ea60, .prod = 0x00000044, .disable = 0x00000000},
+        {.addr = 0x0017ea68, .prod = 0x00000044, .disable = 0x00000000},
+        {.addr = 0x00100d30, .prod = 0x0000c242, .disable = 0x00000000},
+        {.addr = 0x00100d48, .prod = 0x0000c242, .disable = 0x00000000},
+        {.addr = 0x00100d3c, .prod = 0x00000242, .disable = 0x00000000},
+        {.addr = 0x0017ea78, .prod = 0x00000044, .disable = 0x00000000},
+        {.addr = 0x0017e040, .prod = 0x00000044, .disable = 0x00000000},
+        {.addr = 0x00100d1c, .prod = 0x00000042, .disable = 0x00000000},
+        {.addr = 0x00106f24, .prod = 0x0000c242, .disable = 0x00000000},
+        {.addr = 0x0041be00, .prod = 0x00000004, .disable = 0x00000007},
+        {.addr = 0x00100d10, .prod = 0x0000c242, .disable = 0x00000000},
+        {.addr = 0x0017ea70, .prod = 0x00000044, .disable = 0x00000000},
+        {.addr = 0x00001c00, .prod = 0x00000042, .disable = 0x00000000},
+        {.addr = 0x00100c98, .prod = 0x00000242, .disable = 0x00000000},
+        {.addr = 0x0017e030, .prod = 0x00000044, .disable = 0x00000000},
+};
+/* pg gr */
+const struct gating_desc gk20a_pg_gr[] = {
+        {.addr = 0x004041f8, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x004041fc, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00409898, .prod = 0x10140000, .disable = 0x00000000},
+        {.addr = 0x0040989c, .prod = 0xff00000a, .disable = 0x00000000},
+        {.addr = 0x004078c8, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x004078cc, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00406008, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x0040600c, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00405868, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x0040586c, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00405914, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00405924, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00408048, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x0040804c, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00407008, .prod = 0x10140000, .disable = 0x00000000},
+        {.addr = 0x0040700c, .prod = 0xff00000a, .disable = 0x00000000},
+        {.addr = 0x00405bf8, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00405bfc, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x0041a898, .prod = 0x10140000, .disable = 0x00000000},
+        {.addr = 0x0041a89c, .prod = 0xff00000a, .disable = 0x00000000},
+        {.addr = 0x00418510, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00418514, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00418610, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00418614, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00418690, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00418694, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00418720, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00418724, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00418840, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00418844, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00418bc4, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00418bc8, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00418978, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x0041897c, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00418c78, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00418c7c, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00418cf8, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00418cfc, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00418d78, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00418d7c, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00418f14, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00418f18, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00418e14, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00418e18, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00419030, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00419050, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00419a88, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00419a8c, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00419a90, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00419a94, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00419a98, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00419a9c, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00419aa0, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00419aa4, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00419ad4, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00419ad8, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00419870, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00419874, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00419ce4, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00419cf0, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00419c78, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00419c7c, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00419fa0, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00419fa4, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00419fa8, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00419fac, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00419fb0, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00419fb4, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00419fb8, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00419fbc, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00419fc0, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00419fc4, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00419fc8, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00419fcc, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x0041be30, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x0041be34, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x0041bff0, .prod = 0x10747c00, .disable = 0x00000000},
+        {.addr = 0x0041bff4, .prod = 0xff00000a, .disable = 0x00000000},
+        {.addr = 0x0041bed8, .prod = 0x10240a00, .disable = 0x00000000},
+        {.addr = 0x0041bee0, .prod = 0xff00000a, .disable = 0x00000000},
+        {.addr = 0x00408820, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00408824, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00408828, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x0040882c, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00408ac0, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00408ac4, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00408ac8, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00408acc, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00408ad0, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00408ad4, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00408ad8, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00408adc, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00408ae0, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00408ae4, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x00408ae8, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x00408aec, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x004089c0, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x004089c4, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x004089c8, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x004089cc, .prod = 0xff00a725, .disable = 0x00000000},
+        {.addr = 0x004089d0, .prod = 0x10940000, .disable = 0x00000000},
+        {.addr = 0x004089d4, .prod = 0xff00a725, .disable = 0x00000000},
+};
+/* therm gr */
+const struct gating_desc gk20a_slcg_therm[] = {
+        {.addr = 0x000206b8, .prod = 0x00000000, .disable = 0x0000000f},
+};
+/* static inline functions */
+void gr_gk20a_slcg_gr_load_gating_prod(struct gk20a *g,
+        bool prod)
+{
+        u32 i;
+        u32 size = sizeof(gk20a_slcg_gr) / sizeof(struct gating_desc);
+        for (i = 0; i < size; i++) {
+                if (prod)
+                        gk20a_writel(g, gk20a_slcg_gr[i].addr,
+                                gk20a_slcg_gr[i].prod);
+                else
+                        gk20a_writel(g, gk20a_slcg_gr[i].addr,
+                                 gk20a_slcg_gr[i].disable);
+        }
+}
+void gr_gk20a_slcg_perf_load_gating_prod(struct gk20a *g,
+        bool prod)
+{
+        u32 i;
+        u32 size = sizeof(gk20a_slcg_perf) / sizeof(struct gating_desc);
+        for (i = 0; i < size; i++) {
+                if (prod)
+                        gk20a_writel(g, gk20a_slcg_perf[i].addr,
+                                gk20a_slcg_perf[i].prod);
+                else
+                        gk20a_writel(g, gk20a_slcg_perf[i].addr,
+                                gk20a_slcg_perf[i].disable);
+        }
+}
+void gr_gk20a_blcg_gr_load_gating_prod(struct gk20a *g,
+        bool prod)
+{
+        u32 i;
+        u32 size = sizeof(gk20a_blcg_gr) / sizeof(struct gating_desc);
+        for (i = 0; i < size; i++) {
+                if (prod)
+                        gk20a_writel(g, gk20a_blcg_gr[i].addr,
+                                gk20a_blcg_gr[i].prod);
+                else
+                        gk20a_writel(g, gk20a_blcg_gr[i].addr,
+                                gk20a_blcg_gr[i].disable);
+        }
+}
+void gr_gk20a_pg_gr_load_gating_prod(struct gk20a *g,
+        bool prod)
+{
+        u32 i;
+        u32 size = sizeof(gk20a_pg_gr) / sizeof(struct gating_desc);
+        for (i = 0; i < size; i++) {
+                if (prod)
+                        gk20a_writel(g, gk20a_pg_gr[i].addr,
+                                gk20a_pg_gr[i].prod);
+                else
+                        gk20a_writel(g, gk20a_pg_gr[i].addr,
+                                gk20a_pg_gr[i].disable);
+        }
+}
+void gr_gk20a_slcg_therm_load_gating_prod(struct gk20a *g,
+        bool prod)
+{
+        u32 i;
+        u32 size = sizeof(gk20a_slcg_therm) / sizeof(struct gating_desc);
+        for (i = 0; i < size; i++) {
+                if (prod)
+                        gk20a_writel(g, gk20a_slcg_therm[i].addr,
+                                gk20a_slcg_therm[i].prod);
+                else
+                        gk20a_writel(g, gk20a_slcg_therm[i].addr,
+                                gk20a_slcg_therm[i].disable);
+        }
+}
+#endif /* __gk20a_gating_reglist_h__ */
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_gating_reglist.h b/drivers/gpu/nvgpu/gk20a/gk20a_gating_reglist.h
new file mode 100644
index 00000000..40a6c545
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_gating_reglist.h
@@ -0,0 +1,39 @@
+/*
+ * drivers/video/tegra/host/gk20a/gk20a_gating_reglist.h
+ *
+ * Copyright (c) 2012, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * This file is autogenerated.  Do not edit.
+ */
+#include "gk20a.h"
+void gr_gk20a_slcg_gr_load_gating_prod(struct gk20a *g,
+        bool prod);
+void gr_gk20a_slcg_perf_load_gating_prod(struct gk20a *g,
+        bool prod);
+void gr_gk20a_blcg_gr_load_gating_prod(struct gk20a *g,
+        bool prod);
+void gr_gk20a_pg_gr_load_gating_prod(struct gk20a *g,
+        bool prod);
+void gr_gk20a_slcg_therm_load_gating_prod(struct gk20a *g,
+        bool prod);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_scale.c b/drivers/gpu/nvgpu/gk20a/gk20a_scale.c
new file mode 100644
index 00000000..d1fd71fe
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_scale.c
@@ -0,0 +1,358 @@
+/*
+ * gk20a clock scaling profile
+ *
+ * Copyright (c) 2013-2014, NVIDIA Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/devfreq.h>
+#include <linux/debugfs.h>
+#include <linux/types.h>
+#include <linux/clk.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/clk/tegra.h>
+#include <linux/tegra-soc.h>
+#include <linux/platform_data/tegra_edp.h>
+#include <linux/pm_qos.h>
+#include <governor.h>
+#include "gk20a.h"
+#include "pmu_gk20a.h"
+#include "clk_gk20a.h"
+#include "gk20a_scale.h"
+static ssize_t gk20a_scale_load_show(struct device *dev,
+                                     struct device_attribute *attr,
+                                     char *buf)
+{
+        struct platform_device *pdev = to_platform_device(dev);
+        struct gk20a *g = get_gk20a(pdev);
+        u32 busy_time;
+        ssize_t res;
+        if (!g->power_on) {
+                busy_time = 0;
+        } else {
+                gk20a_busy(g->dev);
+                gk20a_pmu_load_norm(g, &busy_time);
+                gk20a_idle(g->dev);
+        }
+        res = snprintf(buf, PAGE_SIZE, "%u\n", busy_time);
+        return res;
+}
+static DEVICE_ATTR(load, S_IRUGO, gk20a_scale_load_show, NULL);
+/*
+ * gk20a_scale_qos_notify()
+ *
+ * This function is called when the minimum QoS requirement for the device
+ * has changed. The function calls postscaling callback if it is defined.
+ */
+static int gk20a_scale_qos_notify(struct notifier_block *nb,
+                                  unsigned long n, void *p)
+{
+        struct gk20a_scale_profile *profile =
+                container_of(nb, struct gk20a_scale_profile,
+                             qos_notify_block);
+        struct gk20a_platform *platform = platform_get_drvdata(profile->pdev);
+        struct gk20a *g = get_gk20a(profile->pdev);
+        unsigned long freq;
+        if (!platform->postscale)
+                return NOTIFY_OK;
+        /* get the frequency requirement. if devfreq is enabled, check if it
+         * has higher demand than qos */
+        freq = gk20a_clk_round_rate(g, pm_qos_request(platform->qos_id));
+        if (g->devfreq)
+                freq = max(g->devfreq->previous_freq, freq);
+        platform->postscale(profile->pdev, freq);
+        return NOTIFY_OK;
+}
+/*
+ * gk20a_scale_make_freq_table(profile)
+ *
+ * This function initialises the frequency table for the given device profile
+ */
+static int gk20a_scale_make_freq_table(struct gk20a_scale_profile *profile)
+{
+        struct gk20a *g = get_gk20a(profile->pdev);
+        unsigned long *freqs;
+        int num_freqs, err;
+        /* make sure the clock is available */
+        if (!gk20a_clk_get(g))
+                return -ENOSYS;
+        /* get gpu dvfs table */
+        err = tegra_dvfs_get_freqs(clk_get_parent(g->clk.tegra_clk),
+                                   &freqs, &num_freqs);
+        if (err)
+                return -ENOSYS;
+        profile->devfreq_profile.freq_table = (unsigned long *)freqs;
+        profile->devfreq_profile.max_state = num_freqs;
+        return 0;
+}
+/*
+ * gk20a_scale_target(dev, *freq, flags)
+ *
+ * This function scales the clock
+ */
+static int gk20a_scale_target(struct device *dev, unsigned long *freq,
+                              u32 flags)
+{
+        struct gk20a *g = get_gk20a(to_platform_device(dev));
+        struct gk20a_platform *platform = dev_get_drvdata(dev);
+        struct gk20a_scale_profile *profile = g->scale_profile;
+        unsigned long rounded_rate = gk20a_clk_round_rate(g, *freq);
+        if (gk20a_clk_get_rate(g) == rounded_rate) {
+                *freq = rounded_rate;
+                return 0;
+        }
+        gk20a_clk_set_rate(g, rounded_rate);
+        if (platform->postscale)
+                platform->postscale(profile->pdev, rounded_rate);
+        *freq = gk20a_clk_get_rate(g);
+        return 0;
+}
+/*
+ * update_load_estimate_gpmu(profile)
+ *
+ * Update load estimate using gpmu. The gpmu value is normalised
+ * based on the time it was asked last time.
+ */
+static void update_load_estimate_gpmu(struct platform_device *pdev)
+{
+        struct gk20a *g = get_gk20a(pdev);
+        struct gk20a_scale_profile *profile = g->scale_profile;
+        unsigned long dt;
+        u32 busy_time;
+        ktime_t t;
+        t = ktime_get();
+        dt = ktime_us_delta(t, profile->last_event_time);
+        profile->dev_stat.total_time = dt;
+        profile->last_event_time = t;
+        gk20a_pmu_load_norm(g, &busy_time);
+        profile->dev_stat.busy_time = (busy_time * dt) / 1000;
+}
+/*
+ * gk20a_scale_suspend(pdev)
+ *
+ * This function informs devfreq of suspend
+ */
+void gk20a_scale_suspend(struct platform_device *pdev)
+{
+        struct gk20a *g = get_gk20a(pdev);
+        struct devfreq *devfreq = g->devfreq;
+        if (!devfreq)
+                return;
+        devfreq_suspend_device(devfreq);
+}
+/*
+ * gk20a_scale_resume(pdev)
+ *
+ * This functions informs devfreq of resume
+ */
+void gk20a_scale_resume(struct platform_device *pdev)
+{
+        struct gk20a *g = get_gk20a(pdev);
+        struct devfreq *devfreq = g->devfreq;
+        if (!devfreq)
+                return;
+        devfreq_resume_device(devfreq);
+}
+/*
+ * gk20a_scale_notify(pdev, busy)
+ *
+ * Calling this function informs that the device is idling (..or busy). This
+ * data is used to estimate the current load
+ */
+static void gk20a_scale_notify(struct platform_device *pdev, bool busy)
+{
+        struct gk20a_platform *platform = platform_get_drvdata(pdev);
+        struct gk20a *g = get_gk20a(pdev);
+        struct gk20a_scale_profile *profile = g->scale_profile;
+        struct devfreq *devfreq = g->devfreq;
+        /* inform edp about new constraint */
+        if (platform->prescale)
+                platform->prescale(pdev);
+        /* Is the device profile initialised? */
+        if (!(profile && devfreq))
+                return;
+        mutex_lock(&devfreq->lock);
+        profile->dev_stat.busy = busy;
+        update_devfreq(devfreq);
+        mutex_unlock(&devfreq->lock);
+}
+void gk20a_scale_notify_idle(struct platform_device *pdev)
+{
+        gk20a_scale_notify(pdev, false);
+}
+void gk20a_scale_notify_busy(struct platform_device *pdev)
+{
+        gk20a_scale_notify(pdev, true);
+}
+/*
+ * gk20a_scale_get_dev_status(dev, *stat)
+ *
+ * This function queries the current device status.
+ */
+static int gk20a_scale_get_dev_status(struct device *dev,
+                                      struct devfreq_dev_status *stat)
+{
+        struct gk20a *g = get_gk20a(to_platform_device(dev));
+        struct gk20a_scale_profile *profile = g->scale_profile;
+        /* Make sure there are correct values for the current frequency */
+        profile->dev_stat.current_frequency = gk20a_clk_get_rate(g);
+        /* Update load estimate */
+        update_load_estimate_gpmu(to_platform_device(dev));
+        /* Copy the contents of the current device status */
+        *stat = profile->dev_stat;
+        /* Finally, clear out the local values */
+        profile->dev_stat.total_time = 0;
+        profile->dev_stat.busy_time = 0;
+        return 0;
+}
+/*
+ * gk20a_scale_init(pdev)
+ */
+void gk20a_scale_init(struct platform_device *pdev)
+{
+        struct gk20a_platform *platform = platform_get_drvdata(pdev);
+        struct gk20a *g = platform->g;
+        struct gk20a_scale_profile *profile;
+        int err;
+        if (g->scale_profile)
+                return;
+        profile = kzalloc(sizeof(*profile), GFP_KERNEL);
+        profile->pdev = pdev;
+        profile->dev_stat.busy = false;
+        /* Create frequency table */
+        err = gk20a_scale_make_freq_table(profile);
+        if (err || !profile->devfreq_profile.max_state)
+                goto err_get_freqs;
+        if (device_create_file(&pdev->dev, &dev_attr_load))
+                goto err_create_sysfs_entry;
+        /* Store device profile so we can access it if devfreq governor
+         * init needs that */
+        g->scale_profile = profile;
+        if (platform->devfreq_governor) {
+                struct devfreq *devfreq;
+                profile->devfreq_profile.initial_freq =
+                        profile->devfreq_profile.freq_table[0];
+                profile->devfreq_profile.target = gk20a_scale_target;
+                profile->devfreq_profile.get_dev_status =
+                        gk20a_scale_get_dev_status;
+                devfreq = devfreq_add_device(&pdev->dev,
+                                        &profile->devfreq_profile,
+                                        platform->devfreq_governor, NULL);
+                if (IS_ERR(devfreq))
+                        devfreq = NULL;
+                g->devfreq = devfreq;
+        }
+        /* Should we register QoS callback for this device? */
+        if (platform->qos_id < PM_QOS_NUM_CLASSES &&
+            platform->qos_id != PM_QOS_RESERVED &&
+            platform->postscale) {
+                profile->qos_notify_block.notifier_call =
+                        &gk20a_scale_qos_notify;
+                pm_qos_add_notifier(platform->qos_id,
+                                    &profile->qos_notify_block);
+        }
+        return;
+err_get_freqs:
+        device_remove_file(&pdev->dev, &dev_attr_load);
+err_create_sysfs_entry:
+        kfree(g->scale_profile);
+        g->scale_profile = NULL;
+}
+/*
+ * gk20a_scale_hw_init(dev)
+ *
+ * Initialize hardware portion of the device
+ */
+void gk20a_scale_hw_init(struct platform_device *pdev)
+{
+        struct gk20a_platform *platform = platform_get_drvdata(pdev);
+        struct gk20a_scale_profile *profile = platform->g->scale_profile;
+        /* make sure that scaling has bee initialised */
+        if (!profile)
+                return;
+        profile->dev_stat.total_time = 0;
+        profile->last_event_time = ktime_get();
+}
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_scale.h b/drivers/gpu/nvgpu/gk20a/gk20a_scale.h
new file mode 100644
index 00000000..e76b1662
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_scale.h
@@ -0,0 +1,51 @@
+/*
+ * gk20a clock scaling profile
+ *
+ * Copyright (c) 2013-2014, NVIDIA Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef GK20A_SCALE_H
+#define GK20A_SCALE_H
+#include <linux/nvhost.h>
+#include <linux/devfreq.h>
+struct platform_device;
+struct clk;
+struct gk20a_scale_profile {
+        struct platform_device          *pdev;
+        ktime_t                         last_event_time;
+        struct devfreq_dev_profile      devfreq_profile;
+        struct devfreq_dev_status       dev_stat;
+        struct notifier_block           qos_notify_block;
+        void                            *private_data;
+};
+/* Initialization and de-initialization for module */
+void gk20a_scale_init(struct platform_device *);
+void gk20a_scale_hw_init(struct platform_device *pdev);
+/*
+ * call when performing submit to notify scaling mechanism that the module is
+ * in use
+ */
+void gk20a_scale_notify_busy(struct platform_device *);
+void gk20a_scale_notify_idle(struct platform_device *);
+void gk20a_scale_suspend(struct platform_device *);
+void gk20a_scale_resume(struct platform_device *);
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_sysfs.c b/drivers/gpu/nvgpu/gk20a/gk20a_sysfs.c
new file mode 100644
index 00000000..f6b43f50
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_sysfs.c
@@ -0,0 +1,335 @@
+/*
+ * drivers/video/tegra/host/gk20a/gk20a_sysfs.c
+ *
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/kernel.h>
+#include <linux/fb.h>
+#include <mach/clk.h>
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include "fifo_gk20a.h"
+#define PTIMER_FP_FACTOR                        1000000
+/* PTIMER_REF_FREQ_HZ corresponds to a period of 32 nanoseconds. 32 ns is
+   the resolution of ptimer. */
+#define PTIMER_REF_FREQ_HZ                      31250000
+static ssize_t elcg_enable_store(struct device *device,
+        struct device_attribute *attr, const char *buf, size_t count)
+{
+        struct platform_device *ndev = to_platform_device(device);
+        struct gk20a *g = get_gk20a(ndev);
+        unsigned long val = 0;
+        if (kstrtoul(buf, 10, &val) < 0)
+                return -EINVAL;
+        gk20a_busy(g->dev);
+        if (val) {
+                g->elcg_enabled = true;
+                gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A);
+                gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A);
+        } else {
+                g->elcg_enabled = false;
+                gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
+                gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
+        }
+        gk20a_idle(g->dev);
+        dev_info(device, "ELCG is %s.\n", g->elcg_enabled ? "enabled" :
+                        "disabled");
+        return count;
+}
+static ssize_t elcg_enable_read(struct device *device,
+        struct device_attribute *attr, char *buf)
+{
+        struct platform_device *ndev = to_platform_device(device);
+        struct gk20a *g = get_gk20a(ndev);
+        return sprintf(buf, "%d\n", g->elcg_enabled ? 1 : 0);
+}
+static DEVICE_ATTR(elcg_enable, S_IRWXUGO, elcg_enable_read, elcg_enable_store);
+static ssize_t blcg_enable_store(struct device *device,
+        struct device_attribute *attr, const char *buf, size_t count)
+{
+        struct platform_device *ndev = to_platform_device(device);
+        struct gk20a *g = get_gk20a(ndev);
+        unsigned long val = 0;
+        if (kstrtoul(buf, 10, &val) < 0)
+                return -EINVAL;
+        if (val)
+                g->blcg_enabled = true;
+        else
+                g->blcg_enabled = false;
+        gk20a_busy(g->dev);
+        g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled);
+        gk20a_idle(g->dev);
+        dev_info(device, "BLCG is %s.\n", g->blcg_enabled ? "enabled" :
+                        "disabled");
+        return count;
+}
+static ssize_t blcg_enable_read(struct device *device,
+        struct device_attribute *attr, char *buf)
+{
+        struct platform_device *ndev = to_platform_device(device);
+        struct gk20a *g = get_gk20a(ndev);
+        return sprintf(buf, "%d\n", g->blcg_enabled ? 1 : 0);
+}
+static DEVICE_ATTR(blcg_enable, S_IRWXUGO, blcg_enable_read, blcg_enable_store);
+static ssize_t slcg_enable_store(struct device *device,
+        struct device_attribute *attr, const char *buf, size_t count)
+{
+        struct platform_device *ndev = to_platform_device(device);
+        struct gk20a *g = get_gk20a(ndev);
+        unsigned long val = 0;
+        if (kstrtoul(buf, 10, &val) < 0)
+                return -EINVAL;
+        if (val)
+                g->slcg_enabled = true;
+        else
+                g->slcg_enabled = false;
+        /*
+         * TODO: slcg_therm_load_gating is not enabled anywhere during
+         * init. Therefore, it would be incongruous to add it here. Once
+         * it is added to init, we should add it here too.
+         */
+        gk20a_busy(g->dev);
+        g->ops.clock_gating.slcg_gr_load_gating_prod(g, g->slcg_enabled);
+        g->ops.clock_gating.slcg_perf_load_gating_prod(g, g->slcg_enabled);
+        gk20a_idle(g->dev);
+        dev_info(device, "SLCG is %s.\n", g->slcg_enabled ? "enabled" :
+                        "disabled");
+        return count;
+}
+static ssize_t slcg_enable_read(struct device *device,
+        struct device_attribute *attr, char *buf)
+{
+        struct platform_device *ndev = to_platform_device(device);
+        struct gk20a *g = get_gk20a(ndev);
+        return sprintf(buf, "%d\n", g->slcg_enabled ? 1 : 0);
+}
+static DEVICE_ATTR(slcg_enable, S_IRWXUGO, slcg_enable_read, slcg_enable_store);
+static ssize_t ptimer_scale_factor_show(struct device *dev,
+                                                struct device_attribute *attr,
+                                                char *buf)
+{
+        u32 tsc_freq_hz = clk_get_rate(clk_get_sys(NULL, "clk_m"));
+        u32 scaling_factor_fp = (u32)(PTIMER_REF_FREQ_HZ) /
+                                ((u32)(tsc_freq_hz) /
+                                (u32)(PTIMER_FP_FACTOR));
+        ssize_t res = snprintf(buf,
+                                PAGE_SIZE,
+                                "%u.%u\n",
+                                scaling_factor_fp / PTIMER_FP_FACTOR,
+                                scaling_factor_fp % PTIMER_FP_FACTOR);
+        return res;
+}
+static DEVICE_ATTR(ptimer_scale_factor,
+                        S_IRUGO,
+                        ptimer_scale_factor_show,
+                        NULL);
+static ssize_t railgate_delay_store(struct device *dev,
+                                    struct device_attribute *attr,
+                                    const char *buf, size_t count)
+{
+        struct gk20a_platform *platform = dev_get_drvdata(dev);
+        int railgate_delay = 0, ret = 0;
+        if (!platform->can_railgate) {
+                dev_info(dev, "does not support power-gating\n");
+                return count;
+        }
+        ret = sscanf(buf, "%d", &railgate_delay);
+        if (ret == 1 && railgate_delay >= 0) {
+                struct generic_pm_domain *genpd = pd_to_genpd(dev->pm_domain);
+                platform->railgate_delay = railgate_delay;
+                pm_genpd_set_poweroff_delay(genpd, platform->railgate_delay);
+        } else
+                dev_err(dev, "Invalid powergate delay\n");
+        return count;
+}
+static ssize_t railgate_delay_show(struct device *dev,
+                                   struct device_attribute *attr, char *buf)
+{
+        struct gk20a_platform *platform = dev_get_drvdata(dev);
+        return snprintf(buf, PAGE_SIZE, "%d\n", platform->railgate_delay);
+}
+static DEVICE_ATTR(railgate_delay, S_IRWXUGO, railgate_delay_show,
+                   railgate_delay_store);
+static ssize_t clockgate_delay_store(struct device *dev,
+                                     struct device_attribute *attr,
+                                     const char *buf, size_t count)
+{
+        struct gk20a_platform *platform = dev_get_drvdata(dev);
+        int clockgate_delay = 0, ret = 0;
+        ret = sscanf(buf, "%d", &clockgate_delay);
+        if (ret == 1 && clockgate_delay >= 0) {
+                platform->clockgate_delay = clockgate_delay;
+                pm_runtime_set_autosuspend_delay(dev,
+                                                 platform->clockgate_delay);
+        } else
+                dev_err(dev, "Invalid clockgate delay\n");
+        return count;
+}
+static ssize_t clockgate_delay_show(struct device *dev,
+                                    struct device_attribute *attr, char *buf)
+{
+        struct gk20a_platform *platform = dev_get_drvdata(dev);
+        return snprintf(buf, PAGE_SIZE, "%d\n", platform->clockgate_delay);
+}
+static DEVICE_ATTR(clockgate_delay, S_IRWXUGO, clockgate_delay_show,
+                   clockgate_delay_store);
+static ssize_t counters_show(struct device *dev,
+                             struct device_attribute *attr, char *buf)
+{
+        struct platform_device *pdev = to_platform_device(dev);
+        struct gk20a *g = get_gk20a(pdev);
+        u32 busy_cycles, total_cycles;
+        ssize_t res;
+        gk20a_pmu_get_load_counters(g, &busy_cycles, &total_cycles);
+        res = snprintf(buf, PAGE_SIZE, "%u %u\n", busy_cycles, total_cycles);
+        return res;
+}
+static DEVICE_ATTR(counters, S_IRUGO, counters_show, NULL);
+static ssize_t counters_show_reset(struct device *dev,
+                                   struct device_attribute *attr, char *buf)
+{
+        ssize_t res = counters_show(dev, attr, buf);
+        struct platform_device *pdev = to_platform_device(dev);
+        struct gk20a *g = get_gk20a(pdev);
+        gk20a_pmu_reset_load_counters(g);
+        return res;
+}
+static DEVICE_ATTR(counters_reset, S_IRUGO, counters_show_reset, NULL);
+static ssize_t elpg_enable_store(struct device *device,
+        struct device_attribute *attr, const char *buf, size_t count)
+{
+        struct platform_device *ndev = to_platform_device(device);
+        struct gk20a *g = get_gk20a(ndev);
+        unsigned long val = 0;
+        if (kstrtoul(buf, 10, &val) < 0)
+                return -EINVAL;
+        /*
+         * Since elpg is refcounted, we should not unnecessarily call
+         * enable/disable if it is already so.
+         */
+        gk20a_channel_busy(g->dev);
+        if (val && !g->elpg_enabled) {
+                g->elpg_enabled = true;
+                gk20a_pmu_enable_elpg(g);
+        } else if (!val && g->elpg_enabled) {
+                g->elpg_enabled = false;
+                gk20a_pmu_disable_elpg(g);
+        }
+        gk20a_channel_idle(g->dev);
+        dev_info(device, "ELPG is %s.\n", g->elpg_enabled ? "enabled" :
+                        "disabled");
+        return count;
+}
+static ssize_t elpg_enable_read(struct device *device,
+        struct device_attribute *attr, char *buf)
+{
+        struct platform_device *ndev = to_platform_device(device);
+        struct gk20a *g = get_gk20a(ndev);
+        return sprintf(buf, "%d\n", g->elpg_enabled ? 1 : 0);
+}
+static DEVICE_ATTR(elpg_enable, S_IRWXUGO, elpg_enable_read, elpg_enable_store);
+void gk20a_remove_sysfs(struct device *dev)
+{
+        device_remove_file(dev, &dev_attr_elcg_enable);
+        device_remove_file(dev, &dev_attr_blcg_enable);
+        device_remove_file(dev, &dev_attr_slcg_enable);
+        device_remove_file(dev, &dev_attr_ptimer_scale_factor);
+        device_remove_file(dev, &dev_attr_elpg_enable);
+        device_remove_file(dev, &dev_attr_counters);
+        device_remove_file(dev, &dev_attr_counters_reset);
+        device_remove_file(dev, &dev_attr_railgate_delay);
+        device_remove_file(dev, &dev_attr_clockgate_delay);
+}
+void gk20a_create_sysfs(struct platform_device *dev)
+{
+        int error = 0;
+        error |= device_create_file(&dev->dev, &dev_attr_elcg_enable);
+        error |= device_create_file(&dev->dev, &dev_attr_blcg_enable);
+        error |= device_create_file(&dev->dev, &dev_attr_slcg_enable);
+        error |= device_create_file(&dev->dev, &dev_attr_ptimer_scale_factor);
+        error |= device_create_file(&dev->dev, &dev_attr_elpg_enable);
+        error |= device_create_file(&dev->dev, &dev_attr_counters);
+        error |= device_create_file(&dev->dev, &dev_attr_counters_reset);
+        error |= device_create_file(&dev->dev, &dev_attr_railgate_delay);
+        error |= device_create_file(&dev->dev, &dev_attr_clockgate_delay);
+        if (error)
+                dev_err(&dev->dev, "Failed to create sysfs attributes!\n");
+}
diff --git a/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.c
new file mode 100644
index 00000000..59404f1d
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.c
@@ -0,0 +1,333 @@
+/*
+ * drivers/video/tegra/host/gk20a/gr_ctx_gk20a.c
+ *
+ * GK20A Graphics Context
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/firmware.h>
+#include "gk20a.h"
+#include "gr_ctx_gk20a.h"
+#include "hw_gr_gk20a.h"
+static int gr_gk20a_alloc_load_netlist_u32(u32 *src, u32 len,
+                        struct u32_list_gk20a *u32_list)
+{
+        u32_list->count = (len + sizeof(u32) - 1) / sizeof(u32);
+        if (!alloc_u32_list_gk20a(u32_list))
+                return -ENOMEM;
+        memcpy(u32_list->l, src, len);
+        return 0;
+}
+static int gr_gk20a_alloc_load_netlist_av(u32 *src, u32 len,
+                        struct av_list_gk20a *av_list)
+{
+        av_list->count = len / sizeof(struct av_gk20a);
+        if (!alloc_av_list_gk20a(av_list))
+                return -ENOMEM;
+        memcpy(av_list->l, src, len);
+        return 0;
+}
+static int gr_gk20a_alloc_load_netlist_aiv(u32 *src, u32 len,
+                        struct aiv_list_gk20a *aiv_list)
+{
+        aiv_list->count = len / sizeof(struct aiv_gk20a);
+        if (!alloc_aiv_list_gk20a(aiv_list))
+                return -ENOMEM;
+        memcpy(aiv_list->l, src, len);
+        return 0;
+}
+static int gr_gk20a_get_netlist_name(int index, char *name)
+{
+        switch (index) {
+#ifdef GK20A_NETLIST_IMAGE_FW_NAME
+        case NETLIST_FINAL:
+                sprintf(name, GK20A_NETLIST_IMAGE_FW_NAME);
+                return 0;
+#endif
+#ifdef GK20A_NETLIST_IMAGE_A
+        case NETLIST_SLOT_A:
+                sprintf(name, GK20A_NETLIST_IMAGE_A);
+                return 0;
+#endif
+#ifdef GK20A_NETLIST_IMAGE_B
+        case NETLIST_SLOT_B:
+                sprintf(name, GK20A_NETLIST_IMAGE_B);
+                return 0;
+#endif
+#ifdef GK20A_NETLIST_IMAGE_C
+        case NETLIST_SLOT_C:
+                sprintf(name, GK20A_NETLIST_IMAGE_C);
+                return 0;
+#endif
+#ifdef GK20A_NETLIST_IMAGE_D
+        case NETLIST_SLOT_D:
+                sprintf(name, GK20A_NETLIST_IMAGE_D);
+                return 0;
+#endif
+        default:
+                return -1;
+        }
+        return -1;
+}
+static int gr_gk20a_init_ctx_vars_fw(struct gk20a *g, struct gr_gk20a *gr)
+{
+        struct device *d = dev_from_gk20a(g);
+        const struct firmware *netlist_fw;
+        struct netlist_image *netlist = NULL;
+        char name[MAX_NETLIST_NAME];
+        u32 i, major_v = ~0, major_v_hw, netlist_num;
+        int net, max, err = -ENOENT;
+        gk20a_dbg_fn("");
+#ifdef GK20A_NETLIST_IMAGE_FW_NAME
+        net = NETLIST_FINAL;
+        max = 0;
+        major_v_hw = ~0;
+        g->gr.ctx_vars.dynamic = false;
+#else
+        net = NETLIST_SLOT_A;
+        max = MAX_NETLIST;
+        major_v_hw = gk20a_readl(g, gr_fecs_ctx_state_store_major_rev_id_r());
+        g->gr.ctx_vars.dynamic = true;
+#endif
+        for (; net < max; net++) {
+                if (gr_gk20a_get_netlist_name(net, name) != 0) {
+                        gk20a_warn(d, "invalid netlist index %d", net);
+                        continue;
+                }
+                netlist_fw = gk20a_request_firmware(g, name);
+                if (!netlist_fw) {
+                        gk20a_warn(d, "failed to load netlist %s", name);
+                        continue;
+                }
+                netlist = (struct netlist_image *)netlist_fw->data;
+                for (i = 0; i < netlist->header.regions; i++) {
+                        u32 *src = (u32 *)((u8 *)netlist + netlist->regions[i].data_offset);
+                        u32 size = netlist->regions[i].data_size;
+                        switch (netlist->regions[i].region_id) {
+                        case NETLIST_REGIONID_FECS_UCODE_DATA:
+                                gk20a_dbg_info("NETLIST_REGIONID_FECS_UCODE_DATA");
+                                err = gr_gk20a_alloc_load_netlist_u32(
+                                        src, size, &g->gr.ctx_vars.ucode.fecs.data);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_FECS_UCODE_INST:
+                                gk20a_dbg_info("NETLIST_REGIONID_FECS_UCODE_INST");
+                                err = gr_gk20a_alloc_load_netlist_u32(
+                                        src, size, &g->gr.ctx_vars.ucode.fecs.inst);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_GPCCS_UCODE_DATA:
+                                gk20a_dbg_info("NETLIST_REGIONID_GPCCS_UCODE_DATA");
+                                err = gr_gk20a_alloc_load_netlist_u32(
+                                        src, size, &g->gr.ctx_vars.ucode.gpccs.data);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_GPCCS_UCODE_INST:
+                                gk20a_dbg_info("NETLIST_REGIONID_GPCCS_UCODE_INST");
+                                err = gr_gk20a_alloc_load_netlist_u32(
+                                        src, size, &g->gr.ctx_vars.ucode.gpccs.inst);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_SW_BUNDLE_INIT:
+                                gk20a_dbg_info("NETLIST_REGIONID_SW_BUNDLE_INIT");
+                                err = gr_gk20a_alloc_load_netlist_av(
+                                        src, size, &g->gr.ctx_vars.sw_bundle_init);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_SW_METHOD_INIT:
+                                gk20a_dbg_info("NETLIST_REGIONID_SW_METHOD_INIT");
+                                err = gr_gk20a_alloc_load_netlist_av(
+                                        src, size, &g->gr.ctx_vars.sw_method_init);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_SW_CTX_LOAD:
+                                gk20a_dbg_info("NETLIST_REGIONID_SW_CTX_LOAD");
+                                err = gr_gk20a_alloc_load_netlist_aiv(
+                                        src, size, &g->gr.ctx_vars.sw_ctx_load);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_SW_NON_CTX_LOAD:
+                                gk20a_dbg_info("NETLIST_REGIONID_SW_NON_CTX_LOAD");
+                                err = gr_gk20a_alloc_load_netlist_av(
+                                        src, size, &g->gr.ctx_vars.sw_non_ctx_load);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_CTXREG_SYS:
+                                gk20a_dbg_info("NETLIST_REGIONID_CTXREG_SYS");
+                                err = gr_gk20a_alloc_load_netlist_aiv(
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.sys);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_CTXREG_GPC:
+                                gk20a_dbg_info("NETLIST_REGIONID_CTXREG_GPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.gpc);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_CTXREG_TPC:
+                                gk20a_dbg_info("NETLIST_REGIONID_CTXREG_TPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.tpc);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_CTXREG_ZCULL_GPC:
+                                gk20a_dbg_info("NETLIST_REGIONID_CTXREG_ZCULL_GPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.zcull_gpc);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_CTXREG_PPC:
+                                gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.ppc);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_CTXREG_PM_SYS:
+                                gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PM_SYS");
+                                err = gr_gk20a_alloc_load_netlist_aiv(
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.pm_sys);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_CTXREG_PM_GPC:
+                                gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PM_GPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.pm_gpc);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_CTXREG_PM_TPC:
+                                gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PM_TPC");
+                                err = gr_gk20a_alloc_load_netlist_aiv(
+                                        src, size, &g->gr.ctx_vars.ctxsw_regs.pm_tpc);
+                                if (err)
+                                        goto clean_up;
+                                break;
+                        case NETLIST_REGIONID_BUFFER_SIZE:
+                                g->gr.ctx_vars.buffer_size = *src;
+                                gk20a_dbg_info("NETLIST_REGIONID_BUFFER_SIZE : %d",
+                                        g->gr.ctx_vars.buffer_size);
+                                break;
+                        case NETLIST_REGIONID_CTXSW_REG_BASE_INDEX:
+                                g->gr.ctx_vars.regs_base_index = *src;
+                                gk20a_dbg_info("NETLIST_REGIONID_CTXSW_REG_BASE_INDEX : %d",
+                                        g->gr.ctx_vars.regs_base_index);
+                                break;
+                        case NETLIST_REGIONID_MAJORV:
+                                major_v = *src;
+                                gk20a_dbg_info("NETLIST_REGIONID_MAJORV : %d",
+                                        major_v);
+                                break;
+                        case NETLIST_REGIONID_NETLIST_NUM:
+                                netlist_num = *src;
+                                gk20a_dbg_info("NETLIST_REGIONID_NETLIST_NUM : %d",
+                                        netlist_num);
+                                break;
+                        case NETLIST_REGIONID_CTXREG_PMPPC:
+                                gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PMPPC skipped");
+                                break;
+                        default:
+                                gk20a_warn(d, "unrecognized region %d skipped", i);
+                                break;
+                        }
+                }
+                if (net != NETLIST_FINAL && major_v != major_v_hw) {
+                        gk20a_dbg_info("skip %s: major_v 0x%08x doesn't match hw 0x%08x",
+                                name, major_v, major_v_hw);
+                        goto clean_up;
+                }
+                g->gr.ctx_vars.valid = true;
+                g->gr.netlist = net;
+                release_firmware(netlist_fw);
+                gk20a_dbg_fn("done");
+                goto done;
+clean_up:
+                kfree(g->gr.ctx_vars.ucode.fecs.inst.l);
+                kfree(g->gr.ctx_vars.ucode.fecs.data.l);
+                kfree(g->gr.ctx_vars.ucode.gpccs.inst.l);
+                kfree(g->gr.ctx_vars.ucode.gpccs.data.l);
+                kfree(g->gr.ctx_vars.sw_bundle_init.l);
+                kfree(g->gr.ctx_vars.sw_method_init.l);
+                kfree(g->gr.ctx_vars.sw_ctx_load.l);
+                kfree(g->gr.ctx_vars.sw_non_ctx_load.l);
+                kfree(g->gr.ctx_vars.ctxsw_regs.sys.l);
+                kfree(g->gr.ctx_vars.ctxsw_regs.gpc.l);
+                kfree(g->gr.ctx_vars.ctxsw_regs.tpc.l);
+                kfree(g->gr.ctx_vars.ctxsw_regs.zcull_gpc.l);
+                kfree(g->gr.ctx_vars.ctxsw_regs.ppc.l);
+                kfree(g->gr.ctx_vars.ctxsw_regs.pm_sys.l);
+                kfree(g->gr.ctx_vars.ctxsw_regs.pm_gpc.l);
+                kfree(g->gr.ctx_vars.ctxsw_regs.pm_tpc.l);
+                release_firmware(netlist_fw);
+                err = -ENOENT;
+        }
+done:
+        if (g->gr.ctx_vars.valid) {
+                gk20a_dbg_info("netlist image %s loaded", name);
+                return 0;
+        } else {
+                gk20a_err(d, "failed to load netlist image!!");
+                return err;
+        }
+}
+int gr_gk20a_init_ctx_vars(struct gk20a *g, struct gr_gk20a *gr)
+{
+        if (tegra_platform_is_linsim())
+                return gr_gk20a_init_ctx_vars_sim(g, gr);
+        else
+                return gr_gk20a_init_ctx_vars_fw(g, gr);
+}
diff --git a/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.h
new file mode 100644
index 00000000..909a166a
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.h
@@ -0,0 +1,149 @@
+/*
+ * GK20A Graphics Context
+ *
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __GR_CTX_GK20A_H__
+#define __GR_CTX_GK20A_H__
+/* production netlist, one and only one from below */
+/*#undef GK20A_NETLIST_IMAGE_FW_NAME*/
+#define GK20A_NETLIST_IMAGE_FW_NAME GK20A_NETLIST_IMAGE_B
+/* emulation netlists, match majorV with HW */
+#define GK20A_NETLIST_IMAGE_A   "NETA_img.bin"
+#define GK20A_NETLIST_IMAGE_B   "NETB_img.bin"
+#define GK20A_NETLIST_IMAGE_C   "NETC_img.bin"
+#define GK20A_NETLIST_IMAGE_D   "NETD_img.bin"
+union __max_name {
+#ifdef GK20A_NETLIST_IMAGE_A
+        char __name_a[sizeof(GK20A_NETLIST_IMAGE_A)];
+#endif
+#ifdef GK20A_NETLIST_IMAGE_B
+        char __name_b[sizeof(GK20A_NETLIST_IMAGE_B)];
+#endif
+#ifdef GK20A_NETLIST_IMAGE_C
+        char __name_c[sizeof(GK20A_NETLIST_IMAGE_C)];
+#endif
+#ifdef GK20A_NETLIST_IMAGE_D
+        char __name_d[sizeof(GK20A_NETLIST_IMAGE_D)];
+#endif
+};
+#define MAX_NETLIST_NAME sizeof(union __max_name)
+/* index for emulation netlists */
+#define NETLIST_FINAL           -1
+#define NETLIST_SLOT_A          0
+#define NETLIST_SLOT_B          1
+#define NETLIST_SLOT_C          2
+#define NETLIST_SLOT_D          3
+#define MAX_NETLIST             4
+/* netlist regions */
+#define NETLIST_REGIONID_FECS_UCODE_DATA        0
+#define NETLIST_REGIONID_FECS_UCODE_INST        1
+#define NETLIST_REGIONID_GPCCS_UCODE_DATA       2
+#define NETLIST_REGIONID_GPCCS_UCODE_INST       3
+#define NETLIST_REGIONID_SW_BUNDLE_INIT         4
+#define NETLIST_REGIONID_SW_CTX_LOAD            5
+#define NETLIST_REGIONID_SW_NON_CTX_LOAD        6
+#define NETLIST_REGIONID_SW_METHOD_INIT         7
+#define NETLIST_REGIONID_CTXREG_SYS             8
+#define NETLIST_REGIONID_CTXREG_GPC             9
+#define NETLIST_REGIONID_CTXREG_TPC             10
+#define NETLIST_REGIONID_CTXREG_ZCULL_GPC       11
+#define NETLIST_REGIONID_CTXREG_PM_SYS          12
+#define NETLIST_REGIONID_CTXREG_PM_GPC          13
+#define NETLIST_REGIONID_CTXREG_PM_TPC          14
+#define NETLIST_REGIONID_MAJORV                 15
+#define NETLIST_REGIONID_BUFFER_SIZE            16
+#define NETLIST_REGIONID_CTXSW_REG_BASE_INDEX   17
+#define NETLIST_REGIONID_NETLIST_NUM            18
+#define NETLIST_REGIONID_CTXREG_PPC             19
+#define NETLIST_REGIONID_CTXREG_PMPPC           20
+struct netlist_region {
+        u32 region_id;
+        u32 data_size;
+        u32 data_offset;
+};
+struct netlist_image_header {
+        u32 version;
+        u32 regions;
+};
+struct netlist_image {
+        struct netlist_image_header header;
+        struct netlist_region regions[1];
+};
+struct av_gk20a {
+        u32 addr;
+        u32 value;
+};
+struct aiv_gk20a {
+        u32 addr;
+        u32 index;
+        u32 value;
+};
+struct aiv_list_gk20a {
+        struct aiv_gk20a *l;
+        u32 count;
+};
+struct av_list_gk20a {
+        struct av_gk20a *l;
+        u32 count;
+};
+struct u32_list_gk20a {
+        u32 *l;
+        u32 count;
+};
+static inline
+struct av_gk20a *alloc_av_list_gk20a(struct av_list_gk20a *avl)
+{
+        avl->l = kzalloc(avl->count * sizeof(*avl->l), GFP_KERNEL);
+        return avl->l;
+}
+static inline
+struct aiv_gk20a *alloc_aiv_list_gk20a(struct aiv_list_gk20a *aivl)
+{
+        aivl->l = kzalloc(aivl->count * sizeof(*aivl->l), GFP_KERNEL);
+        return aivl->l;
+}
+static inline
+u32 *alloc_u32_list_gk20a(struct u32_list_gk20a *u32l)
+{
+        u32l->l = kzalloc(u32l->count * sizeof(*u32l->l), GFP_KERNEL);
+        return u32l->l;
+}
+struct gr_ucode_gk20a {
+        struct {
+                struct u32_list_gk20a inst;
+                struct u32_list_gk20a data;
+        } gpccs, fecs;
+};
+/* main entry for grctx loading */
+int gr_gk20a_init_ctx_vars(struct gk20a *g, struct gr_gk20a *gr);
+int gr_gk20a_init_ctx_vars_sim(struct gk20a *g, struct gr_gk20a *gr);
+#endif /*__GR_CTX_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a_sim.c b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a_sim.c
new file mode 100644
index 00000000..12bba1fd
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a_sim.c
@@ -0,0 +1,256 @@
+/*
+ * drivers/video/tegra/host/gk20a/gr_ctx_sim_gk20a.c
+ *
+ * GK20A Graphics Context for Simulation
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include "gk20a.h"
+#include "gr_ctx_gk20a.h"
+int gr_gk20a_init_ctx_vars_sim(struct gk20a *g, struct gr_gk20a *gr)
+{
+        int err = 0;
+        u32 i, temp;
+        char *size_path  = NULL;
+        char *reg_path   = NULL;
+        char *value_path = NULL;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_info,
+                   "querying grctx info from chiplib");
+        g->gr.ctx_vars.dynamic = true;
+        g->gr.netlist = GR_NETLIST_DYNAMIC;
+        /* query sizes and counts */
+        gk20a_sim_esc_readl(g, "GRCTX_UCODE_INST_FECS_COUNT", 0,
+                            &g->gr.ctx_vars.ucode.fecs.inst.count);
+        gk20a_sim_esc_readl(g, "GRCTX_UCODE_DATA_FECS_COUNT", 0,
+                            &g->gr.ctx_vars.ucode.fecs.data.count);
+        gk20a_sim_esc_readl(g, "GRCTX_UCODE_INST_GPCCS_COUNT", 0,
+                            &g->gr.ctx_vars.ucode.gpccs.inst.count);
+        gk20a_sim_esc_readl(g, "GRCTX_UCODE_DATA_GPCCS_COUNT", 0,
+                            &g->gr.ctx_vars.ucode.gpccs.data.count);
+        gk20a_sim_esc_readl(g, "GRCTX_ALL_CTX_TOTAL_WORDS", 0, &temp);
+        g->gr.ctx_vars.buffer_size = temp << 2;
+        gk20a_sim_esc_readl(g, "GRCTX_SW_BUNDLE_INIT_SIZE", 0,
+                            &g->gr.ctx_vars.sw_bundle_init.count);
+        gk20a_sim_esc_readl(g, "GRCTX_SW_METHOD_INIT_SIZE", 0,
+                            &g->gr.ctx_vars.sw_method_init.count);
+        gk20a_sim_esc_readl(g, "GRCTX_SW_CTX_LOAD_SIZE", 0,
+                            &g->gr.ctx_vars.sw_ctx_load.count);
+        switch (0) { /*g->gr.ctx_vars.reg_init_override)*/
+#if 0
+        case NV_REG_STR_RM_GR_REG_INIT_OVERRIDE_PROD_DIFF:
+                sizePath   = "GRCTX_NONCTXSW_PROD_DIFF_REG_SIZE";
+                regPath    = "GRCTX_NONCTXSW_PROD_DIFF_REG:REG";
+                valuePath  = "GRCTX_NONCTXSW_PROD_DIFF_REG:VALUE";
+                break;
+#endif
+        default:
+                size_path   = "GRCTX_NONCTXSW_REG_SIZE";
+                reg_path    = "GRCTX_NONCTXSW_REG:REG";
+                value_path  = "GRCTX_NONCTXSW_REG:VALUE";
+                break;
+        }
+        gk20a_sim_esc_readl(g, size_path, 0,
+                            &g->gr.ctx_vars.sw_non_ctx_load.count);
+        gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_SYS_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.sys.count);
+        gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_GPC_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.gpc.count);
+        gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_TPC_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.tpc.count);
+#if 0
+        /* looks to be unused, actually chokes the sim */
+        gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PPC_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.ppc.count);
+#endif
+        gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_ZCULL_GPC_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.zcull_gpc.count);
+        gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_SYS_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.pm_sys.count);
+        gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_GPC_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.pm_gpc.count);
+        gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_TPC_COUNT", 0,
+                            &g->gr.ctx_vars.ctxsw_regs.pm_tpc.count);
+        err |= !alloc_u32_list_gk20a(&g->gr.ctx_vars.ucode.fecs.inst);
+        err |= !alloc_u32_list_gk20a(&g->gr.ctx_vars.ucode.fecs.data);
+        err |= !alloc_u32_list_gk20a(&g->gr.ctx_vars.ucode.gpccs.inst);
+        err |= !alloc_u32_list_gk20a(&g->gr.ctx_vars.ucode.gpccs.data);
+        err |= !alloc_av_list_gk20a(&g->gr.ctx_vars.sw_bundle_init);
+        err |= !alloc_av_list_gk20a(&g->gr.ctx_vars.sw_method_init);
+        err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.sw_ctx_load);
+        err |= !alloc_av_list_gk20a(&g->gr.ctx_vars.sw_non_ctx_load);
+        err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.sys);
+        err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.gpc);
+        err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.tpc);
+        err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.zcull_gpc);
+        err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.ppc);
+        err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.pm_sys);
+        err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.pm_gpc);
+        err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.pm_tpc);
+        if (err)
+                goto fail;
+        for (i = 0; i < g->gr.ctx_vars.ucode.fecs.inst.count; i++)
+                gk20a_sim_esc_readl(g, "GRCTX_UCODE_INST_FECS",
+                                    i, &g->gr.ctx_vars.ucode.fecs.inst.l[i]);
+        for (i = 0; i < g->gr.ctx_vars.ucode.fecs.data.count; i++)
+                gk20a_sim_esc_readl(g, "GRCTX_UCODE_DATA_FECS",
+                                    i, &g->gr.ctx_vars.ucode.fecs.data.l[i]);
+        for (i = 0; i < g->gr.ctx_vars.ucode.gpccs.inst.count; i++)
+                gk20a_sim_esc_readl(g, "GRCTX_UCODE_INST_GPCCS",
+                                    i, &g->gr.ctx_vars.ucode.gpccs.inst.l[i]);
+        for (i = 0; i < g->gr.ctx_vars.ucode.gpccs.data.count; i++)
+                gk20a_sim_esc_readl(g, "GRCTX_UCODE_DATA_GPCCS",
+                                    i, &g->gr.ctx_vars.ucode.gpccs.data.l[i]);
+        for (i = 0; i < g->gr.ctx_vars.sw_bundle_init.count; i++) {
+                struct av_gk20a *l = g->gr.ctx_vars.sw_bundle_init.l;
+                gk20a_sim_esc_readl(g, "GRCTX_SW_BUNDLE_INIT:ADDR",
+                                    i, &l[i].addr);
+                gk20a_sim_esc_readl(g, "GRCTX_SW_BUNDLE_INIT:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.sw_method_init.count; i++) {
+                struct av_gk20a *l = g->gr.ctx_vars.sw_method_init.l;
+                gk20a_sim_esc_readl(g, "GRCTX_SW_METHOD_INIT:ADDR",
+                                    i, &l[i].addr);
+                gk20a_sim_esc_readl(g, "GRCTX_SW_METHOD_INIT:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.sw_ctx_load.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.sw_ctx_load.l;
+                gk20a_sim_esc_readl(g, "GRCTX_SW_CTX_LOAD:ADDR",
+                                    i, &l[i].addr);
+                gk20a_sim_esc_readl(g, "GRCTX_SW_CTX_LOAD:INDEX",
+                                    i, &l[i].index);
+                gk20a_sim_esc_readl(g, "GRCTX_SW_CTX_LOAD:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.sw_non_ctx_load.count; i++) {
+                struct av_gk20a *l = g->gr.ctx_vars.sw_non_ctx_load.l;
+                gk20a_sim_esc_readl(g, reg_path, i, &l[i].addr);
+                gk20a_sim_esc_readl(g, value_path, i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.sys.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.sys.l;
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_SYS:ADDR",
+                                    i, &l[i].addr);
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_SYS:INDEX",
+                                    i, &l[i].index);
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_SYS:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.gpc.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.gpc.l;
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_GPC:ADDR",
+                                    i, &l[i].addr);
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_GPC:INDEX",
+                                    i, &l[i].index);
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_GPC:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.tpc.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.tpc.l;
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_TPC:ADDR",
+                                    i, &l[i].addr);
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_TPC:INDEX",
+                                    i, &l[i].index);
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_TPC:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.ppc.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.ppc.l;
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PPC:ADDR",
+                                    i, &l[i].addr);
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PPC:INDEX",
+                                    i, &l[i].index);
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PPC:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.zcull_gpc.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.zcull_gpc.l;
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_ZCULL_GPC:ADDR",
+                                    i, &l[i].addr);
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_ZCULL_GPC:INDEX",
+                                    i, &l[i].index);
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_ZCULL_GPC:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.pm_sys.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.pm_sys.l;
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_SYS:ADDR",
+                                    i, &l[i].addr);
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_SYS:INDEX",
+                                    i, &l[i].index);
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_SYS:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.pm_gpc.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.pm_gpc.l;
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_GPC:ADDR",
+                                    i, &l[i].addr);
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_GPC:INDEX",
+                                    i, &l[i].index);
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_GPC:VALUE",
+                                    i, &l[i].value);
+        }
+        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.pm_tpc.count; i++) {
+                struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.pm_tpc.l;
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_TPC:ADDR",
+                                    i, &l[i].addr);
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_TPC:INDEX",
+                                    i, &l[i].index);
+                gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_TPC:VALUE",
+                                    i, &l[i].value);
+        }
+        g->gr.ctx_vars.valid = true;
+        gk20a_sim_esc_readl(g, "GRCTX_GEN_CTX_REGS_BASE_INDEX", 0,
+                            &g->gr.ctx_vars.regs_base_index);
+        gk20a_dbg(gpu_dbg_info | gpu_dbg_fn, "finished querying grctx info from chiplib");
+        return 0;
+fail:
+        gk20a_err(dev_from_gk20a(g),
+                   "failed querying grctx info from chiplib");
+        return err;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
new file mode 100644
index 00000000..0f93940b
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -0,0 +1,6747 @@
+/*
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/delay.h>        /* for udelay */
+#include <linux/mm.h>           /* for totalram_pages */
+#include <linux/scatterlist.h>
+#include <linux/tegra-soc.h>
+#include <linux/nvhost_dbg_gpu_ioctl.h>
+#include <linux/vmalloc.h>
+#include <linux/dma-mapping.h>
+#include <linux/firmware.h>
+#include <linux/nvhost.h>
+#include "gk20a.h"
+#include "kind_gk20a.h"
+#include "gr_ctx_gk20a.h"
+#include "hw_ccsr_gk20a.h"
+#include "hw_ctxsw_prog_gk20a.h"
+#include "hw_fifo_gk20a.h"
+#include "hw_gr_gk20a.h"
+#include "hw_gmmu_gk20a.h"
+#include "hw_mc_gk20a.h"
+#include "hw_ram_gk20a.h"
+#include "hw_pri_ringmaster_gk20a.h"
+#include "hw_pri_ringstation_sys_gk20a.h"
+#include "hw_pri_ringstation_gpc_gk20a.h"
+#include "hw_pri_ringstation_fbp_gk20a.h"
+#include "hw_proj_gk20a.h"
+#include "hw_top_gk20a.h"
+#include "hw_ltc_gk20a.h"
+#include "hw_fb_gk20a.h"
+#include "hw_therm_gk20a.h"
+#include "hw_pbdma_gk20a.h"
+#include "gr_pri_gk20a.h"
+#include "regops_gk20a.h"
+#include "dbg_gpu_gk20a.h"
+#define BLK_SIZE (256)
+static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);
+/* global ctx buffer */
+static int  gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g);
+static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g);
+static int  gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
+                                            struct channel_gk20a *c);
+static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c);
+/* channel gr ctx buffer */
+static int  gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
+                                        struct channel_gk20a *c);
+static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c);
+/* channel patch ctx buffer */
+static int  gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
+                                        struct channel_gk20a *c);
+static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c);
+/* golden ctx image */
+static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
+                                          struct channel_gk20a *c);
+static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
+                                          struct channel_gk20a *c);
+void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
+{
+        int i;
+        gk20a_err(dev_from_gk20a(g), "gr_fecs_os_r : %d",
+                gk20a_readl(g, gr_fecs_os_r()));
+        gk20a_err(dev_from_gk20a(g), "gr_fecs_cpuctl_r : 0x%x",
+                gk20a_readl(g, gr_fecs_cpuctl_r()));
+        gk20a_err(dev_from_gk20a(g), "gr_fecs_idlestate_r : 0x%x",
+                gk20a_readl(g, gr_fecs_idlestate_r()));
+        gk20a_err(dev_from_gk20a(g), "gr_fecs_mailbox0_r : 0x%x",
+                gk20a_readl(g, gr_fecs_mailbox0_r()));
+        gk20a_err(dev_from_gk20a(g), "gr_fecs_mailbox1_r : 0x%x",
+                gk20a_readl(g, gr_fecs_mailbox1_r()));
+        gk20a_err(dev_from_gk20a(g), "gr_fecs_irqstat_r : 0x%x",
+                gk20a_readl(g, gr_fecs_irqstat_r()));
+        gk20a_err(dev_from_gk20a(g), "gr_fecs_irqmode_r : 0x%x",
+                gk20a_readl(g, gr_fecs_irqmode_r()));
+        gk20a_err(dev_from_gk20a(g), "gr_fecs_irqmask_r : 0x%x",
+                gk20a_readl(g, gr_fecs_irqmask_r()));
+        gk20a_err(dev_from_gk20a(g), "gr_fecs_irqdest_r : 0x%x",
+                gk20a_readl(g, gr_fecs_irqdest_r()));
+        gk20a_err(dev_from_gk20a(g), "gr_fecs_debug1_r : 0x%x",
+                gk20a_readl(g, gr_fecs_debug1_r()));
+        gk20a_err(dev_from_gk20a(g), "gr_fecs_debuginfo_r : 0x%x",
+                gk20a_readl(g, gr_fecs_debuginfo_r()));
+        for (i = 0; i < gr_fecs_ctxsw_mailbox__size_1_v(); i++)
+                gk20a_err(dev_from_gk20a(g), "gr_fecs_ctxsw_mailbox_r(%d) : 0x%x",
+                        i, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(i)));
+        gk20a_err(dev_from_gk20a(g), "gr_fecs_engctl_r : 0x%x",
+                gk20a_readl(g, gr_fecs_engctl_r()));
+        gk20a_err(dev_from_gk20a(g), "gr_fecs_curctx_r : 0x%x",
+                gk20a_readl(g, gr_fecs_curctx_r()));
+        gk20a_err(dev_from_gk20a(g), "gr_fecs_nxtctx_r : 0x%x",
+                gk20a_readl(g, gr_fecs_nxtctx_r()));
+        gk20a_writel(g, gr_fecs_icd_cmd_r(),
+                gr_fecs_icd_cmd_opc_rreg_f() |
+                gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
+        gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_IMB : 0x%x",
+                gk20a_readl(g, gr_fecs_icd_rdata_r()));
+        gk20a_writel(g, gr_fecs_icd_cmd_r(),
+                gr_fecs_icd_cmd_opc_rreg_f() |
+                gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
+        gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_DMB : 0x%x",
+                gk20a_readl(g, gr_fecs_icd_rdata_r()));
+        gk20a_writel(g, gr_fecs_icd_cmd_r(),
+                gr_fecs_icd_cmd_opc_rreg_f() |
+                gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
+        gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_CSW : 0x%x",
+                gk20a_readl(g, gr_fecs_icd_rdata_r()));
+        gk20a_writel(g, gr_fecs_icd_cmd_r(),
+                gr_fecs_icd_cmd_opc_rreg_f() |
+                gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
+        gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_CTX : 0x%x",
+                gk20a_readl(g, gr_fecs_icd_rdata_r()));
+        gk20a_writel(g, gr_fecs_icd_cmd_r(),
+                gr_fecs_icd_cmd_opc_rreg_f() |
+                gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
+        gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_EXCI : 0x%x",
+                gk20a_readl(g, gr_fecs_icd_rdata_r()));
+        for (i = 0; i < 4; i++) {
+                gk20a_writel(g, gr_fecs_icd_cmd_r(),
+                        gr_fecs_icd_cmd_opc_rreg_f() |
+                        gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_PC));
+                gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_PC : 0x%x",
+                        gk20a_readl(g, gr_fecs_icd_rdata_r()));
+                gk20a_writel(g, gr_fecs_icd_cmd_r(),
+                        gr_fecs_icd_cmd_opc_rreg_f() |
+                        gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_SP));
+                gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_SP : 0x%x",
+                        gk20a_readl(g, gr_fecs_icd_rdata_r()));
+        }
+}
+static void gr_gk20a_load_falcon_dmem(struct gk20a *g)
+{
+        u32 i, ucode_u32_size;
+        const u32 *ucode_u32_data;
+        u32 checksum;
+        gk20a_dbg_fn("");
+        gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
+                                              gr_gpccs_dmemc_blk_f(0)  |
+                                              gr_gpccs_dmemc_aincw_f(1)));
+        ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.data.count;
+        ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.data.l;
+        for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+                gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
+                checksum += ucode_u32_data[i];
+        }
+        gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
+                                             gr_fecs_dmemc_blk_f(0)  |
+                                             gr_fecs_dmemc_aincw_f(1)));
+        ucode_u32_size = g->gr.ctx_vars.ucode.fecs.data.count;
+        ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.data.l;
+        for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+                gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
+                checksum += ucode_u32_data[i];
+        }
+        gk20a_dbg_fn("done");
+}
+static void gr_gk20a_load_falcon_imem(struct gk20a *g)
+{
+        u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size;
+        const u32 *ucode_u32_data;
+        u32 tag, i, pad_start, pad_end;
+        u32 checksum;
+        gk20a_dbg_fn("");
+        cfg = gk20a_readl(g, gr_fecs_cfg_r());
+        fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
+        cfg = gk20a_readl(g, gr_gpc0_cfg_r());
+        gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
+        /* Use the broadcast address to access all of the GPCCS units. */
+        gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
+                                              gr_gpccs_imemc_blk_f(0) |
+                                              gr_gpccs_imemc_aincw_f(1)));
+        /* Setup the tags for the instruction memory. */
+        tag = 0;
+        gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
+        ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.inst.count;
+        ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.inst.l;
+        for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+                if (i && ((i % (256/sizeof(u32))) == 0)) {
+                        tag++;
+                        gk20a_writel(g, gr_gpccs_imemt_r(0),
+                                      gr_gpccs_imemt_tag_f(tag));
+                }
+                gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
+                checksum += ucode_u32_data[i];
+        }
+        pad_start = i*4;
+        pad_end = pad_start+(256-pad_start%256)+256;
+        for (i = pad_start;
+             (i < gpccs_imem_size * 256) && (i < pad_end);
+             i += 4) {
+                if (i && ((i % 256) == 0)) {
+                        tag++;
+                        gk20a_writel(g, gr_gpccs_imemt_r(0),
+                                      gr_gpccs_imemt_tag_f(tag));
+                }
+                gk20a_writel(g, gr_gpccs_imemd_r(0), 0);
+        }
+        gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
+                                             gr_fecs_imemc_blk_f(0) |
+                                             gr_fecs_imemc_aincw_f(1)));
+        /* Setup the tags for the instruction memory. */
+        tag = 0;
+        gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
+        ucode_u32_size = g->gr.ctx_vars.ucode.fecs.inst.count;
+        ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.inst.l;
+        for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+                if (i && ((i % (256/sizeof(u32))) == 0)) {
+                        tag++;
+                        gk20a_writel(g, gr_fecs_imemt_r(0),
+                                      gr_fecs_imemt_tag_f(tag));
+                }
+                gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
+                checksum += ucode_u32_data[i];
+        }
+        pad_start = i*4;
+        pad_end = pad_start+(256-pad_start%256)+256;
+        for (i = pad_start; (i < fecs_imem_size * 256) && i < pad_end; i += 4) {
+                if (i && ((i % 256) == 0)) {
+                        tag++;
+                        gk20a_writel(g, gr_fecs_imemt_r(0),
+                                      gr_fecs_imemt_tag_f(tag));
+                }
+                gk20a_writel(g, gr_fecs_imemd_r(0), 0);
+        }
+}
+static int gr_gk20a_wait_idle(struct gk20a *g, unsigned long end_jiffies,
+                u32 expect_delay)
+{
+        u32 delay = expect_delay;
+        bool gr_enabled;
+        bool ctxsw_active;
+        bool gr_busy;
+        gk20a_dbg_fn("");
+        do {
+                /* fmodel: host gets fifo_engine_status(gr) from gr
+                   only when gr_status is read */
+                gk20a_readl(g, gr_status_r());
+                gr_enabled = gk20a_readl(g, mc_enable_r()) &
+                        mc_enable_pgraph_enabled_f();
+                ctxsw_active = gk20a_readl(g,
+                        fifo_engine_status_r(ENGINE_GR_GK20A)) &
+                        fifo_engine_status_ctxsw_in_progress_f();
+                gr_busy = gk20a_readl(g, gr_engine_status_r()) &
+                        gr_engine_status_value_busy_f();
+                if (!gr_enabled || (!gr_busy && !ctxsw_active)) {
+                        gk20a_dbg_fn("done");
+                        return 0;
+                }
+                usleep_range(delay, delay * 2);
+                delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+        } while (time_before(jiffies, end_jiffies)
+                        || !tegra_platform_is_silicon());
+        gk20a_err(dev_from_gk20a(g),
+                "timeout, ctxsw busy : %d, gr busy : %d",
+                ctxsw_active, gr_busy);
+        return -EAGAIN;
+}
+static int gr_gk20a_ctx_reset(struct gk20a *g, u32 rst_mask)
+{
+        u32 delay = GR_IDLE_CHECK_DEFAULT;
+        unsigned long end_jiffies = jiffies +
+                msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+        u32 reg;
+        gk20a_dbg_fn("");
+        if (!tegra_platform_is_linsim()) {
+                /* Force clocks on */
+                gk20a_writel(g, gr_fe_pwr_mode_r(),
+                             gr_fe_pwr_mode_req_send_f() |
+                             gr_fe_pwr_mode_mode_force_on_f());
+                /* Wait for the clocks to indicate that they are on */
+                do {
+                        reg = gk20a_readl(g, gr_fe_pwr_mode_r());
+                        if (gr_fe_pwr_mode_req_v(reg) ==
+                                        gr_fe_pwr_mode_req_done_v())
+                                break;
+                        usleep_range(delay, delay * 2);
+                        delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+                } while (time_before(jiffies, end_jiffies));
+                if (!time_before(jiffies, end_jiffies)) {
+                        gk20a_err(dev_from_gk20a(g),
+                                   "failed to force the clocks on\n");
+                        WARN_ON(1);
+                }
+        }
+        if (rst_mask) {
+                gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(), rst_mask);
+        } else {
+                gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
+                             gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
+                             gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
+                             gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
+                             gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
+                             gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
+                             gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
+                             gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f() |
+                             gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f() |
+                             gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f());
+        }
+        /* we need to read the reset register *and* wait for a moment to ensure
+         * reset propagation */
+        gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
+        udelay(20);
+        gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
+                     gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
+                     gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
+                     gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
+                     gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
+                     gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
+                     gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
+                     gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f() |
+                     gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f() |
+                     gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f());
+        /* we need to readl the reset and then wait a small moment after that */
+        gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
+        udelay(20);
+        if (!tegra_platform_is_linsim()) {
+                /* Set power mode back to auto */
+                gk20a_writel(g, gr_fe_pwr_mode_r(),
+                             gr_fe_pwr_mode_req_send_f() |
+                             gr_fe_pwr_mode_mode_auto_f());
+                /* Wait for the request to complete */
+                end_jiffies = jiffies +
+                        msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+                do {
+                        reg = gk20a_readl(g, gr_fe_pwr_mode_r());
+                        if (gr_fe_pwr_mode_req_v(reg) ==
+                                        gr_fe_pwr_mode_req_done_v())
+                                break;
+                        usleep_range(delay, delay * 2);
+                        delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+                } while (time_before(jiffies, end_jiffies));
+                if (!time_before(jiffies, end_jiffies))
+                        gk20a_warn(dev_from_gk20a(g),
+                                   "failed to set power mode to auto\n");
+        }
+        return 0;
+}
+static int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
+                                   u32 *mailbox_ret, u32 opc_success,
+                                   u32 mailbox_ok, u32 opc_fail,
+                                   u32 mailbox_fail)
+{
+        unsigned long end_jiffies = jiffies +
+                msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+        u32 delay = GR_IDLE_CHECK_DEFAULT;
+        u32 check = WAIT_UCODE_LOOP;
+        u32 reg;
+        gk20a_dbg_fn("");
+        while (check == WAIT_UCODE_LOOP) {
+                if (!time_before(jiffies, end_jiffies) &&
+                                tegra_platform_is_silicon())
+                        check = WAIT_UCODE_TIMEOUT;
+                reg = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(mailbox_id));
+                if (mailbox_ret)
+                        *mailbox_ret = reg;
+                switch (opc_success) {
+                case GR_IS_UCODE_OP_EQUAL:
+                        if (reg == mailbox_ok)
+                                check = WAIT_UCODE_OK;
+                        break;
+                case GR_IS_UCODE_OP_NOT_EQUAL:
+                        if (reg != mailbox_ok)
+                                check = WAIT_UCODE_OK;
+                        break;
+                case GR_IS_UCODE_OP_AND:
+                        if (reg & mailbox_ok)
+                                check = WAIT_UCODE_OK;
+                        break;
+                case GR_IS_UCODE_OP_LESSER:
+                        if (reg < mailbox_ok)
+                                check = WAIT_UCODE_OK;
+                        break;
+                case GR_IS_UCODE_OP_LESSER_EQUAL:
+                        if (reg <= mailbox_ok)
+                                check = WAIT_UCODE_OK;
+                        break;
+                case GR_IS_UCODE_OP_SKIP:
+                        /* do no success check */
+                        break;
+                default:
+                        gk20a_err(dev_from_gk20a(g),
+                                   "invalid success opcode 0x%x", opc_success);
+                        check = WAIT_UCODE_ERROR;
+                        break;
+                }
+                switch (opc_fail) {
+                case GR_IS_UCODE_OP_EQUAL:
+                        if (reg == mailbox_fail)
+                                check = WAIT_UCODE_ERROR;
+                        break;
+                case GR_IS_UCODE_OP_NOT_EQUAL:
+                        if (reg != mailbox_fail)
+                                check = WAIT_UCODE_ERROR;
+                        break;
+                case GR_IS_UCODE_OP_AND:
+                        if (reg & mailbox_fail)
+                                check = WAIT_UCODE_ERROR;
+                        break;
+                case GR_IS_UCODE_OP_LESSER:
+                        if (reg < mailbox_fail)
+                                check = WAIT_UCODE_ERROR;
+                        break;
+                case GR_IS_UCODE_OP_LESSER_EQUAL:
+                        if (reg <= mailbox_fail)
+                                check = WAIT_UCODE_ERROR;
+                        break;
+                case GR_IS_UCODE_OP_SKIP:
+                        /* do no check on fail*/
+                        break;
+                default:
+                        gk20a_err(dev_from_gk20a(g),
+                                   "invalid fail opcode 0x%x", opc_fail);
+                        check = WAIT_UCODE_ERROR;
+                        break;
+                }
+                usleep_range(delay, delay * 2);
+                delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+        }
+        if (check == WAIT_UCODE_TIMEOUT) {
+                gk20a_err(dev_from_gk20a(g),
+                           "timeout waiting on ucode response");
+                gk20a_fecs_dump_falcon_stats(g);
+                return -1;
+        } else if (check == WAIT_UCODE_ERROR) {
+                gk20a_err(dev_from_gk20a(g),
+                           "ucode method failed on mailbox=%d value=0x%08x",
+                           mailbox_id, reg);
+                gk20a_fecs_dump_falcon_stats(g);
+                return -1;
+        }
+        gk20a_dbg_fn("done");
+        return 0;
+}
+/* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
+ * We should replace most, if not all, fecs method calls to this instead. */
+struct fecs_method_op_gk20a {
+        struct {
+                u32 addr;
+                u32 data;
+        } method;
+        struct {
+                u32 id;
+                u32 data;
+                u32 clr;
+                u32 *ret;
+                u32 ok;
+                u32 fail;
+        } mailbox;
+        struct {
+                u32 ok;
+                u32 fail;
+        } cond;
+};
+int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
+                                   struct fecs_method_op_gk20a op)
+{
+        struct gr_gk20a *gr = &g->gr;
+        int ret;
+        mutex_lock(&gr->fecs_mutex);
+        if (op.mailbox.id != 0)
+                gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
+                             op.mailbox.data);
+        gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
+                gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
+        gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
+        gk20a_writel(g, gr_fecs_method_push_r(),
+                gr_fecs_method_push_adr_f(op.method.addr));
+        /* op.mb.id == 4 cases require waiting for completion on
+         * for op.mb.id == 0 */
+        if (op.mailbox.id == 4)
+                op.mailbox.id = 0;
+        ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
+                                      op.cond.ok, op.mailbox.ok,
+                                      op.cond.fail, op.mailbox.fail);
+        mutex_unlock(&gr->fecs_mutex);
+        return ret;
+}
+int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
+{
+        return gr_gk20a_submit_fecs_method_op(g,
+              (struct fecs_method_op_gk20a) {
+                      .method.addr = fecs_method,
+                      .method.data = ~0,
+                      .mailbox = { .id   = 1, /*sideband?*/
+                                   .data = ~0, .clr = ~0, .ret = ret,
+                                   .ok   = gr_fecs_ctxsw_mailbox_value_pass_v(),
+                                   .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
+                      .cond.ok = GR_IS_UCODE_OP_EQUAL,
+                      .cond.fail = GR_IS_UCODE_OP_EQUAL });
+}
+/* Stop processing (stall) context switches at FECS.
+ * The caller must hold the dbg_sessions_lock, else if mutliple stop methods
+ * are sent to the ucode in sequence, it can get into an undefined state. */
+int gr_gk20a_disable_ctxsw(struct gk20a *g)
+{
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+        return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_stop_ctxsw_v(), 0);
+}
+/* Start processing (continue) context switches at FECS */
+int gr_gk20a_enable_ctxsw(struct gk20a *g)
+{
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+        return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_start_ctxsw_v(), 0);
+}
+static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
+{
+        u32 addr_lo;
+        u32 addr_hi;
+        void *inst_ptr = NULL;
+        gk20a_dbg_fn("");
+        /* flush gpu_va before commit */
+        gk20a_mm_fb_flush(c->g);
+        gk20a_mm_l2_flush(c->g, true);
+        inst_ptr = c->inst_block.cpuva;
+        if (!inst_ptr)
+                return -ENOMEM;
+        addr_lo = u64_lo32(gpu_va) >> 12;
+        addr_hi = u64_hi32(gpu_va);
+        gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_target_w(),
+                 ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
+                 ram_in_gr_wfi_ptr_lo_f(addr_lo));
+        gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_ptr_hi_w(),
+                 ram_in_gr_wfi_ptr_hi_f(addr_hi));
+        gk20a_mm_l2_invalidate(c->g);
+        return 0;
+}
+/*
+ * Context state can be written directly or "patched" at times.
+ * So that code can be used in either situation it is written
+ * using a series _ctx_patch_write(..., patch) statements.
+ * However any necessary cpu map/unmap and gpu l2 invalidates
+ * should be minimized (to avoid doing it once per patch write).
+ * Before a sequence of these set up with "_ctx_patch_write_begin"
+ * and close with "_ctx_patch_write_end."
+ */
+int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
+                                          struct channel_ctx_gk20a *ch_ctx)
+{
+        /* being defensive still... */
+        if (ch_ctx->patch_ctx.cpu_va) {
+                gk20a_err(dev_from_gk20a(g), "nested ctx patch begin?");
+                return -EBUSY;
+        }
+        ch_ctx->patch_ctx.cpu_va = vmap(ch_ctx->patch_ctx.pages,
+                        PAGE_ALIGN(ch_ctx->patch_ctx.size) >> PAGE_SHIFT,
+                        0, pgprot_dmacoherent(PAGE_KERNEL));
+        if (!ch_ctx->patch_ctx.cpu_va)
+                return -ENOMEM;
+        return 0;
+}
+int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
+                                        struct channel_ctx_gk20a *ch_ctx)
+{
+        /* being defensive still... */
+        if (!ch_ctx->patch_ctx.cpu_va) {
+                gk20a_err(dev_from_gk20a(g), "dangling ctx patch end?");
+                return -EINVAL;
+        }
+        vunmap(ch_ctx->patch_ctx.cpu_va);
+        ch_ctx->patch_ctx.cpu_va = NULL;
+        gk20a_mm_l2_invalidate(g);
+        return 0;
+}
+int gr_gk20a_ctx_patch_write(struct gk20a *g,
+                                    struct channel_ctx_gk20a *ch_ctx,
+                                    u32 addr, u32 data, bool patch)
+{
+        u32 patch_slot = 0;
+        void *patch_ptr = NULL;
+        bool mapped_here = false;
+        BUG_ON(patch != 0 && ch_ctx == NULL);
+        if (patch) {
+                if (!ch_ctx)
+                        return -EINVAL;
+                /* we added an optimization prolog, epilog
+                 * to get rid of unnecessary maps and l2 invals.
+                 * but be defensive still... */
+                if (!ch_ctx->patch_ctx.cpu_va) {
+                        int err;
+                        gk20a_err(dev_from_gk20a(g),
+                                   "per-write ctx patch begin?");
+                        /* yes, gr_gk20a_ctx_patch_smpc causes this one */
+                        err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
+                        if (err)
+                                return err;
+                        mapped_here = true;
+                } else
+                        mapped_here = false;
+                patch_ptr = ch_ctx->patch_ctx.cpu_va;
+                patch_slot = ch_ctx->patch_ctx.data_count * 2;
+                gk20a_mem_wr32(patch_ptr, patch_slot++, addr);
+                gk20a_mem_wr32(patch_ptr, patch_slot++, data);
+                ch_ctx->patch_ctx.data_count++;
+                if (mapped_here)
+                        gr_gk20a_ctx_patch_write_end(g, ch_ctx);
+        } else
+                gk20a_writel(g, addr, data);
+        return 0;
+}
+static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
+                                        struct channel_gk20a *c)
+{
+        u32 inst_base_ptr = u64_lo32(c->inst_block.cpu_pa
+                                     >> ram_in_base_shift_v());
+        u32 ret;
+        gk20a_dbg_info("bind channel %d inst ptr 0x%08x",
+                   c->hw_chid, inst_base_ptr);
+        ret = gr_gk20a_submit_fecs_method_op(g,
+                     (struct fecs_method_op_gk20a) {
+                     .method.addr = gr_fecs_method_push_adr_bind_pointer_v(),
+                     .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
+                                     gr_fecs_current_ctx_target_vid_mem_f() |
+                                     gr_fecs_current_ctx_valid_f(1)),
+                     .mailbox = { .id = 0, .data = 0,
+                                  .clr = 0x30,
+                                  .ret = NULL,
+                                  .ok = 0x10,
+                                  .fail = 0x20, },
+                     .cond.ok = GR_IS_UCODE_OP_AND,
+                     .cond.fail = GR_IS_UCODE_OP_AND});
+        if (ret)
+                gk20a_err(dev_from_gk20a(g),
+                        "bind channel instance failed");
+        return ret;
+}
+static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c,
+                                    bool disable_fifo)
+{
+        struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+        struct fifo_gk20a *f = &g->fifo;
+        struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
+        u32 va_lo, va_hi, va;
+        int ret = 0;
+        void *ctx_ptr = NULL;
+        gk20a_dbg_fn("");
+        ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
+                        PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
+                        0, pgprot_dmacoherent(PAGE_KERNEL));
+        if (!ctx_ptr)
+                return -ENOMEM;
+        if (ch_ctx->zcull_ctx.gpu_va == 0 &&
+            ch_ctx->zcull_ctx.ctx_sw_mode ==
+                ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
+                ret = -EINVAL;
+                goto clean_up;
+        }
+        va_lo = u64_lo32(ch_ctx->zcull_ctx.gpu_va);
+        va_hi = u64_hi32(ch_ctx->zcull_ctx.gpu_va);
+        va = ((va_lo >> 8) & 0x00FFFFFF) | ((va_hi << 24) & 0xFF000000);
+        if (disable_fifo) {
+                ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
+                if (ret) {
+                        gk20a_err(dev_from_gk20a(g),
+                                "failed to disable gr engine activity\n");
+                        goto clean_up;
+                }
+        }
+        /* Channel gr_ctx buffer is gpu cacheable.
+           Flush and invalidate before cpu update. */
+        gk20a_mm_fb_flush(g);
+        gk20a_mm_l2_flush(g, true);
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_o(), 0,
+                 ch_ctx->zcull_ctx.ctx_sw_mode);
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, va);
+        if (disable_fifo) {
+                ret = gk20a_fifo_enable_engine_activity(g, gr_info);
+                if (ret) {
+                        gk20a_err(dev_from_gk20a(g),
+                                "failed to enable gr engine activity\n");
+                        goto clean_up;
+                }
+        }
+        gk20a_mm_l2_invalidate(g);
+clean_up:
+        vunmap(ctx_ptr);
+        return ret;
+}
+static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
+                        struct channel_gk20a *c, bool patch)
+{
+        struct gr_gk20a *gr = &g->gr;
+        struct channel_ctx_gk20a *ch_ctx = NULL;
+        u32 attrib_offset_in_chunk = 0;
+        u32 alpha_offset_in_chunk = 0;
+        u32 pd_ab_max_output;
+        u32 gpc_index, ppc_index;
+        u32 temp;
+        u32 cbm_cfg_size1, cbm_cfg_size2;
+        gk20a_dbg_fn("");
+        if (patch) {
+                int err;
+                ch_ctx = &c->ch_ctx;
+                err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
+                if (err)
+                        return err;
+        }
+        gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_tga_constraintlogic_r(),
+                gr_ds_tga_constraintlogic_beta_cbsize_f(gr->attrib_cb_default_size) |
+                gr_ds_tga_constraintlogic_alpha_cbsize_f(gr->alpha_cb_default_size),
+                patch);
+        pd_ab_max_output = (gr->alpha_cb_default_size *
+                gr_gpc0_ppc0_cbm_cfg_size_granularity_v()) /
+                gr_pd_ab_dist_cfg1_max_output_granularity_v();
+        gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg1_r(),
+                gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) |
+                gr_pd_ab_dist_cfg1_max_batches_init_f(), patch);
+        alpha_offset_in_chunk = attrib_offset_in_chunk +
+                gr->tpc_count * gr->attrib_cb_size;
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                temp = proj_gpc_stride_v() * gpc_index;
+                for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
+                     ppc_index++) {
+                        cbm_cfg_size1 = gr->attrib_cb_default_size *
+                                gr->pes_tpc_count[ppc_index][gpc_index];
+                        cbm_cfg_size2 = gr->alpha_cb_default_size *
+                                gr->pes_tpc_count[ppc_index][gpc_index];
+                        gr_gk20a_ctx_patch_write(g, ch_ctx,
+                                gr_gpc0_ppc0_cbm_cfg_r() + temp +
+                                proj_ppc_in_gpc_stride_v() * ppc_index,
+                                gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(gr->timeslice_mode) |
+                                gr_gpc0_ppc0_cbm_cfg_start_offset_f(attrib_offset_in_chunk) |
+                                gr_gpc0_ppc0_cbm_cfg_size_f(cbm_cfg_size1), patch);
+                        attrib_offset_in_chunk += gr->attrib_cb_size *
+                                gr->pes_tpc_count[ppc_index][gpc_index];
+                        gr_gk20a_ctx_patch_write(g, ch_ctx,
+                                gr_gpc0_ppc0_cbm_cfg2_r() + temp +
+                                proj_ppc_in_gpc_stride_v() * ppc_index,
+                                gr_gpc0_ppc0_cbm_cfg2_start_offset_f(alpha_offset_in_chunk) |
+                                gr_gpc0_ppc0_cbm_cfg2_size_f(cbm_cfg_size2), patch);
+                        alpha_offset_in_chunk += gr->alpha_cb_size *
+                                gr->pes_tpc_count[ppc_index][gpc_index];
+                }
+        }
+        if (patch)
+                gr_gk20a_ctx_patch_write_end(g, ch_ctx);
+        return 0;
+}
+static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
+                        struct channel_gk20a *c, bool patch)
+{
+        struct gr_gk20a *gr = &g->gr;
+        struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+        u64 addr;
+        u32 size;
+        gk20a_dbg_fn("");
+        if (patch) {
+                int err;
+                err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
+                if (err)
+                        return err;
+        }
+        /* global pagepool buffer */
+        addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
+                gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
+                (u64_hi32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
+                 (32 - gr_scc_pagepool_base_addr_39_8_align_bits_v()));
+        size = gr->global_ctx_buffer[PAGEPOOL].size /
+                gr_scc_pagepool_total_pages_byte_granularity_v();
+        if (size == gr_scc_pagepool_total_pages_hwmax_value_v())
+                size = gr_scc_pagepool_total_pages_hwmax_v();
+        gk20a_dbg_info("pagepool buffer addr : 0x%016llx, size : %d",
+                addr, size);
+        g->ops.gr.commit_global_pagepool(g, ch_ctx, addr, size, patch);
+        /* global bundle cb */
+        addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
+                gr_scc_bundle_cb_base_addr_39_8_align_bits_v()) |
+                (u64_hi32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
+                 (32 - gr_scc_bundle_cb_base_addr_39_8_align_bits_v()));
+        size = gr->bundle_cb_default_size;
+        gk20a_dbg_info("bundle cb addr : 0x%016llx, size : %d",
+                addr, size);
+        g->ops.gr.commit_global_bundle_cb(g, ch_ctx, addr, size, patch);
+        /* global attrib cb */
+        addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
+                gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()) |
+                (u64_hi32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
+                 (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
+        gk20a_dbg_info("attrib cb addr : 0x%016llx", addr);
+        g->ops.gr.commit_global_attrib_cb(g, ch_ctx, addr, patch);
+        if (patch)
+                gr_gk20a_ctx_patch_write_end(g, ch_ctx);
+        return 0;
+}
+static void gr_gk20a_commit_global_attrib_cb(struct gk20a *g,
+                                            struct channel_ctx_gk20a *ch_ctx,
+                                            u64 addr, bool patch)
+{
+        gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_attrib_cb_base_r(),
+                gr_gpcs_setup_attrib_cb_base_addr_39_12_f(addr) |
+                gr_gpcs_setup_attrib_cb_base_valid_true_f(), patch);
+        gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(),
+                gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(addr) |
+                gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(), patch);
+}
+static void gr_gk20a_commit_global_bundle_cb(struct gk20a *g,
+                                            struct channel_ctx_gk20a *ch_ctx,
+                                            u64 addr, u64 size, bool patch)
+{
+        u32 data;
+        gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_base_r(),
+                gr_scc_bundle_cb_base_addr_39_8_f(addr), patch);
+        gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_size_r(),
+                gr_scc_bundle_cb_size_div_256b_f(size) |
+                gr_scc_bundle_cb_size_valid_true_f(), patch);
+        gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_base_r(),
+                gr_gpcs_setup_bundle_cb_base_addr_39_8_f(addr), patch);
+        gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_size_r(),
+                gr_gpcs_setup_bundle_cb_size_div_256b_f(size) |
+                gr_gpcs_setup_bundle_cb_size_valid_true_f(), patch);
+        /* data for state_limit */
+        data = (g->gr.bundle_cb_default_size *
+                gr_scc_bundle_cb_size_div_256b_byte_granularity_v()) /
+                gr_pd_ab_dist_cfg2_state_limit_scc_bundle_granularity_v();
+        data = min_t(u32, data, g->gr.min_gpm_fifo_depth);
+        gk20a_dbg_info("bundle cb token limit : %d, state limit : %d",
+                   g->gr.bundle_cb_token_limit, data);
+        gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg2_r(),
+                gr_pd_ab_dist_cfg2_token_limit_f(g->gr.bundle_cb_token_limit) |
+                gr_pd_ab_dist_cfg2_state_limit_f(data), patch);
+}
+static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, bool patch)
+{
+        struct gr_gk20a *gr = &g->gr;
+        struct channel_ctx_gk20a *ch_ctx = NULL;
+        u32 gpm_pd_cfg;
+        u32 pd_ab_dist_cfg0;
+        u32 ds_debug;
+        u32 mpc_vtg_debug;
+        u32 pe_vaf;
+        u32 pe_vsc_vpc;
+        gk20a_dbg_fn("");
+        gpm_pd_cfg = gk20a_readl(g, gr_gpcs_gpm_pd_cfg_r());
+        pd_ab_dist_cfg0 = gk20a_readl(g, gr_pd_ab_dist_cfg0_r());
+        ds_debug = gk20a_readl(g, gr_ds_debug_r());
+        mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
+        if (patch) {
+                int err;
+                ch_ctx = &c->ch_ctx;
+                err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
+                if (err)
+                        return err;
+        }
+        if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
+                pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
+                pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
+                gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f() | gpm_pd_cfg;
+                pe_vaf = gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f() | pe_vaf;
+                pe_vsc_vpc = gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f() | pe_vsc_vpc;
+                pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_en_f() | pd_ab_dist_cfg0;
+                ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
+                mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
+                gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
+                gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch);
+                gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch);
+                gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
+                gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
+                gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
+        } else {
+                gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
+                pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
+                ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
+                mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
+                gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
+                gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
+                gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
+                gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
+        }
+        if (patch)
+                gr_gk20a_ctx_patch_write_end(g, ch_ctx);
+        return 0;
+}
+int gr_gk20a_setup_rop_mapping(struct gk20a *g, struct gr_gk20a *gr)
+{
+        u32 norm_entries, norm_shift;
+        u32 coeff5_mod, coeff6_mod, coeff7_mod, coeff8_mod, coeff9_mod, coeff10_mod, coeff11_mod;
+        u32 map0, map1, map2, map3, map4, map5;
+        if (!gr->map_tiles)
+                return -1;
+        gk20a_dbg_fn("");
+        gk20a_writel(g, gr_crstr_map_table_cfg_r(),
+                     gr_crstr_map_table_cfg_row_offset_f(gr->map_row_offset) |
+                     gr_crstr_map_table_cfg_num_entries_f(gr->tpc_count));
+        map0 =  gr_crstr_gpc_map0_tile0_f(gr->map_tiles[0]) |
+                gr_crstr_gpc_map0_tile1_f(gr->map_tiles[1]) |
+                gr_crstr_gpc_map0_tile2_f(gr->map_tiles[2]) |
+                gr_crstr_gpc_map0_tile3_f(gr->map_tiles[3]) |
+                gr_crstr_gpc_map0_tile4_f(gr->map_tiles[4]) |
+                gr_crstr_gpc_map0_tile5_f(gr->map_tiles[5]);
+        map1 =  gr_crstr_gpc_map1_tile6_f(gr->map_tiles[6]) |
+                gr_crstr_gpc_map1_tile7_f(gr->map_tiles[7]) |
+                gr_crstr_gpc_map1_tile8_f(gr->map_tiles[8]) |
+                gr_crstr_gpc_map1_tile9_f(gr->map_tiles[9]) |
+                gr_crstr_gpc_map1_tile10_f(gr->map_tiles[10]) |
+                gr_crstr_gpc_map1_tile11_f(gr->map_tiles[11]);
+        map2 =  gr_crstr_gpc_map2_tile12_f(gr->map_tiles[12]) |
+                gr_crstr_gpc_map2_tile13_f(gr->map_tiles[13]) |
+                gr_crstr_gpc_map2_tile14_f(gr->map_tiles[14]) |
+                gr_crstr_gpc_map2_tile15_f(gr->map_tiles[15]) |
+                gr_crstr_gpc_map2_tile16_f(gr->map_tiles[16]) |
+                gr_crstr_gpc_map2_tile17_f(gr->map_tiles[17]);
+        map3 =  gr_crstr_gpc_map3_tile18_f(gr->map_tiles[18]) |
+                gr_crstr_gpc_map3_tile19_f(gr->map_tiles[19]) |
+                gr_crstr_gpc_map3_tile20_f(gr->map_tiles[20]) |
+                gr_crstr_gpc_map3_tile21_f(gr->map_tiles[21]) |
+                gr_crstr_gpc_map3_tile22_f(gr->map_tiles[22]) |
+                gr_crstr_gpc_map3_tile23_f(gr->map_tiles[23]);
+        map4 =  gr_crstr_gpc_map4_tile24_f(gr->map_tiles[24]) |
+                gr_crstr_gpc_map4_tile25_f(gr->map_tiles[25]) |
+                gr_crstr_gpc_map4_tile26_f(gr->map_tiles[26]) |
+                gr_crstr_gpc_map4_tile27_f(gr->map_tiles[27]) |
+                gr_crstr_gpc_map4_tile28_f(gr->map_tiles[28]) |
+                gr_crstr_gpc_map4_tile29_f(gr->map_tiles[29]);
+        map5 =  gr_crstr_gpc_map5_tile30_f(gr->map_tiles[30]) |
+                gr_crstr_gpc_map5_tile31_f(gr->map_tiles[31]) |
+                gr_crstr_gpc_map5_tile32_f(0) |
+                gr_crstr_gpc_map5_tile33_f(0) |
+                gr_crstr_gpc_map5_tile34_f(0) |
+                gr_crstr_gpc_map5_tile35_f(0);
+        gk20a_writel(g, gr_crstr_gpc_map0_r(), map0);
+        gk20a_writel(g, gr_crstr_gpc_map1_r(), map1);
+        gk20a_writel(g, gr_crstr_gpc_map2_r(), map2);
+        gk20a_writel(g, gr_crstr_gpc_map3_r(), map3);
+        gk20a_writel(g, gr_crstr_gpc_map4_r(), map4);
+        gk20a_writel(g, gr_crstr_gpc_map5_r(), map5);
+        switch (gr->tpc_count) {
+        case 1:
+                norm_shift = 4;
+                break;
+        case 2:
+        case 3:
+                norm_shift = 3;
+                break;
+        case 4:
+        case 5:
+        case 6:
+        case 7:
+                norm_shift = 2;
+                break;
+        case 8:
+        case 9:
+        case 10:
+        case 11:
+        case 12:
+        case 13:
+        case 14:
+        case 15:
+                norm_shift = 1;
+                break;
+        default:
+                norm_shift = 0;
+                break;
+        }
+        norm_entries = gr->tpc_count << norm_shift;
+        coeff5_mod = (1 << 5) % norm_entries;
+        coeff6_mod = (1 << 6) % norm_entries;
+        coeff7_mod = (1 << 7) % norm_entries;
+        coeff8_mod = (1 << 8) % norm_entries;
+        coeff9_mod = (1 << 9) % norm_entries;
+        coeff10_mod = (1 << 10) % norm_entries;
+        coeff11_mod = (1 << 11) % norm_entries;
+        gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg_r(),
+                     gr_ppcs_wwdx_map_table_cfg_row_offset_f(gr->map_row_offset) |
+                     gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(norm_entries) |
+                     gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(norm_shift) |
+                     gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(coeff5_mod) |
+                     gr_ppcs_wwdx_map_table_cfg_num_entries_f(gr->tpc_count));
+        gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg2_r(),
+                     gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(coeff6_mod) |
+                     gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(coeff7_mod) |
+                     gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(coeff8_mod) |
+                     gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(coeff9_mod) |
+                     gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(coeff10_mod) |
+                     gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(coeff11_mod));
+        gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map0_r(), map0);
+        gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map1_r(), map1);
+        gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map2_r(), map2);
+        gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map3_r(), map3);
+        gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map4_r(), map4);
+        gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map5_r(), map5);
+        gk20a_writel(g, gr_rstr2d_map_table_cfg_r(),
+                     gr_rstr2d_map_table_cfg_row_offset_f(gr->map_row_offset) |
+                     gr_rstr2d_map_table_cfg_num_entries_f(gr->tpc_count));
+        gk20a_writel(g, gr_rstr2d_gpc_map0_r(), map0);
+        gk20a_writel(g, gr_rstr2d_gpc_map1_r(), map1);
+        gk20a_writel(g, gr_rstr2d_gpc_map2_r(), map2);
+        gk20a_writel(g, gr_rstr2d_gpc_map3_r(), map3);
+        gk20a_writel(g, gr_rstr2d_gpc_map4_r(), map4);
+        gk20a_writel(g, gr_rstr2d_gpc_map5_r(), map5);
+        return 0;
+}
+static inline u32 count_bits(u32 mask)
+{
+        u32 temp = mask;
+        u32 count;
+        for (count = 0; temp != 0; count++)
+                temp &= temp - 1;
+        return count;
+}
+static inline u32 clear_count_bits(u32 num, u32 clear_count)
+{
+        u32 count = clear_count;
+        for (; (num != 0) && (count != 0); count--)
+                num &= num - 1;
+        return num;
+}
+static int gr_gk20a_setup_alpha_beta_tables(struct gk20a *g,
+                                        struct gr_gk20a *gr)
+{
+        u32 table_index_bits = 5;
+        u32 rows = (1 << table_index_bits);
+        u32 row_stride = gr_pd_alpha_ratio_table__size_1_v() / rows;
+        u32 row;
+        u32 index;
+        u32 gpc_index;
+        u32 gpcs_per_reg = 4;
+        u32 pes_index;
+        u32 tpc_count_pes;
+        u32 num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
+        u32 alpha_target, beta_target;
+        u32 alpha_bits, beta_bits;
+        u32 alpha_mask, beta_mask, partial_mask;
+        u32 reg_offset;
+        bool assign_alpha;
+        u32 map_alpha[gr_pd_alpha_ratio_table__size_1_v()];
+        u32 map_beta[gr_pd_alpha_ratio_table__size_1_v()];
+        u32 map_reg_used[gr_pd_alpha_ratio_table__size_1_v()];
+        gk20a_dbg_fn("");
+        memset(map_alpha, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
+        memset(map_beta, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
+        memset(map_reg_used, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
+        for (row = 0; row < rows; ++row) {
+                alpha_target = max_t(u32, gr->tpc_count * row / rows, 1);
+                beta_target = gr->tpc_count - alpha_target;
+                assign_alpha = (alpha_target < beta_target);
+                for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                        reg_offset = (row * row_stride) + (gpc_index / gpcs_per_reg);
+                        alpha_mask = beta_mask = 0;
+                        for (pes_index = 0; pes_index < num_pes_per_gpc; pes_index++) {
+                                tpc_count_pes = gr->pes_tpc_count[pes_index][gpc_index];
+                                if (assign_alpha) {
+                                        alpha_bits = (alpha_target == 0) ? 0 : tpc_count_pes;
+                                        beta_bits = tpc_count_pes - alpha_bits;
+                                } else {
+                                        beta_bits = (beta_target == 0) ? 0 : tpc_count_pes;
+                                        alpha_bits = tpc_count_pes - beta_bits;
+                                }
+                                partial_mask = gr->pes_tpc_mask[pes_index][gpc_index];
+                                partial_mask = clear_count_bits(partial_mask, tpc_count_pes - alpha_bits);
+                                alpha_mask |= partial_mask;
+                                partial_mask = gr->pes_tpc_mask[pes_index][gpc_index] ^ partial_mask;
+                                beta_mask |= partial_mask;
+                                alpha_target -= min(alpha_bits, alpha_target);
+                                beta_target -= min(beta_bits, beta_target);
+                                if ((alpha_bits > 0) || (beta_bits > 0))
+                                        assign_alpha = !assign_alpha;
+                        }
+                        switch (gpc_index % gpcs_per_reg) {
+                        case 0:
+                                map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n0_mask_f(alpha_mask);
+                                map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n0_mask_f(beta_mask);
+                                break;
+                        case 1:
+                                map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n1_mask_f(alpha_mask);
+                                map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n1_mask_f(beta_mask);
+                                break;
+                        case 2:
+                                map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n2_mask_f(alpha_mask);
+                                map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n2_mask_f(beta_mask);
+                                break;
+                        case 3:
+                                map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n3_mask_f(alpha_mask);
+                                map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n3_mask_f(beta_mask);
+                                break;
+                        }
+                        map_reg_used[reg_offset] = true;
+                }
+        }
+        for (index = 0; index < gr_pd_alpha_ratio_table__size_1_v(); index++) {
+                if (map_reg_used[index]) {
+                        gk20a_writel(g, gr_pd_alpha_ratio_table_r(index), map_alpha[index]);
+                        gk20a_writel(g, gr_pd_beta_ratio_table_r(index), map_beta[index]);
+                }
+        }
+        return 0;
+}
+static int gr_gk20a_ctx_state_floorsweep(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        u32 tpc_index, gpc_index;
+        u32 tpc_offset, gpc_offset;
+        u32 sm_id = 0, gpc_id = 0;
+        u32 sm_id_to_gpc_id[proj_scal_max_gpcs_v() * proj_scal_max_tpc_per_gpc_v()];
+        u32 tpc_per_gpc;
+        u32 max_ways_evict = INVALID_MAX_WAYS;
+        u32 l1c_dbg_reg_val;
+        gk20a_dbg_fn("");
+        for (tpc_index = 0; tpc_index < gr->max_tpc_per_gpc_count; tpc_index++) {
+                for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                        gpc_offset = proj_gpc_stride_v() * gpc_index;
+                        if (tpc_index < gr->gpc_tpc_count[gpc_index]) {
+                                tpc_offset = proj_tpc_in_gpc_stride_v() * tpc_index;
+                                gk20a_writel(g, gr_gpc0_tpc0_sm_cfg_r() + gpc_offset + tpc_offset,
+                                             gr_gpc0_tpc0_sm_cfg_sm_id_f(sm_id));
+                                gk20a_writel(g, gr_gpc0_tpc0_l1c_cfg_smid_r() + gpc_offset + tpc_offset,
+                                             gr_gpc0_tpc0_l1c_cfg_smid_value_f(sm_id));
+                                gk20a_writel(g, gr_gpc0_gpm_pd_sm_id_r(tpc_index) + gpc_offset,
+                                             gr_gpc0_gpm_pd_sm_id_id_f(sm_id));
+                                gk20a_writel(g, gr_gpc0_tpc0_pe_cfg_smid_r() + gpc_offset + tpc_offset,
+                                             gr_gpc0_tpc0_pe_cfg_smid_value_f(sm_id));
+                                sm_id_to_gpc_id[sm_id] = gpc_index;
+                                sm_id++;
+                        }
+                        gk20a_writel(g, gr_gpc0_gpm_pd_active_tpcs_r() + gpc_offset,
+                                     gr_gpc0_gpm_pd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
+                        gk20a_writel(g, gr_gpc0_gpm_sd_active_tpcs_r() + gpc_offset,
+                                     gr_gpc0_gpm_sd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
+                }
+        }
+        for (tpc_index = 0, gpc_id = 0;
+             tpc_index < gr_pd_num_tpc_per_gpc__size_1_v();
+             tpc_index++, gpc_id += 8) {
+                if (gpc_id >= gr->gpc_count)
+                        gpc_id = 0;
+                tpc_per_gpc =
+                        gr_pd_num_tpc_per_gpc_count0_f(gr->gpc_tpc_count[gpc_id + 0]) |
+                        gr_pd_num_tpc_per_gpc_count1_f(gr->gpc_tpc_count[gpc_id + 1]) |
+                        gr_pd_num_tpc_per_gpc_count2_f(gr->gpc_tpc_count[gpc_id + 2]) |
+                        gr_pd_num_tpc_per_gpc_count3_f(gr->gpc_tpc_count[gpc_id + 3]) |
+                        gr_pd_num_tpc_per_gpc_count4_f(gr->gpc_tpc_count[gpc_id + 4]) |
+                        gr_pd_num_tpc_per_gpc_count5_f(gr->gpc_tpc_count[gpc_id + 5]) |
+                        gr_pd_num_tpc_per_gpc_count6_f(gr->gpc_tpc_count[gpc_id + 6]) |
+                        gr_pd_num_tpc_per_gpc_count7_f(gr->gpc_tpc_count[gpc_id + 7]);
+                gk20a_writel(g, gr_pd_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
+                gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
+        }
+        /* gr__setup_pd_mapping stubbed for gk20a */
+        gr_gk20a_setup_rop_mapping(g, gr);
+        if (g->ops.gr.setup_alpha_beta_tables)
+                g->ops.gr.setup_alpha_beta_tables(g, gr);
+        if (gr->num_fbps == 1)
+                max_ways_evict = 9;
+        if (max_ways_evict != INVALID_MAX_WAYS)
+                g->ops.ltc.set_max_ways_evict_last(g, max_ways_evict);
+        for (gpc_index = 0;
+             gpc_index < gr_pd_dist_skip_table__size_1_v() * 4;
+             gpc_index += 4) {
+                gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4),
+                             gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) ||
+                             gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) ||
+                             gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) ||
+                             gr_pd_dist_skip_table_gpc_4n3_mask_f(gr->gpc_skip_mask[gpc_index + 3]));
+        }
+        gk20a_writel(g, gr_cwd_fs_r(),
+                     gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
+                     gr_cwd_fs_num_tpcs_f(gr->tpc_count));
+        gk20a_writel(g, gr_bes_zrop_settings_r(),
+                     gr_bes_zrop_settings_num_active_fbps_f(gr->num_fbps));
+        gk20a_writel(g, gr_bes_crop_settings_r(),
+                     gr_bes_crop_settings_num_active_fbps_f(gr->num_fbps));
+        /* turn on cya15 bit for a default val that missed the cut */
+        l1c_dbg_reg_val = gk20a_readl(g, gr_gpc0_tpc0_l1c_dbg_r());
+        l1c_dbg_reg_val |= gr_gpc0_tpc0_l1c_dbg_cya15_en_f();
+        gk20a_writel(g, gr_gpc0_tpc0_l1c_dbg_r(), l1c_dbg_reg_val);
+        return 0;
+}
+static int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
+{
+        struct gk20a *g = c->g;
+        int ret;
+        u32 inst_base_ptr =
+                u64_lo32(c->inst_block.cpu_pa
+                >> ram_in_base_shift_v());
+        gk20a_dbg_fn("");
+        ret = gr_gk20a_submit_fecs_method_op(g,
+                (struct fecs_method_op_gk20a) {
+                .method.addr = save_type,
+                .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
+                                gr_fecs_current_ctx_target_vid_mem_f() |
+                                gr_fecs_current_ctx_valid_f(1)),
+                .mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL,
+                        .ok = 1, .fail = 2,
+                },
+                .cond.ok = GR_IS_UCODE_OP_AND,
+                .cond.fail = GR_IS_UCODE_OP_AND,
+                 });
+        if (ret)
+                gk20a_err(dev_from_gk20a(g), "save context image failed");
+        return ret;
+}
+static u32 gk20a_init_sw_bundle(struct gk20a *g)
+{
+        struct av_list_gk20a *sw_bundle_init = &g->gr.ctx_vars.sw_bundle_init;
+        u32 last_bundle_data = 0;
+        u32 err = 0;
+        int i;
+        unsigned long end_jiffies = jiffies +
+                msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+        /* enable pipe mode override */
+        gk20a_writel(g, gr_pipe_bundle_config_r(),
+                gr_pipe_bundle_config_override_pipe_mode_enabled_f());
+        /* load bundle init */
+        for (i = 0; i < sw_bundle_init->count; i++) {
+                if (i == 0 || last_bundle_data != sw_bundle_init->l[i].value) {
+                        gk20a_writel(g, gr_pipe_bundle_data_r(),
+                                sw_bundle_init->l[i].value);
+                        last_bundle_data = sw_bundle_init->l[i].value;
+                }
+                gk20a_writel(g, gr_pipe_bundle_address_r(),
+                             sw_bundle_init->l[i].addr);
+                if (gr_pipe_bundle_address_value_v(sw_bundle_init->l[i].addr) ==
+                    GR_GO_IDLE_BUNDLE)
+                        err |= gr_gk20a_wait_idle(g, end_jiffies,
+                                        GR_IDLE_CHECK_DEFAULT);
+        }
+        /* disable pipe mode override */
+        gk20a_writel(g, gr_pipe_bundle_config_r(),
+                     gr_pipe_bundle_config_override_pipe_mode_disabled_f());
+        return err;
+}
+/* init global golden image from a fresh gr_ctx in channel ctx.
+   save a copy in local_golden_image in ctx_vars */
+static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
+                                          struct channel_gk20a *c)
+{
+        struct gr_gk20a *gr = &g->gr;
+        struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+        u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
+        u32 ctx_header_words;
+        u32 i;
+        u32 data;
+        void *ctx_ptr = NULL;
+        void *gold_ptr = NULL;
+        u32 err = 0;
+        gk20a_dbg_fn("");
+        /* golden ctx is global to all channels. Although only the first
+           channel initializes golden image, driver needs to prevent multiple
+           channels from initializing golden ctx at the same time */
+        mutex_lock(&gr->ctx_mutex);
+        if (gr->ctx_vars.golden_image_initialized)
+                goto clean_up;
+        err = gr_gk20a_fecs_ctx_bind_channel(g, c);
+        if (err)
+                goto clean_up;
+        err = gk20a_init_sw_bundle(g);
+        if (err)
+                goto clean_up;
+        err = gr_gk20a_elpg_protected_call(g,
+                        gr_gk20a_commit_global_ctx_buffers(g, c, false));
+        if (err)
+                goto clean_up;
+        gold_ptr = vmap(gr->global_ctx_buffer[GOLDEN_CTX].pages,
+                        PAGE_ALIGN(gr->global_ctx_buffer[GOLDEN_CTX].size) >>
+                        PAGE_SHIFT, 0, pgprot_dmacoherent(PAGE_KERNEL));
+        if (!gold_ptr)
+                goto clean_up;
+        ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
+                        PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
+                        0, pgprot_dmacoherent(PAGE_KERNEL));
+        if (!ctx_ptr)
+                goto clean_up;
+        ctx_header_words =  roundup(ctx_header_bytes, sizeof(u32));
+        ctx_header_words >>= 2;
+        /* Channel gr_ctx buffer is gpu cacheable.
+           Flush before cpu read. */
+        gk20a_mm_fb_flush(g);
+        gk20a_mm_l2_flush(g, false);
+        for (i = 0; i < ctx_header_words; i++) {
+                data = gk20a_mem_rd32(ctx_ptr, i);
+                gk20a_mem_wr32(gold_ptr, i, data);
+        }
+        gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_o(), 0,
+                 ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
+        gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, 0);
+        gr_gk20a_commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
+        gr_gk20a_fecs_ctx_image_save(c, gr_fecs_method_push_adr_wfi_golden_save_v());
+        if (gr->ctx_vars.local_golden_image == NULL) {
+                gr->ctx_vars.local_golden_image =
+                        kzalloc(gr->ctx_vars.golden_image_size, GFP_KERNEL);
+                if (gr->ctx_vars.local_golden_image == NULL) {
+                        err = -ENOMEM;
+                        goto clean_up;
+                }
+                for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
+                        gr->ctx_vars.local_golden_image[i] =
+                                gk20a_mem_rd32(gold_ptr, i);
+        }
+        gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
+        gr->ctx_vars.golden_image_initialized = true;
+        gk20a_mm_l2_invalidate(g);
+        gk20a_writel(g, gr_fecs_current_ctx_r(),
+                gr_fecs_current_ctx_valid_false_f());
+clean_up:
+        if (err)
+                gk20a_err(dev_from_gk20a(g), "fail");
+        else
+                gk20a_dbg_fn("done");
+        if (gold_ptr)
+                vunmap(gold_ptr);
+        if (ctx_ptr)
+                vunmap(ctx_ptr);
+        mutex_unlock(&gr->ctx_mutex);
+        return err;
+}
+int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
+                                    struct channel_gk20a *c,
+                                    bool enable_smpc_ctxsw)
+{
+        struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+        void *ctx_ptr = NULL;
+        u32 data;
+        /*XXX caller responsible for making sure the channel is quiesced? */
+        /* Channel gr_ctx buffer is gpu cacheable.
+           Flush and invalidate before cpu update. */
+        gk20a_mm_fb_flush(g);
+        gk20a_mm_l2_flush(g, true);
+        ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
+                        PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
+                        0, pgprot_dmacoherent(PAGE_KERNEL));
+        if (!ctx_ptr)
+                return -ENOMEM;
+        data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+        data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
+        data |= enable_smpc_ctxsw ?
+                ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
+                ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
+                 data);
+        vunmap(ctx_ptr);
+        gk20a_mm_l2_invalidate(g);
+        return 0;
+}
+/* load saved fresh copy of gloden image into channel gr_ctx */
+static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
+                                        struct channel_gk20a *c)
+{
+        struct gr_gk20a *gr = &g->gr;
+        struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+        u32 virt_addr_lo;
+        u32 virt_addr_hi;
+        u32 i, v, data;
+        int ret = 0;
+        void *ctx_ptr = NULL;
+        gk20a_dbg_fn("");
+        if (gr->ctx_vars.local_golden_image == NULL)
+                return -1;
+        /* Channel gr_ctx buffer is gpu cacheable.
+           Flush and invalidate before cpu update. */
+        gk20a_mm_fb_flush(g);
+        gk20a_mm_l2_flush(g, true);
+        ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
+                        PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
+                        0, pgprot_dmacoherent(PAGE_KERNEL));
+        if (!ctx_ptr)
+                return -ENOMEM;
+        for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
+                gk20a_mem_wr32(ctx_ptr, i, gr->ctx_vars.local_golden_image[i]);
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_o(), 0, 0);
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_o(), 0, 0);
+        virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
+        virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_count_o(), 0,
+                 ch_ctx->patch_ctx.data_count);
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_lo_o(), 0,
+                 virt_addr_lo);
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_o(), 0,
+                 virt_addr_hi);
+        /* no user for client managed performance counter ctx */
+        ch_ctx->pm_ctx.ctx_sw_mode =
+                ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
+        data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+        data = data & ~ctxsw_prog_main_image_pm_mode_m();
+        data |= ch_ctx->pm_ctx.ctx_sw_mode;
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
+                 data);
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, 0);
+        /* set priv access map */
+        virt_addr_lo =
+                 u64_lo32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
+        virt_addr_hi =
+                 u64_hi32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_config_o(), 0,
+                 ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f());
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_lo_o(), 0,
+                 virt_addr_lo);
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_hi_o(), 0,
+                 virt_addr_hi);
+        /* disable verif features */
+        v = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0);
+        v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
+        v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
+        gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0, v);
+        vunmap(ctx_ptr);
+        gk20a_mm_l2_invalidate(g);
+        if (tegra_platform_is_linsim()) {
+                u32 inst_base_ptr =
+                        u64_lo32(c->inst_block.cpu_pa
+                        >> ram_in_base_shift_v());
+                ret = gr_gk20a_submit_fecs_method_op(g,
+                          (struct fecs_method_op_gk20a) {
+                                  .method.data =
+                                          (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
+                                           gr_fecs_current_ctx_target_vid_mem_f() |
+                                           gr_fecs_current_ctx_valid_f(1)),
+                                  .method.addr =
+                                          gr_fecs_method_push_adr_restore_golden_v(),
+                                  .mailbox = {
+                                          .id = 0, .data = 0,
+                                          .clr = ~0, .ret = NULL,
+                                          .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
+                                          .fail = 0},
+                                  .cond.ok = GR_IS_UCODE_OP_EQUAL,
+                                  .cond.fail = GR_IS_UCODE_OP_SKIP});
+                if (ret)
+                        gk20a_err(dev_from_gk20a(g),
+                                   "restore context image failed");
+        }
+        return ret;
+}
+static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
+{
+        gk20a_dbg_fn("");
+        gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
+                     gr_fecs_ctxsw_mailbox_clear_value_f(~0));
+        gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0));
+        gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0));
+        gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1));
+        gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1));
+        gk20a_dbg_fn("done");
+}
+static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        struct vm_gk20a *vm = &mm->pmu.vm;
+        struct device *d = dev_from_gk20a(g);
+        struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
+        void *inst_ptr;
+        u32 pde_addr_lo;
+        u32 pde_addr_hi;
+        u64 pde_addr;
+        dma_addr_t iova;
+        /* Alloc mem of inst block */
+        ucode_info->inst_blk_desc.size = ram_in_alloc_size_v();
+        ucode_info->inst_blk_desc.cpuva = dma_alloc_coherent(d,
+                                        ucode_info->inst_blk_desc.size,
+                                        &iova,
+                                        GFP_KERNEL);
+        if (!ucode_info->inst_blk_desc.cpuva) {
+                gk20a_err(d, "failed to allocate memory\n");
+                return -ENOMEM;
+        }
+        ucode_info->inst_blk_desc.iova = iova;
+        ucode_info->inst_blk_desc.cpu_pa = gk20a_get_phys_from_iova(d,
+                                        ucode_info->inst_blk_desc.iova);
+        inst_ptr = ucode_info->inst_blk_desc.cpuva;
+        /* Set inst block */
+        gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
+                 u64_lo32(vm->va_limit) | 0xFFF);
+        gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
+                ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
+        pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
+        pde_addr_lo = u64_lo32(pde_addr >> 12);
+        pde_addr_hi = u64_hi32(pde_addr);
+        gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
+                ram_in_page_dir_base_target_vid_mem_f() |
+                ram_in_page_dir_base_vol_true_f() |
+                ram_in_page_dir_base_lo_f(pde_addr_lo));
+        gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
+                ram_in_page_dir_base_hi_f(pde_addr_hi));
+        /* Map ucode surface to GMMU */
+        ucode_info->ucode_gpuva = gk20a_gmmu_map(vm,
+                                        &ucode_info->surface_desc.sgt,
+                                        ucode_info->surface_desc.size,
+                                        0, /* flags */
+                                        gk20a_mem_flag_read_only);
+        if (!ucode_info->ucode_gpuva) {
+                gk20a_err(d, "failed to update gmmu ptes\n");
+                return -ENOMEM;
+        }
+        return 0;
+}
+static void gr_gk20a_init_ctxsw_ucode_segment(
+        struct gk20a_ctxsw_ucode_segment *p_seg, u32 *offset, u32 size)
+{
+        p_seg->offset = *offset;
+        p_seg->size = size;
+        *offset = ALIGN(*offset + size, BLK_SIZE);
+}
+static void gr_gk20a_init_ctxsw_ucode_segments(
+        struct gk20a_ctxsw_ucode_segments *segments, u32 *offset,
+        struct gk20a_ctxsw_bootloader_desc *bootdesc,
+        u32 code_size, u32 data_size)
+{
+        u32 boot_size = ALIGN(bootdesc->size, sizeof(u32));
+        segments->boot_entry = bootdesc->entry_point;
+        segments->boot_imem_offset = bootdesc->imem_offset;
+        gr_gk20a_init_ctxsw_ucode_segment(&segments->boot, offset, boot_size);
+        gr_gk20a_init_ctxsw_ucode_segment(&segments->code, offset, code_size);
+        gr_gk20a_init_ctxsw_ucode_segment(&segments->data, offset, data_size);
+}
+static int gr_gk20a_copy_ctxsw_ucode_segments(
+        u8 *buf,
+        struct gk20a_ctxsw_ucode_segments *segments,
+        u32 *bootimage,
+        u32 *code, u32 *data)
+{
+        memcpy(buf + segments->boot.offset, bootimage, segments->boot.size);
+        memcpy(buf + segments->code.offset, code,      segments->code.size);
+        memcpy(buf + segments->data.offset, data,      segments->data.size);
+        return 0;
+}
+static int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
+{
+        struct device *d = dev_from_gk20a(g);
+        struct mm_gk20a *mm = &g->mm;
+        struct vm_gk20a *vm = &mm->pmu.vm;
+        struct gk20a_ctxsw_bootloader_desc *fecs_boot_desc;
+        struct gk20a_ctxsw_bootloader_desc *gpccs_boot_desc;
+        const struct firmware *fecs_fw;
+        const struct firmware *gpccs_fw;
+        u32 *fecs_boot_image;
+        u32 *gpccs_boot_image;
+        struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
+        u8 *buf;
+        u32 ucode_size;
+        int err = 0;
+        dma_addr_t iova;
+        DEFINE_DMA_ATTRS(attrs);
+        fecs_fw = gk20a_request_firmware(g, GK20A_FECS_UCODE_IMAGE);
+        if (!fecs_fw) {
+                gk20a_err(d, "failed to load fecs ucode!!");
+                return -ENOENT;
+        }
+        fecs_boot_desc = (void *)fecs_fw->data;
+        fecs_boot_image = (void *)(fecs_fw->data +
+                                sizeof(struct gk20a_ctxsw_bootloader_desc));
+        gpccs_fw = gk20a_request_firmware(g, GK20A_GPCCS_UCODE_IMAGE);
+        if (!gpccs_fw) {
+                release_firmware(fecs_fw);
+                gk20a_err(d, "failed to load gpccs ucode!!");
+                return -ENOENT;
+        }
+        gpccs_boot_desc = (void *)gpccs_fw->data;
+        gpccs_boot_image = (void *)(gpccs_fw->data +
+                                sizeof(struct gk20a_ctxsw_bootloader_desc));
+        ucode_size = 0;
+        gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->fecs, &ucode_size,
+                fecs_boot_desc,
+                g->gr.ctx_vars.ucode.fecs.inst.count * sizeof(u32),
+                g->gr.ctx_vars.ucode.fecs.data.count * sizeof(u32));
+        gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->gpccs, &ucode_size,
+                gpccs_boot_desc,
+                g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),
+                g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));
+        ucode_info->surface_desc.size = ucode_size;
+        dma_set_attr(DMA_ATTR_READ_ONLY, &attrs);
+        ucode_info->surface_desc.cpuva = dma_alloc_attrs(d,
+                                        ucode_info->surface_desc.size,
+                                        &iova,
+                                        GFP_KERNEL,
+                                        &attrs);
+        if (!ucode_info->surface_desc.cpuva) {
+                gk20a_err(d, "memory allocation failed\n");
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        ucode_info->surface_desc.iova = iova;
+        err = gk20a_get_sgtable(d, &ucode_info->surface_desc.sgt,
+                                ucode_info->surface_desc.cpuva,
+                                ucode_info->surface_desc.iova,
+                                ucode_info->surface_desc.size);
+        if (err) {
+                gk20a_err(d, "failed to create sg table\n");
+                goto clean_up;
+        }
+        buf = (u8 *)ucode_info->surface_desc.cpuva;
+        if (!buf) {
+                gk20a_err(d, "failed to map surface desc buffer");
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->fecs,
+                fecs_boot_image,
+                g->gr.ctx_vars.ucode.fecs.inst.l,
+                g->gr.ctx_vars.ucode.fecs.data.l);
+        release_firmware(fecs_fw);
+        fecs_fw = NULL;
+        gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->gpccs,
+                gpccs_boot_image,
+                g->gr.ctx_vars.ucode.gpccs.inst.l,
+                g->gr.ctx_vars.ucode.gpccs.data.l);
+        release_firmware(gpccs_fw);
+        gpccs_fw = NULL;
+        err = gr_gk20a_init_ctxsw_ucode_vaspace(g);
+        if (err)
+                goto clean_up;
+        gk20a_free_sgtable(&ucode_info->surface_desc.sgt);
+        return 0;
+ clean_up:
+        if (ucode_info->ucode_gpuva)
+                gk20a_gmmu_unmap(vm, ucode_info->ucode_gpuva,
+                        ucode_info->surface_desc.size, gk20a_mem_flag_none);
+        if (ucode_info->surface_desc.sgt)
+                gk20a_free_sgtable(&ucode_info->surface_desc.sgt);
+        if (ucode_info->surface_desc.cpuva)
+                dma_free_attrs(d, ucode_info->surface_desc.size,
+                                ucode_info->surface_desc.cpuva,
+                                ucode_info->surface_desc.iova,
+                                &attrs);
+        ucode_info->surface_desc.cpuva = NULL;
+        ucode_info->surface_desc.iova = 0;
+        release_firmware(gpccs_fw);
+        gpccs_fw = NULL;
+        release_firmware(fecs_fw);
+        fecs_fw = NULL;
+        return err;
+}
+static void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g)
+{
+        struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
+        int retries = 20;
+        phys_addr_t inst_ptr;
+        u32 val;
+        while ((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
+                        gr_fecs_ctxsw_status_1_arb_busy_m()) && retries) {
+                udelay(2);
+                retries--;
+        }
+        if (!retries)
+                gk20a_err(dev_from_gk20a(g), "arbiter idle timeout");
+        gk20a_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0);
+        inst_ptr = ucode_info->inst_blk_desc.cpu_pa;
+        gk20a_writel(g, gr_fecs_new_ctx_r(),
+                        gr_fecs_new_ctx_ptr_f(inst_ptr >> 12) |
+                        gr_fecs_new_ctx_target_m() |
+                        gr_fecs_new_ctx_valid_m());
+        gk20a_writel(g, gr_fecs_arb_ctx_ptr_r(),
+                        gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr >> 12) |
+                        gr_fecs_arb_ctx_ptr_target_m());
+        gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7);
+        /* Wait for arbiter command to complete */
+        retries = 20;
+        val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
+        while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
+                udelay(2);
+                retries--;
+                val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
+        }
+        if (!retries)
+                gk20a_err(dev_from_gk20a(g), "arbiter complete timeout");
+        gk20a_writel(g, gr_fecs_current_ctx_r(),
+                        gr_fecs_current_ctx_ptr_f(inst_ptr >> 12) |
+                        gr_fecs_current_ctx_target_m() |
+                        gr_fecs_current_ctx_valid_m());
+        /* Send command to arbiter to flush */
+        gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s());
+        retries = 20;
+        val = (gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()));
+        while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
+                udelay(2);
+                retries--;
+                val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
+        }
+        if (!retries)
+                gk20a_err(dev_from_gk20a(g), "arbiter complete timeout");
+}
+static int gr_gk20a_load_ctxsw_ucode_segments(struct gk20a *g, u64 addr_base,
+        struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
+{
+        u32 addr_code32;
+        u32 addr_data32;
+        u32 addr_load32;
+        u32 dst = 0;
+        u32 blocks;
+        u32 b;
+        addr_code32 = u64_lo32((addr_base + segments->code.offset) >> 8);
+        addr_data32 = u64_lo32((addr_base + segments->data.offset) >> 8);
+        addr_load32 = u64_lo32((addr_base + segments->boot.offset) >> 8);
+        gk20a_writel(g, reg_offset + gr_fecs_dmactl_r(),
+                        gr_fecs_dmactl_require_ctx_f(0));
+        /*
+         * Copy falcon bootloader header into dmem at offset 0.
+         * Configure dmem port 0 for auto-incrementing writes starting at dmem
+         * offset 0.
+         */
+        gk20a_writel(g, reg_offset + gr_fecs_dmemc_r(0),
+                        gr_fecs_dmemc_offs_f(0) |
+                        gr_fecs_dmemc_blk_f(0) |
+                        gr_fecs_dmemc_aincw_f(1));
+        /* Write out the actual data */
+        gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+        gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
+        gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+        gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), segments->code.size);
+        gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+        gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_data32);
+        gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), segments->data.size);
+        gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
+        gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+        gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+        blocks = ((segments->boot.size + 0xFF) & ~0xFF) >> 8;
+        /*
+         * Set the base FB address for the DMA transfer. Subtract off the 256
+         * byte IMEM block offset such that the relative FB and IMEM offsets
+         * match, allowing the IMEM tags to be properly created.
+         */
+        dst = segments->boot_imem_offset;
+        gk20a_writel(g, reg_offset + gr_fecs_dmatrfbase_r(),
+                        (addr_load32 - (dst >> 8)));
+        for (b = 0; b < blocks; b++) {
+                /* Setup destination IMEM offset */
+                gk20a_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(),
+                                dst + (b << 8));
+                /* Setup source offset (relative to BASE) */
+                gk20a_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(),
+                                dst + (b << 8));
+                gk20a_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(),
+                                gr_fecs_dmatrfcmd_imem_f(0x01) |
+                                gr_fecs_dmatrfcmd_write_f(0x00) |
+                                gr_fecs_dmatrfcmd_size_f(0x06) |
+                                gr_fecs_dmatrfcmd_ctxdma_f(0));
+        }
+        /* Specify the falcon boot vector */
+        gk20a_writel(g, reg_offset + gr_fecs_bootvec_r(),
+                        gr_fecs_bootvec_vec_f(segments->boot_entry));
+        /* Write to CPUCTL to start the falcon */
+        gk20a_writel(g, reg_offset + gr_fecs_cpuctl_r(),
+                        gr_fecs_cpuctl_startcpu_f(0x01));
+        return 0;
+}
+static void gr_gk20a_load_falcon_with_bootloader(struct gk20a *g)
+{
+        struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
+        u64 addr_base = ucode_info->ucode_gpuva;
+        gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
+        gr_gk20a_load_falcon_bind_instblk(g);
+        gr_gk20a_load_ctxsw_ucode_segments(g, addr_base,
+                &g->ctxsw_ucode_info.fecs, 0);
+        gr_gk20a_load_ctxsw_ucode_segments(g, addr_base,
+                &g->ctxsw_ucode_info.gpccs,
+                gr_gpcs_gpccs_falcon_hwcfg_r() -
+                gr_fecs_falcon_hwcfg_r());
+}
+static int gr_gk20a_load_ctxsw_ucode(struct gk20a *g, struct gr_gk20a *gr)
+{
+        u32 ret;
+        gk20a_dbg_fn("");
+        if (tegra_platform_is_linsim()) {
+                gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
+                        gr_fecs_ctxsw_mailbox_value_f(0xc0de7777));
+                gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
+                        gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777));
+        }
+        /*
+         * In case the gPMU falcon is not being used, revert to the old way of
+         * loading gr ucode, without the faster bootstrap routine.
+         */
+        if (!support_gk20a_pmu()) {
+                gr_gk20a_load_falcon_dmem(g);
+                gr_gk20a_load_falcon_imem(g);
+                gr_gk20a_start_falcon_ucode(g);
+        } else {
+                if (!gr->skip_ucode_init)
+                        gr_gk20a_init_ctxsw_ucode(g);
+                gr_gk20a_load_falcon_with_bootloader(g);
+                gr->skip_ucode_init = true;
+        }
+        ret = gr_gk20a_ctx_wait_ucode(g, 0, 0,
+                                      GR_IS_UCODE_OP_EQUAL,
+                                      eUcodeHandshakeInitComplete,
+                                      GR_IS_UCODE_OP_SKIP, 0);
+        if (ret) {
+                gk20a_err(dev_from_gk20a(g), "falcon ucode init timeout");
+                return ret;
+        }
+        if (support_gk20a_pmu())
+                gk20a_writel(g, gr_fecs_current_ctx_r(),
+                        gr_fecs_current_ctx_valid_false_f());
+        gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0xffffffff);
+        gk20a_writel(g, gr_fecs_method_data_r(), 0x7fffffff);
+        gk20a_writel(g, gr_fecs_method_push_r(),
+                     gr_fecs_method_push_adr_set_watchdog_timeout_f());
+        gk20a_dbg_fn("done");
+        return 0;
+}
+static int gr_gk20a_init_ctx_state(struct gk20a *g, struct gr_gk20a *gr)
+{
+        u32 golden_ctx_image_size = 0;
+        u32 zcull_ctx_image_size = 0;
+        u32 pm_ctx_image_size = 0;
+        u32 ret;
+        struct fecs_method_op_gk20a op = {
+                .mailbox = { .id = 0, .data = 0,
+                             .clr = ~0, .ok = 0, .fail = 0},
+                .method.data = 0,
+                .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
+                .cond.fail = GR_IS_UCODE_OP_SKIP,
+                };
+        gk20a_dbg_fn("");
+        op.method.addr = gr_fecs_method_push_adr_discover_image_size_v();
+        op.mailbox.ret = &golden_ctx_image_size;
+        ret = gr_gk20a_submit_fecs_method_op(g, op);
+        if (ret) {
+                gk20a_err(dev_from_gk20a(g),
+                           "query golden image size failed");
+                return ret;
+        }
+        op.method.addr = gr_fecs_method_push_adr_discover_zcull_image_size_v();
+        op.mailbox.ret = &zcull_ctx_image_size;
+        ret = gr_gk20a_submit_fecs_method_op(g, op);
+        if (ret) {
+                gk20a_err(dev_from_gk20a(g),
+                           "query zcull ctx image size failed");
+                return ret;
+        }
+        op.method.addr = gr_fecs_method_push_adr_discover_pm_image_size_v();
+        op.mailbox.ret = &pm_ctx_image_size;
+        ret = gr_gk20a_submit_fecs_method_op(g, op);
+        if (ret) {
+                gk20a_err(dev_from_gk20a(g),
+                           "query pm ctx image size failed");
+                return ret;
+        }
+        if (!g->gr.ctx_vars.golden_image_size &&
+            !g->gr.ctx_vars.zcull_ctxsw_image_size) {
+                g->gr.ctx_vars.golden_image_size = golden_ctx_image_size;
+                g->gr.ctx_vars.zcull_ctxsw_image_size = zcull_ctx_image_size;
+        } else {
+                /* hw is different after railgating? */
+                BUG_ON(g->gr.ctx_vars.golden_image_size != golden_ctx_image_size);
+                BUG_ON(g->gr.ctx_vars.zcull_ctxsw_image_size != zcull_ctx_image_size);
+        }
+        g->gr.ctx_vars.priv_access_map_size = 512 * 1024;
+        gk20a_dbg_fn("done");
+        return 0;
+}
+static void gk20a_gr_destroy_ctx_buffer(struct platform_device *pdev,
+                                        struct gr_ctx_buffer_desc *desc)
+{
+        struct device *dev = &pdev->dev;
+        gk20a_free_sgtable(&desc->sgt);
+        dma_free_attrs(dev, desc->size, desc->pages,
+                       desc->iova, &desc->attrs);
+}
+static int gk20a_gr_alloc_ctx_buffer(struct platform_device *pdev,
+                                     struct gr_ctx_buffer_desc *desc,
+                                     size_t size)
+{
+        struct device *dev = &pdev->dev;
+        DEFINE_DMA_ATTRS(attrs);
+        dma_addr_t iova;
+        int err = 0;
+        dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+        desc->pages = dma_alloc_attrs(&pdev->dev, size, &iova,
+                                      GFP_KERNEL, &attrs);
+        if (!desc->pages)
+                return -ENOMEM;
+        desc->iova = iova;
+        desc->size = size;
+        desc->attrs = attrs;
+        desc->destroy = gk20a_gr_destroy_ctx_buffer;
+        err = gk20a_get_sgtable_from_pages(&pdev->dev, &desc->sgt, desc->pages,
+                                           desc->iova, desc->size);
+        if (err) {
+                dma_free_attrs(dev, desc->size, desc->pages,
+                               desc->iova, &desc->attrs);
+                memset(desc, 0, sizeof(*desc));
+        }
+        return err;
+}
+static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g)
+{
+        struct gk20a_platform *platform = platform_get_drvdata(g->dev);
+        struct gr_gk20a *gr = &g->gr;
+        int i, attr_buffer_size, err;
+        struct platform_device *pdev = g->dev;
+        u32 cb_buffer_size = gr->bundle_cb_default_size *
+                gr_scc_bundle_cb_size_div_256b_byte_granularity_v();
+        u32 pagepool_buffer_size = gr_scc_pagepool_total_pages_hwmax_value_v() *
+                gr_scc_pagepool_total_pages_byte_granularity_v();
+        gk20a_dbg_fn("");
+        attr_buffer_size = g->ops.gr.calc_global_ctx_buffer_size(g);
+        gk20a_dbg_info("cb_buffer_size : %d", cb_buffer_size);
+        err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[CIRCULAR],
+                                        cb_buffer_size);
+        if (err)
+                goto clean_up;
+        if (platform->secure_alloc)
+                platform->secure_alloc(pdev,
+                                       &gr->global_ctx_buffer[CIRCULAR_VPR],
+                                       cb_buffer_size);
+        gk20a_dbg_info("pagepool_buffer_size : %d", pagepool_buffer_size);
+        err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[PAGEPOOL],
+                                        pagepool_buffer_size);
+        if (err)
+                goto clean_up;
+        if (platform->secure_alloc)
+                platform->secure_alloc(pdev,
+                                       &gr->global_ctx_buffer[PAGEPOOL_VPR],
+                                       pagepool_buffer_size);
+        gk20a_dbg_info("attr_buffer_size : %d", attr_buffer_size);
+        err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[ATTRIBUTE],
+                                        attr_buffer_size);
+        if (err)
+                goto clean_up;
+        if (platform->secure_alloc)
+                platform->secure_alloc(pdev,
+                                       &gr->global_ctx_buffer[ATTRIBUTE_VPR],
+                                       attr_buffer_size);
+        gk20a_dbg_info("golden_image_size : %d",
+                   gr->ctx_vars.golden_image_size);
+        err = gk20a_gr_alloc_ctx_buffer(pdev,
+                                        &gr->global_ctx_buffer[GOLDEN_CTX],
+                                        gr->ctx_vars.golden_image_size);
+        if (err)
+                goto clean_up;
+        gk20a_dbg_info("priv_access_map_size : %d",
+                   gr->ctx_vars.priv_access_map_size);
+        err = gk20a_gr_alloc_ctx_buffer(pdev,
+                                        &gr->global_ctx_buffer[PRIV_ACCESS_MAP],
+                                        gr->ctx_vars.priv_access_map_size);
+        if (err)
+                goto clean_up;
+        gk20a_dbg_fn("done");
+        return 0;
+ clean_up:
+        gk20a_err(dev_from_gk20a(g), "fail");
+        for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
+                if (gr->global_ctx_buffer[i].destroy) {
+                        gr->global_ctx_buffer[i].destroy(pdev,
+                                        &gr->global_ctx_buffer[i]);
+                }
+        }
+        return -ENOMEM;
+}
+static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g)
+{
+        struct platform_device *pdev = g->dev;
+        struct gr_gk20a *gr = &g->gr;
+        DEFINE_DMA_ATTRS(attrs);
+        u32 i;
+        dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+        for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
+                gr->global_ctx_buffer[i].destroy(pdev,
+                                &gr->global_ctx_buffer[i]);
+        }
+        gk20a_dbg_fn("done");
+}
+static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
+                                        struct channel_gk20a *c)
+{
+        struct vm_gk20a *ch_vm = c->vm;
+        u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
+        u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
+        struct gr_gk20a *gr = &g->gr;
+        struct sg_table *sgt;
+        u64 size;
+        u64 gpu_va;
+        u32 i;
+        gk20a_dbg_fn("");
+        /* Circular Buffer */
+        if (!c->vpr || (gr->global_ctx_buffer[CIRCULAR_VPR].sgt == NULL)) {
+                sgt = gr->global_ctx_buffer[CIRCULAR].sgt;
+                size = gr->global_ctx_buffer[CIRCULAR].size;
+        } else {
+                sgt = gr->global_ctx_buffer[CIRCULAR_VPR].sgt;
+                size = gr->global_ctx_buffer[CIRCULAR_VPR].size;
+        }
+        gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
+                                NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+                                gk20a_mem_flag_none);
+        if (!gpu_va)
+                goto clean_up;
+        g_bfr_va[CIRCULAR_VA] = gpu_va;
+        g_bfr_size[CIRCULAR_VA] = size;
+        /* Attribute Buffer */
+        if (!c->vpr || (gr->global_ctx_buffer[ATTRIBUTE_VPR].sgt == NULL)) {
+                sgt = gr->global_ctx_buffer[ATTRIBUTE].sgt;
+                size = gr->global_ctx_buffer[ATTRIBUTE].size;
+        } else {
+                sgt = gr->global_ctx_buffer[ATTRIBUTE_VPR].sgt;
+                size = gr->global_ctx_buffer[ATTRIBUTE_VPR].size;
+        }
+        gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
+                                NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+                                gk20a_mem_flag_none);
+        if (!gpu_va)
+                goto clean_up;
+        g_bfr_va[ATTRIBUTE_VA] = gpu_va;
+        g_bfr_size[ATTRIBUTE_VA] = size;
+        /* Page Pool */
+        if (!c->vpr || (gr->global_ctx_buffer[PAGEPOOL_VPR].sgt == NULL)) {
+                sgt = gr->global_ctx_buffer[PAGEPOOL].sgt;
+                size = gr->global_ctx_buffer[PAGEPOOL].size;
+        } else {
+                sgt = gr->global_ctx_buffer[PAGEPOOL_VPR].sgt;
+                size = gr->global_ctx_buffer[PAGEPOOL_VPR].size;
+        }
+        gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
+                                NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+                                gk20a_mem_flag_none);
+        if (!gpu_va)
+                goto clean_up;
+        g_bfr_va[PAGEPOOL_VA] = gpu_va;
+        g_bfr_size[PAGEPOOL_VA] = size;
+        /* Golden Image */
+        sgt = gr->global_ctx_buffer[GOLDEN_CTX].sgt;
+        size = gr->global_ctx_buffer[GOLDEN_CTX].size;
+        gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size, 0,
+                                gk20a_mem_flag_none);
+        if (!gpu_va)
+                goto clean_up;
+        g_bfr_va[GOLDEN_CTX_VA] = gpu_va;
+        g_bfr_size[GOLDEN_CTX_VA] = size;
+        /* Priv register Access Map */
+        sgt = gr->global_ctx_buffer[PRIV_ACCESS_MAP].sgt;
+        size = gr->global_ctx_buffer[PRIV_ACCESS_MAP].size;
+        gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size, 0,
+                                gk20a_mem_flag_none);
+        if (!gpu_va)
+                goto clean_up;
+        g_bfr_va[PRIV_ACCESS_MAP_VA] = gpu_va;
+        g_bfr_size[PRIV_ACCESS_MAP_VA] = size;
+        c->ch_ctx.global_ctx_buffer_mapped = true;
+        return 0;
+ clean_up:
+        for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
+                if (g_bfr_va[i]) {
+                        gk20a_gmmu_unmap(ch_vm, g_bfr_va[i],
+                                         gr->global_ctx_buffer[i].size,
+                                         gk20a_mem_flag_none);
+                        g_bfr_va[i] = 0;
+                }
+        }
+        return -ENOMEM;
+}
+static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c)
+{
+        struct vm_gk20a *ch_vm = c->vm;
+        u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
+        u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
+        u32 i;
+        gk20a_dbg_fn("");
+        for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
+                if (g_bfr_va[i]) {
+                        gk20a_gmmu_unmap(ch_vm, g_bfr_va[i],
+                                         g_bfr_size[i],
+                                         gk20a_mem_flag_none);
+                        g_bfr_va[i] = 0;
+                        g_bfr_size[i] = 0;
+                }
+        }
+        c->ch_ctx.global_ctx_buffer_mapped = false;
+}
+static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
+                                struct channel_gk20a *c)
+{
+        struct gr_gk20a *gr = &g->gr;
+        struct gr_ctx_desc *gr_ctx = &c->ch_ctx.gr_ctx;
+        struct vm_gk20a *ch_vm = c->vm;
+        struct device *d = dev_from_gk20a(g);
+        struct sg_table *sgt;
+        DEFINE_DMA_ATTRS(attrs);
+        int err = 0;
+        dma_addr_t iova;
+        gk20a_dbg_fn("");
+        if (gr->ctx_vars.buffer_size == 0)
+                return 0;
+        /* alloc channel gr ctx buffer */
+        gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
+        gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
+        gr_ctx->size = gr->ctx_vars.buffer_total_size;
+        dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+        gr_ctx->pages = dma_alloc_attrs(d, gr_ctx->size,
+                                &iova, GFP_KERNEL, &attrs);
+        if (!gr_ctx->pages)
+                return -ENOMEM;
+        gr_ctx->iova = iova;
+        err = gk20a_get_sgtable_from_pages(d, &sgt, gr_ctx->pages,
+                        gr_ctx->iova, gr_ctx->size);
+        if (err)
+                goto err_free;
+        gr_ctx->gpu_va = gk20a_gmmu_map(ch_vm, &sgt, gr_ctx->size,
+                                NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+                                gk20a_mem_flag_none);
+        if (!gr_ctx->gpu_va)
+                goto err_free_sgt;
+        gk20a_free_sgtable(&sgt);
+        return 0;
+ err_free_sgt:
+        gk20a_free_sgtable(&sgt);
+ err_free:
+        dma_free_attrs(d, gr_ctx->size,
+                gr_ctx->pages, gr_ctx->iova, &attrs);
+        gr_ctx->pages = NULL;
+        gr_ctx->iova = 0;
+        return err;
+}
+static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c)
+{
+        struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+        struct vm_gk20a *ch_vm = c->vm;
+        struct gk20a *g = c->g;
+        struct device *d = dev_from_gk20a(g);
+        DEFINE_DMA_ATTRS(attrs);
+        gk20a_dbg_fn("");
+        if (!ch_ctx->gr_ctx.gpu_va)
+                return;
+        gk20a_gmmu_unmap(ch_vm, ch_ctx->gr_ctx.gpu_va,
+                        ch_ctx->gr_ctx.size, gk20a_mem_flag_none);
+        dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+        dma_free_attrs(d, ch_ctx->gr_ctx.size,
+                ch_ctx->gr_ctx.pages, ch_ctx->gr_ctx.iova, &attrs);
+        ch_ctx->gr_ctx.pages = NULL;
+        ch_ctx->gr_ctx.iova = 0;
+}
+static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
+                                struct channel_gk20a *c)
+{
+        struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
+        struct device *d = dev_from_gk20a(g);
+        struct vm_gk20a *ch_vm = c->vm;
+        DEFINE_DMA_ATTRS(attrs);
+        struct sg_table *sgt;
+        int err = 0;
+        dma_addr_t iova;
+        gk20a_dbg_fn("");
+        patch_ctx->size = 128 * sizeof(u32);
+        dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+        patch_ctx->pages = dma_alloc_attrs(d, patch_ctx->size,
+                                &iova, GFP_KERNEL,
+                                &attrs);
+        if (!patch_ctx->pages)
+                return -ENOMEM;
+        patch_ctx->iova = iova;
+        err = gk20a_get_sgtable_from_pages(d, &sgt, patch_ctx->pages,
+                        patch_ctx->iova, patch_ctx->size);
+        if (err)
+                goto err_free;
+        patch_ctx->gpu_va = gk20a_gmmu_map(ch_vm, &sgt, patch_ctx->size,
+                                        0, gk20a_mem_flag_none);
+        if (!patch_ctx->gpu_va)
+                goto err_free_sgtable;
+        gk20a_free_sgtable(&sgt);
+        gk20a_dbg_fn("done");
+        return 0;
+ err_free_sgtable:
+        gk20a_free_sgtable(&sgt);
+ err_free:
+        dma_free_attrs(d, patch_ctx->size,
+                patch_ctx->pages, patch_ctx->iova, &attrs);
+        patch_ctx->pages = NULL;
+        patch_ctx->iova = 0;
+        gk20a_err(dev_from_gk20a(g), "fail");
+        return err;
+}
+static void gr_gk20a_unmap_channel_patch_ctx(struct channel_gk20a *c)
+{
+        struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
+        struct vm_gk20a *ch_vm = c->vm;
+        gk20a_dbg_fn("");
+        if (patch_ctx->gpu_va)
+                gk20a_gmmu_unmap(ch_vm, patch_ctx->gpu_va,
+                        patch_ctx->size, gk20a_mem_flag_none);
+        patch_ctx->gpu_va = 0;
+        patch_ctx->data_count = 0;
+}
+static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c)
+{
+        struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
+        struct gk20a *g = c->g;
+        struct device *d = dev_from_gk20a(g);
+        DEFINE_DMA_ATTRS(attrs);
+        gk20a_dbg_fn("");
+        gr_gk20a_unmap_channel_patch_ctx(c);
+        if (patch_ctx->pages) {
+                dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+                dma_free_attrs(d, patch_ctx->size,
+                        patch_ctx->pages, patch_ctx->iova, &attrs);
+                patch_ctx->pages = NULL;
+                patch_ctx->iova = 0;
+        }
+}
+void gk20a_free_channel_ctx(struct channel_gk20a *c)
+{
+        gr_gk20a_unmap_global_ctx_buffers(c);
+        gr_gk20a_free_channel_patch_ctx(c);
+        gr_gk20a_free_channel_gr_ctx(c);
+        /* zcull_ctx, pm_ctx */
+        memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
+        c->num_objects = 0;
+        c->first_init = false;
+}
+static bool gr_gk20a_is_valid_class(struct gk20a *g, u32 class_num)
+{
+        bool valid = false;
+        switch (class_num) {
+        case KEPLER_COMPUTE_A:
+        case KEPLER_C:
+        case FERMI_TWOD_A:
+        case KEPLER_DMA_COPY_A:
+                valid = true;
+                break;
+        default:
+                break;
+        }
+        return valid;
+}
+int gk20a_alloc_obj_ctx(struct channel_gk20a  *c,
+                        struct nvhost_alloc_obj_ctx_args *args)
+{
+        struct gk20a *g = c->g;
+        struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+        int err = 0;
+        gk20a_dbg_fn("");
+        /* an address space needs to have been bound at this point.*/
+        if (!gk20a_channel_as_bound(c)) {
+                gk20a_err(dev_from_gk20a(g),
+                           "not bound to address space at time"
+                           " of grctx allocation");
+                return -EINVAL;
+        }
+        if (!g->ops.gr.is_valid_class(g, args->class_num)) {
+                gk20a_err(dev_from_gk20a(g),
+                           "invalid obj class 0x%x", args->class_num);
+                err = -EINVAL;
+                goto out;
+        }
+        /* allocate gr ctx buffer */
+        if (ch_ctx->gr_ctx.pages == NULL) {
+                err = gr_gk20a_alloc_channel_gr_ctx(g, c);
+                if (err) {
+                        gk20a_err(dev_from_gk20a(g),
+                                "fail to allocate gr ctx buffer");
+                        goto out;
+                }
+                c->obj_class = args->class_num;
+        } else {
+                /*TBD: needs to be more subtle about which is being allocated
+                * as some are allowed to be allocated along same channel */
+                gk20a_err(dev_from_gk20a(g),
+                        "too many classes alloc'd on same channel");
+                err = -EINVAL;
+                goto out;
+        }
+        /* commit gr ctx buffer */
+        err = gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
+        if (err) {
+                gk20a_err(dev_from_gk20a(g),
+                        "fail to commit gr ctx buffer");
+                goto out;
+        }
+        /* allocate patch buffer */
+        if (ch_ctx->patch_ctx.pages == NULL) {
+                err = gr_gk20a_alloc_channel_patch_ctx(g, c);
+                if (err) {
+                        gk20a_err(dev_from_gk20a(g),
+                                "fail to allocate patch buffer");
+                        goto out;
+                }
+        }
+        /* map global buffer to channel gpu_va and commit */
+        if (!ch_ctx->global_ctx_buffer_mapped) {
+                err = gr_gk20a_map_global_ctx_buffers(g, c);
+                if (err) {
+                        gk20a_err(dev_from_gk20a(g),
+                                "fail to map global ctx buffer");
+                        goto out;
+                }
+                gr_gk20a_elpg_protected_call(g,
+                        gr_gk20a_commit_global_ctx_buffers(g, c, true));
+        }
+        /* init golden image, ELPG enabled after this is done */
+        err = gr_gk20a_init_golden_ctx_image(g, c);
+        if (err) {
+                gk20a_err(dev_from_gk20a(g),
+                        "fail to init golden ctx image");
+                goto out;
+        }
+        /* load golden image */
+        if (!c->first_init) {
+                err = gr_gk20a_elpg_protected_call(g,
+                        gr_gk20a_load_golden_ctx_image(g, c));
+                if (err) {
+                        gk20a_err(dev_from_gk20a(g),
+                                "fail to load golden ctx image");
+                        goto out;
+                }
+                c->first_init = true;
+        }
+        gk20a_mm_l2_invalidate(g);
+        c->num_objects++;
+        gk20a_dbg_fn("done");
+        return 0;
+out:
+        /* 1. gr_ctx, patch_ctx and global ctx buffer mapping
+           can be reused so no need to release them.
+           2. golden image init and load is a one time thing so if
+           they pass, no need to undo. */
+        gk20a_err(dev_from_gk20a(g), "fail");
+        return err;
+}
+int gk20a_free_obj_ctx(struct channel_gk20a  *c,
+                       struct nvhost_free_obj_ctx_args *args)
+{
+        unsigned long timeout = gk20a_get_gr_idle_timeout(c->g);
+        gk20a_dbg_fn("");
+        if (c->num_objects == 0)
+                return 0;
+        c->num_objects--;
+        if (c->num_objects == 0) {
+                c->first_init = false;
+                gk20a_disable_channel(c,
+                        !c->has_timedout,
+                        timeout);
+                gr_gk20a_unmap_channel_patch_ctx(c);
+        }
+        return 0;
+}
+static void gk20a_remove_gr_support(struct gr_gk20a *gr)
+{
+        struct gk20a *g = gr->g;
+        struct device *d = dev_from_gk20a(g);
+        DEFINE_DMA_ATTRS(attrs);
+        gk20a_dbg_fn("");
+        gr_gk20a_free_global_ctx_buffers(g);
+        dma_free_coherent(d, gr->mmu_wr_mem.size,
+                gr->mmu_wr_mem.cpuva, gr->mmu_wr_mem.iova);
+        gr->mmu_wr_mem.cpuva = NULL;
+        gr->mmu_wr_mem.iova = 0;
+        dma_free_coherent(d, gr->mmu_rd_mem.size,
+                gr->mmu_rd_mem.cpuva, gr->mmu_rd_mem.iova);
+        gr->mmu_rd_mem.cpuva = NULL;
+        gr->mmu_rd_mem.iova = 0;
+        dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+        dma_free_attrs(d, gr->compbit_store.size, gr->compbit_store.pages,
+                        gr->compbit_store.base_iova, &attrs);
+        memset(&gr->mmu_wr_mem, 0, sizeof(struct mmu_desc));
+        memset(&gr->mmu_rd_mem, 0, sizeof(struct mmu_desc));
+        memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
+        kfree(gr->gpc_tpc_count);
+        kfree(gr->gpc_zcb_count);
+        kfree(gr->gpc_ppc_count);
+        kfree(gr->pes_tpc_count[0]);
+        kfree(gr->pes_tpc_count[1]);
+        kfree(gr->pes_tpc_mask[0]);
+        kfree(gr->pes_tpc_mask[1]);
+        kfree(gr->gpc_skip_mask);
+        kfree(gr->map_tiles);
+        gr->gpc_tpc_count = NULL;
+        gr->gpc_zcb_count = NULL;
+        gr->gpc_ppc_count = NULL;
+        gr->pes_tpc_count[0] = NULL;
+        gr->pes_tpc_count[1] = NULL;
+        gr->pes_tpc_mask[0] = NULL;
+        gr->pes_tpc_mask[1] = NULL;
+        gr->gpc_skip_mask = NULL;
+        gr->map_tiles = NULL;
+        kfree(gr->ctx_vars.ucode.fecs.inst.l);
+        kfree(gr->ctx_vars.ucode.fecs.data.l);
+        kfree(gr->ctx_vars.ucode.gpccs.inst.l);
+        kfree(gr->ctx_vars.ucode.gpccs.data.l);
+        kfree(gr->ctx_vars.sw_bundle_init.l);
+        kfree(gr->ctx_vars.sw_method_init.l);
+        kfree(gr->ctx_vars.sw_ctx_load.l);
+        kfree(gr->ctx_vars.sw_non_ctx_load.l);
+        kfree(gr->ctx_vars.ctxsw_regs.sys.l);
+        kfree(gr->ctx_vars.ctxsw_regs.gpc.l);
+        kfree(gr->ctx_vars.ctxsw_regs.tpc.l);
+        kfree(gr->ctx_vars.ctxsw_regs.zcull_gpc.l);
+        kfree(gr->ctx_vars.ctxsw_regs.ppc.l);
+        kfree(gr->ctx_vars.ctxsw_regs.pm_sys.l);
+        kfree(gr->ctx_vars.ctxsw_regs.pm_gpc.l);
+        kfree(gr->ctx_vars.ctxsw_regs.pm_tpc.l);
+        kfree(gr->ctx_vars.local_golden_image);
+        gr->ctx_vars.local_golden_image = NULL;
+        gk20a_allocator_destroy(&gr->comp_tags);
+}
+static void gr_gk20a_bundle_cb_defaults(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        gr->bundle_cb_default_size =
+                gr_scc_bundle_cb_size_div_256b__prod_v();
+        gr->min_gpm_fifo_depth =
+                gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v();
+        gr->bundle_cb_token_limit =
+                gr_pd_ab_dist_cfg2_token_limit_init_v();
+}
+static int gr_gk20a_init_gr_config(struct gk20a *g, struct gr_gk20a *gr)
+{
+        u32 gpc_index, pes_index;
+        u32 pes_tpc_mask;
+        u32 pes_tpc_count;
+        u32 pes_heavy_index;
+        u32 gpc_new_skip_mask;
+        u32 tmp;
+        tmp = gk20a_readl(g, pri_ringmaster_enum_fbp_r());
+        gr->num_fbps = pri_ringmaster_enum_fbp_count_v(tmp);
+        tmp = gk20a_readl(g, top_num_gpcs_r());
+        gr->max_gpc_count = top_num_gpcs_value_v(tmp);
+        tmp = gk20a_readl(g, top_num_fbps_r());
+        gr->max_fbps_count = top_num_fbps_value_v(tmp);
+        tmp = gk20a_readl(g, top_tpc_per_gpc_r());
+        gr->max_tpc_per_gpc_count = top_tpc_per_gpc_value_v(tmp);
+        gr->max_tpc_count = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
+        tmp = gk20a_readl(g, top_num_fbps_r());
+        gr->sys_count = top_num_fbps_value_v(tmp);
+        tmp = gk20a_readl(g, pri_ringmaster_enum_gpc_r());
+        gr->gpc_count = pri_ringmaster_enum_gpc_count_v(tmp);
+        gr->pe_count_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
+        gr->max_zcull_per_gpc_count = proj_scal_litter_num_zcull_banks_v();
+        if (!gr->gpc_count) {
+                gk20a_err(dev_from_gk20a(g), "gpc_count==0!");
+                goto clean_up;
+        }
+        gr->gpc_tpc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+        gr->gpc_zcb_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+        gr->gpc_ppc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+        gr->pes_tpc_count[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+        gr->pes_tpc_count[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+        gr->pes_tpc_mask[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+        gr->pes_tpc_mask[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+        gr->gpc_skip_mask =
+                kzalloc(gr_pd_dist_skip_table__size_1_v() * 4 * sizeof(u32),
+                        GFP_KERNEL);
+        if (!gr->gpc_tpc_count || !gr->gpc_zcb_count || !gr->gpc_ppc_count ||
+            !gr->pes_tpc_count[0] || !gr->pes_tpc_count[1] ||
+            !gr->pes_tpc_mask[0] || !gr->pes_tpc_mask[1] || !gr->gpc_skip_mask)
+                goto clean_up;
+        gr->ppc_count = 0;
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                tmp = gk20a_readl(g, gr_gpc0_fs_gpc_r());
+                gr->gpc_tpc_count[gpc_index] =
+                        gr_gpc0_fs_gpc_num_available_tpcs_v(tmp);
+                gr->tpc_count += gr->gpc_tpc_count[gpc_index];
+                gr->gpc_zcb_count[gpc_index] =
+                        gr_gpc0_fs_gpc_num_available_zculls_v(tmp);
+                gr->zcb_count += gr->gpc_zcb_count[gpc_index];
+                gr->gpc_ppc_count[gpc_index] = gr->pe_count_per_gpc;
+                gr->ppc_count += gr->gpc_ppc_count[gpc_index];
+                for (pes_index = 0; pes_index < gr->pe_count_per_gpc; pes_index++) {
+                        tmp = gk20a_readl(g,
+                                gr_gpc0_gpm_pd_pes_tpc_id_mask_r(pes_index) +
+                                gpc_index * proj_gpc_stride_v());
+                        pes_tpc_mask = gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(tmp);
+                        pes_tpc_count = count_bits(pes_tpc_mask);
+                        gr->pes_tpc_count[pes_index][gpc_index] = pes_tpc_count;
+                        gr->pes_tpc_mask[pes_index][gpc_index] = pes_tpc_mask;
+                }
+                gpc_new_skip_mask = 0;
+                if (gr->pes_tpc_count[0][gpc_index] +
+                    gr->pes_tpc_count[1][gpc_index] == 5) {
+                        pes_heavy_index =
+                                gr->pes_tpc_count[0][gpc_index] >
+                                gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
+                        gpc_new_skip_mask =
+                                gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
+                                   (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
+                                   (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
+                } else if ((gr->pes_tpc_count[0][gpc_index] +
+                            gr->pes_tpc_count[1][gpc_index] == 4) &&
+                           (gr->pes_tpc_count[0][gpc_index] !=
+                            gr->pes_tpc_count[1][gpc_index])) {
+                                pes_heavy_index =
+                                    gr->pes_tpc_count[0][gpc_index] >
+                                    gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
+                        gpc_new_skip_mask =
+                                gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
+                                   (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
+                                   (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
+                }
+                gr->gpc_skip_mask[gpc_index] = gpc_new_skip_mask;
+        }
+        gk20a_dbg_info("fbps: %d", gr->num_fbps);
+        gk20a_dbg_info("max_gpc_count: %d", gr->max_gpc_count);
+        gk20a_dbg_info("max_fbps_count: %d", gr->max_fbps_count);
+        gk20a_dbg_info("max_tpc_per_gpc_count: %d", gr->max_tpc_per_gpc_count);
+        gk20a_dbg_info("max_zcull_per_gpc_count: %d", gr->max_zcull_per_gpc_count);
+        gk20a_dbg_info("max_tpc_count: %d", gr->max_tpc_count);
+        gk20a_dbg_info("sys_count: %d", gr->sys_count);
+        gk20a_dbg_info("gpc_count: %d", gr->gpc_count);
+        gk20a_dbg_info("pe_count_per_gpc: %d", gr->pe_count_per_gpc);
+        gk20a_dbg_info("tpc_count: %d", gr->tpc_count);
+        gk20a_dbg_info("ppc_count: %d", gr->ppc_count);
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+                gk20a_dbg_info("gpc_tpc_count[%d] : %d",
+                           gpc_index, gr->gpc_tpc_count[gpc_index]);
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+                gk20a_dbg_info("gpc_zcb_count[%d] : %d",
+                           gpc_index, gr->gpc_zcb_count[gpc_index]);
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+                gk20a_dbg_info("gpc_ppc_count[%d] : %d",
+                           gpc_index, gr->gpc_ppc_count[gpc_index]);
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+                gk20a_dbg_info("gpc_skip_mask[%d] : %d",
+                           gpc_index, gr->gpc_skip_mask[gpc_index]);
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+                for (pes_index = 0;
+                     pes_index < gr->pe_count_per_gpc;
+                     pes_index++)
+                        gk20a_dbg_info("pes_tpc_count[%d][%d] : %d",
+                                   pes_index, gpc_index,
+                                   gr->pes_tpc_count[pes_index][gpc_index]);
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+                for (pes_index = 0;
+                     pes_index < gr->pe_count_per_gpc;
+                     pes_index++)
+                        gk20a_dbg_info("pes_tpc_mask[%d][%d] : %d",
+                                   pes_index, gpc_index,
+                                   gr->pes_tpc_mask[pes_index][gpc_index]);
+        g->ops.gr.bundle_cb_defaults(g);
+        g->ops.gr.cb_size_default(g);
+        g->ops.gr.calc_global_ctx_buffer_size(g);
+        gr->timeslice_mode = gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v();
+        gk20a_dbg_info("bundle_cb_default_size: %d",
+                   gr->bundle_cb_default_size);
+        gk20a_dbg_info("min_gpm_fifo_depth: %d", gr->min_gpm_fifo_depth);
+        gk20a_dbg_info("bundle_cb_token_limit: %d", gr->bundle_cb_token_limit);
+        gk20a_dbg_info("attrib_cb_default_size: %d",
+                   gr->attrib_cb_default_size);
+        gk20a_dbg_info("attrib_cb_size: %d", gr->attrib_cb_size);
+        gk20a_dbg_info("alpha_cb_default_size: %d", gr->alpha_cb_default_size);
+        gk20a_dbg_info("alpha_cb_size: %d", gr->alpha_cb_size);
+        gk20a_dbg_info("timeslice_mode: %d", gr->timeslice_mode);
+        return 0;
+clean_up:
+        return -ENOMEM;
+}
+static int gr_gk20a_init_mmu_sw(struct gk20a *g, struct gr_gk20a *gr)
+{
+        struct device *d = dev_from_gk20a(g);
+        dma_addr_t iova;
+        gr->mmu_wr_mem_size = gr->mmu_rd_mem_size = 0x1000;
+        gr->mmu_wr_mem.size = gr->mmu_wr_mem_size;
+        gr->mmu_wr_mem.cpuva = dma_zalloc_coherent(d, gr->mmu_wr_mem_size,
+                                        &iova, GFP_KERNEL);
+        if (!gr->mmu_wr_mem.cpuva)
+                goto err;
+        gr->mmu_wr_mem.iova = iova;
+        gr->mmu_rd_mem.size = gr->mmu_rd_mem_size;
+        gr->mmu_rd_mem.cpuva = dma_zalloc_coherent(d, gr->mmu_rd_mem_size,
+                                        &iova, GFP_KERNEL);
+        if (!gr->mmu_rd_mem.cpuva)
+                goto err_free_wr_mem;
+        gr->mmu_rd_mem.iova = iova;
+        return 0;
+ err_free_wr_mem:
+        dma_free_coherent(d, gr->mmu_wr_mem.size,
+                gr->mmu_wr_mem.cpuva, gr->mmu_wr_mem.iova);
+        gr->mmu_wr_mem.cpuva = NULL;
+        gr->mmu_wr_mem.iova = 0;
+ err:
+        return -ENOMEM;
+}
+static u32 prime_set[18] = {
+        2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61 };
+static int gr_gk20a_init_map_tiles(struct gk20a *g, struct gr_gk20a *gr)
+{
+        s32 comm_denom;
+        s32 mul_factor;
+        s32 *init_frac = NULL;
+        s32 *init_err = NULL;
+        s32 *run_err = NULL;
+        s32 *sorted_num_tpcs = NULL;
+        s32 *sorted_to_unsorted_gpc_map = NULL;
+        u32 gpc_index;
+        u32 gpc_mark = 0;
+        u32 num_tpc;
+        u32 max_tpc_count = 0;
+        u32 swap;
+        u32 tile_count;
+        u32 index;
+        bool delete_map = false;
+        bool gpc_sorted;
+        int ret = 0;
+        init_frac = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
+        init_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
+        run_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
+        sorted_num_tpcs =
+                kzalloc(proj_scal_max_gpcs_v() *
+                        proj_scal_max_tpc_per_gpc_v() * sizeof(s32),
+                        GFP_KERNEL);
+        sorted_to_unsorted_gpc_map =
+                kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
+        if (!(init_frac && init_err && run_err && sorted_num_tpcs &&
+              sorted_to_unsorted_gpc_map)) {
+                ret = -ENOMEM;
+                goto clean_up;
+        }
+        gr->map_row_offset = INVALID_SCREEN_TILE_ROW_OFFSET;
+        if (gr->tpc_count == 3)
+                gr->map_row_offset = 2;
+        else if (gr->tpc_count < 3)
+                gr->map_row_offset = 1;
+        else {
+                gr->map_row_offset = 3;
+                for (index = 1; index < 18; index++) {
+                        u32 prime = prime_set[index];
+                        if ((gr->tpc_count % prime) != 0) {
+                                gr->map_row_offset = prime;
+                                break;
+                        }
+                }
+        }
+        switch (gr->tpc_count) {
+        case 15:
+                gr->map_row_offset = 6;
+                break;
+        case 14:
+                gr->map_row_offset = 5;
+                break;
+        case 13:
+                gr->map_row_offset = 2;
+                break;
+        case 11:
+                gr->map_row_offset = 7;
+                break;
+        case 10:
+                gr->map_row_offset = 6;
+                break;
+        case 7:
+        case 5:
+                gr->map_row_offset = 1;
+                break;
+        default:
+                break;
+        }
+        if (gr->map_tiles) {
+                if (gr->map_tile_count != gr->tpc_count)
+                        delete_map = true;
+                for (tile_count = 0; tile_count < gr->map_tile_count; tile_count++) {
+                        if ((u32)gr->map_tiles[tile_count] >= gr->tpc_count)
+                                delete_map = true;
+                }
+                if (delete_map) {
+                        kfree(gr->map_tiles);
+                        gr->map_tiles = NULL;
+                        gr->map_tile_count = 0;
+                }
+        }
+        if (gr->map_tiles == NULL) {
+                gr->map_tile_count = proj_scal_max_gpcs_v();
+                gr->map_tiles = kzalloc(proj_scal_max_gpcs_v() * sizeof(u8), GFP_KERNEL);
+                if (gr->map_tiles == NULL) {
+                        ret = -ENOMEM;
+                        goto clean_up;
+                }
+                for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                        sorted_num_tpcs[gpc_index] = gr->gpc_tpc_count[gpc_index];
+                        sorted_to_unsorted_gpc_map[gpc_index] = gpc_index;
+                }
+                gpc_sorted = false;
+                while (!gpc_sorted) {
+                        gpc_sorted = true;
+                        for (gpc_index = 0; gpc_index < gr->gpc_count - 1; gpc_index++) {
+                                if (sorted_num_tpcs[gpc_index + 1] > sorted_num_tpcs[gpc_index]) {
+                                        gpc_sorted = false;
+                                        swap = sorted_num_tpcs[gpc_index];
+                                        sorted_num_tpcs[gpc_index] = sorted_num_tpcs[gpc_index + 1];
+                                        sorted_num_tpcs[gpc_index + 1] = swap;
+                                        swap = sorted_to_unsorted_gpc_map[gpc_index];
+                                        sorted_to_unsorted_gpc_map[gpc_index] =
+                                                sorted_to_unsorted_gpc_map[gpc_index + 1];
+                                        sorted_to_unsorted_gpc_map[gpc_index + 1] = swap;
+                                }
+                        }
+                }
+                for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+                        if (gr->gpc_tpc_count[gpc_index] > max_tpc_count)
+                                max_tpc_count = gr->gpc_tpc_count[gpc_index];
+                mul_factor = gr->gpc_count * max_tpc_count;
+                if (mul_factor & 0x1)
+                        mul_factor = 2;
+                else
+                        mul_factor = 1;
+                comm_denom = gr->gpc_count * max_tpc_count * mul_factor;
+                for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                        num_tpc = sorted_num_tpcs[gpc_index];
+                        init_frac[gpc_index] = num_tpc * gr->gpc_count * mul_factor;
+                        if (num_tpc != 0)
+                                init_err[gpc_index] = gpc_index * max_tpc_count * mul_factor - comm_denom/2;
+                        else
+                                init_err[gpc_index] = 0;
+                        run_err[gpc_index] = init_frac[gpc_index] + init_err[gpc_index];
+                }
+                while (gpc_mark < gr->tpc_count) {
+                        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                                if ((run_err[gpc_index] * 2) >= comm_denom) {
+                                        gr->map_tiles[gpc_mark++] = (u8)sorted_to_unsorted_gpc_map[gpc_index];
+                                        run_err[gpc_index] += init_frac[gpc_index] - comm_denom;
+                                } else
+                                        run_err[gpc_index] += init_frac[gpc_index];
+                        }
+                }
+        }
+clean_up:
+        kfree(init_frac);
+        kfree(init_err);
+        kfree(run_err);
+        kfree(sorted_num_tpcs);
+        kfree(sorted_to_unsorted_gpc_map);
+        if (ret)
+                gk20a_err(dev_from_gk20a(g), "fail");
+        else
+                gk20a_dbg_fn("done");
+        return ret;
+}
+static int gr_gk20a_init_zcull(struct gk20a *g, struct gr_gk20a *gr)
+{
+        struct gr_zcull_gk20a *zcull = &gr->zcull;
+        zcull->aliquot_width = gr->tpc_count * 16;
+        zcull->aliquot_height = 16;
+        zcull->width_align_pixels = gr->tpc_count * 16;
+        zcull->height_align_pixels = 32;
+        zcull->aliquot_size =
+                zcull->aliquot_width * zcull->aliquot_height;
+        /* assume no floor sweeping since we only have 1 tpc in 1 gpc */
+        zcull->pixel_squares_by_aliquots =
+                gr->zcb_count * 16 * 16 * gr->tpc_count /
+                (gr->gpc_count * gr->gpc_tpc_count[0]);
+        zcull->total_aliquots =
+                gr_gpc0_zcull_total_ram_size_num_aliquots_f(
+                        gk20a_readl(g, gr_gpc0_zcull_total_ram_size_r()));
+        return 0;
+}
+u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr)
+{
+        /* assuming gr has already been initialized */
+        return gr->ctx_vars.zcull_ctxsw_image_size;
+}
+int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
+                        struct channel_gk20a *c, u64 zcull_va, u32 mode)
+{
+        struct zcull_ctx_desc *zcull_ctx = &c->ch_ctx.zcull_ctx;
+        zcull_ctx->ctx_sw_mode = mode;
+        zcull_ctx->gpu_va = zcull_va;
+        /* TBD: don't disable channel in sw method processing */
+        return gr_gk20a_ctx_zcull_setup(g, c, true);
+}
+int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
+                        struct gr_zcull_info *zcull_params)
+{
+        struct gr_zcull_gk20a *zcull = &gr->zcull;
+        zcull_params->width_align_pixels = zcull->width_align_pixels;
+        zcull_params->height_align_pixels = zcull->height_align_pixels;
+        zcull_params->pixel_squares_by_aliquots =
+                zcull->pixel_squares_by_aliquots;
+        zcull_params->aliquot_total = zcull->total_aliquots;
+        zcull_params->region_byte_multiplier =
+                gr->gpc_count * gr_zcull_bytes_per_aliquot_per_gpu_v();
+        zcull_params->region_header_size =
+                proj_scal_litter_num_gpcs_v() *
+                gr_zcull_save_restore_header_bytes_per_gpc_v();
+        zcull_params->subregion_header_size =
+                proj_scal_litter_num_gpcs_v() *
+                gr_zcull_save_restore_subregion_header_bytes_per_gpc_v();
+        zcull_params->subregion_width_align_pixels =
+                gr->tpc_count * gr_gpc0_zcull_zcsize_width_subregion__multiple_v();
+        zcull_params->subregion_height_align_pixels =
+                gr_gpc0_zcull_zcsize_height_subregion__multiple_v();
+        zcull_params->subregion_count = gr_zcull_subregion_qty_v();
+        return 0;
+}
+static int gr_gk20a_add_zbc_color(struct gk20a *g, struct gr_gk20a *gr,
+                                  struct zbc_entry *color_val, u32 index)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
+        u32 i;
+        unsigned long end_jiffies = jiffies +
+                msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+        u32 ret;
+        ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
+        if (ret) {
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to disable gr engine activity\n");
+                return ret;
+        }
+        ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
+        if (ret) {
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to idle graphics\n");
+                goto clean_up;
+        }
+        /* update l2 table */
+        g->ops.ltc.set_zbc_color_entry(g, color_val, index);
+        /* update ds table */
+        gk20a_writel(g, gr_ds_zbc_color_r_r(),
+                gr_ds_zbc_color_r_val_f(color_val->color_ds[0]));
+        gk20a_writel(g, gr_ds_zbc_color_g_r(),
+                gr_ds_zbc_color_g_val_f(color_val->color_ds[1]));
+        gk20a_writel(g, gr_ds_zbc_color_b_r(),
+                gr_ds_zbc_color_b_val_f(color_val->color_ds[2]));
+        gk20a_writel(g, gr_ds_zbc_color_a_r(),
+                gr_ds_zbc_color_a_val_f(color_val->color_ds[3]));
+        gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
+                gr_ds_zbc_color_fmt_val_f(color_val->format));
+        gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
+                gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
+        /* trigger the write */
+        gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
+                gr_ds_zbc_tbl_ld_select_c_f() |
+                gr_ds_zbc_tbl_ld_action_write_f() |
+                gr_ds_zbc_tbl_ld_trigger_active_f());
+        /* update local copy */
+        for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+                gr->zbc_col_tbl[index].color_l2[i] = color_val->color_l2[i];
+                gr->zbc_col_tbl[index].color_ds[i] = color_val->color_ds[i];
+        }
+        gr->zbc_col_tbl[index].format = color_val->format;
+        gr->zbc_col_tbl[index].ref_cnt++;
+clean_up:
+        ret = gk20a_fifo_enable_engine_activity(g, gr_info);
+        if (ret) {
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to enable gr engine activity\n");
+        }
+        return ret;
+}
+static int gr_gk20a_add_zbc_depth(struct gk20a *g, struct gr_gk20a *gr,
+                                struct zbc_entry *depth_val, u32 index)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
+        unsigned long end_jiffies = jiffies +
+                msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+        u32 ret;
+        ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
+        if (ret) {
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to disable gr engine activity\n");
+                return ret;
+        }
+        ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
+        if (ret) {
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to idle graphics\n");
+                goto clean_up;
+        }
+        /* update l2 table */
+        g->ops.ltc.set_zbc_depth_entry(g, depth_val, index);
+        /* update ds table */
+        gk20a_writel(g, gr_ds_zbc_z_r(),
+                gr_ds_zbc_z_val_f(depth_val->depth));
+        gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
+                gr_ds_zbc_z_fmt_val_f(depth_val->format));
+        gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
+                gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
+        /* trigger the write */
+        gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
+                gr_ds_zbc_tbl_ld_select_z_f() |
+                gr_ds_zbc_tbl_ld_action_write_f() |
+                gr_ds_zbc_tbl_ld_trigger_active_f());
+        /* update local copy */
+        gr->zbc_dep_tbl[index].depth = depth_val->depth;
+        gr->zbc_dep_tbl[index].format = depth_val->format;
+        gr->zbc_dep_tbl[index].ref_cnt++;
+clean_up:
+        ret = gk20a_fifo_enable_engine_activity(g, gr_info);
+        if (ret) {
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to enable gr engine activity\n");
+        }
+        return ret;
+}
+int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
+                     struct zbc_entry *zbc_val)
+{
+        struct zbc_color_table *c_tbl;
+        struct zbc_depth_table *d_tbl;
+        u32 i, ret = -ENOMEM;
+        bool added = false;
+        u32 entries;
+        /* no endian swap ? */
+        switch (zbc_val->type) {
+        case GK20A_ZBC_TYPE_COLOR:
+                /* search existing tables */
+                for (i = 0; i < gr->max_used_color_index; i++) {
+                        c_tbl = &gr->zbc_col_tbl[i];
+                        if (c_tbl->ref_cnt && c_tbl->format == zbc_val->format &&
+                            memcmp(c_tbl->color_ds, zbc_val->color_ds,
+                                sizeof(zbc_val->color_ds)) == 0) {
+                                if (memcmp(c_tbl->color_l2, zbc_val->color_l2,
+                                    sizeof(zbc_val->color_l2))) {
+                                        gk20a_err(dev_from_gk20a(g),
+                                                "zbc l2 and ds color don't match with existing entries");
+                                        return -EINVAL;
+                                }
+                                added = true;
+                                c_tbl->ref_cnt++;
+                                ret = 0;
+                                break;
+                        }
+                }
+                /* add new table */
+                if (!added &&
+                    gr->max_used_color_index < GK20A_ZBC_TABLE_SIZE) {
+                        c_tbl =
+                            &gr->zbc_col_tbl[gr->max_used_color_index];
+                        WARN_ON(c_tbl->ref_cnt != 0);
+                        ret = gr_gk20a_add_zbc_color(g, gr,
+                                zbc_val, gr->max_used_color_index);
+                        if (!ret)
+                                gr->max_used_color_index++;
+                }
+                break;
+        case GK20A_ZBC_TYPE_DEPTH:
+                /* search existing tables */
+                for (i = 0; i < gr->max_used_depth_index; i++) {
+                        d_tbl = &gr->zbc_dep_tbl[i];
+                        if (d_tbl->ref_cnt &&
+                            d_tbl->depth == zbc_val->depth &&
+                            d_tbl->format == zbc_val->format) {
+                                added = true;
+                                d_tbl->ref_cnt++;
+                                ret = 0;
+                                break;
+                        }
+                }
+                /* add new table */
+                if (!added &&
+                    gr->max_used_depth_index < GK20A_ZBC_TABLE_SIZE) {
+                        d_tbl =
+                            &gr->zbc_dep_tbl[gr->max_used_depth_index];
+                        WARN_ON(d_tbl->ref_cnt != 0);
+                        ret = gr_gk20a_add_zbc_depth(g, gr,
+                                zbc_val, gr->max_used_depth_index);
+                        if (!ret)
+                                gr->max_used_depth_index++;
+                }
+                break;
+        default:
+                gk20a_err(dev_from_gk20a(g),
+                        "invalid zbc table type %d", zbc_val->type);
+                return -EINVAL;
+        }
+        if (!added && ret == 0) {
+                /* update zbc for elpg only when new entry is added */
+                entries = max(gr->max_used_color_index,
+                                        gr->max_used_depth_index);
+                gk20a_pmu_save_zbc(g, entries);
+        }
+        return ret;
+}
+int gr_gk20a_clear_zbc_table(struct gk20a *g, struct gr_gk20a *gr)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
+        u32 i, j;
+        unsigned long end_jiffies = jiffies +
+                msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+        u32 ret;
+        ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
+        if (ret) {
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to disable gr engine activity\n");
+                return ret;
+        }
+        ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
+        if (ret) {
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to idle graphics\n");
+                goto clean_up;
+        }
+        for (i = 0; i < GK20A_ZBC_TABLE_SIZE; i++) {
+                gr->zbc_col_tbl[i].format = 0;
+                gr->zbc_col_tbl[i].ref_cnt = 0;
+                gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
+                        gr_ds_zbc_color_fmt_val_invalid_f());
+                gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
+                        gr_ds_zbc_tbl_index_val_f(i + GK20A_STARTOF_ZBC_TABLE));
+                /* trigger the write */
+                gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
+                        gr_ds_zbc_tbl_ld_select_c_f() |
+                        gr_ds_zbc_tbl_ld_action_write_f() |
+                        gr_ds_zbc_tbl_ld_trigger_active_f());
+                /* clear l2 table */
+                g->ops.ltc.clear_zbc_color_entry(g, i);
+                for (j = 0; j < GK20A_ZBC_COLOR_VALUE_SIZE; j++) {
+                        gr->zbc_col_tbl[i].color_l2[j] = 0;
+                        gr->zbc_col_tbl[i].color_ds[j] = 0;
+                }
+        }
+        gr->max_used_color_index = 0;
+        gr->max_default_color_index = 0;
+        for (i = 0; i < GK20A_ZBC_TABLE_SIZE; i++) {
+                gr->zbc_dep_tbl[i].depth = 0;
+                gr->zbc_dep_tbl[i].format = 0;
+                gr->zbc_dep_tbl[i].ref_cnt = 0;
+                gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
+                        gr_ds_zbc_z_fmt_val_invalid_f());
+                gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
+                        gr_ds_zbc_tbl_index_val_f(i + GK20A_STARTOF_ZBC_TABLE));
+                /* trigger the write */
+                gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
+                        gr_ds_zbc_tbl_ld_select_z_f() |
+                        gr_ds_zbc_tbl_ld_action_write_f() |
+                        gr_ds_zbc_tbl_ld_trigger_active_f());
+                /* clear l2 table */
+                g->ops.ltc.clear_zbc_depth_entry(g, i);
+        }
+        gr->max_used_depth_index = 0;
+        gr->max_default_depth_index = 0;
+clean_up:
+        ret = gk20a_fifo_enable_engine_activity(g, gr_info);
+        if (ret) {
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to enable gr engine activity\n");
+        }
+        /* elpg stuff */
+        return ret;
+}
+/* get a zbc table entry specified by index
+ * return table size when type is invalid */
+int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
+                        struct zbc_query_params *query_params)
+{
+        u32 index = query_params->index_size;
+        u32 i;
+        switch (query_params->type) {
+        case GK20A_ZBC_TYPE_INVALID:
+                query_params->index_size = GK20A_ZBC_TABLE_SIZE;
+                break;
+        case GK20A_ZBC_TYPE_COLOR:
+                if (index >= GK20A_ZBC_TABLE_SIZE) {
+                        gk20a_err(dev_from_gk20a(g),
+                                "invalid zbc color table index\n");
+                        return -EINVAL;
+                }
+                for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+                        query_params->color_l2[i] =
+                                gr->zbc_col_tbl[index].color_l2[i];
+                        query_params->color_ds[i] =
+                                gr->zbc_col_tbl[index].color_ds[i];
+                }
+                query_params->format = gr->zbc_col_tbl[index].format;
+                query_params->ref_cnt = gr->zbc_col_tbl[index].ref_cnt;
+                break;
+        case GK20A_ZBC_TYPE_DEPTH:
+                if (index >= GK20A_ZBC_TABLE_SIZE) {
+                        gk20a_err(dev_from_gk20a(g),
+                                "invalid zbc depth table index\n");
+                        return -EINVAL;
+                }
+                query_params->depth = gr->zbc_dep_tbl[index].depth;
+                query_params->format = gr->zbc_dep_tbl[index].format;
+                query_params->ref_cnt = gr->zbc_dep_tbl[index].ref_cnt;
+                break;
+        default:
+                gk20a_err(dev_from_gk20a(g),
+                                "invalid zbc table type\n");
+                return -EINVAL;
+        }
+        return 0;
+}
+int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr)
+{
+        struct zbc_entry zbc_val;
+        u32 i, err;
+        /* load default color table */
+        zbc_val.type = GK20A_ZBC_TYPE_COLOR;
+        zbc_val.format = gr_ds_zbc_color_fmt_val_zero_v();
+        for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+                zbc_val.color_ds[i] = 0;
+                zbc_val.color_l2[i] = 0;
+        }
+        err = gr_gk20a_add_zbc(g, gr, &zbc_val);
+        zbc_val.format = gr_ds_zbc_color_fmt_val_unorm_one_v();
+        for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+                zbc_val.color_ds[i] = 0xffffffff;
+                zbc_val.color_l2[i] = 0x3f800000;
+        }
+        err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
+        zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
+        for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+                zbc_val.color_ds[i] = 0;
+                zbc_val.color_l2[i] = 0;
+        }
+        err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
+        zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
+        for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+                zbc_val.color_ds[i] = 0x3f800000;
+                zbc_val.color_l2[i] = 0x3f800000;
+        }
+        err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
+        if (!err)
+                gr->max_default_color_index = 4;
+        else {
+                gk20a_err(dev_from_gk20a(g),
+                           "fail to load default zbc color table\n");
+                return err;
+        }
+        /* load default depth table */
+        zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
+        zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
+        zbc_val.depth = 0;
+        err = gr_gk20a_add_zbc(g, gr, &zbc_val);
+        zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
+        zbc_val.depth = 0x3f800000;
+        err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
+        if (!err)
+                gr->max_default_depth_index = 2;
+        else {
+                gk20a_err(dev_from_gk20a(g),
+                           "fail to load default zbc depth table\n");
+                return err;
+        }
+        return 0;
+}
+int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
+                        struct zbc_entry *zbc_val)
+{
+        gk20a_dbg_fn("");
+        return gr_gk20a_elpg_protected_call(g,
+                gr_gk20a_add_zbc(g, gr, zbc_val));
+}
+void gr_gk20a_init_blcg_mode(struct gk20a *g, u32 mode, u32 engine)
+{
+        u32 gate_ctrl;
+        gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
+        switch (mode) {
+        case BLCG_RUN:
+                gate_ctrl = set_field(gate_ctrl,
+                                therm_gate_ctrl_blk_clk_m(),
+                                therm_gate_ctrl_blk_clk_run_f());
+                break;
+        case BLCG_AUTO:
+                gate_ctrl = set_field(gate_ctrl,
+                                therm_gate_ctrl_blk_clk_m(),
+                                therm_gate_ctrl_blk_clk_auto_f());
+                break;
+        default:
+                gk20a_err(dev_from_gk20a(g),
+                        "invalid blcg mode %d", mode);
+                return;
+        }
+        gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
+}
+void gr_gk20a_init_elcg_mode(struct gk20a *g, u32 mode, u32 engine)
+{
+        u32 gate_ctrl, idle_filter;
+        gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
+        switch (mode) {
+        case ELCG_RUN:
+                gate_ctrl = set_field(gate_ctrl,
+                                therm_gate_ctrl_eng_clk_m(),
+                                therm_gate_ctrl_eng_clk_run_f());
+                gate_ctrl = set_field(gate_ctrl,
+                                therm_gate_ctrl_eng_pwr_m(),
+                                /* set elpg to auto to meet hw expectation */
+                                therm_gate_ctrl_eng_pwr_auto_f());
+                break;
+        case ELCG_STOP:
+                gate_ctrl = set_field(gate_ctrl,
+                                therm_gate_ctrl_eng_clk_m(),
+                                therm_gate_ctrl_eng_clk_stop_f());
+                break;
+        case ELCG_AUTO:
+                gate_ctrl = set_field(gate_ctrl,
+                                therm_gate_ctrl_eng_clk_m(),
+                                therm_gate_ctrl_eng_clk_auto_f());
+                break;
+        default:
+                gk20a_err(dev_from_gk20a(g),
+                        "invalid elcg mode %d", mode);
+        }
+        if (tegra_platform_is_linsim()) {
+                gate_ctrl = set_field(gate_ctrl,
+                        therm_gate_ctrl_eng_delay_after_m(),
+                        therm_gate_ctrl_eng_delay_after_f(4));
+        }
+        /* 2 * (1 << 9) = 1024 clks */
+        gate_ctrl = set_field(gate_ctrl,
+                therm_gate_ctrl_eng_idle_filt_exp_m(),
+                therm_gate_ctrl_eng_idle_filt_exp_f(9));
+        gate_ctrl = set_field(gate_ctrl,
+                therm_gate_ctrl_eng_idle_filt_mant_m(),
+                therm_gate_ctrl_eng_idle_filt_mant_f(2));
+        gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
+        /* default fecs_idle_filter to 0 */
+        idle_filter = gk20a_readl(g, therm_fecs_idle_filter_r());
+        idle_filter &= ~therm_fecs_idle_filter_value_m();
+        gk20a_writel(g, therm_fecs_idle_filter_r(), idle_filter);
+        /* default hubmmu_idle_filter to 0 */
+        idle_filter = gk20a_readl(g, therm_hubmmu_idle_filter_r());
+        idle_filter &= ~therm_hubmmu_idle_filter_value_m();
+        gk20a_writel(g, therm_hubmmu_idle_filter_r(), idle_filter);
+}
+static int gr_gk20a_zcull_init_hw(struct gk20a *g, struct gr_gk20a *gr)
+{
+        u32 gpc_index, gpc_tpc_count, gpc_zcull_count;
+        u32 *zcull_map_tiles, *zcull_bank_counters;
+        u32 map_counter;
+        u32 rcp_conserv;
+        u32 offset;
+        bool floorsweep = false;
+        if (!gr->map_tiles)
+                return -1;
+        zcull_map_tiles = kzalloc(proj_scal_max_gpcs_v() *
+                        proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
+        if (!zcull_map_tiles) {
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to allocate zcull temp buffers");
+                return -ENOMEM;
+        }
+        zcull_bank_counters = kzalloc(proj_scal_max_gpcs_v() *
+                        proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
+        if (!zcull_bank_counters) {
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to allocate zcull temp buffers");
+                kfree(zcull_map_tiles);
+                return -ENOMEM;
+        }
+        for (map_counter = 0; map_counter < gr->tpc_count; map_counter++) {
+                zcull_map_tiles[map_counter] =
+                        zcull_bank_counters[gr->map_tiles[map_counter]];
+                zcull_bank_counters[gr->map_tiles[map_counter]]++;
+        }
+        gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map0_r(),
+                gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(zcull_map_tiles[0]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(zcull_map_tiles[1]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(zcull_map_tiles[2]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(zcull_map_tiles[3]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(zcull_map_tiles[4]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(zcull_map_tiles[5]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(zcull_map_tiles[6]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(zcull_map_tiles[7]));
+        gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map1_r(),
+                gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(zcull_map_tiles[8]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(zcull_map_tiles[9]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(zcull_map_tiles[10]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(zcull_map_tiles[11]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(zcull_map_tiles[12]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(zcull_map_tiles[13]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(zcull_map_tiles[14]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(zcull_map_tiles[15]));
+        gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map2_r(),
+                gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(zcull_map_tiles[16]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(zcull_map_tiles[17]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(zcull_map_tiles[18]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(zcull_map_tiles[19]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(zcull_map_tiles[20]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(zcull_map_tiles[21]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(zcull_map_tiles[22]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(zcull_map_tiles[23]));
+        gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map3_r(),
+                gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(zcull_map_tiles[24]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(zcull_map_tiles[25]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(zcull_map_tiles[26]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(zcull_map_tiles[27]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(zcull_map_tiles[28]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(zcull_map_tiles[29]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(zcull_map_tiles[30]) |
+                gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(zcull_map_tiles[31]));
+        kfree(zcull_map_tiles);
+        kfree(zcull_bank_counters);
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                gpc_tpc_count = gr->gpc_tpc_count[gpc_index];
+                gpc_zcull_count = gr->gpc_zcb_count[gpc_index];
+                if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
+                    gpc_zcull_count < gpc_tpc_count) {
+                        gk20a_err(dev_from_gk20a(g),
+                                "zcull_banks (%d) less than tpcs (%d) for gpc (%d)",
+                                gpc_zcull_count, gpc_tpc_count, gpc_index);
+                        return -EINVAL;
+                }
+                if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
+                    gpc_zcull_count != 0)
+                        floorsweep = true;
+        }
+        /* 1.0f / 1.0f * gr_gpc0_zcull_sm_num_rcp_conservative__max_v() */
+        rcp_conserv = gr_gpc0_zcull_sm_num_rcp_conservative__max_v();
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                offset = gpc_index * proj_gpc_stride_v();
+                if (floorsweep) {
+                        gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
+                                gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
+                                gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
+                                        gr->max_zcull_per_gpc_count));
+                } else {
+                        gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
+                                gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
+                                gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
+                                        gr->gpc_tpc_count[gpc_index]));
+                }
+                gk20a_writel(g, gr_gpc0_zcull_fs_r() + offset,
+                        gr_gpc0_zcull_fs_num_active_banks_f(gr->gpc_zcb_count[gpc_index]) |
+                        gr_gpc0_zcull_fs_num_sms_f(gr->tpc_count));
+                gk20a_writel(g, gr_gpc0_zcull_sm_num_rcp_r() + offset,
+                        gr_gpc0_zcull_sm_num_rcp_conservative_f(rcp_conserv));
+        }
+        gk20a_writel(g, gr_gpcs_ppcs_wwdx_sm_num_rcp_r(),
+                gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(rcp_conserv));
+        return 0;
+}
+static void gk20a_gr_enable_gpc_exceptions(struct gk20a *g)
+{
+        /* enable tpc exception forwarding */
+        gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(),
+                gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f());
+        /* enable gpc exception forwarding */
+        gk20a_writel(g, gr_gpc0_gpccs_gpc_exception_en_r(),
+                gr_gpc0_gpccs_gpc_exception_en_tpc_0_enabled_f());
+}
+void gr_gk20a_enable_hww_exceptions(struct gk20a *g)
+{
+        /* enable exceptions */
+        gk20a_writel(g, gr_fe_hww_esr_r(),
+                     gr_fe_hww_esr_en_enable_f() |
+                     gr_fe_hww_esr_reset_active_f());
+        gk20a_writel(g, gr_memfmt_hww_esr_r(),
+                     gr_memfmt_hww_esr_en_enable_f() |
+                     gr_memfmt_hww_esr_reset_active_f());
+        gk20a_writel(g, gr_scc_hww_esr_r(),
+                     gr_scc_hww_esr_en_enable_f() |
+                     gr_scc_hww_esr_reset_active_f());
+        gk20a_writel(g, gr_mme_hww_esr_r(),
+                     gr_mme_hww_esr_en_enable_f() |
+                     gr_mme_hww_esr_reset_active_f());
+        gk20a_writel(g, gr_pd_hww_esr_r(),
+                     gr_pd_hww_esr_en_enable_f() |
+                     gr_pd_hww_esr_reset_active_f());
+        gk20a_writel(g, gr_sked_hww_esr_r(), /* enabled by default */
+                     gr_sked_hww_esr_reset_active_f());
+        gk20a_writel(g, gr_ds_hww_esr_r(),
+                     gr_ds_hww_esr_en_enabled_f() |
+                     gr_ds_hww_esr_reset_task_f());
+        gk20a_writel(g, gr_ds_hww_report_mask_r(),
+                     gr_ds_hww_report_mask_sph0_err_report_f() |
+                     gr_ds_hww_report_mask_sph1_err_report_f() |
+                     gr_ds_hww_report_mask_sph2_err_report_f() |
+                     gr_ds_hww_report_mask_sph3_err_report_f() |
+                     gr_ds_hww_report_mask_sph4_err_report_f() |
+                     gr_ds_hww_report_mask_sph5_err_report_f() |
+                     gr_ds_hww_report_mask_sph6_err_report_f() |
+                     gr_ds_hww_report_mask_sph7_err_report_f() |
+                     gr_ds_hww_report_mask_sph8_err_report_f() |
+                     gr_ds_hww_report_mask_sph9_err_report_f() |
+                     gr_ds_hww_report_mask_sph10_err_report_f() |
+                     gr_ds_hww_report_mask_sph11_err_report_f() |
+                     gr_ds_hww_report_mask_sph12_err_report_f() |
+                     gr_ds_hww_report_mask_sph13_err_report_f() |
+                     gr_ds_hww_report_mask_sph14_err_report_f() |
+                     gr_ds_hww_report_mask_sph15_err_report_f() |
+                     gr_ds_hww_report_mask_sph16_err_report_f() |
+                     gr_ds_hww_report_mask_sph17_err_report_f() |
+                     gr_ds_hww_report_mask_sph18_err_report_f() |
+                     gr_ds_hww_report_mask_sph19_err_report_f() |
+                     gr_ds_hww_report_mask_sph20_err_report_f() |
+                     gr_ds_hww_report_mask_sph21_err_report_f() |
+                     gr_ds_hww_report_mask_sph22_err_report_f() |
+                     gr_ds_hww_report_mask_sph23_err_report_f());
+}
+static void gr_gk20a_set_hww_esr_report_mask(struct gk20a *g)
+{
+        /* setup sm warp esr report masks */
+        gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
+                gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
+        /* setup sm global esr report mask */
+        gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
+                gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
+                gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
+                gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
+                gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
+                gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
+                gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
+                gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
+}
+static int gk20a_init_gr_setup_hw(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
+        struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
+        u32 data;
+        u32 addr_lo, addr_hi;
+        u64 addr;
+        unsigned long end_jiffies = jiffies +
+                msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+        u32 fe_go_idle_timeout_save;
+        u32 last_method_data = 0;
+        u32 i, err;
+        gk20a_dbg_fn("");
+        /* slcg prod values */
+        g->ops.clock_gating.slcg_gr_load_gating_prod(g, g->slcg_enabled);
+        g->ops.clock_gating.slcg_perf_load_gating_prod(g, g->slcg_enabled);
+        /* init mmu debug buffer */
+        addr = NV_MC_SMMU_VADDR_TRANSLATE(gr->mmu_wr_mem.iova);
+        addr_lo = u64_lo32(addr);
+        addr_hi = u64_hi32(addr);
+        addr = (addr_lo >> fb_mmu_debug_wr_addr_alignment_v()) |
+                (addr_hi << (32 - fb_mmu_debug_wr_addr_alignment_v()));
+        gk20a_writel(g, fb_mmu_debug_wr_r(),
+                     fb_mmu_debug_wr_aperture_vid_mem_f() |
+                     fb_mmu_debug_wr_vol_false_f() |
+                     fb_mmu_debug_wr_addr_v(addr));
+        addr = NV_MC_SMMU_VADDR_TRANSLATE(gr->mmu_rd_mem.iova);
+        addr_lo = u64_lo32(addr);
+        addr_hi = u64_hi32(addr);
+        addr = (addr_lo >> fb_mmu_debug_rd_addr_alignment_v()) |
+                (addr_hi << (32 - fb_mmu_debug_rd_addr_alignment_v()));
+        gk20a_writel(g, fb_mmu_debug_rd_r(),
+                     fb_mmu_debug_rd_aperture_vid_mem_f() |
+                     fb_mmu_debug_rd_vol_false_f() |
+                     fb_mmu_debug_rd_addr_v(addr));
+        /* load gr floorsweeping registers */
+        data = gk20a_readl(g, gr_gpc0_ppc0_pes_vsc_strem_r());
+        data = set_field(data, gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(),
+                        gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f());
+        gk20a_writel(g, gr_gpc0_ppc0_pes_vsc_strem_r(), data);
+        gr_gk20a_zcull_init_hw(g, gr);
+        g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled);
+        g->ops.clock_gating.pg_gr_load_gating_prod(g, true);
+        if (g->elcg_enabled) {
+                gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A);
+                gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A);
+        } else {
+                gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
+                gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
+        }
+        /* Bug 1340570: increase the clock timeout to avoid potential
+         * operation failure at high gpcclk rate. Default values are 0x400.
+         */
+        gk20a_writel(g, pri_ringstation_sys_master_config_r(0x15), 0x800);
+        gk20a_writel(g, pri_ringstation_gpc_master_config_r(0xa), 0x800);
+        gk20a_writel(g, pri_ringstation_fbp_master_config_r(0x8), 0x800);
+        /* enable fifo access */
+        gk20a_writel(g, gr_gpfifo_ctl_r(),
+                     gr_gpfifo_ctl_access_enabled_f() |
+                     gr_gpfifo_ctl_semaphore_access_enabled_f());
+        /* TBD: reload gr ucode when needed */
+        /* enable interrupts */
+        gk20a_writel(g, gr_intr_r(), 0xFFFFFFFF);
+        gk20a_writel(g, gr_intr_en_r(), 0xFFFFFFFF);
+        /* enable fecs error interrupts */
+        gk20a_writel(g, gr_fecs_host_int_enable_r(),
+                     gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
+                     gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |
+                     gr_fecs_host_int_enable_umimp_illegal_method_enable_f() |
+                     gr_fecs_host_int_enable_watchdog_enable_f());
+        g->ops.gr.enable_hww_exceptions(g);
+        g->ops.gr.set_hww_esr_report_mask(g);
+        /* enable per GPC exceptions */
+        gk20a_gr_enable_gpc_exceptions(g);
+        /* TBD: ECC for L1/SM */
+        /* TBD: enable per BE exceptions */
+        /* reset and enable all exceptions */
+        gk20a_writel(g, gr_exception_r(), 0xFFFFFFFF);
+        gk20a_writel(g, gr_exception_en_r(), 0xFFFFFFFF);
+        gk20a_writel(g, gr_exception1_r(), 0xFFFFFFFF);
+        gk20a_writel(g, gr_exception1_en_r(), 0xFFFFFFFF);
+        gk20a_writel(g, gr_exception2_r(), 0xFFFFFFFF);
+        gk20a_writel(g, gr_exception2_en_r(), 0xFFFFFFFF);
+        /* ignore status from some units */
+        data = gk20a_readl(g, gr_status_mask_r());
+        gk20a_writel(g, gr_status_mask_r(), data & gr->status_disable_mask);
+        g->ops.ltc.init_zbc(g, gr);
+        g->ops.ltc.init_cbc(g, gr);
+        /* load ctx init */
+        for (i = 0; i < sw_ctx_load->count; i++)
+                gk20a_writel(g, sw_ctx_load->l[i].addr,
+                             sw_ctx_load->l[i].value);
+        err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
+        if (err)
+                goto out;
+        /* save and disable fe_go_idle */
+        fe_go_idle_timeout_save =
+                gk20a_readl(g, gr_fe_go_idle_timeout_r());
+        gk20a_writel(g, gr_fe_go_idle_timeout_r(),
+                (fe_go_idle_timeout_save & gr_fe_go_idle_timeout_count_f(0)) |
+                gr_fe_go_idle_timeout_count_disabled_f());
+        /* override a few ctx state registers */
+        g->ops.gr.commit_global_cb_manager(g, NULL, false);
+        gr_gk20a_commit_global_timeslice(g, NULL, false);
+        /* floorsweep anything left */
+        g->ops.gr.init_fs_state(g);
+        err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
+        if (err)
+                goto restore_fe_go_idle;
+restore_fe_go_idle:
+        /* restore fe_go_idle */
+        gk20a_writel(g, gr_fe_go_idle_timeout_r(), fe_go_idle_timeout_save);
+        if (err || gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT))
+                goto out;
+        /* load method init */
+        if (sw_method_init->count) {
+                gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
+                             sw_method_init->l[0].value);
+                gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
+                             gr_pri_mme_shadow_raw_index_write_trigger_f() |
+                             sw_method_init->l[0].addr);
+                last_method_data = sw_method_init->l[0].value;
+        }
+        for (i = 1; i < sw_method_init->count; i++) {
+                if (sw_method_init->l[i].value != last_method_data) {
+                        gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
+                                sw_method_init->l[i].value);
+                        last_method_data = sw_method_init->l[i].value;
+                }
+                gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
+                        gr_pri_mme_shadow_raw_index_write_trigger_f() |
+                        sw_method_init->l[i].addr);
+        }
+        gk20a_mm_l2_invalidate(g);
+        err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
+        if (err)
+                goto out;
+out:
+        gk20a_dbg_fn("done");
+        return 0;
+}
+static int gk20a_init_gr_prepare(struct gk20a *g)
+{
+        u32 gpfifo_ctrl, pmc_en;
+        u32 err = 0;
+        /* disable fifo access */
+        pmc_en = gk20a_readl(g, mc_enable_r());
+        if (pmc_en & mc_enable_pgraph_enabled_f()) {
+                gpfifo_ctrl = gk20a_readl(g, gr_gpfifo_ctl_r());
+                gpfifo_ctrl &= ~gr_gpfifo_ctl_access_enabled_f();
+                gk20a_writel(g, gr_gpfifo_ctl_r(), gpfifo_ctrl);
+        }
+        /* reset gr engine */
+        gk20a_reset(g, mc_enable_pgraph_enabled_f()
+                        | mc_enable_blg_enabled_f()
+                        | mc_enable_perfmon_enabled_f());
+        /* enable fifo access */
+        gk20a_writel(g, gr_gpfifo_ctl_r(),
+                gr_gpfifo_ctl_access_enabled_f() |
+                gr_gpfifo_ctl_semaphore_access_enabled_f());
+        if (!g->gr.ctx_vars.valid) {
+                err = gr_gk20a_init_ctx_vars(g, &g->gr);
+                if (err)
+                        gk20a_err(dev_from_gk20a(g),
+                                "fail to load gr init ctx");
+        }
+        return err;
+}
+static int gr_gk20a_wait_mem_scrubbing(struct gk20a *g)
+{
+        int retries = GR_IDLE_CHECK_MAX / GR_IDLE_CHECK_DEFAULT;
+        bool fecs_scrubbing;
+        bool gpccs_scrubbing;
+        gk20a_dbg_fn("");
+        do {
+                fecs_scrubbing = gk20a_readl(g, gr_fecs_dmactl_r()) &
+                        (gr_fecs_dmactl_imem_scrubbing_m() |
+                         gr_fecs_dmactl_dmem_scrubbing_m());
+                gpccs_scrubbing = gk20a_readl(g, gr_gpccs_dmactl_r()) &
+                        (gr_gpccs_dmactl_imem_scrubbing_m() |
+                         gr_gpccs_dmactl_imem_scrubbing_m());
+                if (!fecs_scrubbing && !gpccs_scrubbing) {
+                        gk20a_dbg_fn("done");
+                        return 0;
+                }
+                udelay(GR_IDLE_CHECK_DEFAULT);
+        } while (--retries || !tegra_platform_is_silicon());
+        gk20a_err(dev_from_gk20a(g), "Falcon mem scrubbing timeout");
+        return -ETIMEDOUT;
+}
+static int gk20a_init_gr_reset_enable_hw(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        struct av_list_gk20a *sw_non_ctx_load = &g->gr.ctx_vars.sw_non_ctx_load;
+        unsigned long end_jiffies = jiffies +
+                msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+        u32 i, err = 0;
+        gk20a_dbg_fn("");
+        /* enable interrupts */
+        gk20a_writel(g, gr_intr_r(), ~0);
+        gk20a_writel(g, gr_intr_en_r(), ~0);
+        /* reset ctx switch state */
+        gr_gk20a_ctx_reset(g, 0);
+        /* clear scc ram */
+        gk20a_writel(g, gr_scc_init_r(),
+                gr_scc_init_ram_trigger_f());
+        /* load non_ctx init */
+        for (i = 0; i < sw_non_ctx_load->count; i++)
+                gk20a_writel(g, sw_non_ctx_load->l[i].addr,
+                        sw_non_ctx_load->l[i].value);
+        err = gr_gk20a_wait_mem_scrubbing(g);
+        if (err)
+                goto out;
+        err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
+        if (err)
+                goto out;
+        err = gr_gk20a_load_ctxsw_ucode(g, gr);
+        if (err)
+                goto out;
+        /* this appears query for sw states but fecs actually init
+           ramchain, etc so this is hw init */
+        err = gr_gk20a_init_ctx_state(g, gr);
+        if (err)
+                goto out;
+out:
+        if (err)
+                gk20a_err(dev_from_gk20a(g), "fail");
+        else
+                gk20a_dbg_fn("done");
+        return 0;
+}
+/*
+ * XXX Merge this list with the debugger/profiler
+ * session regops whitelists?
+ */
+static u32 wl_addr_gk20a[] = {
+        /* this list must be sorted (low to high) */
+        0x404468, /* gr_pri_mme_max_instructions       */
+        0x418800, /* gr_pri_gpcs_setup_debug           */
+        0x419a04, /* gr_pri_gpcs_tpcs_tex_lod_dbg      */
+        0x419a08, /* gr_pri_gpcs_tpcs_tex_samp_dbg     */
+        0x419e10, /* gr_pri_gpcs_tpcs_sm_dbgr_control0 */
+        0x419f78, /* gr_pri_gpcs_tpcs_sm_disp_ctrl     */
+};
+static int gr_gk20a_init_access_map(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        void *data;
+        int err = 0;
+        u32 w, nr_pages =
+                DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size,
+                             PAGE_SIZE);
+        data = vmap(gr->global_ctx_buffer[PRIV_ACCESS_MAP].pages,
+                    PAGE_ALIGN(gr->global_ctx_buffer[PRIV_ACCESS_MAP].size) >>
+                    PAGE_SHIFT, 0, pgprot_dmacoherent(PAGE_KERNEL));
+        if (!data) {
+                gk20a_err(dev_from_gk20a(g),
+                          "failed to map priv access map memory");
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        memset(data, 0x0, PAGE_SIZE * nr_pages);
+        for (w = 0; w < ARRAY_SIZE(wl_addr_gk20a); w++) {
+                u32 map_bit, map_byte, map_shift;
+                map_bit = wl_addr_gk20a[w] >> 2;
+                map_byte = map_bit >> 3;
+                map_shift = map_bit & 0x7; /* i.e. 0-7 */
+                gk20a_dbg_info("access map addr:0x%x byte:0x%x bit:%d",
+                  wl_addr_gk20a[w], map_byte, map_shift);
+                ((u8 *)data)[map_byte] |= 1 << map_shift;
+        }
+clean_up:
+        if (data)
+                vunmap(data);
+        return 0;
+}
+static int gk20a_init_gr_setup_sw(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        int err;
+        gk20a_dbg_fn("");
+        if (gr->sw_ready) {
+                gk20a_dbg_fn("skip init");
+                return 0;
+        }
+        gr->g = g;
+        err = gr_gk20a_init_gr_config(g, gr);
+        if (err)
+                goto clean_up;
+        err = gr_gk20a_init_mmu_sw(g, gr);
+        if (err)
+                goto clean_up;
+        err = gr_gk20a_init_map_tiles(g, gr);
+        if (err)
+                goto clean_up;
+        if (tegra_cpu_is_asim())
+                gr->max_comptag_mem = 1; /* MBs worth of comptag coverage */
+        else {
+                gk20a_dbg_info("total ram pages : %lu", totalram_pages);
+                gr->max_comptag_mem = totalram_pages
+                                         >> (10 - (PAGE_SHIFT - 10));
+        }
+        err = g->ops.ltc.init_comptags(g, gr);
+        if (err)
+                goto clean_up;
+        err = gr_gk20a_init_zcull(g, gr);
+        if (err)
+                goto clean_up;
+        err = gr_gk20a_alloc_global_ctx_buffers(g);
+        if (err)
+                goto clean_up;
+        err = gr_gk20a_init_access_map(g);
+        if (err)
+                goto clean_up;
+        mutex_init(&gr->ctx_mutex);
+        spin_lock_init(&gr->ch_tlb_lock);
+        gr->remove_support = gk20a_remove_gr_support;
+        gr->sw_ready = true;
+        gk20a_dbg_fn("done");
+        return 0;
+clean_up:
+        gk20a_err(dev_from_gk20a(g), "fail");
+        gk20a_remove_gr_support(gr);
+        return err;
+}
+int gk20a_init_gr_support(struct gk20a *g)
+{
+        u32 err;
+        gk20a_dbg_fn("");
+        err = gk20a_init_gr_prepare(g);
+        if (err)
+                return err;
+        /* this is required before gr_gk20a_init_ctx_state */
+        mutex_init(&g->gr.fecs_mutex);
+        err = gk20a_init_gr_reset_enable_hw(g);
+        if (err)
+                return err;
+        err = gk20a_init_gr_setup_sw(g);
+        if (err)
+                return err;
+        err = gk20a_init_gr_setup_hw(g);
+        if (err)
+                return err;
+        return 0;
+}
+#define NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE   0x02dc
+#define NVA297_SET_CIRCULAR_BUFFER_SIZE         0x1280
+#define NVA297_SET_SHADER_EXCEPTIONS            0x1528
+#define NVA0C0_SET_SHADER_EXCEPTIONS            0x1528
+#define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE 0
+struct gr_isr_data {
+        u32 addr;
+        u32 data_lo;
+        u32 data_hi;
+        u32 curr_ctx;
+        u32 chid;
+        u32 offset;
+        u32 sub_chan;
+        u32 class_num;
+};
+void gk20a_gr_set_shader_exceptions(struct gk20a *g, u32 data)
+{
+        gk20a_dbg_fn("");
+        if (data == NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE) {
+                gk20a_writel(g,
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(), 0);
+                gk20a_writel(g,
+                        gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(), 0);
+        } else {
+                /* setup sm warp esr report masks */
+                gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
+                        gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
+                /* setup sm global esr report mask */
+                gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
+                        gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
+                        gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
+                        gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
+                        gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
+                        gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
+                        gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
+                        gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
+        }
+}
+static void gk20a_gr_set_circular_buffer_size(struct gk20a *g, u32 data)
+{
+        struct gr_gk20a *gr = &g->gr;
+        u32 gpc_index, ppc_index, stride, val, offset;
+        u32 cb_size = data * 4;
+        gk20a_dbg_fn("");
+        if (cb_size > gr->attrib_cb_size)
+                cb_size = gr->attrib_cb_size;
+        gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
+                (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
+                 ~gr_ds_tga_constraintlogic_beta_cbsize_f(~0)) |
+                 gr_ds_tga_constraintlogic_beta_cbsize_f(cb_size));
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                stride = proj_gpc_stride_v() * gpc_index;
+                for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
+                        ppc_index++) {
+                        val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg_r() +
+                                stride +
+                                proj_ppc_in_gpc_stride_v() * ppc_index);
+                        offset = gr_gpc0_ppc0_cbm_cfg_start_offset_v(val);
+                        val = set_field(val,
+                                gr_gpc0_ppc0_cbm_cfg_size_m(),
+                                gr_gpc0_ppc0_cbm_cfg_size_f(cb_size *
+                                        gr->pes_tpc_count[ppc_index][gpc_index]));
+                        val = set_field(val,
+                                gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
+                                (offset + 1));
+                        gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
+                                stride +
+                                proj_ppc_in_gpc_stride_v() * ppc_index, val);
+                        val = set_field(val,
+                                gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
+                                offset);
+                        gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
+                                stride +
+                                proj_ppc_in_gpc_stride_v() * ppc_index, val);
+                }
+        }
+}
+static void gk20a_gr_set_alpha_circular_buffer_size(struct gk20a *g, u32 data)
+{
+        struct gr_gk20a *gr = &g->gr;
+        u32 gpc_index, ppc_index, stride, val;
+        u32 pd_ab_max_output;
+        u32 alpha_cb_size = data * 4;
+        gk20a_dbg_fn("");
+        /* if (NO_ALPHA_BETA_TIMESLICE_SUPPORT_DEF)
+                return; */
+        if (alpha_cb_size > gr->alpha_cb_size)
+                alpha_cb_size = gr->alpha_cb_size;
+        gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
+                (gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
+                 ~gr_ds_tga_constraintlogic_alpha_cbsize_f(~0)) |
+                 gr_ds_tga_constraintlogic_alpha_cbsize_f(alpha_cb_size));
+        pd_ab_max_output = alpha_cb_size *
+                gr_gpc0_ppc0_cbm_cfg_size_granularity_v() /
+                gr_pd_ab_dist_cfg1_max_output_granularity_v();
+        gk20a_writel(g, gr_pd_ab_dist_cfg1_r(),
+                gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output));
+        for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+                stride = proj_gpc_stride_v() * gpc_index;
+                for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
+                        ppc_index++) {
+                        val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg2_r() +
+                                stride +
+                                proj_ppc_in_gpc_stride_v() * ppc_index);
+                        val = set_field(val, gr_gpc0_ppc0_cbm_cfg2_size_m(),
+                                        gr_gpc0_ppc0_cbm_cfg2_size_f(alpha_cb_size *
+                                                gr->pes_tpc_count[ppc_index][gpc_index]));
+                        gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg2_r() +
+                                stride +
+                                proj_ppc_in_gpc_stride_v() * ppc_index, val);
+                }
+        }
+}
+void gk20a_gr_reset(struct gk20a *g)
+{
+        int err;
+        err = gk20a_init_gr_prepare(g);
+        BUG_ON(err);
+        err = gk20a_init_gr_reset_enable_hw(g);
+        BUG_ON(err);
+        err = gk20a_init_gr_setup_hw(g);
+        BUG_ON(err);
+}
+static int gr_gk20a_handle_sw_method(struct gk20a *g, u32 addr,
+                                          u32 class_num, u32 offset, u32 data)
+{
+        gk20a_dbg_fn("");
+        if (class_num == KEPLER_COMPUTE_A) {
+                switch (offset << 2) {
+                case NVA0C0_SET_SHADER_EXCEPTIONS:
+                        gk20a_gr_set_shader_exceptions(g, data);
+                        break;
+                default:
+                        goto fail;
+                }
+        }
+        if (class_num == KEPLER_C) {
+                switch (offset << 2) {
+                case NVA297_SET_SHADER_EXCEPTIONS:
+                        gk20a_gr_set_shader_exceptions(g, data);
+                        break;
+                case NVA297_SET_CIRCULAR_BUFFER_SIZE:
+                        g->ops.gr.set_circular_buffer_size(g, data);
+                        break;
+                case NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE:
+                        g->ops.gr.set_alpha_circular_buffer_size(g, data);
+                        break;
+                default:
+                        goto fail;
+                }
+        }
+        return 0;
+fail:
+        return -EINVAL;
+}
+static int gk20a_gr_handle_semaphore_timeout_pending(struct gk20a *g,
+                  struct gr_isr_data *isr_data)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        struct channel_gk20a *ch = &f->channel[isr_data->chid];
+        gk20a_dbg_fn("");
+        gk20a_set_error_notifier(ch,
+                                NVHOST_CHANNEL_GR_SEMAPHORE_TIMEOUT);
+        gk20a_err(dev_from_gk20a(g),
+                   "gr semaphore timeout\n");
+        return -EINVAL;
+}
+static int gk20a_gr_intr_illegal_notify_pending(struct gk20a *g,
+                  struct gr_isr_data *isr_data)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        struct channel_gk20a *ch = &f->channel[isr_data->chid];
+        gk20a_dbg_fn("");
+        gk20a_set_error_notifier(ch,
+                                NVHOST_CHANNEL_GR_ILLEGAL_NOTIFY);
+        /* This is an unrecoverable error, reset is needed */
+        gk20a_err(dev_from_gk20a(g),
+                   "gr semaphore timeout\n");
+        return -EINVAL;
+}
+static int gk20a_gr_handle_illegal_method(struct gk20a *g,
+                                          struct gr_isr_data *isr_data)
+{
+        int ret = g->ops.gr.handle_sw_method(g, isr_data->addr,
+                        isr_data->class_num, isr_data->offset,
+                        isr_data->data_lo);
+        if (ret)
+                gk20a_err(dev_from_gk20a(g), "invalid method class 0x%08x"
+                        ", offset 0x%08x address 0x%08x\n",
+                        isr_data->class_num, isr_data->offset, isr_data->addr);
+        return ret;
+}
+static int gk20a_gr_handle_illegal_class(struct gk20a *g,
+                                          struct gr_isr_data *isr_data)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        struct channel_gk20a *ch = &f->channel[isr_data->chid];
+        gk20a_dbg_fn("");
+        gk20a_set_error_notifier(ch,
+                                NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
+        gk20a_err(dev_from_gk20a(g),
+                   "invalid class 0x%08x, offset 0x%08x",
+                   isr_data->class_num, isr_data->offset);
+        return -EINVAL;
+}
+static int gk20a_gr_handle_class_error(struct gk20a *g,
+                                          struct gr_isr_data *isr_data)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        struct channel_gk20a *ch = &f->channel[isr_data->chid];
+        gk20a_dbg_fn("");
+        gk20a_set_error_notifier(ch,
+                        NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
+        gk20a_err(dev_from_gk20a(g),
+                   "class error 0x%08x, offset 0x%08x",
+                   isr_data->class_num, isr_data->offset);
+        return -EINVAL;
+}
+static int gk20a_gr_handle_semaphore_pending(struct gk20a *g,
+                                             struct gr_isr_data *isr_data)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        struct channel_gk20a *ch = &f->channel[isr_data->chid];
+        wake_up(&ch->semaphore_wq);
+        return 0;
+}
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+static inline bool is_valid_cyclestats_bar0_offset_gk20a(struct gk20a *g,
+                                                         u32 offset)
+{
+        /* support only 24-bit 4-byte aligned offsets */
+        bool valid = !(offset & 0xFF000003);
+        /* whitelist check */
+        valid = valid &&
+                is_bar0_global_offset_whitelisted_gk20a(offset);
+        /* resource size check in case there was a problem
+         * with allocating the assumed size of bar0 */
+        valid = valid &&
+                offset < resource_size(g->reg_mem);
+        return valid;
+}
+#endif
+static int gk20a_gr_handle_notify_pending(struct gk20a *g,
+                                          struct gr_isr_data *isr_data)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        struct channel_gk20a *ch = &f->channel[isr_data->chid];
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+        void *virtual_address;
+        u32 buffer_size;
+        u32 offset;
+        u32 new_offset;
+        bool exit;
+        struct share_buffer_head *sh_hdr;
+        u32 raw_reg;
+        u64 mask_orig;
+        u64 v = 0;
+        struct gk20a_cyclestate_buffer_elem *op_elem;
+        /* GL will never use payload 0 for cycle state */
+        if ((ch->cyclestate.cyclestate_buffer == NULL) || (isr_data->data_lo == 0))
+                return 0;
+        mutex_lock(&ch->cyclestate.cyclestate_buffer_mutex);
+        virtual_address = ch->cyclestate.cyclestate_buffer;
+        buffer_size = ch->cyclestate.cyclestate_buffer_size;
+        offset = isr_data->data_lo;
+        exit = false;
+        while (!exit) {
+                if (offset >= buffer_size) {
+                        WARN_ON(1);
+                        break;
+                }
+                sh_hdr = (struct share_buffer_head *)
+                        ((char *)virtual_address + offset);
+                if (sh_hdr->size < sizeof(struct share_buffer_head)) {
+                        WARN_ON(1);
+                        break;
+                }
+                new_offset = offset + sh_hdr->size;
+                switch (sh_hdr->operation) {
+                case OP_END:
+                        exit = true;
+                        break;
+                case BAR0_READ32:
+                case BAR0_WRITE32:
+                {
+                        bool valid;
+                        op_elem =
+                                (struct gk20a_cyclestate_buffer_elem *)
+                                        sh_hdr;
+                        valid = is_valid_cyclestats_bar0_offset_gk20a(g,
+                                                        op_elem->offset_bar0);
+                        if (!valid) {
+                                gk20a_err(dev_from_gk20a(g),
+                                           "invalid cycletstats op offset: 0x%x\n",
+                                           op_elem->offset_bar0);
+                                sh_hdr->failed = exit = true;
+                                break;
+                        }
+                        mask_orig =
+                                ((1ULL <<
+                                  (op_elem->last_bit + 1))
+                                 -1)&~((1ULL <<
+                                        op_elem->first_bit)-1);
+                        raw_reg =
+                                gk20a_readl(g,
+                                            op_elem->offset_bar0);
+                        switch (sh_hdr->operation) {
+                        case BAR0_READ32:
+                                op_elem->data =
+                                        (raw_reg & mask_orig)
+                                        >> op_elem->first_bit;
+                                break;
+                        case BAR0_WRITE32:
+                                v = 0;
+                                if ((unsigned int)mask_orig !=
+                                    (unsigned int)~0) {
+                                        v = (unsigned int)
+                                                (raw_reg & ~mask_orig);
+                                }
+                                v |= ((op_elem->data
+                                       << op_elem->first_bit)
+                                      & mask_orig);
+                                gk20a_writel(g,
+                                             op_elem->offset_bar0,
+                                             (unsigned int)v);
+                                break;
+                        default:
+                                /* nop ok?*/
+                                break;
+                        }
+                }
+                break;
+                default:
+                        /* no operation content case */
+                        exit = true;
+                        break;
+                }
+                sh_hdr->completed = true;
+                offset = new_offset;
+        }
+        mutex_unlock(&ch->cyclestate.cyclestate_buffer_mutex);
+#endif
+        gk20a_dbg_fn("");
+        wake_up(&ch->notifier_wq);
+        return 0;
+}
+/* Used by sw interrupt thread to translate current ctx to chid.
+ * For performance, we don't want to go through 128 channels every time.
+ * A small tlb is used here to cache translation */
+static int gk20a_gr_get_chid_from_ctx(struct gk20a *g, u32 curr_ctx)
+{
+        struct fifo_gk20a *f = &g->fifo;
+        struct gr_gk20a *gr = &g->gr;
+        u32 chid = -1;
+        u32 i;
+        spin_lock(&gr->ch_tlb_lock);
+        /* check cache first */
+        for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
+                if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
+                        chid = gr->chid_tlb[i].hw_chid;
+                        goto unlock;
+                }
+        }
+        /* slow path */
+        for (chid = 0; chid < f->num_channels; chid++)
+                if (f->channel[chid].in_use) {
+                        if ((u32)(f->channel[chid].inst_block.cpu_pa >>
+                                ram_in_base_shift_v()) ==
+                                gr_fecs_current_ctx_ptr_v(curr_ctx))
+                                break;
+        }
+        if (chid >= f->num_channels) {
+                chid = -1;
+                goto unlock;
+        }
+        /* add to free tlb entry */
+        for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
+                if (gr->chid_tlb[i].curr_ctx == 0) {
+                        gr->chid_tlb[i].curr_ctx = curr_ctx;
+                        gr->chid_tlb[i].hw_chid = chid;
+                        goto unlock;
+                }
+        }
+        /* no free entry, flush one */
+        gr->chid_tlb[gr->channel_tlb_flush_index].curr_ctx = curr_ctx;
+        gr->chid_tlb[gr->channel_tlb_flush_index].hw_chid = chid;
+        gr->channel_tlb_flush_index =
+                (gr->channel_tlb_flush_index + 1) &
+                (GR_CHANNEL_MAP_TLB_SIZE - 1);
+unlock:
+        spin_unlock(&gr->ch_tlb_lock);
+        return chid;
+}
+static int gk20a_gr_lock_down_sm(struct gk20a *g, u32 global_esr_mask)
+{
+        unsigned long end_jiffies = jiffies +
+                msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+        u32 delay = GR_IDLE_CHECK_DEFAULT;
+        bool mmu_debug_mode_enabled = gk20a_mm_mmu_debug_mode_enabled(g);
+        u32 dbgr_control0;
+        gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "locking down SM");
+        /* assert stop trigger */
+        dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
+        dbgr_control0 |= gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
+        gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
+        /* wait for the sm to lock down */
+        do {
+                u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
+                u32 warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
+                u32 dbgr_status0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_status0_r());
+                bool locked_down =
+                        (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) ==
+                         gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v());
+                bool error_pending =
+                        (gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) !=
+                         gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) ||
+                        ((global_esr & ~global_esr_mask) != 0);
+                if (locked_down || !error_pending) {
+                        gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "locked down SM");
+                        /* de-assert stop trigger */
+                        dbgr_control0 &= ~gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
+                        gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
+                        return 0;
+                }
+                /* if an mmu fault is pending and mmu debug mode is not
+                 * enabled, the sm will never lock down. */
+                if (!mmu_debug_mode_enabled && gk20a_fifo_mmu_fault_pending(g)) {
+                        gk20a_err(dev_from_gk20a(g), "mmu fault pending, sm will"
+                                   " never lock down!");
+                        return -EFAULT;
+                }
+                usleep_range(delay, delay * 2);
+                delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+        } while (time_before(jiffies, end_jiffies)
+                        || !tegra_platform_is_silicon());
+        gk20a_err(dev_from_gk20a(g), "timed out while trying to lock down SM");
+        return -EAGAIN;
+}
+bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
+{
+        u32 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
+        /* check if an sm debugger is attached */
+        if (gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(dbgr_control0) ==
+                        gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v())
+                return true;
+        return false;
+}
+static void gk20a_gr_clear_sm_hww(struct gk20a *g, u32 global_esr)
+{
+        gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r(), global_esr);
+        /* clear the warp hww */
+        gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r(),
+                        gr_gpc0_tpc0_sm_hww_warp_esr_error_none_f());
+}
+static struct channel_gk20a *
+channel_from_hw_chid(struct gk20a *g, u32 hw_chid)
+{
+        return g->fifo.channel+hw_chid;
+}
+static int gk20a_gr_handle_sm_exception(struct gk20a *g,
+                struct gr_isr_data *isr_data)
+{
+        int ret = 0;
+        bool do_warp_sync = false;
+        /* these three interrupts don't require locking down the SM. They can
+         * be handled by usermode clients as they aren't fatal. Additionally,
+         * usermode clients may wish to allow some warps to execute while others
+         * are at breakpoints, as opposed to fatal errors where all warps should
+         * halt. */
+        u32 global_mask = gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()   |
+                          gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() |
+                          gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f();
+        u32 global_esr, warp_esr;
+        bool sm_debugger_attached = gk20a_gr_sm_debugger_attached(g);
+        struct channel_gk20a *fault_ch;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+        global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
+        warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
+        /* if an sm debugger is attached, disable forwarding of tpc exceptions.
+         * the debugger will reenable exceptions after servicing them. */
+        if (sm_debugger_attached) {
+                u32 tpc_exception_en = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r());
+                tpc_exception_en &= ~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
+                gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(), tpc_exception_en);
+                gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM debugger attached");
+        }
+        /* if a debugger is present and an error has occurred, do a warp sync */
+        if (sm_debugger_attached && ((warp_esr != 0) || ((global_esr & ~global_mask) != 0))) {
+                gk20a_dbg(gpu_dbg_intr, "warp sync needed");
+                do_warp_sync = true;
+        }
+        if (do_warp_sync) {
+                ret = gk20a_gr_lock_down_sm(g, global_mask);
+                if (ret) {
+                        gk20a_err(dev_from_gk20a(g), "sm did not lock down!\n");
+                        return ret;
+                }
+        }
+        /* finally, signal any client waiting on an event */
+        fault_ch = channel_from_hw_chid(g, isr_data->chid);
+        if (fault_ch)
+                gk20a_dbg_gpu_post_events(fault_ch);
+        return ret;
+}
+static int gk20a_gr_handle_tpc_exception(struct gk20a *g,
+                struct gr_isr_data *isr_data)
+{
+        int ret = 0;
+        u32 tpc_exception = gk20a_readl(g, gr_gpcs_tpcs_tpccs_tpc_exception_r());
+        gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "");
+        /* check if an sm exeption is pending  */
+        if (gr_gpcs_tpcs_tpccs_tpc_exception_sm_v(tpc_exception) ==
+                        gr_gpcs_tpcs_tpccs_tpc_exception_sm_pending_v()) {
+                gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM exception pending");
+                ret = gk20a_gr_handle_sm_exception(g, isr_data);
+        }
+        return ret;
+}
+static int gk20a_gr_handle_gpc_exception(struct gk20a *g,
+                struct gr_isr_data *isr_data)
+{
+        int ret = 0;
+        u32 gpc_exception = gk20a_readl(g, gr_gpcs_gpccs_gpc_exception_r());
+        gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "");
+        /* check if tpc 0 has an exception */
+        if (gr_gpcs_gpccs_gpc_exception_tpc_v(gpc_exception) ==
+                        gr_gpcs_gpccs_gpc_exception_tpc_0_pending_v()) {
+                gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "TPC exception pending");
+                ret = gk20a_gr_handle_tpc_exception(g, isr_data);
+        }
+        return ret;
+}
+int gk20a_gr_isr(struct gk20a *g)
+{
+        struct gr_isr_data isr_data;
+        u32 grfifo_ctl;
+        u32 obj_table;
+        int need_reset = 0;
+        u32 gr_intr = gk20a_readl(g, gr_intr_r());
+        gk20a_dbg_fn("");
+        gk20a_dbg(gpu_dbg_intr, "pgraph intr %08x", gr_intr);
+        if (!gr_intr)
+                return 0;
+        grfifo_ctl = gk20a_readl(g, gr_gpfifo_ctl_r());
+        grfifo_ctl &= ~gr_gpfifo_ctl_semaphore_access_f(1);
+        grfifo_ctl &= ~gr_gpfifo_ctl_access_f(1);
+        gk20a_writel(g, gr_gpfifo_ctl_r(),
+                grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
+                gr_gpfifo_ctl_semaphore_access_f(0));
+        isr_data.addr = gk20a_readl(g, gr_trapped_addr_r());
+        isr_data.data_lo = gk20a_readl(g, gr_trapped_data_lo_r());
+        isr_data.data_hi = gk20a_readl(g, gr_trapped_data_hi_r());
+        isr_data.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
+        isr_data.offset = gr_trapped_addr_mthd_v(isr_data.addr);
+        isr_data.sub_chan = gr_trapped_addr_subch_v(isr_data.addr);
+        obj_table = gk20a_readl(g,
+                gr_fe_object_table_r(isr_data.sub_chan));
+        isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
+        isr_data.chid =
+                gk20a_gr_get_chid_from_ctx(g, isr_data.curr_ctx);
+        if (isr_data.chid == -1) {
+                gk20a_err(dev_from_gk20a(g), "invalid channel ctx 0x%08x",
+                           isr_data.curr_ctx);
+                goto clean_up;
+        }
+        gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                "channel %d: addr 0x%08x, "
+                "data 0x%08x 0x%08x,"
+                "ctx 0x%08x, offset 0x%08x, "
+                "subchannel 0x%08x, class 0x%08x",
+                isr_data.chid, isr_data.addr,
+                isr_data.data_hi, isr_data.data_lo,
+                isr_data.curr_ctx, isr_data.offset,
+                isr_data.sub_chan, isr_data.class_num);
+        if (gr_intr & gr_intr_notify_pending_f()) {
+                gk20a_gr_handle_notify_pending(g, &isr_data);
+                gk20a_writel(g, gr_intr_r(),
+                        gr_intr_notify_reset_f());
+                gr_intr &= ~gr_intr_notify_pending_f();
+        }
+        if (gr_intr & gr_intr_semaphore_pending_f()) {
+                gk20a_gr_handle_semaphore_pending(g, &isr_data);
+                gk20a_writel(g, gr_intr_r(),
+                        gr_intr_semaphore_reset_f());
+                gr_intr &= ~gr_intr_semaphore_pending_f();
+        }
+        if (gr_intr & gr_intr_semaphore_timeout_pending_f()) {
+                need_reset |= gk20a_gr_handle_semaphore_timeout_pending(g,
+                        &isr_data);
+                gk20a_writel(g, gr_intr_r(),
+                        gr_intr_semaphore_reset_f());
+                gr_intr &= ~gr_intr_semaphore_pending_f();
+        }
+        if (gr_intr & gr_intr_illegal_notify_pending_f()) {
+                need_reset |= gk20a_gr_intr_illegal_notify_pending(g,
+                        &isr_data);
+                gk20a_writel(g, gr_intr_r(),
+                        gr_intr_illegal_notify_reset_f());
+                gr_intr &= ~gr_intr_illegal_notify_pending_f();
+        }
+        if (gr_intr & gr_intr_illegal_method_pending_f()) {
+                need_reset |= gk20a_gr_handle_illegal_method(g, &isr_data);
+                gk20a_writel(g, gr_intr_r(),
+                        gr_intr_illegal_method_reset_f());
+                gr_intr &= ~gr_intr_illegal_method_pending_f();
+        }
+        if (gr_intr & gr_intr_illegal_class_pending_f()) {
+                need_reset |= gk20a_gr_handle_illegal_class(g, &isr_data);
+                gk20a_writel(g, gr_intr_r(),
+                        gr_intr_illegal_class_reset_f());
+                gr_intr &= ~gr_intr_illegal_class_pending_f();
+        }
+        if (gr_intr & gr_intr_class_error_pending_f()) {
+                need_reset |= gk20a_gr_handle_class_error(g, &isr_data);
+                gk20a_writel(g, gr_intr_r(),
+                        gr_intr_class_error_reset_f());
+                gr_intr &= ~gr_intr_class_error_pending_f();
+        }
+        /* this one happens if someone tries to hit a non-whitelisted
+         * register using set_falcon[4] */
+        if (gr_intr & gr_intr_firmware_method_pending_f()) {
+                need_reset |= true;
+                gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "firmware method intr pending\n");
+                gk20a_writel(g, gr_intr_r(),
+                        gr_intr_firmware_method_reset_f());
+                gr_intr &= ~gr_intr_firmware_method_pending_f();
+        }
+        if (gr_intr & gr_intr_exception_pending_f()) {
+                u32 exception = gk20a_readl(g, gr_exception_r());
+                struct fifo_gk20a *f = &g->fifo;
+                struct channel_gk20a *ch = &f->channel[isr_data.chid];
+                gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "exception %08x\n", exception);
+                if (exception & gr_exception_fe_m()) {
+                        u32 fe = gk20a_readl(g, gr_fe_hww_esr_r());
+                        gk20a_dbg(gpu_dbg_intr, "fe warning %08x\n", fe);
+                        gk20a_writel(g, gr_fe_hww_esr_r(), fe);
+                }
+                /* check if a gpc exception has occurred */
+                if (exception & gr_exception_gpc_m() && need_reset == 0) {
+                        u32 exception1 = gk20a_readl(g, gr_exception1_r());
+                        u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
+                        gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC exception pending");
+                        /* if no sm debugger is present, clean up the channel */
+                        if (!gk20a_gr_sm_debugger_attached(g)) {
+                                gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
+                                           "SM debugger not attached, clearing interrupt");
+                                need_reset |= -EFAULT;
+                        } else {
+                                /* check if gpc 0 has an exception */
+                                if (exception1 & gr_exception1_gpc_0_pending_f())
+                                        need_reset |= gk20a_gr_handle_gpc_exception(g, &isr_data);
+                                /* clear the hwws, also causes tpc and gpc
+                                 * exceptions to be cleared */
+                                gk20a_gr_clear_sm_hww(g, global_esr);
+                        }
+                        if (need_reset)
+                                gk20a_set_error_notifier(ch,
+                                        NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
+                }
+                gk20a_writel(g, gr_intr_r(), gr_intr_exception_reset_f());
+                gr_intr &= ~gr_intr_exception_pending_f();
+        }
+        if (need_reset)
+                gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A), true);
+clean_up:
+        gk20a_writel(g, gr_gpfifo_ctl_r(),
+                grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
+                gr_gpfifo_ctl_semaphore_access_f(1));
+        if (gr_intr)
+                gk20a_err(dev_from_gk20a(g),
+                           "unhandled gr interrupt 0x%08x", gr_intr);
+        return 0;
+}
+int gk20a_gr_nonstall_isr(struct gk20a *g)
+{
+        u32 gr_intr = gk20a_readl(g, gr_intr_nonstall_r());
+        u32 clear_intr = 0;
+        gk20a_dbg(gpu_dbg_intr, "pgraph nonstall intr %08x", gr_intr);
+        if (gr_intr & gr_intr_nonstall_trap_pending_f()) {
+                gk20a_channel_semaphore_wakeup(g);
+                clear_intr |= gr_intr_nonstall_trap_pending_f();
+        }
+        gk20a_writel(g, gr_intr_nonstall_r(), clear_intr);
+        return 0;
+}
+int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size)
+{
+        BUG_ON(size == NULL);
+        return gr_gk20a_submit_fecs_method_op(g,
+                   (struct fecs_method_op_gk20a) {
+                           .mailbox.id = 0,
+                           .mailbox.data = 0,
+                           .mailbox.clr = ~0,
+                           .method.data = 1,
+                           .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(),
+                           .mailbox.ret = size,
+                           .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
+                           .mailbox.ok = 0,
+                           .cond.fail = GR_IS_UCODE_OP_SKIP,
+                           .mailbox.fail = 0});
+}
+int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g, phys_addr_t addr)
+{
+        return gr_gk20a_submit_fecs_method_op(g,
+                   (struct fecs_method_op_gk20a){
+                           .mailbox.id = 4,
+                           .mailbox.data = (gr_fecs_current_ctx_ptr_f(addr >> 12) |
+                                            gr_fecs_current_ctx_valid_f(1) |
+                                            gr_fecs_current_ctx_target_vid_mem_f()),
+                           .mailbox.clr = ~0,
+                           .method.data = 1,
+                           .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
+                           .mailbox.ret = NULL,
+                           .cond.ok = GR_IS_UCODE_OP_EQUAL,
+                           .mailbox.ok = 1,
+                           .cond.fail = GR_IS_UCODE_OP_SKIP,
+                           .mailbox.fail = 0});
+}
+int gr_gk20a_fecs_set_reglist_virual_addr(struct gk20a *g, u64 pmu_va)
+{
+        return gr_gk20a_submit_fecs_method_op(g,
+                   (struct fecs_method_op_gk20a) {
+                           .mailbox.id = 4,
+                           .mailbox.data = u64_lo32(pmu_va >> 8),
+                           .mailbox.clr = ~0,
+                           .method.data = 1,
+                           .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
+                           .mailbox.ret = NULL,
+                           .cond.ok = GR_IS_UCODE_OP_EQUAL,
+                           .mailbox.ok = 1,
+                           .cond.fail = GR_IS_UCODE_OP_SKIP,
+                           .mailbox.fail = 0});
+}
+int gk20a_gr_suspend(struct gk20a *g)
+{
+        unsigned long end_jiffies = jiffies +
+                msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+        u32 ret = 0;
+        gk20a_dbg_fn("");
+        ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
+        if (ret)
+                return ret;
+        gk20a_writel(g, gr_gpfifo_ctl_r(),
+                gr_gpfifo_ctl_access_disabled_f());
+        /* disable gr intr */
+        gk20a_writel(g, gr_intr_r(), 0);
+        gk20a_writel(g, gr_intr_en_r(), 0);
+        /* disable all exceptions */
+        gk20a_writel(g, gr_exception_r(), 0);
+        gk20a_writel(g, gr_exception_en_r(), 0);
+        gk20a_writel(g, gr_exception1_r(), 0);
+        gk20a_writel(g, gr_exception1_en_r(), 0);
+        gk20a_writel(g, gr_exception2_r(), 0);
+        gk20a_writel(g, gr_exception2_en_r(), 0);
+        gk20a_gr_flush_channel_tlb(&g->gr);
+        gk20a_dbg_fn("done");
+        return ret;
+}
+static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
+                                               u32 addr,
+                                               bool is_quad, u32 quad,
+                                               u32 *context_buffer,
+                                               u32 context_buffer_size,
+                                               u32 *priv_offset);
+/* This function will decode a priv address and return the partition type and numbers. */
+int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
+                              int  *addr_type, /* enum ctxsw_addr_type */
+                              u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
+                              u32 *broadcast_flags)
+{
+        u32 gpc_addr;
+        u32 ppc_address;
+        u32 ppc_broadcast_addr;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+        /* setup defaults */
+        ppc_address = 0;
+        ppc_broadcast_addr = 0;
+        *addr_type = CTXSW_ADDR_TYPE_SYS;
+        *broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
+        *gpc_num = 0;
+        *tpc_num = 0;
+        *ppc_num = 0;
+        *be_num  = 0;
+        if (pri_is_gpc_addr(addr)) {
+                *addr_type = CTXSW_ADDR_TYPE_GPC;
+                gpc_addr = pri_gpccs_addr_mask(addr);
+                if (pri_is_gpc_addr_shared(addr)) {
+                        *addr_type = CTXSW_ADDR_TYPE_GPC;
+                        *broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
+                } else
+                        *gpc_num = pri_get_gpc_num(addr);
+                if (pri_is_tpc_addr(gpc_addr)) {
+                        *addr_type = CTXSW_ADDR_TYPE_TPC;
+                        if (pri_is_tpc_addr_shared(gpc_addr)) {
+                                *broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
+                                return 0;
+                        }
+                        *tpc_num = pri_get_tpc_num(gpc_addr);
+                }
+                return 0;
+        } else if (pri_is_be_addr(addr)) {
+                *addr_type = CTXSW_ADDR_TYPE_BE;
+                if (pri_is_be_addr_shared(addr)) {
+                        *broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
+                        return 0;
+                }
+                *be_num = pri_get_be_num(addr);
+                return 0;
+        } else {
+                *addr_type = CTXSW_ADDR_TYPE_SYS;
+                return 0;
+        }
+        /* PPC!?!?!?! */
+        /*NOTREACHED*/
+        return -EINVAL;
+}
+static int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
+                                      u32 gpc_num,
+                                      u32 *priv_addr_table, u32 *t)
+{
+    u32 ppc_num;
+    gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+    for (ppc_num = 0; ppc_num < g->gr.pe_count_per_gpc; ppc_num++)
+            priv_addr_table[(*t)++] = pri_ppc_addr(pri_ppccs_addr_mask(addr),
+                                                   gpc_num, ppc_num);
+    return 0;
+}
+/*
+ * The context buffer is indexed using BE broadcast addresses and GPC/TPC
+ * unicast addresses. This function will convert a BE unicast address to a BE
+ * broadcast address and split a GPC/TPC broadcast address into a table of
+ * GPC/TPC addresses.  The addresses generated by this function can be
+ * successfully processed by gr_gk20a_find_priv_offset_in_buffer
+ */
+static int gr_gk20a_create_priv_addr_table(struct gk20a *g,
+                                           u32 addr,
+                                           u32 *priv_addr_table,
+                                           u32 *num_registers)
+{
+        int addr_type; /*enum ctxsw_addr_type */
+        u32 gpc_num, tpc_num, ppc_num, be_num;
+        u32 broadcast_flags;
+        u32 t;
+        int err;
+        t = 0;
+        *num_registers = 0;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+        err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
+                                        &gpc_num, &tpc_num, &ppc_num, &be_num,
+                                        &broadcast_flags);
+        gk20a_dbg(gpu_dbg_gpu_dbg, "addr_type = %d", addr_type);
+        if (err)
+                return err;
+        if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
+            (addr_type == CTXSW_ADDR_TYPE_BE)) {
+                /* The BE broadcast registers are included in the compressed PRI
+                 * table. Convert a BE unicast address to a broadcast address
+                 * so that we can look up the offset. */
+                if ((addr_type == CTXSW_ADDR_TYPE_BE) &&
+                    !(broadcast_flags & PRI_BROADCAST_FLAGS_BE))
+                        priv_addr_table[t++] = pri_be_shared_addr(addr);
+                else
+                        priv_addr_table[t++] = addr;
+                *num_registers = t;
+                return 0;
+        }
+        /* The GPC/TPC unicast registers are included in the compressed PRI
+         * tables. Convert a GPC/TPC broadcast address to unicast addresses so
+         * that we can look up the offsets. */
+        if (broadcast_flags & PRI_BROADCAST_FLAGS_GPC) {
+                for (gpc_num = 0; gpc_num < g->gr.gpc_count; gpc_num++) {
+                        if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
+                                for (tpc_num = 0;
+                                     tpc_num < g->gr.gpc_tpc_count[gpc_num];
+                                     tpc_num++)
+                                        priv_addr_table[t++] =
+                                                pri_tpc_addr(pri_tpccs_addr_mask(addr),
+                                                             gpc_num, tpc_num);
+                        else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
+                                err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
+                                                               priv_addr_table, &t);
+                                if (err)
+                                        return err;
+                        } else
+                                priv_addr_table[t++] =
+                                        pri_gpc_addr(pri_gpccs_addr_mask(addr),
+                                                     gpc_num);
+                }
+        } else {
+                if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
+                        for (tpc_num = 0;
+                             tpc_num < g->gr.gpc_tpc_count[gpc_num];
+                             tpc_num++)
+                                priv_addr_table[t++] =
+                                        pri_tpc_addr(pri_tpccs_addr_mask(addr),
+                                                     gpc_num, tpc_num);
+                else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC)
+                        err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
+                                                       priv_addr_table, &t);
+                else
+                        priv_addr_table[t++] = addr;
+        }
+        *num_registers = t;
+        return 0;
+}
+int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
+                                    u32 addr,
+                                    u32 max_offsets,
+                                    u32 *offsets, u32 *offset_addrs,
+                                    u32 *num_offsets,
+                                    bool is_quad, u32 quad)
+{
+        u32 i;
+        u32 priv_offset = 0;
+        u32 *priv_registers;
+        u32 num_registers = 0;
+        int err = 0;
+        u32 potential_offsets = proj_scal_litter_num_gpcs_v() *
+                proj_scal_litter_num_tpc_per_gpc_v();
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+        /* implementation is crossed-up if either of these happen */
+        if (max_offsets > potential_offsets)
+                return -EINVAL;
+        if (!g->gr.ctx_vars.golden_image_initialized)
+                return -ENODEV;
+        priv_registers = kzalloc(sizeof(u32) * potential_offsets, GFP_KERNEL);
+        if (IS_ERR_OR_NULL(priv_registers)) {
+                gk20a_dbg_fn("failed alloc for potential_offsets=%d", potential_offsets);
+                err = PTR_ERR(priv_registers);
+                goto cleanup;
+        }
+        memset(offsets,      0, sizeof(u32) * max_offsets);
+        memset(offset_addrs, 0, sizeof(u32) * max_offsets);
+        *num_offsets = 0;
+        gr_gk20a_create_priv_addr_table(g, addr, &priv_registers[0], &num_registers);
+        if ((max_offsets > 1) && (num_registers > max_offsets)) {
+                err = -EINVAL;
+                goto cleanup;
+        }
+        if ((max_offsets == 1) && (num_registers > 1))
+                num_registers = 1;
+        if (!g->gr.ctx_vars.local_golden_image) {
+                gk20a_dbg_fn("no context switch header info to work with");
+                err = -EINVAL;
+                goto cleanup;
+        }
+        for (i = 0; i < num_registers; i++) {
+                err = gr_gk20a_find_priv_offset_in_buffer(g,
+                                                  priv_registers[i],
+                                                  is_quad, quad,
+                                                  g->gr.ctx_vars.local_golden_image,
+                                                  g->gr.ctx_vars.golden_image_size,
+                                                  &priv_offset);
+                if (err) {
+                        gk20a_dbg_fn("Could not determine priv_offset for addr:0x%x",
+                                      addr); /*, grPriRegStr(addr)));*/
+                        goto cleanup;
+                }
+                offsets[i] = priv_offset;
+                offset_addrs[i] = priv_registers[i];
+        }
+    *num_offsets = num_registers;
+ cleanup:
+    if (!IS_ERR_OR_NULL(priv_registers))
+            kfree(priv_registers);
+    return err;
+}
+/* Setup some register tables.  This looks hacky; our
+ * register/offset functions are just that, functions.
+ * So they can't be used as initializers... TBD: fix to
+ * generate consts at least on an as-needed basis.
+ */
+static const u32 _num_ovr_perf_regs = 17;
+static u32 _ovr_perf_regs[17] = { 0, };
+/* Following are the blocks of registers that the ucode
+ stores in the extended region.*/
+/* ==  ctxsw_extended_sm_dsm_perf_counter_register_stride_v() ? */
+static const u32 _num_sm_dsm_perf_regs = 5;
+/* ==  ctxsw_extended_sm_dsm_perf_counter_control_register_stride_v() ?*/
+static const u32 _num_sm_dsm_perf_ctrl_regs = 4;
+static u32 _sm_dsm_perf_regs[5];
+static u32 _sm_dsm_perf_ctrl_regs[4];
+static void init_sm_dsm_reg_info(void)
+{
+        if (_ovr_perf_regs[0] != 0)
+                return;
+        _ovr_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r();
+        _ovr_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r();
+        _ovr_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r();
+        _ovr_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r();
+        _ovr_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r();
+        _ovr_perf_regs[5] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r();
+        _ovr_perf_regs[6] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r();
+        _ovr_perf_regs[7] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r();
+        _ovr_perf_regs[8] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r();
+        _ovr_perf_regs[9] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r();
+        _ovr_perf_regs[10] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r();
+        _ovr_perf_regs[11] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r();
+        _ovr_perf_regs[12] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r();
+        _ovr_perf_regs[13] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r();
+        _ovr_perf_regs[14] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r();
+        _ovr_perf_regs[15] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r();
+        _ovr_perf_regs[16] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r();
+        _sm_dsm_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status_r();
+        _sm_dsm_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_r();
+        _sm_dsm_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_r();
+        _sm_dsm_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_r();
+        _sm_dsm_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_r();
+        _sm_dsm_perf_ctrl_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control1_r();
+        _sm_dsm_perf_ctrl_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control2_r();
+        _sm_dsm_perf_ctrl_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control3_r();
+        _sm_dsm_perf_ctrl_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control4_r();
+}
+/* TBD: would like to handle this elsewhere, at a higher level.
+ * these are currently constructed in a "test-then-write" style
+ * which makes it impossible to know externally whether a ctx
+ * write will actually occur. so later we should put a lazy,
+ *  map-and-hold system in the patch write state */
+int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
+                            struct channel_ctx_gk20a *ch_ctx,
+                            u32 addr, u32 data,
+                            u8 *context)
+{
+        u32 num_gpc = g->gr.gpc_count;
+        u32 num_tpc;
+        u32 tpc, gpc, reg;
+        u32 chk_addr;
+        u32 vaddr_lo;
+        u32 vaddr_hi;
+        u32 tmp;
+        init_sm_dsm_reg_info();
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+        for (reg = 0; reg < _num_ovr_perf_regs; reg++) {
+                for (gpc = 0; gpc < num_gpc; gpc++)  {
+                        num_tpc = g->gr.gpc_tpc_count[gpc];
+                        for (tpc = 0; tpc < num_tpc; tpc++) {
+                                chk_addr = ((proj_gpc_stride_v() * gpc) +
+                                            (proj_tpc_in_gpc_stride_v() * tpc) +
+                                            _ovr_perf_regs[reg]);
+                                if (chk_addr != addr)
+                                        continue;
+                                /* reset the patch count from previous
+                                   runs,if ucode has already processed
+                                   it */
+                                tmp = gk20a_mem_rd32(context +
+                                       ctxsw_prog_main_image_patch_count_o(), 0);
+                                if (!tmp)
+                                        ch_ctx->patch_ctx.data_count = 0;
+                                gr_gk20a_ctx_patch_write(g, ch_ctx,
+                                                         addr, data, true);
+                                vaddr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
+                                vaddr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
+                                gk20a_mem_wr32(context +
+                                         ctxsw_prog_main_image_patch_count_o(),
+                                         0, ch_ctx->patch_ctx.data_count);
+                                gk20a_mem_wr32(context +
+                                         ctxsw_prog_main_image_patch_adr_lo_o(),
+                                         0, vaddr_lo);
+                                gk20a_mem_wr32(context +
+                                         ctxsw_prog_main_image_patch_adr_hi_o(),
+                                         0, vaddr_hi);
+                                /* we're not caching these on cpu side,
+                                   but later watch for it */
+                                /* the l2 invalidate in the patch_write
+                                 * would be too early for this? */
+                                gk20a_mm_l2_invalidate(g);
+                                return 0;
+                        }
+                }
+        }
+        return 0;
+}
+static void gr_gk20a_access_smpc_reg(struct gk20a *g, u32 quad, u32 offset)
+{
+        u32 reg;
+        u32 quad_ctrl;
+        u32 half_ctrl;
+        u32 tpc, gpc;
+        u32 gpc_tpc_addr;
+        u32 gpc_tpc_stride;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "offset=0x%x", offset);
+        gpc = pri_get_gpc_num(offset);
+        gpc_tpc_addr = pri_gpccs_addr_mask(offset);
+        tpc = pri_get_tpc_num(gpc_tpc_addr);
+        quad_ctrl = quad & 0x1; /* first bit tells us quad */
+        half_ctrl = (quad >> 1) & 0x1; /* second bit tells us half */
+        gpc_tpc_stride = gpc * proj_gpc_stride_v() +
+                tpc * proj_tpc_in_gpc_stride_v();
+        gpc_tpc_addr = gr_gpc0_tpc0_sm_halfctl_ctrl_r() + gpc_tpc_stride;
+        reg = gk20a_readl(g, gpc_tpc_addr);
+        reg = set_field(reg,
+                gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_m(),
+                gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_f(quad_ctrl));
+        gk20a_writel(g, gpc_tpc_addr, reg);
+        gpc_tpc_addr = gr_gpc0_tpc0_sm_debug_sfe_control_r() + gpc_tpc_stride;
+        reg = gk20a_readl(g, gpc_tpc_addr);
+        reg = set_field(reg,
+                gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_m(),
+                gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_f(half_ctrl));
+        gk20a_writel(g, gpc_tpc_addr, reg);
+}
+#define ILLEGAL_ID (~0)
+static inline bool check_main_image_header_magic(void *context)
+{
+        u32 magic = gk20a_mem_rd32(context +
+                             ctxsw_prog_main_image_magic_value_o(), 0);
+        gk20a_dbg(gpu_dbg_gpu_dbg, "main image magic=0x%x", magic);
+        return magic == ctxsw_prog_main_image_magic_value_v_value_v();
+}
+static inline bool check_local_header_magic(void *context)
+{
+        u32 magic = gk20a_mem_rd32(context +
+                             ctxsw_prog_local_magic_value_o(), 0);
+        gk20a_dbg(gpu_dbg_gpu_dbg, "local magic=0x%x",  magic);
+        return magic == ctxsw_prog_local_magic_value_v_value_v();
+}
+/* most likely dupe of ctxsw_gpccs_header__size_1_v() */
+static inline int ctxsw_prog_ucode_header_size_in_bytes(void)
+{
+        return 256;
+}
+void gr_gk20a_get_sm_dsm_perf_regs(struct gk20a *g,
+                                        u32 *num_sm_dsm_perf_regs,
+                                        u32 **sm_dsm_perf_regs,
+                                        u32 *perf_register_stride)
+{
+        *num_sm_dsm_perf_regs = _num_sm_dsm_perf_regs;
+        *sm_dsm_perf_regs = _sm_dsm_perf_regs;
+        *perf_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_register_stride_v();
+}
+void gr_gk20a_get_sm_dsm_perf_ctrl_regs(struct gk20a *g,
+                                        u32 *num_sm_dsm_perf_ctrl_regs,
+                                        u32 **sm_dsm_perf_ctrl_regs,
+                                        u32 *ctrl_register_stride)
+{
+        *num_sm_dsm_perf_ctrl_regs = _num_sm_dsm_perf_ctrl_regs;
+        *sm_dsm_perf_ctrl_regs = _sm_dsm_perf_ctrl_regs;
+        *ctrl_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_control_register_stride_v();
+}
+static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
+                                                   u32 addr,
+                                                   bool is_quad, u32 quad,
+                                                   u32 *context_buffer,
+                                                   u32 context_buffer_size,
+                                                   u32 *priv_offset)
+{
+        u32 i, data32;
+        u32 gpc_num, tpc_num;
+        u32 num_gpcs, num_tpcs;
+        u32 chk_addr;
+        u32 ext_priv_offset, ext_priv_size;
+        void *context;
+        u32 offset_to_segment, offset_to_segment_end;
+        u32 sm_dsm_perf_reg_id = ILLEGAL_ID;
+        u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID;
+        u32 num_ext_gpccs_ext_buffer_segments;
+        u32 inter_seg_offset;
+        u32 tpc_gpc_mask = (proj_tpc_in_gpc_stride_v() - 1);
+        u32 max_tpc_count;
+        u32 *sm_dsm_perf_ctrl_regs = NULL;
+        u32 num_sm_dsm_perf_ctrl_regs = 0;
+        u32 *sm_dsm_perf_regs = NULL;
+        u32 num_sm_dsm_perf_regs = 0;
+        u32 buffer_segments_size = 0;
+        u32 marker_size = 0;
+        u32 control_register_stride = 0;
+        u32 perf_register_stride = 0;
+        /* Only have TPC registers in extended region, so if not a TPC reg,
+           then return error so caller can look elsewhere. */
+        if (pri_is_gpc_addr(addr))   {
+                u32 gpc_addr = 0;
+                gpc_num = pri_get_gpc_num(addr);
+                gpc_addr = pri_gpccs_addr_mask(addr);
+                if (pri_is_tpc_addr(gpc_addr))
+                        tpc_num = pri_get_tpc_num(gpc_addr);
+                else
+                        return -EINVAL;
+                gk20a_dbg_info(" gpc = %d tpc = %d",
+                                gpc_num, tpc_num);
+        } else
+                return -EINVAL;
+        buffer_segments_size = ctxsw_prog_extended_buffer_segments_size_in_bytes_v();
+        /* note below is in words/num_registers */
+        marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2;
+        context = context_buffer;
+        /* sanity check main header */
+        if (!check_main_image_header_magic(context)) {
+                gk20a_err(dev_from_gk20a(g),
+                           "Invalid main header: magic value");
+                return -EINVAL;
+        }
+        num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
+        if (gpc_num >= num_gpcs) {
+                gk20a_err(dev_from_gk20a(g),
+                   "GPC 0x%08x is greater than total count 0x%08x!\n",
+                           gpc_num, num_gpcs);
+                return -EINVAL;
+        }
+        data32 = gk20a_mem_rd32(context + ctxsw_prog_main_extended_buffer_ctl_o(), 0);
+        ext_priv_size   = ctxsw_prog_main_extended_buffer_ctl_size_v(data32);
+        if (0 == ext_priv_size) {
+                gk20a_dbg_info(" No extended memory in context buffer");
+                return -EINVAL;
+        }
+        ext_priv_offset = ctxsw_prog_main_extended_buffer_ctl_offset_v(data32);
+        offset_to_segment = ext_priv_offset * ctxsw_prog_ucode_header_size_in_bytes();
+        offset_to_segment_end = offset_to_segment +
+                (ext_priv_size * buffer_segments_size);
+        /* check local header magic */
+        context += ctxsw_prog_ucode_header_size_in_bytes();
+        if (!check_local_header_magic(context)) {
+                gk20a_err(dev_from_gk20a(g),
+                           "Invalid local header: magic value\n");
+                return -EINVAL;
+        }
+        /*
+         * See if the incoming register address is in the first table of
+         * registers. We check this by decoding only the TPC addr portion.
+         * If we get a hit on the TPC bit, we then double check the address
+         * by computing it from the base gpc/tpc strides.  Then make sure
+         * it is a real match.
+         */
+        g->ops.gr.get_sm_dsm_perf_regs(g, &num_sm_dsm_perf_regs,
+                                       &sm_dsm_perf_regs,
+                                       &perf_register_stride);
+        init_sm_dsm_reg_info();
+        for (i = 0; i < num_sm_dsm_perf_regs; i++) {
+                if ((addr & tpc_gpc_mask) == (sm_dsm_perf_regs[i] & tpc_gpc_mask)) {
+                        sm_dsm_perf_reg_id = i;
+                        gk20a_dbg_info("register match: 0x%08x",
+                                        sm_dsm_perf_regs[i]);
+                        chk_addr = (proj_gpc_base_v() +
+                                   (proj_gpc_stride_v() * gpc_num) +
+                                   proj_tpc_in_gpc_base_v() +
+                                   (proj_tpc_in_gpc_stride_v() * tpc_num) +
+                                   (sm_dsm_perf_regs[sm_dsm_perf_reg_id] & tpc_gpc_mask));
+                        if (chk_addr != addr) {
+                                gk20a_err(dev_from_gk20a(g),
+                                   "Oops addr miss-match! : 0x%08x != 0x%08x\n",
+                                           addr, chk_addr);
+                                return -EINVAL;
+                        }
+                        break;
+                }
+        }
+        /* Didn't find reg in supported group 1.
+         *  so try the second group now */
+        g->ops.gr.get_sm_dsm_perf_ctrl_regs(g, &num_sm_dsm_perf_ctrl_regs,
+                                       &sm_dsm_perf_ctrl_regs,
+                                       &control_register_stride);
+        if (ILLEGAL_ID == sm_dsm_perf_reg_id) {
+                for (i = 0; i < num_sm_dsm_perf_ctrl_regs; i++) {
+                        if ((addr & tpc_gpc_mask) ==
+                            (sm_dsm_perf_ctrl_regs[i] & tpc_gpc_mask)) {
+                                sm_dsm_perf_ctrl_reg_id = i;
+                                gk20a_dbg_info("register match: 0x%08x",
+                                                sm_dsm_perf_ctrl_regs[i]);
+                                chk_addr = (proj_gpc_base_v() +
+                                           (proj_gpc_stride_v() * gpc_num) +
+                                           proj_tpc_in_gpc_base_v() +
+                                           (proj_tpc_in_gpc_stride_v() * tpc_num) +
+                                           (sm_dsm_perf_ctrl_regs[sm_dsm_perf_ctrl_reg_id] &
+                                            tpc_gpc_mask));
+                                if (chk_addr != addr) {
+                                        gk20a_err(dev_from_gk20a(g),
+                                                   "Oops addr miss-match! : 0x%08x != 0x%08x\n",
+                                                   addr, chk_addr);
+                                        return -EINVAL;
+                                }
+                                break;
+                        }
+                }
+        }
+        if ((ILLEGAL_ID == sm_dsm_perf_ctrl_reg_id) &&
+            (ILLEGAL_ID == sm_dsm_perf_reg_id))
+                return -EINVAL;
+        /* Skip the FECS extended header, nothing there for us now. */
+        offset_to_segment += buffer_segments_size;
+        /* skip through the GPCCS extended headers until we get to the data for
+         * our GPC.  The size of each gpc extended segment is enough to hold the
+         * max tpc count for the gpcs,in 256b chunks.
+         */
+        max_tpc_count = proj_scal_litter_num_tpc_per_gpc_v();
+        num_ext_gpccs_ext_buffer_segments = (u32)((max_tpc_count + 1) / 2);
+        offset_to_segment += (num_ext_gpccs_ext_buffer_segments *
+                              buffer_segments_size * gpc_num);
+        num_tpcs = g->gr.gpc_tpc_count[gpc_num];
+        /* skip the head marker to start with */
+        inter_seg_offset = marker_size;
+        if (ILLEGAL_ID != sm_dsm_perf_ctrl_reg_id) {
+                /* skip over control regs of TPC's before the one we want.
+                 *  then skip to the register in this tpc */
+                inter_seg_offset = inter_seg_offset +
+                        (tpc_num * control_register_stride) +
+                        sm_dsm_perf_ctrl_reg_id;
+        } else {
+                /* skip all the control registers */
+                inter_seg_offset = inter_seg_offset +
+                        (num_tpcs * control_register_stride);
+                /* skip the marker between control and counter segments */
+                inter_seg_offset += marker_size;
+                /* skip over counter regs of TPCs before the one we want */
+                inter_seg_offset = inter_seg_offset +
+                        (tpc_num * perf_register_stride) *
+                        ctxsw_prog_extended_num_smpc_quadrants_v();
+                /* skip over the register for the quadrants we do not want.
+                 *  then skip to the register in this tpc */
+                inter_seg_offset = inter_seg_offset +
+                        (perf_register_stride * quad) +
+                        sm_dsm_perf_reg_id;
+        }
+        /* set the offset to the segment offset plus the inter segment offset to
+         *  our register */
+        offset_to_segment += (inter_seg_offset * 4);
+        /* last sanity check: did we somehow compute an offset outside the
+         * extended buffer? */
+        if (offset_to_segment > offset_to_segment_end) {
+                gk20a_err(dev_from_gk20a(g),
+                           "Overflow ctxsw buffer! 0x%08x > 0x%08x\n",
+                           offset_to_segment, offset_to_segment_end);
+                return -EINVAL;
+        }
+        *priv_offset = offset_to_segment;
+        return 0;
+}
+static int
+gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g,
+                                             int addr_type,/* enum ctxsw_addr_type */
+                                             u32 pri_addr,
+                                             u32 gpc_num, u32 num_tpcs,
+                                             u32 num_ppcs, u32 ppc_mask,
+                                             u32 *priv_offset)
+{
+        u32 i;
+        u32 address, base_address;
+        u32 sys_offset, gpc_offset, tpc_offset, ppc_offset;
+        u32 ppc_num, tpc_num, tpc_addr, gpc_addr, ppc_addr;
+        struct aiv_gk20a *reg;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "pri_addr=0x%x", pri_addr);
+        if (!g->gr.ctx_vars.valid)
+                return -EINVAL;
+        /* Process the SYS/BE segment. */
+        if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
+            (addr_type == CTXSW_ADDR_TYPE_BE)) {
+                for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.sys.count; i++) {
+                        reg = &g->gr.ctx_vars.ctxsw_regs.sys.l[i];
+                        address    = reg->addr;
+                        sys_offset = reg->index;
+                        if (pri_addr == address) {
+                                *priv_offset = sys_offset;
+                                return 0;
+                        }
+                }
+        }
+        /* Process the TPC segment. */
+        if (addr_type == CTXSW_ADDR_TYPE_TPC) {
+                for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
+                        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.tpc.count; i++) {
+                                reg = &g->gr.ctx_vars.ctxsw_regs.tpc.l[i];
+                                address = reg->addr;
+                                tpc_addr = pri_tpccs_addr_mask(address);
+                                base_address = proj_gpc_base_v() +
+                                        (gpc_num * proj_gpc_stride_v()) +
+                                        proj_tpc_in_gpc_base_v() +
+                                        (tpc_num * proj_tpc_in_gpc_stride_v());
+                                address = base_address + tpc_addr;
+                                /*
+                                 * The data for the TPCs is interleaved in the context buffer.
+                                 * Example with num_tpcs = 2
+                                 * 0    1    2    3    4    5    6    7    8    9    10   11 ...
+                                 * 0-0  1-0  0-1  1-1  0-2  1-2  0-3  1-3  0-4  1-4  0-5  1-5 ...
+                                 */
+                                tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4);
+                                if (pri_addr == address) {
+                                        *priv_offset = tpc_offset;
+                                        return 0;
+                                }
+                        }
+                }
+        }
+        /* Process the PPC segment. */
+        if (addr_type == CTXSW_ADDR_TYPE_PPC) {
+                for (ppc_num = 0; ppc_num < num_ppcs; ppc_num++) {
+                        for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.ppc.count; i++) {
+                                reg = &g->gr.ctx_vars.ctxsw_regs.ppc.l[i];
+                                address = reg->addr;
+                                ppc_addr = pri_ppccs_addr_mask(address);
+                                base_address = proj_gpc_base_v() +
+                                        (gpc_num * proj_gpc_stride_v()) +
+                                        proj_ppc_in_gpc_base_v() +
+                                        (ppc_num * proj_ppc_in_gpc_stride_v());
+                                address = base_address + ppc_addr;
+                                /*
+                                 * The data for the PPCs is interleaved in the context buffer.
+                                 * Example with numPpcs = 2
+                                 * 0    1    2    3    4    5    6    7    8    9    10   11 ...
+                                 * 0-0  1-0  0-1  1-1  0-2  1-2  0-3  1-3  0-4  1-4  0-5  1-5 ...
+                                 */
+                                ppc_offset = (reg->index * num_ppcs) + (ppc_num * 4);
+                                if (pri_addr == address)  {
+                                        *priv_offset = ppc_offset;
+                                        return 0;
+                                }
+                        }
+                }
+        }
+        /* Process the GPC segment. */
+        if (addr_type == CTXSW_ADDR_TYPE_GPC) {
+                for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.gpc.count; i++) {
+                        reg = &g->gr.ctx_vars.ctxsw_regs.gpc.l[i];
+                        address = reg->addr;
+                        gpc_addr = pri_gpccs_addr_mask(address);
+                        gpc_offset = reg->index;
+                        base_address = proj_gpc_base_v() +
+                                (gpc_num * proj_gpc_stride_v());
+                        address = base_address + gpc_addr;
+                        if (pri_addr == address) {
+                                *priv_offset = gpc_offset;
+                                return 0;
+                        }
+                }
+        }
+        return -EINVAL;
+}
+static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
+                                               void *context,
+                                               u32 *num_ppcs, u32 *ppc_mask,
+                                               u32 *reg_ppc_count)
+{
+        u32 data32;
+        u32 litter_num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
+        /*
+         * if there is only 1 PES_PER_GPC, then we put the PES registers
+         * in the GPC reglist, so we can't error out if ppc.count == 0
+         */
+        if ((!g->gr.ctx_vars.valid) ||
+            ((g->gr.ctx_vars.ctxsw_regs.ppc.count == 0) &&
+             (litter_num_pes_per_gpc > 1)))
+                return -EINVAL;
+        data32 = gk20a_mem_rd32(context + ctxsw_prog_local_image_ppc_info_o(), 0);
+        *num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32);
+        *ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32);
+        *reg_ppc_count = g->gr.ctx_vars.ctxsw_regs.ppc.count;
+        return 0;
+}
+/*
+ *  This function will return the 32 bit offset for a priv register if it is
+ *  present in the context buffer.
+ */
+static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
+                                               u32 addr,
+                                               bool is_quad, u32 quad,
+                                               u32 *context_buffer,
+                                               u32 context_buffer_size,
+                                               u32 *priv_offset)
+{
+        struct gr_gk20a *gr = &g->gr;
+        u32 i, data32;
+        int err;
+        int addr_type; /*enum ctxsw_addr_type */
+        u32 broadcast_flags;
+        u32 gpc_num, tpc_num, ppc_num, be_num;
+        u32 num_gpcs, num_tpcs, num_ppcs;
+        u32 offset;
+        u32 sys_priv_offset, gpc_priv_offset;
+        u32 ppc_mask, reg_list_ppc_count;
+        void *context;
+        u32 offset_to_segment;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+        err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
+                                        &gpc_num, &tpc_num, &ppc_num, &be_num,
+                                        &broadcast_flags);
+        if (err)
+                return err;
+        context = context_buffer;
+        if (!check_main_image_header_magic(context)) {
+                gk20a_err(dev_from_gk20a(g),
+                           "Invalid main header: magic value");
+                return -EINVAL;
+        }
+        num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
+        /* Parse the FECS local header. */
+        context += ctxsw_prog_ucode_header_size_in_bytes();
+        if (!check_local_header_magic(context)) {
+                gk20a_err(dev_from_gk20a(g),
+                           "Invalid FECS local header: magic value\n");
+                return -EINVAL;
+        }
+        data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
+        sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
+        /* If found in Ext buffer, ok.
+         * If it failed and we expected to find it there (quad offset)
+         * then return the error.  Otherwise continue on.
+         */
+        err = gr_gk20a_find_priv_offset_in_ext_buffer(g,
+                                      addr, is_quad, quad, context_buffer,
+                                      context_buffer_size, priv_offset);
+        if (!err || (err && is_quad))
+                return err;
+        if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
+            (addr_type == CTXSW_ADDR_TYPE_BE)) {
+                /* Find the offset in the FECS segment. */
+                offset_to_segment = sys_priv_offset *
+                        ctxsw_prog_ucode_header_size_in_bytes();
+                err = gr_gk20a_process_context_buffer_priv_segment(g,
+                                           addr_type, addr,
+                                           0, 0, 0, 0,
+                                           &offset);
+                if (err)
+                        return err;
+                *priv_offset = (offset_to_segment + offset);
+                return 0;
+        }
+        if ((gpc_num + 1) > num_gpcs)  {
+                gk20a_err(dev_from_gk20a(g),
+                           "GPC %d not in this context buffer.\n",
+                           gpc_num);
+                return -EINVAL;
+        }
+        /* Parse the GPCCS local header(s).*/
+        for (i = 0; i < num_gpcs; i++) {
+                context += ctxsw_prog_ucode_header_size_in_bytes();
+                if (!check_local_header_magic(context)) {
+                        gk20a_err(dev_from_gk20a(g),
+                                   "Invalid GPCCS local header: magic value\n");
+                        return -EINVAL;
+                }
+                data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
+                gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
+                err = gr_gk20a_determine_ppc_configuration(g, context,
+                                                           &num_ppcs, &ppc_mask,
+                                                           &reg_list_ppc_count);
+                if (err)
+                        return err;
+                num_tpcs = gk20a_mem_rd32(context + ctxsw_prog_local_image_num_tpcs_o(), 0);
+                if ((i == gpc_num) && ((tpc_num + 1) > num_tpcs)) {
+                        gk20a_err(dev_from_gk20a(g),
+                           "GPC %d TPC %d not in this context buffer.\n",
+                                   gpc_num, tpc_num);
+                        return -EINVAL;
+                }
+                /* Find the offset in the GPCCS segment.*/
+                if (i == gpc_num) {
+                        offset_to_segment = gpc_priv_offset *
+                                ctxsw_prog_ucode_header_size_in_bytes();
+                        if (addr_type == CTXSW_ADDR_TYPE_TPC) {
+                                /*reg = gr->ctx_vars.ctxsw_regs.tpc.l;*/
+                        } else if (addr_type == CTXSW_ADDR_TYPE_PPC) {
+                                /* The ucode stores TPC data before PPC data.
+                                 * Advance offset past TPC data to PPC data. */
+                                offset_to_segment +=
+                                        ((gr->ctx_vars.ctxsw_regs.tpc.count *
+                                          num_tpcs) << 2);
+                        } else if (addr_type == CTXSW_ADDR_TYPE_GPC) {
+                                /* The ucode stores TPC/PPC data before GPC data.
+                                 * Advance offset past TPC/PPC data to GPC data. */
+                                /* note 1 PES_PER_GPC case */
+                                u32 litter_num_pes_per_gpc =
+                                        proj_scal_litter_num_pes_per_gpc_v();
+                                if (litter_num_pes_per_gpc > 1) {
+                                        offset_to_segment +=
+                                                (((gr->ctx_vars.ctxsw_regs.tpc.count *
+                                                   num_tpcs) << 2) +
+                                                 ((reg_list_ppc_count * num_ppcs) << 2));
+                                } else {
+                                        offset_to_segment +=
+                                                ((gr->ctx_vars.ctxsw_regs.tpc.count *
+                                                  num_tpcs) << 2);
+                                }
+                        } else {
+                                gk20a_err(dev_from_gk20a(g),
+                                           " Unknown address type.\n");
+                                return -EINVAL;
+                        }
+                        err = gr_gk20a_process_context_buffer_priv_segment(g,
+                                                           addr_type, addr,
+                                                           i, num_tpcs,
+                                                           num_ppcs, ppc_mask,
+                                                           &offset);
+                        if (err)
+                            return -EINVAL;
+                        *priv_offset = offset_to_segment + offset;
+                        return 0;
+                }
+        }
+        return -EINVAL;
+}
+int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
+                          struct nvhost_dbg_gpu_reg_op *ctx_ops, u32 num_ops,
+                          u32 num_ctx_wr_ops, u32 num_ctx_rd_ops)
+{
+        struct gk20a *g = ch->g;
+        struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
+        void *ctx_ptr = NULL;
+        int curr_gr_chid, curr_gr_ctx;
+        bool ch_is_curr_ctx, restart_gr_ctxsw = false;
+        u32 i, j, offset, v;
+        u32 max_offsets = proj_scal_litter_num_gpcs_v() *
+                proj_scal_litter_num_tpc_per_gpc_v();
+        u32 *offsets = NULL;
+        u32 *offset_addrs = NULL;
+        u32 ctx_op_nr, num_ctx_ops[2] = {num_ctx_wr_ops, num_ctx_rd_ops};
+        int err, pass;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "wr_ops=%d rd_ops=%d",
+                   num_ctx_wr_ops, num_ctx_rd_ops);
+        /* disable channel switching.
+         * at that point the hardware state can be inspected to
+         * determine if the context we're interested in is current.
+         */
+        err = gr_gk20a_disable_ctxsw(g);
+        if (err) {
+                gk20a_err(dev_from_gk20a(g), "unable to stop gr ctxsw");
+                /* this should probably be ctx-fatal... */
+                goto cleanup;
+        }
+        restart_gr_ctxsw = true;
+        curr_gr_ctx  = gk20a_readl(g, gr_fecs_current_ctx_r());
+        curr_gr_chid = gk20a_gr_get_chid_from_ctx(g, curr_gr_ctx);
+        ch_is_curr_ctx = (curr_gr_chid != -1) && (ch->hw_chid == curr_gr_chid);
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "is curr ctx=%d", ch_is_curr_ctx);
+        if (ch_is_curr_ctx) {
+                for (pass = 0; pass < 2; pass++) {
+                        ctx_op_nr = 0;
+                        for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
+                                /* only do ctx ops and only on the right pass */
+                                if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
+                                    (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
+                                     ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
+                                        continue;
+                                /* if this is a quad access, setup for special access*/
+                                if (ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD)
+                                                && g->ops.gr.access_smpc_reg)
+                                        g->ops.gr.access_smpc_reg(g,
+                                                        ctx_ops[i].quad,
+                                                        ctx_ops[i].offset);
+                                offset = ctx_ops[i].offset;
+                                if (pass == 0) { /* write pass */
+                                        v = gk20a_readl(g, offset);
+                                        v &= ~ctx_ops[i].and_n_mask_lo;
+                                        v |= ctx_ops[i].value_lo;
+                                        gk20a_writel(g, offset, v);
+                                        gk20a_dbg(gpu_dbg_gpu_dbg,
+                                                   "direct wr: offset=0x%x v=0x%x",
+                                                   offset, v);
+                                        if (ctx_ops[i].op == REGOP(WRITE_64)) {
+                                                v = gk20a_readl(g, offset + 4);
+                                                v &= ~ctx_ops[i].and_n_mask_hi;
+                                                v |= ctx_ops[i].value_hi;
+                                                gk20a_writel(g, offset + 4, v);
+                                                gk20a_dbg(gpu_dbg_gpu_dbg,
+                                                           "direct wr: offset=0x%x v=0x%x",
+                                                           offset + 4, v);
+                                        }
+                                } else { /* read pass */
+                                        ctx_ops[i].value_lo =
+                                                gk20a_readl(g, offset);
+                                        gk20a_dbg(gpu_dbg_gpu_dbg,
+                                                   "direct rd: offset=0x%x v=0x%x",
+                                                   offset, ctx_ops[i].value_lo);
+                                        if (ctx_ops[i].op == REGOP(READ_64)) {
+                                                ctx_ops[i].value_hi =
+                                                        gk20a_readl(g, offset + 4);
+                                                gk20a_dbg(gpu_dbg_gpu_dbg,
+                                                           "direct rd: offset=0x%x v=0x%x",
+                                                           offset, ctx_ops[i].value_lo);
+                                        } else
+                                                ctx_ops[i].value_hi = 0;
+                                }
+                                ctx_op_nr++;
+                        }
+                }
+                goto cleanup;
+        }
+        /* they're the same size, so just use one alloc for both */
+        offsets = kzalloc(2 * sizeof(u32) * max_offsets, GFP_KERNEL);
+        if (!offsets) {
+                err = -ENOMEM;
+                goto cleanup;
+        }
+        offset_addrs = offsets + max_offsets;
+        /* would have been a variant of gr_gk20a_apply_instmem_overrides */
+        /* recoded in-place instead.*/
+        ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
+                        PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
+                        0, pgprot_dmacoherent(PAGE_KERNEL));
+        if (!ctx_ptr) {
+                err = -ENOMEM;
+                goto cleanup;
+        }
+        /* Channel gr_ctx buffer is gpu cacheable; so flush and invalidate.
+         * There should be no on-going/in-flight references by the gpu now. */
+        gk20a_mm_fb_flush(g);
+        gk20a_mm_l2_flush(g, true);
+        /* write to appropriate place in context image,
+         * first have to figure out where that really is */
+        /* first pass is writes, second reads */
+        for (pass = 0; pass < 2; pass++) {
+                ctx_op_nr = 0;
+                for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
+                        u32 num_offsets;
+                        /* only do ctx ops and only on the right pass */
+                        if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
+                            (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
+                             ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
+                                continue;
+                        err = gr_gk20a_get_ctx_buffer_offsets(g,
+                                                ctx_ops[i].offset,
+                                                max_offsets,
+                                                offsets, offset_addrs,
+                                                &num_offsets,
+                                                ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD),
+                                                ctx_ops[i].quad);
+                        if (err) {
+                                gk20a_dbg(gpu_dbg_gpu_dbg,
+                                           "ctx op invalid offset: offset=0x%x",
+                                           ctx_ops[i].offset);
+                                ctx_ops[i].status =
+                                        NVHOST_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET;
+                                continue;
+                        }
+                        /* if this is a quad access, setup for special access*/
+                        if (ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD) &&
+                                        g->ops.gr.access_smpc_reg)
+                                g->ops.gr.access_smpc_reg(g, ctx_ops[i].quad,
+                                                         ctx_ops[i].offset);
+                        for (j = 0; j < num_offsets; j++) {
+                                /* sanity check, don't write outside, worst case */
+                                if (offsets[j] >= g->gr.ctx_vars.golden_image_size)
+                                        continue;
+                                if (pass == 0) { /* write pass */
+                                        v = gk20a_mem_rd32(ctx_ptr + offsets[j], 0);
+                                        v &= ~ctx_ops[i].and_n_mask_lo;
+                                        v |= ctx_ops[i].value_lo;
+                                        gk20a_mem_wr32(ctx_ptr + offsets[j], 0, v);
+                                        gk20a_dbg(gpu_dbg_gpu_dbg,
+                                                   "context wr: offset=0x%x v=0x%x",
+                                                   offsets[j], v);
+                                        if (ctx_ops[i].op == REGOP(WRITE_64)) {
+                                                v = gk20a_mem_rd32(ctx_ptr + offsets[j] + 4, 0);
+                                                v &= ~ctx_ops[i].and_n_mask_hi;
+                                                v |= ctx_ops[i].value_hi;
+                                                gk20a_mem_wr32(ctx_ptr + offsets[j] + 4, 0, v);
+                                                gk20a_dbg(gpu_dbg_gpu_dbg,
+                                                           "context wr: offset=0x%x v=0x%x",
+                                                           offsets[j] + 4, v);
+                                        }
+                                        /* check to see if we need to add a special WAR
+                                           for some of the SMPC perf regs */
+                                        gr_gk20a_ctx_patch_smpc(g, ch_ctx, offset_addrs[j],
+                                                        v, ctx_ptr);
+                                } else { /* read pass */
+                                        ctx_ops[i].value_lo =
+                                                gk20a_mem_rd32(ctx_ptr + offsets[0], 0);
+                                        gk20a_dbg(gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
+                                                   offsets[0], ctx_ops[i].value_lo);
+                                        if (ctx_ops[i].op == REGOP(READ_64)) {
+                                                ctx_ops[i].value_hi =
+                                                        gk20a_mem_rd32(ctx_ptr + offsets[0] + 4, 0);
+                                                gk20a_dbg(gpu_dbg_gpu_dbg,
+                                                           "context rd: offset=0x%x v=0x%x",
+                                                           offsets[0] + 4, ctx_ops[i].value_hi);
+                                        } else
+                                                ctx_ops[i].value_hi = 0;
+                                }
+                        }
+                        ctx_op_nr++;
+                }
+        }
+#if 0
+        /* flush cpu caches for the ctx buffer? only if cpu cached, of course.
+         * they aren't, yet */
+        if (cached) {
+                FLUSH_CPU_DCACHE(ctx_ptr,
+                         sg_phys(ch_ctx->gr_ctx.mem.ref), size);
+        }
+#endif
+ cleanup:
+        if (offsets)
+                kfree(offsets);
+        if (ctx_ptr)
+                vunmap(ctx_ptr);
+        if (restart_gr_ctxsw) {
+                int tmp_err = gr_gk20a_enable_ctxsw(g);
+                if (tmp_err) {
+                        gk20a_err(dev_from_gk20a(g), "unable to restart ctxsw!\n");
+                        err = tmp_err;
+                }
+        }
+        return err;
+}
+static void gr_gk20a_cb_size_default(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        gr->attrib_cb_default_size =
+                gr_gpc0_ppc0_cbm_cfg_size_default_v();
+        gr->alpha_cb_default_size =
+                gr_gpc0_ppc0_cbm_cfg2_size_default_v();
+}
+static int gr_gk20a_calc_global_ctx_buffer_size(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        int size;
+        gr->attrib_cb_size = gr->attrib_cb_default_size;
+        gr->alpha_cb_size = gr->alpha_cb_default_size
+                + (gr->alpha_cb_default_size >> 1);
+        size = gr->attrib_cb_size *
+                gr_gpc0_ppc0_cbm_cfg_size_granularity_v() *
+                gr->max_tpc_count;
+        size += gr->alpha_cb_size *
+                gr_gpc0_ppc0_cbm_cfg2_size_granularity_v() *
+                gr->max_tpc_count;
+        return size;
+}
+void gr_gk20a_commit_global_pagepool(struct gk20a *g,
+                                            struct channel_ctx_gk20a *ch_ctx,
+                                            u64 addr, u32 size, bool patch)
+{
+        gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_base_r(),
+                gr_scc_pagepool_base_addr_39_8_f(addr), patch);
+        gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_r(),
+                gr_scc_pagepool_total_pages_f(size) |
+                gr_scc_pagepool_valid_true_f(), patch);
+        gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_base_r(),
+                gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
+        gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_r(),
+                gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
+        gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_pagepool_r(),
+                gr_pd_pagepool_total_pages_f(size) |
+                gr_pd_pagepool_valid_true_f(), patch);
+}
+void gk20a_init_gr(struct gpu_ops *gops)
+{
+        gops->gr.access_smpc_reg = gr_gk20a_access_smpc_reg;
+        gops->gr.bundle_cb_defaults = gr_gk20a_bundle_cb_defaults;
+        gops->gr.cb_size_default = gr_gk20a_cb_size_default;
+        gops->gr.calc_global_ctx_buffer_size =
+                gr_gk20a_calc_global_ctx_buffer_size;
+        gops->gr.commit_global_attrib_cb = gr_gk20a_commit_global_attrib_cb;
+        gops->gr.commit_global_bundle_cb = gr_gk20a_commit_global_bundle_cb;
+        gops->gr.commit_global_cb_manager = gr_gk20a_commit_global_cb_manager;
+        gops->gr.commit_global_pagepool = gr_gk20a_commit_global_pagepool;
+        gops->gr.handle_sw_method = gr_gk20a_handle_sw_method;
+        gops->gr.set_alpha_circular_buffer_size =
+                gk20a_gr_set_circular_buffer_size;
+        gops->gr.set_circular_buffer_size =
+                gk20a_gr_set_alpha_circular_buffer_size;
+        gops->gr.enable_hww_exceptions = gr_gk20a_enable_hww_exceptions;
+        gops->gr.is_valid_class = gr_gk20a_is_valid_class;
+        gops->gr.get_sm_dsm_perf_regs = gr_gk20a_get_sm_dsm_perf_regs;
+        gops->gr.get_sm_dsm_perf_ctrl_regs = gr_gk20a_get_sm_dsm_perf_ctrl_regs;
+        gops->gr.init_fs_state = gr_gk20a_ctx_state_floorsweep;
+        gops->gr.set_hww_esr_report_mask = gr_gk20a_set_hww_esr_report_mask;
+        gops->gr.setup_alpha_beta_tables = gr_gk20a_setup_alpha_beta_tables;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
new file mode 100644
index 00000000..7eb2923a
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -0,0 +1,406 @@
+/*
+ * GK20A Graphics Engine
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __GR_GK20A_H__
+#define __GR_GK20A_H__
+#include <linux/slab.h>
+#include "gr_ctx_gk20a.h"
+#define GR_IDLE_CHECK_DEFAULT           100 /* usec */
+#define GR_IDLE_CHECK_MAX               5000 /* usec */
+#define INVALID_SCREEN_TILE_ROW_OFFSET  0xFFFFFFFF
+#define INVALID_MAX_WAYS                0xFFFFFFFF
+#define GK20A_FECS_UCODE_IMAGE  "fecs.bin"
+#define GK20A_GPCCS_UCODE_IMAGE "gpccs.bin"
+enum /* global_ctx_buffer */ {
+        CIRCULAR                = 0,
+        PAGEPOOL                = 1,
+        ATTRIBUTE               = 2,
+        CIRCULAR_VPR            = 3,
+        PAGEPOOL_VPR            = 4,
+        ATTRIBUTE_VPR           = 5,
+        GOLDEN_CTX              = 6,
+        PRIV_ACCESS_MAP         = 7,
+        NR_GLOBAL_CTX_BUF       = 8
+};
+/* either ATTRIBUTE or ATTRIBUTE_VPR maps to ATTRIBUTE_VA */
+enum  /*global_ctx_buffer_va */ {
+        CIRCULAR_VA             = 0,
+        PAGEPOOL_VA             = 1,
+        ATTRIBUTE_VA            = 2,
+        GOLDEN_CTX_VA           = 3,
+        PRIV_ACCESS_MAP_VA      = 4,
+        NR_GLOBAL_CTX_BUF_VA    = 5
+};
+enum {
+        WAIT_UCODE_LOOP,
+        WAIT_UCODE_TIMEOUT,
+        WAIT_UCODE_ERROR,
+        WAIT_UCODE_OK
+};
+enum {
+        GR_IS_UCODE_OP_EQUAL,
+        GR_IS_UCODE_OP_NOT_EQUAL,
+        GR_IS_UCODE_OP_AND,
+        GR_IS_UCODE_OP_LESSER,
+        GR_IS_UCODE_OP_LESSER_EQUAL,
+        GR_IS_UCODE_OP_SKIP
+};
+enum {
+        eUcodeHandshakeInitComplete = 1,
+        eUcodeHandshakeMethodFinished
+};
+enum {
+        ELCG_RUN,       /* clk always run, i.e. disable elcg */
+        ELCG_STOP,      /* clk is stopped */
+        ELCG_AUTO       /* clk will run when non-idle, standard elcg mode */
+};
+enum {
+        BLCG_RUN,       /* clk always run, i.e. disable blcg */
+        BLCG_AUTO       /* clk will run when non-idle, standard blcg mode */
+};
+#ifndef GR_GO_IDLE_BUNDLE
+#define GR_GO_IDLE_BUNDLE       0x0000e100 /* --V-B */
+#endif
+struct gr_channel_map_tlb_entry {
+        u32 curr_ctx;
+        u32 hw_chid;
+};
+struct gr_zcull_gk20a {
+        u32 aliquot_width;
+        u32 aliquot_height;
+        u32 aliquot_size;
+        u32 total_aliquots;
+        u32 width_align_pixels;
+        u32 height_align_pixels;
+        u32 pixel_squares_by_aliquots;
+};
+struct gr_zcull_info {
+        u32 width_align_pixels;
+        u32 height_align_pixels;
+        u32 pixel_squares_by_aliquots;
+        u32 aliquot_total;
+        u32 region_byte_multiplier;
+        u32 region_header_size;
+        u32 subregion_header_size;
+        u32 subregion_width_align_pixels;
+        u32 subregion_height_align_pixels;
+        u32 subregion_count;
+};
+#define GK20A_ZBC_COLOR_VALUE_SIZE      4  /* RGBA */
+#define GK20A_STARTOF_ZBC_TABLE         1   /* index zero reserved to indicate "not ZBCd" */
+#define GK20A_SIZEOF_ZBC_TABLE          16  /* match ltcs_ltss_dstg_zbc_index_address width (4) */
+#define GK20A_ZBC_TABLE_SIZE            (16 - 1)
+#define GK20A_ZBC_TYPE_INVALID          0
+#define GK20A_ZBC_TYPE_COLOR            1
+#define GK20A_ZBC_TYPE_DEPTH            2
+struct zbc_color_table {
+        u32 color_ds[GK20A_ZBC_COLOR_VALUE_SIZE];
+        u32 color_l2[GK20A_ZBC_COLOR_VALUE_SIZE];
+        u32 format;
+        u32 ref_cnt;
+};
+struct zbc_depth_table {
+        u32 depth;
+        u32 format;
+        u32 ref_cnt;
+};
+struct zbc_entry {
+        u32 color_ds[GK20A_ZBC_COLOR_VALUE_SIZE];
+        u32 color_l2[GK20A_ZBC_COLOR_VALUE_SIZE];
+        u32 depth;
+        u32 type;       /* color or depth */
+        u32 format;
+};
+struct zbc_query_params {
+        u32 color_ds[GK20A_ZBC_COLOR_VALUE_SIZE];
+        u32 color_l2[GK20A_ZBC_COLOR_VALUE_SIZE];
+        u32 depth;
+        u32 ref_cnt;
+        u32 format;
+        u32 type;       /* color or depth */
+        u32 index_size; /* [out] size, [in] index */
+};
+struct gr_gk20a {
+        struct gk20a *g;
+        struct {
+                bool dynamic;
+                u32 buffer_size;
+                u32 buffer_total_size;
+                bool golden_image_initialized;
+                u32 golden_image_size;
+                u32 *local_golden_image;
+                u32 zcull_ctxsw_image_size;
+                u32 buffer_header_size;
+                u32 priv_access_map_size;
+                struct gr_ucode_gk20a ucode;
+                struct av_list_gk20a  sw_bundle_init;
+                struct av_list_gk20a  sw_method_init;
+                struct aiv_list_gk20a sw_ctx_load;
+                struct av_list_gk20a  sw_non_ctx_load;
+                struct {
+                        struct aiv_list_gk20a sys;
+                        struct aiv_list_gk20a gpc;
+                        struct aiv_list_gk20a tpc;
+                        struct aiv_list_gk20a zcull_gpc;
+                        struct aiv_list_gk20a ppc;
+                        struct aiv_list_gk20a pm_sys;
+                        struct aiv_list_gk20a pm_gpc;
+                        struct aiv_list_gk20a pm_tpc;
+                } ctxsw_regs;
+                int regs_base_index;
+                bool valid;
+        } ctx_vars;
+        struct mutex ctx_mutex; /* protect golden ctx init */
+        struct mutex fecs_mutex; /* protect fecs method */
+#define GR_NETLIST_DYNAMIC      -1
+#define GR_NETLIST_STATIC_A     'A'
+        int netlist;
+        int initialized;
+        u32 num_fbps;
+        u32 max_gpc_count;
+        u32 max_fbps_count;
+        u32 max_tpc_per_gpc_count;
+        u32 max_zcull_per_gpc_count;
+        u32 max_tpc_count;
+        u32 sys_count;
+        u32 gpc_count;
+        u32 pe_count_per_gpc;
+        u32 ppc_count;
+        u32 *gpc_ppc_count;
+        u32 tpc_count;
+        u32 *gpc_tpc_count;
+        u32 zcb_count;
+        u32 *gpc_zcb_count;
+        u32 *pes_tpc_count[2];
+        u32 *pes_tpc_mask[2];
+        u32 *gpc_skip_mask;
+        u32 bundle_cb_default_size;
+        u32 min_gpm_fifo_depth;
+        u32 bundle_cb_token_limit;
+        u32 attrib_cb_default_size;
+        u32 attrib_cb_size;
+        u32 alpha_cb_default_size;
+        u32 alpha_cb_size;
+        u32 timeslice_mode;
+        struct gr_ctx_buffer_desc global_ctx_buffer[NR_GLOBAL_CTX_BUF];
+        struct mmu_desc mmu_wr_mem;
+        u32 mmu_wr_mem_size;
+        struct mmu_desc mmu_rd_mem;
+        u32 mmu_rd_mem_size;
+        u8 *map_tiles;
+        u32 map_tile_count;
+        u32 map_row_offset;
+#define COMP_TAG_LINE_SIZE_SHIFT        (17)    /* one tag covers 128K */
+#define COMP_TAG_LINE_SIZE              (1 << COMP_TAG_LINE_SIZE_SHIFT)
+        u32 max_comptag_mem; /* max memory size (MB) for comptag */
+        struct compbit_store_desc compbit_store;
+        struct gk20a_allocator comp_tags;
+        struct gr_zcull_gk20a zcull;
+        struct zbc_color_table zbc_col_tbl[GK20A_ZBC_TABLE_SIZE];
+        struct zbc_depth_table zbc_dep_tbl[GK20A_ZBC_TABLE_SIZE];
+        s32 max_default_color_index;
+        s32 max_default_depth_index;
+        s32 max_used_color_index;
+        s32 max_used_depth_index;
+        u32 status_disable_mask;
+#define GR_CHANNEL_MAP_TLB_SIZE         2 /* must of power of 2 */
+        struct gr_channel_map_tlb_entry chid_tlb[GR_CHANNEL_MAP_TLB_SIZE];
+        u32 channel_tlb_flush_index;
+        spinlock_t ch_tlb_lock;
+        void (*remove_support)(struct gr_gk20a *gr);
+        bool sw_ready;
+        bool skip_ucode_init;
+};
+void gk20a_fecs_dump_falcon_stats(struct gk20a *g);
+struct gk20a_ctxsw_ucode_segment {
+        u32 offset;
+        u32 size;
+};
+struct gk20a_ctxsw_ucode_segments {
+        u32 boot_entry;
+        u32 boot_imem_offset;
+        struct gk20a_ctxsw_ucode_segment boot;
+        struct gk20a_ctxsw_ucode_segment code;
+        struct gk20a_ctxsw_ucode_segment data;
+};
+struct gk20a_ctxsw_ucode_info {
+        u64 *p_va;
+        struct inst_desc inst_blk_desc;
+        struct surface_mem_desc surface_desc;
+        u64 ucode_gpuva;
+        struct gk20a_ctxsw_ucode_segments fecs;
+        struct gk20a_ctxsw_ucode_segments gpccs;
+};
+struct gk20a_ctxsw_bootloader_desc {
+        u32 start_offset;
+        u32 size;
+        u32 imem_offset;
+        u32 entry_point;
+};
+struct gpu_ops;
+void gk20a_init_gr(struct gpu_ops *gops);
+int gk20a_init_gr_support(struct gk20a *g);
+void gk20a_gr_reset(struct gk20a *g);
+int gk20a_init_gr_channel(struct channel_gk20a *ch_gk20a);
+int gr_gk20a_init_ctx_vars(struct gk20a *g, struct gr_gk20a *gr);
+struct nvhost_alloc_obj_ctx_args;
+struct nvhost_free_obj_ctx_args;
+int gk20a_alloc_obj_ctx(struct channel_gk20a *c,
+                        struct nvhost_alloc_obj_ctx_args *args);
+int gk20a_free_obj_ctx(struct channel_gk20a *c,
+                        struct nvhost_free_obj_ctx_args *args);
+void gk20a_free_channel_ctx(struct channel_gk20a *c);
+int gk20a_gr_isr(struct gk20a *g);
+int gk20a_gr_nonstall_isr(struct gk20a *g);
+/* zcull */
+u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr);
+int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
+                        struct channel_gk20a *c, u64 zcull_va, u32 mode);
+int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
+                        struct gr_zcull_info *zcull_params);
+/* zbc */
+int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
+                        struct zbc_entry *zbc_val);
+int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
+                        struct zbc_query_params *query_params);
+int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
+                        struct zbc_entry *zbc_val);
+int gr_gk20a_clear_zbc_table(struct gk20a *g, struct gr_gk20a *gr);
+int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr);
+/* pmu */
+int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size);
+int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g, phys_addr_t addr);
+int gr_gk20a_fecs_set_reglist_virual_addr(struct gk20a *g, u64 pmu_va);
+void gr_gk20a_init_elcg_mode(struct gk20a *g, u32 mode, u32 engine);
+void gr_gk20a_init_blcg_mode(struct gk20a *g, u32 mode, u32 engine);
+/* sm */
+bool gk20a_gr_sm_debugger_attached(struct gk20a *g);
+#define gr_gk20a_elpg_protected_call(g, func) \
+        ({ \
+                int err; \
+                if (support_gk20a_pmu()) \
+                        gk20a_pmu_disable_elpg(g); \
+                err = func; \
+                if (support_gk20a_pmu()) \
+                        gk20a_pmu_enable_elpg(g); \
+                err; \
+        })
+int gk20a_gr_suspend(struct gk20a *g);
+struct nvhost_dbg_gpu_reg_op;
+int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
+                          struct nvhost_dbg_gpu_reg_op *ctx_ops, u32 num_ops,
+                          u32 num_ctx_wr_ops, u32 num_ctx_rd_ops);
+int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
+                                    u32 addr,
+                                    u32 max_offsets,
+                                    u32 *offsets, u32 *offset_addrs,
+                                    u32 *num_offsets,
+                                    bool is_quad, u32 quad);
+int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
+                                 struct channel_gk20a *c,
+                                    bool enable_smpc_ctxsw);
+struct channel_ctx_gk20a;
+int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_ctx_gk20a *ch_ctx,
+                                    u32 addr, u32 data, bool patch);
+int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
+                                          struct channel_ctx_gk20a *ch_ctx);
+int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
+                                        struct channel_ctx_gk20a *ch_ctx);
+void gr_gk20a_commit_global_pagepool(struct gk20a *g,
+                                     struct channel_ctx_gk20a *ch_ctx,
+                                     u64 addr, u32 size, bool patch);
+void gk20a_gr_set_shader_exceptions(struct gk20a *g, u32 data);
+void gr_gk20a_enable_hww_exceptions(struct gk20a *g);
+void gr_gk20a_get_sm_dsm_perf_regs(struct gk20a *g,
+                                   u32 *num_sm_dsm_perf_regs,
+                                   u32 **sm_dsm_perf_regs,
+                                   u32 *perf_register_stride);
+void gr_gk20a_get_sm_dsm_perf_ctrl_regs(struct gk20a *g,
+                                        u32 *num_sm_dsm_perf_regs,
+                                        u32 **sm_dsm_perf_regs,
+                                        u32 *perf_register_stride);
+int gr_gk20a_setup_rop_mapping(struct gk20a *g, struct gr_gk20a *gr);
+#endif /*__GR_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/gr_pri_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_pri_gk20a.h
new file mode 100644
index 00000000..a82a1ee7
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gr_pri_gk20a.h
@@ -0,0 +1,179 @@
+/*
+ * GK20A Graphics Context Pri Register Addressing
+ *
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef _NVHOST_GR_PRI_GK20A_H_
+#define _NVHOST_GR_PRI_GK20A_H_
+/*
+ * These convenience macros are generally for use in the management/modificaiton
+ * of the context state store for gr/compute contexts.
+ */
+/*
+ * GPC pri addressing
+ */
+static inline u32 pri_gpccs_addr_width(void)
+{
+        return 15; /*from where?*/
+}
+static inline u32 pri_gpccs_addr_mask(u32 addr)
+{
+        return addr & ((1 << pri_gpccs_addr_width()) - 1);
+}
+static inline u32 pri_gpc_addr(u32 addr, u32 gpc)
+{
+        return proj_gpc_base_v() + (gpc * proj_gpc_stride_v()) + addr;
+}
+static inline bool pri_is_gpc_addr_shared(u32 addr)
+{
+        return (addr >= proj_gpc_shared_base_v()) &&
+                (addr < proj_gpc_shared_base_v() + proj_gpc_stride_v());
+}
+static inline bool pri_is_gpc_addr(u32 addr)
+{
+        return  ((addr >= proj_gpc_base_v()) &&
+                 (addr < proj_gpc_base_v() +
+                  proj_scal_litter_num_gpcs_v() * proj_gpc_stride_v())) ||
+                pri_is_gpc_addr_shared(addr);
+}
+static inline u32 pri_get_gpc_num(u32 addr)
+{
+        u32 i, start;
+        u32 num_gpcs = proj_scal_litter_num_gpcs_v();
+        for (i = 0; i < num_gpcs; i++) {
+                start = proj_gpc_base_v() + (i * proj_gpc_stride_v());
+                if ((addr >= start) && (addr < (start + proj_gpc_stride_v())))
+                        return i;
+        }
+        return 0;
+}
+/*
+ * TPC pri addressing
+ */
+static inline u32 pri_tpccs_addr_width(void)
+{
+        return 11; /* from where? */
+}
+static inline u32 pri_tpccs_addr_mask(u32 addr)
+{
+        return addr & ((1 << pri_tpccs_addr_width()) - 1);
+}
+static inline u32 pri_tpc_addr(u32 addr, u32 gpc, u32 tpc)
+{
+        return proj_gpc_base_v() + (gpc * proj_gpc_stride_v()) +
+                proj_tpc_in_gpc_base_v() + (tpc * proj_tpc_in_gpc_stride_v()) +
+                addr;
+}
+static inline bool pri_is_tpc_addr_shared(u32 addr)
+{
+        return (addr >= proj_tpc_in_gpc_shared_base_v()) &&
+                (addr < (proj_tpc_in_gpc_shared_base_v() +
+                         proj_tpc_in_gpc_stride_v()));
+}
+static inline bool pri_is_tpc_addr(u32 addr)
+{
+        return ((addr >= proj_tpc_in_gpc_base_v()) &&
+                (addr < proj_tpc_in_gpc_base_v() + (proj_scal_litter_num_tpc_per_gpc_v() *
+                                                    proj_tpc_in_gpc_stride_v())))
+                ||
+                pri_is_tpc_addr_shared(addr);
+}
+static inline u32 pri_get_tpc_num(u32 addr)
+{
+        u32 i, start;
+        u32 num_tpcs = proj_scal_litter_num_tpc_per_gpc_v();
+        for (i = 0; i < num_tpcs; i++) {
+                start = proj_tpc_in_gpc_base_v() + (i * proj_tpc_in_gpc_stride_v());
+                if ((addr >= start) && (addr < (start + proj_tpc_in_gpc_stride_v())))
+                        return i;
+        }
+        return 0;
+}
+/*
+ * BE pri addressing
+ */
+static inline u32 pri_becs_addr_width(void)
+{
+        return 10;/* from where? */
+}
+static inline u32 pri_becs_addr_mask(u32 addr)
+{
+        return addr & ((1 << pri_becs_addr_width()) - 1);
+}
+static inline bool pri_is_be_addr_shared(u32 addr)
+{
+        return (addr >= proj_rop_shared_base_v()) &&
+                (addr < proj_rop_shared_base_v() + proj_rop_stride_v());
+}
+static inline u32 pri_be_shared_addr(u32 addr)
+{
+        return proj_rop_shared_base_v() + pri_becs_addr_mask(addr);
+}
+static inline bool pri_is_be_addr(u32 addr)
+{
+        return  ((addr >= proj_rop_base_v()) &&
+                 (addr < proj_rop_base_v()+proj_scal_litter_num_fbps_v() * proj_rop_stride_v())) ||
+                pri_is_be_addr_shared(addr);
+}
+static inline u32 pri_get_be_num(u32 addr)
+{
+        u32 i, start;
+        u32 num_fbps = proj_scal_litter_num_fbps_v();
+        for (i = 0; i < num_fbps; i++) {
+                start = proj_rop_base_v() + (i * proj_rop_stride_v());
+                if ((addr >= start) && (addr < (start + proj_rop_stride_v())))
+                        return i;
+        }
+        return 0;
+}
+/*
+ * PPC pri addressing
+ */
+static inline u32 pri_ppccs_addr_width(void)
+{
+        return 9; /* from where? */
+}
+static inline u32 pri_ppccs_addr_mask(u32 addr)
+{
+        return addr & ((1 << pri_ppccs_addr_width()) - 1);
+}
+static inline u32 pri_ppc_addr(u32 addr, u32 gpc, u32 ppc)
+{
+        return proj_gpc_base_v() + (gpc * proj_gpc_stride_v()) +
+                proj_ppc_in_gpc_base_v() + (ppc * proj_ppc_in_gpc_stride_v()) + addr;
+}
+enum ctxsw_addr_type {
+        CTXSW_ADDR_TYPE_SYS = 0,
+        CTXSW_ADDR_TYPE_GPC = 1,
+        CTXSW_ADDR_TYPE_TPC = 2,
+        CTXSW_ADDR_TYPE_BE  = 3,
+        CTXSW_ADDR_TYPE_PPC = 4
+};
+#define PRI_BROADCAST_FLAGS_NONE  0
+#define PRI_BROADCAST_FLAGS_GPC   BIT(0)
+#define PRI_BROADCAST_FLAGS_TPC   BIT(1)
+#define PRI_BROADCAST_FLAGS_BE    BIT(2)
+#define PRI_BROADCAST_FLAGS_PPC   BIT(3)
+#endif /*_NVHOST_GR_PRI_GK20A_H_ */
diff --git a/drivers/gpu/nvgpu/gk20a/hal.c b/drivers/gpu/nvgpu/gk20a/hal.c
new file mode 100644
index 00000000..dea740c2
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hal.c
@@ -0,0 +1,33 @@
+/*
+ * NVIDIA GPU HAL interface.
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include "gk20a.h"
+#include "hal_gk20a.h"
+int gpu_init_hal(struct gk20a *g)
+{
+        u32 ver = g->gpu_characteristics.arch + g->gpu_characteristics.impl;
+        switch (ver) {
+        case GK20A_GPUID_GK20A:
+                gk20a_dbg_info("gk20a detected");
+                gk20a_init_hal(&g->ops);
+                break;
+        default:
+                gk20a_err(&g->dev->dev, "no support for %x", ver);
+                return -ENODEV;
+        }
+        return 0;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/hal.h b/drivers/gpu/nvgpu/gk20a/hal.h
new file mode 100644
index 00000000..da02cf5f
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hal.h
@@ -0,0 +1,25 @@
+/*
+ * NVIDIA GPU Hardware Abstraction Layer functions definitions.
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef __HAL_GPU__
+#define __HAL_GPU__
+#include <linux/kernel.h>
+struct gk20a;
+int gpu_init_hal(struct gk20a *g);
+#endif /* __HAL_GPU__ */
diff --git a/drivers/gpu/nvgpu/gk20a/hal_gk20a.c b/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
new file mode 100644
index 00000000..b3e9b0e6
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
@@ -0,0 +1,50 @@
+/*
+ * drivers/video/tegra/host/gk20a/hal_gk20a.c
+ *
+ * GK20A Tegra HAL interface.
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include "hal_gk20a.h"
+#include "ltc_gk20a.h"
+#include "fb_gk20a.h"
+#include "gk20a.h"
+#include "gk20a_gating_reglist.h"
+#include "channel_gk20a.h"
+struct gpu_ops gk20a_ops = {
+        .clock_gating = {
+                .slcg_gr_load_gating_prod =
+                        gr_gk20a_slcg_gr_load_gating_prod,
+                .slcg_perf_load_gating_prod =
+                        gr_gk20a_slcg_perf_load_gating_prod,
+                .blcg_gr_load_gating_prod =
+                        gr_gk20a_blcg_gr_load_gating_prod,
+                .pg_gr_load_gating_prod =
+                        gr_gk20a_pg_gr_load_gating_prod,
+                .slcg_therm_load_gating_prod =
+                        gr_gk20a_slcg_therm_load_gating_prod,
+        }
+};
+int gk20a_init_hal(struct gpu_ops *gops)
+{
+        *gops = gk20a_ops;
+        gk20a_init_ltc(gops);
+        gk20a_init_gr(gops);
+        gk20a_init_fb(gops);
+        gk20a_init_fifo(gops);
+        gops->name = "gk20a";
+        return 0;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/hal_gk20a.h b/drivers/gpu/nvgpu/gk20a/hal_gk20a.h
new file mode 100644
index 00000000..db77a4a7
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hal_gk20a.h
@@ -0,0 +1,28 @@
+/*
+ * drivers/video/tegra/host/gk20a/hal_gk20a.h
+ *
+ * GK20A Hardware Abstraction Layer functions definitions.
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef __HAL_GK20A__
+#define __HAL_GK20A__
+#include <linux/kernel.h>
+struct gpu_ops;
+struct gk20a;
+int gk20a_init_hal(struct gpu_ops *gops);
+#endif /* __HAL_GK20A__ */
diff --git a/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h
new file mode 100644
index 00000000..ebf8a873
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_bus_gk20a_h_
+#define _hw_bus_gk20a_h_
+static inline u32 bus_bar1_block_r(void)
+{
+        return 0x00001704;
+}
+static inline u32 bus_bar1_block_ptr_f(u32 v)
+{
+        return (v & 0xfffffff) << 0;
+}
+static inline u32 bus_bar1_block_target_vid_mem_f(void)
+{
+        return 0x0;
+}
+static inline u32 bus_bar1_block_mode_virtual_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 bus_bar1_block_ptr_shift_v(void)
+{
+        return 0x0000000c;
+}
+static inline u32 bus_intr_0_r(void)
+{
+        return 0x00001100;
+}
+static inline u32 bus_intr_0_pri_squash_m(void)
+{
+        return 0x1 << 1;
+}
+static inline u32 bus_intr_0_pri_fecserr_m(void)
+{
+        return 0x1 << 2;
+}
+static inline u32 bus_intr_0_pri_timeout_m(void)
+{
+        return 0x1 << 3;
+}
+static inline u32 bus_intr_en_0_r(void)
+{
+        return 0x00001140;
+}
+static inline u32 bus_intr_en_0_pri_squash_m(void)
+{
+        return 0x1 << 1;
+}
+static inline u32 bus_intr_en_0_pri_fecserr_m(void)
+{
+        return 0x1 << 2;
+}
+static inline u32 bus_intr_en_0_pri_timeout_m(void)
+{
+        return 0x1 << 3;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_ccsr_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_ccsr_gk20a.h
new file mode 100644
index 00000000..573329f1
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_ccsr_gk20a.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_ccsr_gk20a_h_
+#define _hw_ccsr_gk20a_h_
+static inline u32 ccsr_channel_inst_r(u32 i)
+{
+        return 0x00800000 + i*8;
+}
+static inline u32 ccsr_channel_inst__size_1_v(void)
+{
+        return 0x00000080;
+}
+static inline u32 ccsr_channel_inst_ptr_f(u32 v)
+{
+        return (v & 0xfffffff) << 0;
+}
+static inline u32 ccsr_channel_inst_target_vid_mem_f(void)
+{
+        return 0x0;
+}
+static inline u32 ccsr_channel_inst_bind_false_f(void)
+{
+        return 0x0;
+}
+static inline u32 ccsr_channel_inst_bind_true_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 ccsr_channel_r(u32 i)
+{
+        return 0x00800004 + i*8;
+}
+static inline u32 ccsr_channel__size_1_v(void)
+{
+        return 0x00000080;
+}
+static inline u32 ccsr_channel_enable_v(u32 r)
+{
+        return (r >> 0) & 0x1;
+}
+static inline u32 ccsr_channel_enable_set_f(u32 v)
+{
+        return (v & 0x1) << 10;
+}
+static inline u32 ccsr_channel_enable_set_true_f(void)
+{
+        return 0x400;
+}
+static inline u32 ccsr_channel_enable_clr_true_f(void)
+{
+        return 0x800;
+}
+static inline u32 ccsr_channel_runlist_f(u32 v)
+{
+        return (v & 0xf) << 16;
+}
+static inline u32 ccsr_channel_status_v(u32 r)
+{
+        return (r >> 24) & 0xf;
+}
+static inline u32 ccsr_channel_busy_v(u32 r)
+{
+        return (r >> 28) & 0x1;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_chiplet_pwr_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_chiplet_pwr_gk20a.h
new file mode 100644
index 00000000..66bf01b0
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_chiplet_pwr_gk20a.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_chiplet_pwr_gk20a_h_
+#define _hw_chiplet_pwr_gk20a_h_
+static inline u32 chiplet_pwr_gpcs_weight_6_r(void)
+{
+        return 0x0010e018;
+}
+static inline u32 chiplet_pwr_gpcs_weight_7_r(void)
+{
+        return 0x0010e01c;
+}
+static inline u32 chiplet_pwr_gpcs_config_1_r(void)
+{
+        return 0x0010e03c;
+}
+static inline u32 chiplet_pwr_gpcs_config_1_ba_enable_yes_f(void)
+{
+        return 0x1;
+}
+static inline u32 chiplet_pwr_fbps_weight_0_r(void)
+{
+        return 0x0010e100;
+}
+static inline u32 chiplet_pwr_fbps_weight_1_r(void)
+{
+        return 0x0010e104;
+}
+static inline u32 chiplet_pwr_fbps_config_1_r(void)
+{
+        return 0x0010e13c;
+}
+static inline u32 chiplet_pwr_fbps_config_1_ba_enable_yes_f(void)
+{
+        return 0x1;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h
new file mode 100644
index 00000000..e2a4f2f2
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_ctxsw_prog_gk20a_h_
+#define _hw_ctxsw_prog_gk20a_h_
+static inline u32 ctxsw_prog_fecs_header_v(void)
+{
+        return 0x00000100;
+}
+static inline u32 ctxsw_prog_main_image_num_gpcs_o(void)
+{
+        return 0x00000008;
+}
+static inline u32 ctxsw_prog_main_image_patch_count_o(void)
+{
+        return 0x00000010;
+}
+static inline u32 ctxsw_prog_main_image_patch_adr_lo_o(void)
+{
+        return 0x00000014;
+}
+static inline u32 ctxsw_prog_main_image_patch_adr_hi_o(void)
+{
+        return 0x00000018;
+}
+static inline u32 ctxsw_prog_main_image_zcull_o(void)
+{
+        return 0x0000001c;
+}
+static inline u32 ctxsw_prog_main_image_zcull_mode_no_ctxsw_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 ctxsw_prog_main_image_zcull_mode_separate_buffer_v(void)
+{
+        return 0x00000002;
+}
+static inline u32 ctxsw_prog_main_image_zcull_ptr_o(void)
+{
+        return 0x00000020;
+}
+static inline u32 ctxsw_prog_main_image_pm_o(void)
+{
+        return 0x00000028;
+}
+static inline u32 ctxsw_prog_main_image_pm_mode_m(void)
+{
+        return 0x7 << 0;
+}
+static inline u32 ctxsw_prog_main_image_pm_mode_v(u32 r)
+{
+        return (r >> 0) & 0x7;
+}
+static inline u32 ctxsw_prog_main_image_pm_mode_no_ctxsw_f(void)
+{
+        return 0x0;
+}
+static inline u32 ctxsw_prog_main_image_pm_smpc_mode_m(void)
+{
+        return 0x7 << 3;
+}
+static inline u32 ctxsw_prog_main_image_pm_smpc_mode_v(u32 r)
+{
+        return (r >> 3) & 0x7;
+}
+static inline u32 ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f(void)
+{
+        return 0x0;
+}
+static inline u32 ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f(void)
+{
+        return 0x8;
+}
+static inline u32 ctxsw_prog_main_image_pm_ptr_o(void)
+{
+        return 0x0000002c;
+}
+static inline u32 ctxsw_prog_main_image_num_save_ops_o(void)
+{
+        return 0x000000f4;
+}
+static inline u32 ctxsw_prog_main_image_num_restore_ops_o(void)
+{
+        return 0x000000f8;
+}
+static inline u32 ctxsw_prog_main_image_magic_value_o(void)
+{
+        return 0x000000fc;
+}
+static inline u32 ctxsw_prog_main_image_magic_value_v_value_v(void)
+{
+        return 0x600dc0de;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_config_o(void)
+{
+        return 0x000000a0;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_config_mode_allow_all_f(void)
+{
+        return 0x0;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_config_mode_allow_none_f(void)
+{
+        return 0x1;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f(void)
+{
+        return 0x2;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_addr_lo_o(void)
+{
+        return 0x000000a4;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_addr_hi_o(void)
+{
+        return 0x000000a8;
+}
+static inline u32 ctxsw_prog_main_image_misc_options_o(void)
+{
+        return 0x0000003c;
+}
+static inline u32 ctxsw_prog_main_image_misc_options_verif_features_m(void)
+{
+        return 0x1 << 3;
+}
+static inline u32 ctxsw_prog_main_image_misc_options_verif_features_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 ctxsw_prog_main_image_misc_options_verif_features_enabled_f(void)
+{
+        return 0x8;
+}
+static inline u32 ctxsw_prog_local_priv_register_ctl_o(void)
+{
+        return 0x0000000c;
+}
+static inline u32 ctxsw_prog_local_priv_register_ctl_offset_v(u32 r)
+{
+        return (r >> 0) & 0xffff;
+}
+static inline u32 ctxsw_prog_local_image_ppc_info_o(void)
+{
+        return 0x000000f4;
+}
+static inline u32 ctxsw_prog_local_image_ppc_info_num_ppcs_v(u32 r)
+{
+        return (r >> 0) & 0xffff;
+}
+static inline u32 ctxsw_prog_local_image_ppc_info_ppc_mask_v(u32 r)
+{
+        return (r >> 16) & 0xffff;
+}
+static inline u32 ctxsw_prog_local_image_num_tpcs_o(void)
+{
+        return 0x000000f8;
+}
+static inline u32 ctxsw_prog_local_magic_value_o(void)
+{
+        return 0x000000fc;
+}
+static inline u32 ctxsw_prog_local_magic_value_v_value_v(void)
+{
+        return 0xad0becab;
+}
+static inline u32 ctxsw_prog_main_extended_buffer_ctl_o(void)
+{
+        return 0x000000ec;
+}
+static inline u32 ctxsw_prog_main_extended_buffer_ctl_offset_v(u32 r)
+{
+        return (r >> 0) & 0xffff;
+}
+static inline u32 ctxsw_prog_main_extended_buffer_ctl_size_v(u32 r)
+{
+        return (r >> 16) & 0xff;
+}
+static inline u32 ctxsw_prog_extended_buffer_segments_size_in_bytes_v(void)
+{
+        return 0x00000100;
+}
+static inline u32 ctxsw_prog_extended_marker_size_in_bytes_v(void)
+{
+        return 0x00000004;
+}
+static inline u32 ctxsw_prog_extended_sm_dsm_perf_counter_register_stride_v(void)
+{
+        return 0x00000005;
+}
+static inline u32 ctxsw_prog_extended_sm_dsm_perf_counter_control_register_stride_v(void)
+{
+        return 0x00000004;
+}
+static inline u32 ctxsw_prog_extended_num_smpc_quadrants_v(void)
+{
+        return 0x00000004;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_fb_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_fb_gk20a.h
new file mode 100644
index 00000000..b7edc29d
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_fb_gk20a.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_fb_gk20a_h_
+#define _hw_fb_gk20a_h_
+static inline u32 fb_mmu_ctrl_r(void)
+{
+        return 0x00100c80;
+}
+static inline u32 fb_mmu_ctrl_vm_pg_size_f(u32 v)
+{
+        return (v & 0x1) << 0;
+}
+static inline u32 fb_mmu_ctrl_vm_pg_size_128kb_f(void)
+{
+        return 0x0;
+}
+static inline u32 fb_mmu_ctrl_pri_fifo_empty_v(u32 r)
+{
+        return (r >> 15) & 0x1;
+}
+static inline u32 fb_mmu_ctrl_pri_fifo_empty_false_f(void)
+{
+        return 0x0;
+}
+static inline u32 fb_mmu_ctrl_pri_fifo_space_v(u32 r)
+{
+        return (r >> 16) & 0xff;
+}
+static inline u32 fb_mmu_invalidate_pdb_r(void)
+{
+        return 0x00100cb8;
+}
+static inline u32 fb_mmu_invalidate_pdb_aperture_vid_mem_f(void)
+{
+        return 0x0;
+}
+static inline u32 fb_mmu_invalidate_pdb_addr_f(u32 v)
+{
+        return (v & 0xfffffff) << 4;
+}
+static inline u32 fb_mmu_invalidate_r(void)
+{
+        return 0x00100cbc;
+}
+static inline u32 fb_mmu_invalidate_all_va_true_f(void)
+{
+        return 0x1;
+}
+static inline u32 fb_mmu_invalidate_all_pdb_true_f(void)
+{
+        return 0x2;
+}
+static inline u32 fb_mmu_invalidate_trigger_s(void)
+{
+        return 1;
+}
+static inline u32 fb_mmu_invalidate_trigger_f(u32 v)
+{
+        return (v & 0x1) << 31;
+}
+static inline u32 fb_mmu_invalidate_trigger_m(void)
+{
+        return 0x1 << 31;
+}
+static inline u32 fb_mmu_invalidate_trigger_v(u32 r)
+{
+        return (r >> 31) & 0x1;
+}
+static inline u32 fb_mmu_invalidate_trigger_true_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 fb_mmu_debug_wr_r(void)
+{
+        return 0x00100cc8;
+}
+static inline u32 fb_mmu_debug_wr_aperture_s(void)
+{
+        return 2;
+}
+static inline u32 fb_mmu_debug_wr_aperture_f(u32 v)
+{
+        return (v & 0x3) << 0;
+}
+static inline u32 fb_mmu_debug_wr_aperture_m(void)
+{
+        return 0x3 << 0;
+}
+static inline u32 fb_mmu_debug_wr_aperture_v(u32 r)
+{
+        return (r >> 0) & 0x3;
+}
+static inline u32 fb_mmu_debug_wr_aperture_vid_mem_f(void)
+{
+        return 0x0;
+}
+static inline u32 fb_mmu_debug_wr_vol_false_f(void)
+{
+        return 0x0;
+}
+static inline u32 fb_mmu_debug_wr_vol_true_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 fb_mmu_debug_wr_vol_true_f(void)
+{
+        return 0x4;
+}
+static inline u32 fb_mmu_debug_wr_addr_v(u32 r)
+{
+        return (r >> 4) & 0xfffffff;
+}
+static inline u32 fb_mmu_debug_wr_addr_alignment_v(void)
+{
+        return 0x0000000c;
+}
+static inline u32 fb_mmu_debug_rd_r(void)
+{
+        return 0x00100ccc;
+}
+static inline u32 fb_mmu_debug_rd_aperture_vid_mem_f(void)
+{
+        return 0x0;
+}
+static inline u32 fb_mmu_debug_rd_vol_false_f(void)
+{
+        return 0x0;
+}
+static inline u32 fb_mmu_debug_rd_addr_v(u32 r)
+{
+        return (r >> 4) & 0xfffffff;
+}
+static inline u32 fb_mmu_debug_rd_addr_alignment_v(void)
+{
+        return 0x0000000c;
+}
+static inline u32 fb_mmu_debug_ctrl_r(void)
+{
+        return 0x00100cc4;
+}
+static inline u32 fb_mmu_debug_ctrl_debug_v(u32 r)
+{
+        return (r >> 16) & 0x1;
+}
+static inline u32 fb_mmu_debug_ctrl_debug_enabled_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 fb_mmu_vpr_info_r(void)
+{
+        return 0x00100cd0;
+}
+static inline u32 fb_mmu_vpr_info_fetch_v(u32 r)
+{
+        return (r >> 2) & 0x1;
+}
+static inline u32 fb_mmu_vpr_info_fetch_false_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 fb_mmu_vpr_info_fetch_true_v(void)
+{
+        return 0x00000001;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_fifo_gk20a.h
new file mode 100644
index 00000000..a39d3c51
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_fifo_gk20a.h
@@ -0,0 +1,565 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_fifo_gk20a_h_
+#define _hw_fifo_gk20a_h_
+static inline u32 fifo_bar1_base_r(void)
+{
+        return 0x00002254;
+}
+static inline u32 fifo_bar1_base_ptr_f(u32 v)
+{
+        return (v & 0xfffffff) << 0;
+}
+static inline u32 fifo_bar1_base_ptr_align_shift_v(void)
+{
+        return 0x0000000c;
+}
+static inline u32 fifo_bar1_base_valid_false_f(void)
+{
+        return 0x0;
+}
+static inline u32 fifo_bar1_base_valid_true_f(void)
+{
+        return 0x10000000;
+}
+static inline u32 fifo_runlist_base_r(void)
+{
+        return 0x00002270;
+}
+static inline u32 fifo_runlist_base_ptr_f(u32 v)
+{
+        return (v & 0xfffffff) << 0;
+}
+static inline u32 fifo_runlist_base_target_vid_mem_f(void)
+{
+        return 0x0;
+}
+static inline u32 fifo_runlist_r(void)
+{
+        return 0x00002274;
+}
+static inline u32 fifo_runlist_engine_f(u32 v)
+{
+        return (v & 0xf) << 20;
+}
+static inline u32 fifo_eng_runlist_base_r(u32 i)
+{
+        return 0x00002280 + i*8;
+}
+static inline u32 fifo_eng_runlist_base__size_1_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 fifo_eng_runlist_r(u32 i)
+{
+        return 0x00002284 + i*8;
+}
+static inline u32 fifo_eng_runlist__size_1_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 fifo_eng_runlist_length_f(u32 v)
+{
+        return (v & 0xffff) << 0;
+}
+static inline u32 fifo_eng_runlist_pending_true_f(void)
+{
+        return 0x100000;
+}
+static inline u32 fifo_eng_timeslice_r(u32 i)
+{
+        return 0x00002310 + i*4;
+}
+static inline u32 fifo_eng_timeslice_timeout_128_f(void)
+{
+        return 0x80;
+}
+static inline u32 fifo_eng_timeslice_timescale_3_f(void)
+{
+        return 0x3000;
+}
+static inline u32 fifo_eng_timeslice_enable_true_f(void)
+{
+        return 0x10000000;
+}
+static inline u32 fifo_pb_timeslice_r(u32 i)
+{
+        return 0x00002350 + i*4;
+}
+static inline u32 fifo_pb_timeslice_timeout_16_f(void)
+{
+        return 0x10;
+}
+static inline u32 fifo_pb_timeslice_timescale_0_f(void)
+{
+        return 0x0;
+}
+static inline u32 fifo_pb_timeslice_enable_true_f(void)
+{
+        return 0x10000000;
+}
+static inline u32 fifo_pbdma_map_r(u32 i)
+{
+        return 0x00002390 + i*4;
+}
+static inline u32 fifo_intr_0_r(void)
+{
+        return 0x00002100;
+}
+static inline u32 fifo_intr_0_bind_error_pending_f(void)
+{
+        return 0x1;
+}
+static inline u32 fifo_intr_0_bind_error_reset_f(void)
+{
+        return 0x1;
+}
+static inline u32 fifo_intr_0_pio_error_pending_f(void)
+{
+        return 0x10;
+}
+static inline u32 fifo_intr_0_pio_error_reset_f(void)
+{
+        return 0x10;
+}
+static inline u32 fifo_intr_0_sched_error_pending_f(void)
+{
+        return 0x100;
+}
+static inline u32 fifo_intr_0_sched_error_reset_f(void)
+{
+        return 0x100;
+}
+static inline u32 fifo_intr_0_chsw_error_pending_f(void)
+{
+        return 0x10000;
+}
+static inline u32 fifo_intr_0_chsw_error_reset_f(void)
+{
+        return 0x10000;
+}
+static inline u32 fifo_intr_0_fb_flush_timeout_pending_f(void)
+{
+        return 0x800000;
+}
+static inline u32 fifo_intr_0_fb_flush_timeout_reset_f(void)
+{
+        return 0x800000;
+}
+static inline u32 fifo_intr_0_lb_error_pending_f(void)
+{
+        return 0x1000000;
+}
+static inline u32 fifo_intr_0_lb_error_reset_f(void)
+{
+        return 0x1000000;
+}
+static inline u32 fifo_intr_0_dropped_mmu_fault_pending_f(void)
+{
+        return 0x8000000;
+}
+static inline u32 fifo_intr_0_dropped_mmu_fault_reset_f(void)
+{
+        return 0x8000000;
+}
+static inline u32 fifo_intr_0_mmu_fault_pending_f(void)
+{
+        return 0x10000000;
+}
+static inline u32 fifo_intr_0_pbdma_intr_pending_f(void)
+{
+        return 0x20000000;
+}
+static inline u32 fifo_intr_0_runlist_event_pending_f(void)
+{
+        return 0x40000000;
+}
+static inline u32 fifo_intr_0_channel_intr_pending_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 fifo_intr_en_0_r(void)
+{
+        return 0x00002140;
+}
+static inline u32 fifo_intr_en_1_r(void)
+{
+        return 0x00002528;
+}
+static inline u32 fifo_intr_bind_error_r(void)
+{
+        return 0x0000252c;
+}
+static inline u32 fifo_intr_sched_error_r(void)
+{
+        return 0x0000254c;
+}
+static inline u32 fifo_intr_sched_error_code_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 fifo_intr_sched_error_code_ctxsw_timeout_v(void)
+{
+        return 0x0000000a;
+}
+static inline u32 fifo_intr_chsw_error_r(void)
+{
+        return 0x0000256c;
+}
+static inline u32 fifo_intr_mmu_fault_id_r(void)
+{
+        return 0x0000259c;
+}
+static inline u32 fifo_intr_mmu_fault_eng_id_graphics_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 fifo_intr_mmu_fault_eng_id_graphics_f(void)
+{
+        return 0x0;
+}
+static inline u32 fifo_intr_mmu_fault_inst_r(u32 i)
+{
+        return 0x00002800 + i*16;
+}
+static inline u32 fifo_intr_mmu_fault_inst_ptr_v(u32 r)
+{
+        return (r >> 0) & 0xfffffff;
+}
+static inline u32 fifo_intr_mmu_fault_inst_ptr_align_shift_v(void)
+{
+        return 0x0000000c;
+}
+static inline u32 fifo_intr_mmu_fault_lo_r(u32 i)
+{
+        return 0x00002804 + i*16;
+}
+static inline u32 fifo_intr_mmu_fault_hi_r(u32 i)
+{
+        return 0x00002808 + i*16;
+}
+static inline u32 fifo_intr_mmu_fault_info_r(u32 i)
+{
+        return 0x0000280c + i*16;
+}
+static inline u32 fifo_intr_mmu_fault_info_type_v(u32 r)
+{
+        return (r >> 0) & 0xf;
+}
+static inline u32 fifo_intr_mmu_fault_info_engine_subid_v(u32 r)
+{
+        return (r >> 6) & 0x1;
+}
+static inline u32 fifo_intr_mmu_fault_info_engine_subid_gpc_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 fifo_intr_mmu_fault_info_engine_subid_hub_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 fifo_intr_mmu_fault_info_client_v(u32 r)
+{
+        return (r >> 8) & 0x1f;
+}
+static inline u32 fifo_intr_pbdma_id_r(void)
+{
+        return 0x000025a0;
+}
+static inline u32 fifo_intr_pbdma_id_status_f(u32 v, u32 i)
+{
+        return (v & 0x1) << (0 + i*1);
+}
+static inline u32 fifo_intr_pbdma_id_status__size_1_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 fifo_intr_runlist_r(void)
+{
+        return 0x00002a00;
+}
+static inline u32 fifo_fb_timeout_r(void)
+{
+        return 0x00002a04;
+}
+static inline u32 fifo_fb_timeout_period_m(void)
+{
+        return 0x3fffffff << 0;
+}
+static inline u32 fifo_fb_timeout_period_max_f(void)
+{
+        return 0x3fffffff;
+}
+static inline u32 fifo_pb_timeout_r(void)
+{
+        return 0x00002a08;
+}
+static inline u32 fifo_pb_timeout_detection_enabled_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 fifo_eng_timeout_r(void)
+{
+        return 0x00002a0c;
+}
+static inline u32 fifo_eng_timeout_period_m(void)
+{
+        return 0x7fffffff << 0;
+}
+static inline u32 fifo_eng_timeout_period_max_f(void)
+{
+        return 0x7fffffff;
+}
+static inline u32 fifo_eng_timeout_detection_m(void)
+{
+        return 0x1 << 31;
+}
+static inline u32 fifo_eng_timeout_detection_enabled_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 fifo_eng_timeout_detection_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 fifo_error_sched_disable_r(void)
+{
+        return 0x0000262c;
+}
+static inline u32 fifo_sched_disable_r(void)
+{
+        return 0x00002630;
+}
+static inline u32 fifo_sched_disable_runlist_f(u32 v, u32 i)
+{
+        return (v & 0x1) << (0 + i*1);
+}
+static inline u32 fifo_sched_disable_runlist_m(u32 i)
+{
+        return 0x1 << (0 + i*1);
+}
+static inline u32 fifo_sched_disable_true_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 fifo_preempt_r(void)
+{
+        return 0x00002634;
+}
+static inline u32 fifo_preempt_pending_true_f(void)
+{
+        return 0x100000;
+}
+static inline u32 fifo_preempt_type_channel_f(void)
+{
+        return 0x0;
+}
+static inline u32 fifo_preempt_chid_f(u32 v)
+{
+        return (v & 0xfff) << 0;
+}
+static inline u32 fifo_trigger_mmu_fault_r(u32 i)
+{
+        return 0x00002a30 + i*4;
+}
+static inline u32 fifo_trigger_mmu_fault_id_f(u32 v)
+{
+        return (v & 0x1f) << 0;
+}
+static inline u32 fifo_trigger_mmu_fault_enable_f(u32 v)
+{
+        return (v & 0x1) << 8;
+}
+static inline u32 fifo_engine_status_r(u32 i)
+{
+        return 0x00002640 + i*8;
+}
+static inline u32 fifo_engine_status__size_1_v(void)
+{
+        return 0x00000002;
+}
+static inline u32 fifo_engine_status_id_v(u32 r)
+{
+        return (r >> 0) & 0xfff;
+}
+static inline u32 fifo_engine_status_id_type_v(u32 r)
+{
+        return (r >> 12) & 0x1;
+}
+static inline u32 fifo_engine_status_id_type_chid_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 fifo_engine_status_ctx_status_v(u32 r)
+{
+        return (r >> 13) & 0x7;
+}
+static inline u32 fifo_engine_status_ctx_status_valid_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 fifo_engine_status_ctx_status_ctxsw_load_v(void)
+{
+        return 0x00000005;
+}
+static inline u32 fifo_engine_status_ctx_status_ctxsw_save_v(void)
+{
+        return 0x00000006;
+}
+static inline u32 fifo_engine_status_ctx_status_ctxsw_switch_v(void)
+{
+        return 0x00000007;
+}
+static inline u32 fifo_engine_status_next_id_v(u32 r)
+{
+        return (r >> 16) & 0xfff;
+}
+static inline u32 fifo_engine_status_next_id_type_v(u32 r)
+{
+        return (r >> 28) & 0x1;
+}
+static inline u32 fifo_engine_status_next_id_type_chid_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 fifo_engine_status_faulted_v(u32 r)
+{
+        return (r >> 30) & 0x1;
+}
+static inline u32 fifo_engine_status_faulted_true_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 fifo_engine_status_engine_v(u32 r)
+{
+        return (r >> 31) & 0x1;
+}
+static inline u32 fifo_engine_status_engine_idle_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 fifo_engine_status_engine_busy_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 fifo_engine_status_ctxsw_v(u32 r)
+{
+        return (r >> 15) & 0x1;
+}
+static inline u32 fifo_engine_status_ctxsw_in_progress_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 fifo_engine_status_ctxsw_in_progress_f(void)
+{
+        return 0x8000;
+}
+static inline u32 fifo_pbdma_status_r(u32 i)
+{
+        return 0x00003080 + i*4;
+}
+static inline u32 fifo_pbdma_status__size_1_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 fifo_pbdma_status_id_v(u32 r)
+{
+        return (r >> 0) & 0xfff;
+}
+static inline u32 fifo_pbdma_status_id_type_v(u32 r)
+{
+        return (r >> 12) & 0x1;
+}
+static inline u32 fifo_pbdma_status_id_type_chid_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 fifo_pbdma_status_chan_status_v(u32 r)
+{
+        return (r >> 13) & 0x7;
+}
+static inline u32 fifo_pbdma_status_chan_status_valid_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 fifo_pbdma_status_chan_status_chsw_load_v(void)
+{
+        return 0x00000005;
+}
+static inline u32 fifo_pbdma_status_chan_status_chsw_save_v(void)
+{
+        return 0x00000006;
+}
+static inline u32 fifo_pbdma_status_chan_status_chsw_switch_v(void)
+{
+        return 0x00000007;
+}
+static inline u32 fifo_pbdma_status_next_id_v(u32 r)
+{
+        return (r >> 16) & 0xfff;
+}
+static inline u32 fifo_pbdma_status_next_id_type_v(u32 r)
+{
+        return (r >> 28) & 0x1;
+}
+static inline u32 fifo_pbdma_status_next_id_type_chid_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 fifo_pbdma_status_chsw_v(u32 r)
+{
+        return (r >> 15) & 0x1;
+}
+static inline u32 fifo_pbdma_status_chsw_in_progress_v(void)
+{
+        return 0x00000001;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_flush_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_flush_gk20a.h
new file mode 100644
index 00000000..0aeb11f9
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_flush_gk20a.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_flush_gk20a_h_
+#define _hw_flush_gk20a_h_
+static inline u32 flush_l2_system_invalidate_r(void)
+{
+        return 0x00070004;
+}
+static inline u32 flush_l2_system_invalidate_pending_v(u32 r)
+{
+        return (r >> 0) & 0x1;
+}
+static inline u32 flush_l2_system_invalidate_pending_busy_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 flush_l2_system_invalidate_pending_busy_f(void)
+{
+        return 0x1;
+}
+static inline u32 flush_l2_system_invalidate_outstanding_v(u32 r)
+{
+        return (r >> 1) & 0x1;
+}
+static inline u32 flush_l2_system_invalidate_outstanding_true_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 flush_l2_flush_dirty_r(void)
+{
+        return 0x00070010;
+}
+static inline u32 flush_l2_flush_dirty_pending_v(u32 r)
+{
+        return (r >> 0) & 0x1;
+}
+static inline u32 flush_l2_flush_dirty_pending_empty_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 flush_l2_flush_dirty_pending_empty_f(void)
+{
+        return 0x0;
+}
+static inline u32 flush_l2_flush_dirty_pending_busy_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 flush_l2_flush_dirty_pending_busy_f(void)
+{
+        return 0x1;
+}
+static inline u32 flush_l2_flush_dirty_outstanding_v(u32 r)
+{
+        return (r >> 1) & 0x1;
+}
+static inline u32 flush_l2_flush_dirty_outstanding_false_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 flush_l2_flush_dirty_outstanding_false_f(void)
+{
+        return 0x0;
+}
+static inline u32 flush_l2_flush_dirty_outstanding_true_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 flush_fb_flush_r(void)
+{
+        return 0x00070000;
+}
+static inline u32 flush_fb_flush_pending_v(u32 r)
+{
+        return (r >> 0) & 0x1;
+}
+static inline u32 flush_fb_flush_pending_busy_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 flush_fb_flush_pending_busy_f(void)
+{
+        return 0x1;
+}
+static inline u32 flush_fb_flush_outstanding_v(u32 r)
+{
+        return (r >> 1) & 0x1;
+}
+static inline u32 flush_fb_flush_outstanding_true_v(void)
+{
+        return 0x00000001;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_gmmu_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_gmmu_gk20a.h
new file mode 100644
index 00000000..e0118946
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_gmmu_gk20a.h
@@ -0,0 +1,1141 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_gmmu_gk20a_h_
+#define _hw_gmmu_gk20a_h_
+static inline u32 gmmu_pde_aperture_big_w(void)
+{
+        return 0;
+}
+static inline u32 gmmu_pde_aperture_big_invalid_f(void)
+{
+        return 0x0;
+}
+static inline u32 gmmu_pde_aperture_big_video_memory_f(void)
+{
+        return 0x1;
+}
+static inline u32 gmmu_pde_size_w(void)
+{
+        return 0;
+}
+static inline u32 gmmu_pde_size_full_f(void)
+{
+        return 0x0;
+}
+static inline u32 gmmu_pde_address_big_sys_f(u32 v)
+{
+        return (v & 0xfffffff) << 4;
+}
+static inline u32 gmmu_pde_address_big_sys_w(void)
+{
+        return 0;
+}
+static inline u32 gmmu_pde_aperture_small_w(void)
+{
+        return 1;
+}
+static inline u32 gmmu_pde_aperture_small_invalid_f(void)
+{
+        return 0x0;
+}
+static inline u32 gmmu_pde_aperture_small_video_memory_f(void)
+{
+        return 0x1;
+}
+static inline u32 gmmu_pde_vol_small_w(void)
+{
+        return 1;
+}
+static inline u32 gmmu_pde_vol_small_true_f(void)
+{
+        return 0x4;
+}
+static inline u32 gmmu_pde_vol_small_false_f(void)
+{
+        return 0x0;
+}
+static inline u32 gmmu_pde_vol_big_w(void)
+{
+        return 1;
+}
+static inline u32 gmmu_pde_vol_big_true_f(void)
+{
+        return 0x8;
+}
+static inline u32 gmmu_pde_vol_big_false_f(void)
+{
+        return 0x0;
+}
+static inline u32 gmmu_pde_address_small_sys_f(u32 v)
+{
+        return (v & 0xfffffff) << 4;
+}
+static inline u32 gmmu_pde_address_small_sys_w(void)
+{
+        return 1;
+}
+static inline u32 gmmu_pde_address_shift_v(void)
+{
+        return 0x0000000c;
+}
+static inline u32 gmmu_pde__size_v(void)
+{
+        return 0x00000008;
+}
+static inline u32 gmmu_pte__size_v(void)
+{
+        return 0x00000008;
+}
+static inline u32 gmmu_pte_valid_w(void)
+{
+        return 0;
+}
+static inline u32 gmmu_pte_valid_true_f(void)
+{
+        return 0x1;
+}
+static inline u32 gmmu_pte_address_sys_f(u32 v)
+{
+        return (v & 0xfffffff) << 4;
+}
+static inline u32 gmmu_pte_address_sys_w(void)
+{
+        return 0;
+}
+static inline u32 gmmu_pte_vol_w(void)
+{
+        return 1;
+}
+static inline u32 gmmu_pte_vol_true_f(void)
+{
+        return 0x1;
+}
+static inline u32 gmmu_pte_vol_false_f(void)
+{
+        return 0x0;
+}
+static inline u32 gmmu_pte_aperture_w(void)
+{
+        return 1;
+}
+static inline u32 gmmu_pte_aperture_video_memory_f(void)
+{
+        return 0x0;
+}
+static inline u32 gmmu_pte_read_only_w(void)
+{
+        return 0;
+}
+static inline u32 gmmu_pte_read_only_true_f(void)
+{
+        return 0x4;
+}
+static inline u32 gmmu_pte_write_disable_w(void)
+{
+        return 1;
+}
+static inline u32 gmmu_pte_write_disable_true_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gmmu_pte_read_disable_w(void)
+{
+        return 1;
+}
+static inline u32 gmmu_pte_read_disable_true_f(void)
+{
+        return 0x40000000;
+}
+static inline u32 gmmu_pte_comptagline_f(u32 v)
+{
+        return (v & 0x1ffff) << 12;
+}
+static inline u32 gmmu_pte_comptagline_w(void)
+{
+        return 1;
+}
+static inline u32 gmmu_pte_address_shift_v(void)
+{
+        return 0x0000000c;
+}
+static inline u32 gmmu_pte_kind_f(u32 v)
+{
+        return (v & 0xff) << 4;
+}
+static inline u32 gmmu_pte_kind_w(void)
+{
+        return 1;
+}
+static inline u32 gmmu_pte_kind_invalid_v(void)
+{
+        return 0x000000ff;
+}
+static inline u32 gmmu_pte_kind_pitch_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 gmmu_pte_kind_z16_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 gmmu_pte_kind_z16_2c_v(void)
+{
+        return 0x00000002;
+}
+static inline u32 gmmu_pte_kind_z16_ms2_2c_v(void)
+{
+        return 0x00000003;
+}
+static inline u32 gmmu_pte_kind_z16_ms4_2c_v(void)
+{
+        return 0x00000004;
+}
+static inline u32 gmmu_pte_kind_z16_ms8_2c_v(void)
+{
+        return 0x00000005;
+}
+static inline u32 gmmu_pte_kind_z16_ms16_2c_v(void)
+{
+        return 0x00000006;
+}
+static inline u32 gmmu_pte_kind_z16_2z_v(void)
+{
+        return 0x00000007;
+}
+static inline u32 gmmu_pte_kind_z16_ms2_2z_v(void)
+{
+        return 0x00000008;
+}
+static inline u32 gmmu_pte_kind_z16_ms4_2z_v(void)
+{
+        return 0x00000009;
+}
+static inline u32 gmmu_pte_kind_z16_ms8_2z_v(void)
+{
+        return 0x0000000a;
+}
+static inline u32 gmmu_pte_kind_z16_ms16_2z_v(void)
+{
+        return 0x0000000b;
+}
+static inline u32 gmmu_pte_kind_z16_4cz_v(void)
+{
+        return 0x0000000c;
+}
+static inline u32 gmmu_pte_kind_z16_ms2_4cz_v(void)
+{
+        return 0x0000000d;
+}
+static inline u32 gmmu_pte_kind_z16_ms4_4cz_v(void)
+{
+        return 0x0000000e;
+}
+static inline u32 gmmu_pte_kind_z16_ms8_4cz_v(void)
+{
+        return 0x0000000f;
+}
+static inline u32 gmmu_pte_kind_z16_ms16_4cz_v(void)
+{
+        return 0x00000010;
+}
+static inline u32 gmmu_pte_kind_s8z24_v(void)
+{
+        return 0x00000011;
+}
+static inline u32 gmmu_pte_kind_s8z24_1z_v(void)
+{
+        return 0x00000012;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms2_1z_v(void)
+{
+        return 0x00000013;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms4_1z_v(void)
+{
+        return 0x00000014;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms8_1z_v(void)
+{
+        return 0x00000015;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms16_1z_v(void)
+{
+        return 0x00000016;
+}
+static inline u32 gmmu_pte_kind_s8z24_2cz_v(void)
+{
+        return 0x00000017;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms2_2cz_v(void)
+{
+        return 0x00000018;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms4_2cz_v(void)
+{
+        return 0x00000019;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms8_2cz_v(void)
+{
+        return 0x0000001a;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms16_2cz_v(void)
+{
+        return 0x0000001b;
+}
+static inline u32 gmmu_pte_kind_s8z24_2cs_v(void)
+{
+        return 0x0000001c;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms2_2cs_v(void)
+{
+        return 0x0000001d;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms4_2cs_v(void)
+{
+        return 0x0000001e;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms8_2cs_v(void)
+{
+        return 0x0000001f;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms16_2cs_v(void)
+{
+        return 0x00000020;
+}
+static inline u32 gmmu_pte_kind_s8z24_4cszv_v(void)
+{
+        return 0x00000021;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms2_4cszv_v(void)
+{
+        return 0x00000022;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms4_4cszv_v(void)
+{
+        return 0x00000023;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms8_4cszv_v(void)
+{
+        return 0x00000024;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms16_4cszv_v(void)
+{
+        return 0x00000025;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc12_v(void)
+{
+        return 0x00000026;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc4_v(void)
+{
+        return 0x00000027;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc8_v(void)
+{
+        return 0x00000028;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc24_v(void)
+{
+        return 0x00000029;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc12_1zv_v(void)
+{
+        return 0x0000002e;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc4_1zv_v(void)
+{
+        return 0x0000002f;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc8_1zv_v(void)
+{
+        return 0x00000030;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc24_1zv_v(void)
+{
+        return 0x00000031;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc12_2cs_v(void)
+{
+        return 0x00000032;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc4_2cs_v(void)
+{
+        return 0x00000033;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc8_2cs_v(void)
+{
+        return 0x00000034;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc24_2cs_v(void)
+{
+        return 0x00000035;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc12_2czv_v(void)
+{
+        return 0x0000003a;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc4_2czv_v(void)
+{
+        return 0x0000003b;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc8_2czv_v(void)
+{
+        return 0x0000003c;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc24_2czv_v(void)
+{
+        return 0x0000003d;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc12_2zv_v(void)
+{
+        return 0x0000003e;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc4_2zv_v(void)
+{
+        return 0x0000003f;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc8_2zv_v(void)
+{
+        return 0x00000040;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc24_2zv_v(void)
+{
+        return 0x00000041;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc12_4cszv_v(void)
+{
+        return 0x00000042;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc4_4cszv_v(void)
+{
+        return 0x00000043;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc8_4cszv_v(void)
+{
+        return 0x00000044;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc24_4cszv_v(void)
+{
+        return 0x00000045;
+}
+static inline u32 gmmu_pte_kind_z24s8_v(void)
+{
+        return 0x00000046;
+}
+static inline u32 gmmu_pte_kind_z24s8_1z_v(void)
+{
+        return 0x00000047;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms2_1z_v(void)
+{
+        return 0x00000048;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms4_1z_v(void)
+{
+        return 0x00000049;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms8_1z_v(void)
+{
+        return 0x0000004a;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms16_1z_v(void)
+{
+        return 0x0000004b;
+}
+static inline u32 gmmu_pte_kind_z24s8_2cs_v(void)
+{
+        return 0x0000004c;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms2_2cs_v(void)
+{
+        return 0x0000004d;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms4_2cs_v(void)
+{
+        return 0x0000004e;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms8_2cs_v(void)
+{
+        return 0x0000004f;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms16_2cs_v(void)
+{
+        return 0x00000050;
+}
+static inline u32 gmmu_pte_kind_z24s8_2cz_v(void)
+{
+        return 0x00000051;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms2_2cz_v(void)
+{
+        return 0x00000052;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms4_2cz_v(void)
+{
+        return 0x00000053;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms8_2cz_v(void)
+{
+        return 0x00000054;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms16_2cz_v(void)
+{
+        return 0x00000055;
+}
+static inline u32 gmmu_pte_kind_z24s8_4cszv_v(void)
+{
+        return 0x00000056;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms2_4cszv_v(void)
+{
+        return 0x00000057;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms4_4cszv_v(void)
+{
+        return 0x00000058;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms8_4cszv_v(void)
+{
+        return 0x00000059;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms16_4cszv_v(void)
+{
+        return 0x0000005a;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc12_v(void)
+{
+        return 0x0000005b;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc4_v(void)
+{
+        return 0x0000005c;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc8_v(void)
+{
+        return 0x0000005d;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc24_v(void)
+{
+        return 0x0000005e;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc12_1zv_v(void)
+{
+        return 0x00000063;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc4_1zv_v(void)
+{
+        return 0x00000064;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc8_1zv_v(void)
+{
+        return 0x00000065;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc24_1zv_v(void)
+{
+        return 0x00000066;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc12_2cs_v(void)
+{
+        return 0x00000067;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc4_2cs_v(void)
+{
+        return 0x00000068;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc8_2cs_v(void)
+{
+        return 0x00000069;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc24_2cs_v(void)
+{
+        return 0x0000006a;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc12_2czv_v(void)
+{
+        return 0x0000006f;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc4_2czv_v(void)
+{
+        return 0x00000070;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc8_2czv_v(void)
+{
+        return 0x00000071;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc24_2czv_v(void)
+{
+        return 0x00000072;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc12_2zv_v(void)
+{
+        return 0x00000073;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc4_2zv_v(void)
+{
+        return 0x00000074;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc8_2zv_v(void)
+{
+        return 0x00000075;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc24_2zv_v(void)
+{
+        return 0x00000076;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc12_4cszv_v(void)
+{
+        return 0x00000077;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc4_4cszv_v(void)
+{
+        return 0x00000078;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc8_4cszv_v(void)
+{
+        return 0x00000079;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc24_4cszv_v(void)
+{
+        return 0x0000007a;
+}
+static inline u32 gmmu_pte_kind_zf32_v(void)
+{
+        return 0x0000007b;
+}
+static inline u32 gmmu_pte_kind_zf32_1z_v(void)
+{
+        return 0x0000007c;
+}
+static inline u32 gmmu_pte_kind_zf32_ms2_1z_v(void)
+{
+        return 0x0000007d;
+}
+static inline u32 gmmu_pte_kind_zf32_ms4_1z_v(void)
+{
+        return 0x0000007e;
+}
+static inline u32 gmmu_pte_kind_zf32_ms8_1z_v(void)
+{
+        return 0x0000007f;
+}
+static inline u32 gmmu_pte_kind_zf32_ms16_1z_v(void)
+{
+        return 0x00000080;
+}
+static inline u32 gmmu_pte_kind_zf32_2cs_v(void)
+{
+        return 0x00000081;
+}
+static inline u32 gmmu_pte_kind_zf32_ms2_2cs_v(void)
+{
+        return 0x00000082;
+}
+static inline u32 gmmu_pte_kind_zf32_ms4_2cs_v(void)
+{
+        return 0x00000083;
+}
+static inline u32 gmmu_pte_kind_zf32_ms8_2cs_v(void)
+{
+        return 0x00000084;
+}
+static inline u32 gmmu_pte_kind_zf32_ms16_2cs_v(void)
+{
+        return 0x00000085;
+}
+static inline u32 gmmu_pte_kind_zf32_2cz_v(void)
+{
+        return 0x00000086;
+}
+static inline u32 gmmu_pte_kind_zf32_ms2_2cz_v(void)
+{
+        return 0x00000087;
+}
+static inline u32 gmmu_pte_kind_zf32_ms4_2cz_v(void)
+{
+        return 0x00000088;
+}
+static inline u32 gmmu_pte_kind_zf32_ms8_2cz_v(void)
+{
+        return 0x00000089;
+}
+static inline u32 gmmu_pte_kind_zf32_ms16_2cz_v(void)
+{
+        return 0x0000008a;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_v(void)
+{
+        return 0x0000008b;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_v(void)
+{
+        return 0x0000008c;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_v(void)
+{
+        return 0x0000008d;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_v(void)
+{
+        return 0x0000008e;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_1cs_v(void)
+{
+        return 0x0000008f;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_1cs_v(void)
+{
+        return 0x00000090;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_1cs_v(void)
+{
+        return 0x00000091;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_1cs_v(void)
+{
+        return 0x00000092;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_1zv_v(void)
+{
+        return 0x00000097;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_1zv_v(void)
+{
+        return 0x00000098;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_1zv_v(void)
+{
+        return 0x00000099;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_1zv_v(void)
+{
+        return 0x0000009a;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_1czv_v(void)
+{
+        return 0x0000009b;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_1czv_v(void)
+{
+        return 0x0000009c;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_1czv_v(void)
+{
+        return 0x0000009d;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_1czv_v(void)
+{
+        return 0x0000009e;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_2cs_v(void)
+{
+        return 0x0000009f;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_2cs_v(void)
+{
+        return 0x000000a0;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_2cs_v(void)
+{
+        return 0x000000a1;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_2cs_v(void)
+{
+        return 0x000000a2;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_2cszv_v(void)
+{
+        return 0x000000a3;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_2cszv_v(void)
+{
+        return 0x000000a4;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_2cszv_v(void)
+{
+        return 0x000000a5;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_2cszv_v(void)
+{
+        return 0x000000a6;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_v(void)
+{
+        return 0x000000a7;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_v(void)
+{
+        return 0x000000a8;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_v(void)
+{
+        return 0x000000a9;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_v(void)
+{
+        return 0x000000aa;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_1cs_v(void)
+{
+        return 0x000000ab;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_1cs_v(void)
+{
+        return 0x000000ac;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_1cs_v(void)
+{
+        return 0x000000ad;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_1cs_v(void)
+{
+        return 0x000000ae;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_1zv_v(void)
+{
+        return 0x000000b3;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_1zv_v(void)
+{
+        return 0x000000b4;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_1zv_v(void)
+{
+        return 0x000000b5;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_1zv_v(void)
+{
+        return 0x000000b6;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_1czv_v(void)
+{
+        return 0x000000b7;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_1czv_v(void)
+{
+        return 0x000000b8;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_1czv_v(void)
+{
+        return 0x000000b9;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_1czv_v(void)
+{
+        return 0x000000ba;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_2cs_v(void)
+{
+        return 0x000000bb;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_2cs_v(void)
+{
+        return 0x000000bc;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_2cs_v(void)
+{
+        return 0x000000bd;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_2cs_v(void)
+{
+        return 0x000000be;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_2cszv_v(void)
+{
+        return 0x000000bf;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_2cszv_v(void)
+{
+        return 0x000000c0;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_2cszv_v(void)
+{
+        return 0x000000c1;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_2cszv_v(void)
+{
+        return 0x000000c2;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_v(void)
+{
+        return 0x000000c3;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_1cs_v(void)
+{
+        return 0x000000c4;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms2_1cs_v(void)
+{
+        return 0x000000c5;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms4_1cs_v(void)
+{
+        return 0x000000c6;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms8_1cs_v(void)
+{
+        return 0x000000c7;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms16_1cs_v(void)
+{
+        return 0x000000c8;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_2cszv_v(void)
+{
+        return 0x000000ce;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms2_2cszv_v(void)
+{
+        return 0x000000cf;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms4_2cszv_v(void)
+{
+        return 0x000000d0;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms8_2cszv_v(void)
+{
+        return 0x000000d1;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms16_2cszv_v(void)
+{
+        return 0x000000d2;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_2cs_v(void)
+{
+        return 0x000000d3;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms2_2cs_v(void)
+{
+        return 0x000000d4;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms4_2cs_v(void)
+{
+        return 0x000000d5;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms8_2cs_v(void)
+{
+        return 0x000000d6;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms16_2cs_v(void)
+{
+        return 0x000000d7;
+}
+static inline u32 gmmu_pte_kind_generic_16bx2_v(void)
+{
+        return 0x000000fe;
+}
+static inline u32 gmmu_pte_kind_c32_2c_v(void)
+{
+        return 0x000000d8;
+}
+static inline u32 gmmu_pte_kind_c32_2cbr_v(void)
+{
+        return 0x000000d9;
+}
+static inline u32 gmmu_pte_kind_c32_2cba_v(void)
+{
+        return 0x000000da;
+}
+static inline u32 gmmu_pte_kind_c32_2cra_v(void)
+{
+        return 0x000000db;
+}
+static inline u32 gmmu_pte_kind_c32_2bra_v(void)
+{
+        return 0x000000dc;
+}
+static inline u32 gmmu_pte_kind_c32_ms2_2c_v(void)
+{
+        return 0x000000dd;
+}
+static inline u32 gmmu_pte_kind_c32_ms2_2cbr_v(void)
+{
+        return 0x000000de;
+}
+static inline u32 gmmu_pte_kind_c32_ms2_2cra_v(void)
+{
+        return 0x000000cc;
+}
+static inline u32 gmmu_pte_kind_c32_ms4_2c_v(void)
+{
+        return 0x000000df;
+}
+static inline u32 gmmu_pte_kind_c32_ms4_2cbr_v(void)
+{
+        return 0x000000e0;
+}
+static inline u32 gmmu_pte_kind_c32_ms4_2cba_v(void)
+{
+        return 0x000000e1;
+}
+static inline u32 gmmu_pte_kind_c32_ms4_2cra_v(void)
+{
+        return 0x000000e2;
+}
+static inline u32 gmmu_pte_kind_c32_ms4_2bra_v(void)
+{
+        return 0x000000e3;
+}
+static inline u32 gmmu_pte_kind_c32_ms8_ms16_2c_v(void)
+{
+        return 0x000000e4;
+}
+static inline u32 gmmu_pte_kind_c32_ms8_ms16_2cra_v(void)
+{
+        return 0x000000e5;
+}
+static inline u32 gmmu_pte_kind_c64_2c_v(void)
+{
+        return 0x000000e6;
+}
+static inline u32 gmmu_pte_kind_c64_2cbr_v(void)
+{
+        return 0x000000e7;
+}
+static inline u32 gmmu_pte_kind_c64_2cba_v(void)
+{
+        return 0x000000e8;
+}
+static inline u32 gmmu_pte_kind_c64_2cra_v(void)
+{
+        return 0x000000e9;
+}
+static inline u32 gmmu_pte_kind_c64_2bra_v(void)
+{
+        return 0x000000ea;
+}
+static inline u32 gmmu_pte_kind_c64_ms2_2c_v(void)
+{
+        return 0x000000eb;
+}
+static inline u32 gmmu_pte_kind_c64_ms2_2cbr_v(void)
+{
+        return 0x000000ec;
+}
+static inline u32 gmmu_pte_kind_c64_ms2_2cra_v(void)
+{
+        return 0x000000cd;
+}
+static inline u32 gmmu_pte_kind_c64_ms4_2c_v(void)
+{
+        return 0x000000ed;
+}
+static inline u32 gmmu_pte_kind_c64_ms4_2cbr_v(void)
+{
+        return 0x000000ee;
+}
+static inline u32 gmmu_pte_kind_c64_ms4_2cba_v(void)
+{
+        return 0x000000ef;
+}
+static inline u32 gmmu_pte_kind_c64_ms4_2cra_v(void)
+{
+        return 0x000000f0;
+}
+static inline u32 gmmu_pte_kind_c64_ms4_2bra_v(void)
+{
+        return 0x000000f1;
+}
+static inline u32 gmmu_pte_kind_c64_ms8_ms16_2c_v(void)
+{
+        return 0x000000f2;
+}
+static inline u32 gmmu_pte_kind_c64_ms8_ms16_2cra_v(void)
+{
+        return 0x000000f3;
+}
+static inline u32 gmmu_pte_kind_c128_2c_v(void)
+{
+        return 0x000000f4;
+}
+static inline u32 gmmu_pte_kind_c128_2cr_v(void)
+{
+        return 0x000000f5;
+}
+static inline u32 gmmu_pte_kind_c128_ms2_2c_v(void)
+{
+        return 0x000000f6;
+}
+static inline u32 gmmu_pte_kind_c128_ms2_2cr_v(void)
+{
+        return 0x000000f7;
+}
+static inline u32 gmmu_pte_kind_c128_ms4_2c_v(void)
+{
+        return 0x000000f8;
+}
+static inline u32 gmmu_pte_kind_c128_ms4_2cr_v(void)
+{
+        return 0x000000f9;
+}
+static inline u32 gmmu_pte_kind_c128_ms8_ms16_2c_v(void)
+{
+        return 0x000000fa;
+}
+static inline u32 gmmu_pte_kind_c128_ms8_ms16_2cr_v(void)
+{
+        return 0x000000fb;
+}
+static inline u32 gmmu_pte_kind_x8c24_v(void)
+{
+        return 0x000000fc;
+}
+static inline u32 gmmu_pte_kind_pitch_no_swizzle_v(void)
+{
+        return 0x000000fd;
+}
+static inline u32 gmmu_pte_kind_smsked_message_v(void)
+{
+        return 0x000000ca;
+}
+static inline u32 gmmu_pte_kind_smhost_message_v(void)
+{
+        return 0x000000cb;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_gr_gk20a.h
new file mode 100644
index 00000000..ece7602d
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_gr_gk20a.h
@@ -0,0 +1,3173 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_gr_gk20a_h_
+#define _hw_gr_gk20a_h_
+static inline u32 gr_intr_r(void)
+{
+        return 0x00400100;
+}
+static inline u32 gr_intr_notify_pending_f(void)
+{
+        return 0x1;
+}
+static inline u32 gr_intr_notify_reset_f(void)
+{
+        return 0x1;
+}
+static inline u32 gr_intr_semaphore_pending_f(void)
+{
+        return 0x2;
+}
+static inline u32 gr_intr_semaphore_reset_f(void)
+{
+        return 0x2;
+}
+static inline u32 gr_intr_semaphore_timeout_not_pending_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_intr_semaphore_timeout_pending_f(void)
+{
+        return 0x4;
+}
+static inline u32 gr_intr_semaphore_timeout_reset_f(void)
+{
+        return 0x4;
+}
+static inline u32 gr_intr_illegal_method_pending_f(void)
+{
+        return 0x10;
+}
+static inline u32 gr_intr_illegal_method_reset_f(void)
+{
+        return 0x10;
+}
+static inline u32 gr_intr_illegal_notify_pending_f(void)
+{
+        return 0x40;
+}
+static inline u32 gr_intr_illegal_notify_reset_f(void)
+{
+        return 0x40;
+}
+static inline u32 gr_intr_illegal_class_pending_f(void)
+{
+        return 0x20;
+}
+static inline u32 gr_intr_illegal_class_reset_f(void)
+{
+        return 0x20;
+}
+static inline u32 gr_intr_class_error_pending_f(void)
+{
+        return 0x100000;
+}
+static inline u32 gr_intr_class_error_reset_f(void)
+{
+        return 0x100000;
+}
+static inline u32 gr_intr_exception_pending_f(void)
+{
+        return 0x200000;
+}
+static inline u32 gr_intr_exception_reset_f(void)
+{
+        return 0x200000;
+}
+static inline u32 gr_intr_firmware_method_pending_f(void)
+{
+        return 0x100;
+}
+static inline u32 gr_intr_firmware_method_reset_f(void)
+{
+        return 0x100;
+}
+static inline u32 gr_intr_nonstall_r(void)
+{
+        return 0x00400120;
+}
+static inline u32 gr_intr_nonstall_trap_pending_f(void)
+{
+        return 0x2;
+}
+static inline u32 gr_intr_en_r(void)
+{
+        return 0x0040013c;
+}
+static inline u32 gr_exception_r(void)
+{
+        return 0x00400108;
+}
+static inline u32 gr_exception_fe_m(void)
+{
+        return 0x1 << 0;
+}
+static inline u32 gr_exception_gpc_m(void)
+{
+        return 0x1 << 24;
+}
+static inline u32 gr_exception1_r(void)
+{
+        return 0x00400118;
+}
+static inline u32 gr_exception1_gpc_0_pending_f(void)
+{
+        return 0x1;
+}
+static inline u32 gr_exception2_r(void)
+{
+        return 0x0040011c;
+}
+static inline u32 gr_exception_en_r(void)
+{
+        return 0x00400138;
+}
+static inline u32 gr_exception_en_fe_m(void)
+{
+        return 0x1 << 0;
+}
+static inline u32 gr_exception1_en_r(void)
+{
+        return 0x00400130;
+}
+static inline u32 gr_exception2_en_r(void)
+{
+        return 0x00400134;
+}
+static inline u32 gr_gpfifo_ctl_r(void)
+{
+        return 0x00400500;
+}
+static inline u32 gr_gpfifo_ctl_access_f(u32 v)
+{
+        return (v & 0x1) << 0;
+}
+static inline u32 gr_gpfifo_ctl_access_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_gpfifo_ctl_access_enabled_f(void)
+{
+        return 0x1;
+}
+static inline u32 gr_gpfifo_ctl_semaphore_access_f(u32 v)
+{
+        return (v & 0x1) << 16;
+}
+static inline u32 gr_gpfifo_ctl_semaphore_access_enabled_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 gr_gpfifo_ctl_semaphore_access_enabled_f(void)
+{
+        return 0x10000;
+}
+static inline u32 gr_trapped_addr_r(void)
+{
+        return 0x00400704;
+}
+static inline u32 gr_trapped_addr_mthd_v(u32 r)
+{
+        return (r >> 2) & 0xfff;
+}
+static inline u32 gr_trapped_addr_subch_v(u32 r)
+{
+        return (r >> 16) & 0x7;
+}
+static inline u32 gr_trapped_data_lo_r(void)
+{
+        return 0x00400708;
+}
+static inline u32 gr_trapped_data_hi_r(void)
+{
+        return 0x0040070c;
+}
+static inline u32 gr_status_r(void)
+{
+        return 0x00400700;
+}
+static inline u32 gr_status_fe_method_lower_v(u32 r)
+{
+        return (r >> 2) & 0x1;
+}
+static inline u32 gr_status_fe_method_lower_idle_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 gr_status_mask_r(void)
+{
+        return 0x00400610;
+}
+static inline u32 gr_engine_status_r(void)
+{
+        return 0x0040060c;
+}
+static inline u32 gr_engine_status_value_busy_f(void)
+{
+        return 0x1;
+}
+static inline u32 gr_pipe_bundle_address_r(void)
+{
+        return 0x00400200;
+}
+static inline u32 gr_pipe_bundle_address_value_v(u32 r)
+{
+        return (r >> 0) & 0xffff;
+}
+static inline u32 gr_pipe_bundle_data_r(void)
+{
+        return 0x00400204;
+}
+static inline u32 gr_pipe_bundle_config_r(void)
+{
+        return 0x00400208;
+}
+static inline u32 gr_pipe_bundle_config_override_pipe_mode_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_pipe_bundle_config_override_pipe_mode_enabled_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gr_fe_hww_esr_r(void)
+{
+        return 0x00404000;
+}
+static inline u32 gr_fe_hww_esr_reset_active_f(void)
+{
+        return 0x40000000;
+}
+static inline u32 gr_fe_hww_esr_en_enable_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gr_fe_go_idle_timeout_r(void)
+{
+        return 0x00404154;
+}
+static inline u32 gr_fe_go_idle_timeout_count_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_fe_go_idle_timeout_count_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_fe_object_table_r(u32 i)
+{
+        return 0x00404200 + i*4;
+}
+static inline u32 gr_fe_object_table_nvclass_v(u32 r)
+{
+        return (r >> 0) & 0xffff;
+}
+static inline u32 gr_pri_mme_shadow_raw_index_r(void)
+{
+        return 0x00404488;
+}
+static inline u32 gr_pri_mme_shadow_raw_index_write_trigger_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gr_pri_mme_shadow_raw_data_r(void)
+{
+        return 0x0040448c;
+}
+static inline u32 gr_mme_hww_esr_r(void)
+{
+        return 0x00404490;
+}
+static inline u32 gr_mme_hww_esr_reset_active_f(void)
+{
+        return 0x40000000;
+}
+static inline u32 gr_mme_hww_esr_en_enable_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gr_memfmt_hww_esr_r(void)
+{
+        return 0x00404600;
+}
+static inline u32 gr_memfmt_hww_esr_reset_active_f(void)
+{
+        return 0x40000000;
+}
+static inline u32 gr_memfmt_hww_esr_en_enable_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gr_fecs_cpuctl_r(void)
+{
+        return 0x00409100;
+}
+static inline u32 gr_fecs_cpuctl_startcpu_f(u32 v)
+{
+        return (v & 0x1) << 1;
+}
+static inline u32 gr_fecs_dmactl_r(void)
+{
+        return 0x0040910c;
+}
+static inline u32 gr_fecs_dmactl_require_ctx_f(u32 v)
+{
+        return (v & 0x1) << 0;
+}
+static inline u32 gr_fecs_dmactl_dmem_scrubbing_m(void)
+{
+        return 0x1 << 1;
+}
+static inline u32 gr_fecs_dmactl_imem_scrubbing_m(void)
+{
+        return 0x1 << 2;
+}
+static inline u32 gr_fecs_os_r(void)
+{
+        return 0x00409080;
+}
+static inline u32 gr_fecs_idlestate_r(void)
+{
+        return 0x0040904c;
+}
+static inline u32 gr_fecs_mailbox0_r(void)
+{
+        return 0x00409040;
+}
+static inline u32 gr_fecs_mailbox1_r(void)
+{
+        return 0x00409044;
+}
+static inline u32 gr_fecs_irqstat_r(void)
+{
+        return 0x00409008;
+}
+static inline u32 gr_fecs_irqmode_r(void)
+{
+        return 0x0040900c;
+}
+static inline u32 gr_fecs_irqmask_r(void)
+{
+        return 0x00409018;
+}
+static inline u32 gr_fecs_irqdest_r(void)
+{
+        return 0x0040901c;
+}
+static inline u32 gr_fecs_curctx_r(void)
+{
+        return 0x00409050;
+}
+static inline u32 gr_fecs_nxtctx_r(void)
+{
+        return 0x00409054;
+}
+static inline u32 gr_fecs_engctl_r(void)
+{
+        return 0x004090a4;
+}
+static inline u32 gr_fecs_debug1_r(void)
+{
+        return 0x00409090;
+}
+static inline u32 gr_fecs_debuginfo_r(void)
+{
+        return 0x00409094;
+}
+static inline u32 gr_fecs_icd_cmd_r(void)
+{
+        return 0x00409200;
+}
+static inline u32 gr_fecs_icd_cmd_opc_s(void)
+{
+        return 4;
+}
+static inline u32 gr_fecs_icd_cmd_opc_f(u32 v)
+{
+        return (v & 0xf) << 0;
+}
+static inline u32 gr_fecs_icd_cmd_opc_m(void)
+{
+        return 0xf << 0;
+}
+static inline u32 gr_fecs_icd_cmd_opc_v(u32 r)
+{
+        return (r >> 0) & 0xf;
+}
+static inline u32 gr_fecs_icd_cmd_opc_rreg_f(void)
+{
+        return 0x8;
+}
+static inline u32 gr_fecs_icd_cmd_opc_rstat_f(void)
+{
+        return 0xe;
+}
+static inline u32 gr_fecs_icd_cmd_idx_f(u32 v)
+{
+        return (v & 0x1f) << 8;
+}
+static inline u32 gr_fecs_icd_rdata_r(void)
+{
+        return 0x0040920c;
+}
+static inline u32 gr_fecs_imemc_r(u32 i)
+{
+        return 0x00409180 + i*16;
+}
+static inline u32 gr_fecs_imemc_offs_f(u32 v)
+{
+        return (v & 0x3f) << 2;
+}
+static inline u32 gr_fecs_imemc_blk_f(u32 v)
+{
+        return (v & 0xff) << 8;
+}
+static inline u32 gr_fecs_imemc_aincw_f(u32 v)
+{
+        return (v & 0x1) << 24;
+}
+static inline u32 gr_fecs_imemd_r(u32 i)
+{
+        return 0x00409184 + i*16;
+}
+static inline u32 gr_fecs_imemt_r(u32 i)
+{
+        return 0x00409188 + i*16;
+}
+static inline u32 gr_fecs_imemt_tag_f(u32 v)
+{
+        return (v & 0xffff) << 0;
+}
+static inline u32 gr_fecs_dmemc_r(u32 i)
+{
+        return 0x004091c0 + i*8;
+}
+static inline u32 gr_fecs_dmemc_offs_s(void)
+{
+        return 6;
+}
+static inline u32 gr_fecs_dmemc_offs_f(u32 v)
+{
+        return (v & 0x3f) << 2;
+}
+static inline u32 gr_fecs_dmemc_offs_m(void)
+{
+        return 0x3f << 2;
+}
+static inline u32 gr_fecs_dmemc_offs_v(u32 r)
+{
+        return (r >> 2) & 0x3f;
+}
+static inline u32 gr_fecs_dmemc_blk_f(u32 v)
+{
+        return (v & 0xff) << 8;
+}
+static inline u32 gr_fecs_dmemc_aincw_f(u32 v)
+{
+        return (v & 0x1) << 24;
+}
+static inline u32 gr_fecs_dmemd_r(u32 i)
+{
+        return 0x004091c4 + i*8;
+}
+static inline u32 gr_fecs_dmatrfbase_r(void)
+{
+        return 0x00409110;
+}
+static inline u32 gr_fecs_dmatrfmoffs_r(void)
+{
+        return 0x00409114;
+}
+static inline u32 gr_fecs_dmatrffboffs_r(void)
+{
+        return 0x0040911c;
+}
+static inline u32 gr_fecs_dmatrfcmd_r(void)
+{
+        return 0x00409118;
+}
+static inline u32 gr_fecs_dmatrfcmd_imem_f(u32 v)
+{
+        return (v & 0x1) << 4;
+}
+static inline u32 gr_fecs_dmatrfcmd_write_f(u32 v)
+{
+        return (v & 0x1) << 5;
+}
+static inline u32 gr_fecs_dmatrfcmd_size_f(u32 v)
+{
+        return (v & 0x7) << 8;
+}
+static inline u32 gr_fecs_dmatrfcmd_ctxdma_f(u32 v)
+{
+        return (v & 0x7) << 12;
+}
+static inline u32 gr_fecs_bootvec_r(void)
+{
+        return 0x00409104;
+}
+static inline u32 gr_fecs_bootvec_vec_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_fecs_falcon_hwcfg_r(void)
+{
+        return 0x00409108;
+}
+static inline u32 gr_gpcs_gpccs_falcon_hwcfg_r(void)
+{
+        return 0x0041a108;
+}
+static inline u32 gr_fecs_falcon_rm_r(void)
+{
+        return 0x00409084;
+}
+static inline u32 gr_fecs_current_ctx_r(void)
+{
+        return 0x00409b00;
+}
+static inline u32 gr_fecs_current_ctx_ptr_f(u32 v)
+{
+        return (v & 0xfffffff) << 0;
+}
+static inline u32 gr_fecs_current_ctx_ptr_v(u32 r)
+{
+        return (r >> 0) & 0xfffffff;
+}
+static inline u32 gr_fecs_current_ctx_target_s(void)
+{
+        return 2;
+}
+static inline u32 gr_fecs_current_ctx_target_f(u32 v)
+{
+        return (v & 0x3) << 28;
+}
+static inline u32 gr_fecs_current_ctx_target_m(void)
+{
+        return 0x3 << 28;
+}
+static inline u32 gr_fecs_current_ctx_target_v(u32 r)
+{
+        return (r >> 28) & 0x3;
+}
+static inline u32 gr_fecs_current_ctx_target_vid_mem_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_fecs_current_ctx_valid_s(void)
+{
+        return 1;
+}
+static inline u32 gr_fecs_current_ctx_valid_f(u32 v)
+{
+        return (v & 0x1) << 31;
+}
+static inline u32 gr_fecs_current_ctx_valid_m(void)
+{
+        return 0x1 << 31;
+}
+static inline u32 gr_fecs_current_ctx_valid_v(u32 r)
+{
+        return (r >> 31) & 0x1;
+}
+static inline u32 gr_fecs_current_ctx_valid_false_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_fecs_method_data_r(void)
+{
+        return 0x00409500;
+}
+static inline u32 gr_fecs_method_push_r(void)
+{
+        return 0x00409504;
+}
+static inline u32 gr_fecs_method_push_adr_f(u32 v)
+{
+        return (v & 0xfff) << 0;
+}
+static inline u32 gr_fecs_method_push_adr_bind_pointer_v(void)
+{
+        return 0x00000003;
+}
+static inline u32 gr_fecs_method_push_adr_bind_pointer_f(void)
+{
+        return 0x3;
+}
+static inline u32 gr_fecs_method_push_adr_discover_image_size_v(void)
+{
+        return 0x00000010;
+}
+static inline u32 gr_fecs_method_push_adr_wfi_golden_save_v(void)
+{
+        return 0x00000009;
+}
+static inline u32 gr_fecs_method_push_adr_restore_golden_v(void)
+{
+        return 0x00000015;
+}
+static inline u32 gr_fecs_method_push_adr_discover_zcull_image_size_v(void)
+{
+        return 0x00000016;
+}
+static inline u32 gr_fecs_method_push_adr_discover_pm_image_size_v(void)
+{
+        return 0x00000025;
+}
+static inline u32 gr_fecs_method_push_adr_discover_reglist_image_size_v(void)
+{
+        return 0x00000030;
+}
+static inline u32 gr_fecs_method_push_adr_set_reglist_bind_instance_v(void)
+{
+        return 0x00000031;
+}
+static inline u32 gr_fecs_method_push_adr_set_reglist_virtual_address_v(void)
+{
+        return 0x00000032;
+}
+static inline u32 gr_fecs_method_push_adr_stop_ctxsw_v(void)
+{
+        return 0x00000038;
+}
+static inline u32 gr_fecs_method_push_adr_start_ctxsw_v(void)
+{
+        return 0x00000039;
+}
+static inline u32 gr_fecs_method_push_adr_set_watchdog_timeout_f(void)
+{
+        return 0x21;
+}
+static inline u32 gr_fecs_host_int_enable_r(void)
+{
+        return 0x00409c24;
+}
+static inline u32 gr_fecs_host_int_enable_fault_during_ctxsw_enable_f(void)
+{
+        return 0x10000;
+}
+static inline u32 gr_fecs_host_int_enable_umimp_firmware_method_enable_f(void)
+{
+        return 0x20000;
+}
+static inline u32 gr_fecs_host_int_enable_umimp_illegal_method_enable_f(void)
+{
+        return 0x40000;
+}
+static inline u32 gr_fecs_host_int_enable_watchdog_enable_f(void)
+{
+        return 0x80000;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_r(void)
+{
+        return 0x00409614;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f(void)
+{
+        return 0x10;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f(void)
+{
+        return 0x20;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f(void)
+{
+        return 0x40;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f(void)
+{
+        return 0x100;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f(void)
+{
+        return 0x200;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_s(void)
+{
+        return 1;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_f(u32 v)
+{
+        return (v & 0x1) << 10;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_m(void)
+{
+        return 0x1 << 10;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_v(u32 r)
+{
+        return (r >> 10) & 0x1;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f(void)
+{
+        return 0x400;
+}
+static inline u32 gr_fecs_ctx_state_store_major_rev_id_r(void)
+{
+        return 0x0040960c;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_r(u32 i)
+{
+        return 0x00409800 + i*4;
+}
+static inline u32 gr_fecs_ctxsw_mailbox__size_1_v(void)
+{
+        return 0x00000008;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_value_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_value_pass_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_value_fail_v(void)
+{
+        return 0x00000002;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_set_r(u32 i)
+{
+        return 0x00409820 + i*4;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_set_value_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_clear_r(u32 i)
+{
+        return 0x00409840 + i*4;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_clear_value_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_fecs_fs_r(void)
+{
+        return 0x00409604;
+}
+static inline u32 gr_fecs_fs_num_available_gpcs_s(void)
+{
+        return 5;
+}
+static inline u32 gr_fecs_fs_num_available_gpcs_f(u32 v)
+{
+        return (v & 0x1f) << 0;
+}
+static inline u32 gr_fecs_fs_num_available_gpcs_m(void)
+{
+        return 0x1f << 0;
+}
+static inline u32 gr_fecs_fs_num_available_gpcs_v(u32 r)
+{
+        return (r >> 0) & 0x1f;
+}
+static inline u32 gr_fecs_fs_num_available_fbps_s(void)
+{
+        return 5;
+}
+static inline u32 gr_fecs_fs_num_available_fbps_f(u32 v)
+{
+        return (v & 0x1f) << 16;
+}
+static inline u32 gr_fecs_fs_num_available_fbps_m(void)
+{
+        return 0x1f << 16;
+}
+static inline u32 gr_fecs_fs_num_available_fbps_v(u32 r)
+{
+        return (r >> 16) & 0x1f;
+}
+static inline u32 gr_fecs_cfg_r(void)
+{
+        return 0x00409620;
+}
+static inline u32 gr_fecs_cfg_imem_sz_v(u32 r)
+{
+        return (r >> 0) & 0xff;
+}
+static inline u32 gr_fecs_rc_lanes_r(void)
+{
+        return 0x00409880;
+}
+static inline u32 gr_fecs_rc_lanes_num_chains_s(void)
+{
+        return 6;
+}
+static inline u32 gr_fecs_rc_lanes_num_chains_f(u32 v)
+{
+        return (v & 0x3f) << 0;
+}
+static inline u32 gr_fecs_rc_lanes_num_chains_m(void)
+{
+        return 0x3f << 0;
+}
+static inline u32 gr_fecs_rc_lanes_num_chains_v(u32 r)
+{
+        return (r >> 0) & 0x3f;
+}
+static inline u32 gr_fecs_ctxsw_status_1_r(void)
+{
+        return 0x00409400;
+}
+static inline u32 gr_fecs_ctxsw_status_1_arb_busy_s(void)
+{
+        return 1;
+}
+static inline u32 gr_fecs_ctxsw_status_1_arb_busy_f(u32 v)
+{
+        return (v & 0x1) << 12;
+}
+static inline u32 gr_fecs_ctxsw_status_1_arb_busy_m(void)
+{
+        return 0x1 << 12;
+}
+static inline u32 gr_fecs_ctxsw_status_1_arb_busy_v(u32 r)
+{
+        return (r >> 12) & 0x1;
+}
+static inline u32 gr_fecs_arb_ctx_adr_r(void)
+{
+        return 0x00409a24;
+}
+static inline u32 gr_fecs_new_ctx_r(void)
+{
+        return 0x00409b04;
+}
+static inline u32 gr_fecs_new_ctx_ptr_s(void)
+{
+        return 28;
+}
+static inline u32 gr_fecs_new_ctx_ptr_f(u32 v)
+{
+        return (v & 0xfffffff) << 0;
+}
+static inline u32 gr_fecs_new_ctx_ptr_m(void)
+{
+        return 0xfffffff << 0;
+}
+static inline u32 gr_fecs_new_ctx_ptr_v(u32 r)
+{
+        return (r >> 0) & 0xfffffff;
+}
+static inline u32 gr_fecs_new_ctx_target_s(void)
+{
+        return 2;
+}
+static inline u32 gr_fecs_new_ctx_target_f(u32 v)
+{
+        return (v & 0x3) << 28;
+}
+static inline u32 gr_fecs_new_ctx_target_m(void)
+{
+        return 0x3 << 28;
+}
+static inline u32 gr_fecs_new_ctx_target_v(u32 r)
+{
+        return (r >> 28) & 0x3;
+}
+static inline u32 gr_fecs_new_ctx_valid_s(void)
+{
+        return 1;
+}
+static inline u32 gr_fecs_new_ctx_valid_f(u32 v)
+{
+        return (v & 0x1) << 31;
+}
+static inline u32 gr_fecs_new_ctx_valid_m(void)
+{
+        return 0x1 << 31;
+}
+static inline u32 gr_fecs_new_ctx_valid_v(u32 r)
+{
+        return (r >> 31) & 0x1;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_r(void)
+{
+        return 0x00409a0c;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_ptr_s(void)
+{
+        return 28;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_ptr_f(u32 v)
+{
+        return (v & 0xfffffff) << 0;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_ptr_m(void)
+{
+        return 0xfffffff << 0;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_ptr_v(u32 r)
+{
+        return (r >> 0) & 0xfffffff;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_target_s(void)
+{
+        return 2;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_target_f(u32 v)
+{
+        return (v & 0x3) << 28;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_target_m(void)
+{
+        return 0x3 << 28;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_target_v(u32 r)
+{
+        return (r >> 28) & 0x3;
+}
+static inline u32 gr_fecs_arb_ctx_cmd_r(void)
+{
+        return 0x00409a10;
+}
+static inline u32 gr_fecs_arb_ctx_cmd_cmd_s(void)
+{
+        return 5;
+}
+static inline u32 gr_fecs_arb_ctx_cmd_cmd_f(u32 v)
+{
+        return (v & 0x1f) << 0;
+}
+static inline u32 gr_fecs_arb_ctx_cmd_cmd_m(void)
+{
+        return 0x1f << 0;
+}
+static inline u32 gr_fecs_arb_ctx_cmd_cmd_v(u32 r)
+{
+        return (r >> 0) & 0x1f;
+}
+static inline u32 gr_rstr2d_gpc_map0_r(void)
+{
+        return 0x0040780c;
+}
+static inline u32 gr_rstr2d_gpc_map1_r(void)
+{
+        return 0x00407810;
+}
+static inline u32 gr_rstr2d_gpc_map2_r(void)
+{
+        return 0x00407814;
+}
+static inline u32 gr_rstr2d_gpc_map3_r(void)
+{
+        return 0x00407818;
+}
+static inline u32 gr_rstr2d_gpc_map4_r(void)
+{
+        return 0x0040781c;
+}
+static inline u32 gr_rstr2d_gpc_map5_r(void)
+{
+        return 0x00407820;
+}
+static inline u32 gr_rstr2d_map_table_cfg_r(void)
+{
+        return 0x004078bc;
+}
+static inline u32 gr_rstr2d_map_table_cfg_row_offset_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 gr_rstr2d_map_table_cfg_num_entries_f(u32 v)
+{
+        return (v & 0xff) << 8;
+}
+static inline u32 gr_pd_hww_esr_r(void)
+{
+        return 0x00406018;
+}
+static inline u32 gr_pd_hww_esr_reset_active_f(void)
+{
+        return 0x40000000;
+}
+static inline u32 gr_pd_hww_esr_en_enable_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_r(u32 i)
+{
+        return 0x00406028 + i*4;
+}
+static inline u32 gr_pd_num_tpc_per_gpc__size_1_v(void)
+{
+        return 0x00000004;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count0_f(u32 v)
+{
+        return (v & 0xf) << 0;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count1_f(u32 v)
+{
+        return (v & 0xf) << 4;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count2_f(u32 v)
+{
+        return (v & 0xf) << 8;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count3_f(u32 v)
+{
+        return (v & 0xf) << 12;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count4_f(u32 v)
+{
+        return (v & 0xf) << 16;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count5_f(u32 v)
+{
+        return (v & 0xf) << 20;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count6_f(u32 v)
+{
+        return (v & 0xf) << 24;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count7_f(u32 v)
+{
+        return (v & 0xf) << 28;
+}
+static inline u32 gr_pd_ab_dist_cfg0_r(void)
+{
+        return 0x004064c0;
+}
+static inline u32 gr_pd_ab_dist_cfg0_timeslice_enable_en_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gr_pd_ab_dist_cfg0_timeslice_enable_dis_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_pd_ab_dist_cfg1_r(void)
+{
+        return 0x004064c4;
+}
+static inline u32 gr_pd_ab_dist_cfg1_max_batches_init_f(void)
+{
+        return 0xffff;
+}
+static inline u32 gr_pd_ab_dist_cfg1_max_output_f(u32 v)
+{
+        return (v & 0x7ff) << 16;
+}
+static inline u32 gr_pd_ab_dist_cfg1_max_output_granularity_v(void)
+{
+        return 0x00000080;
+}
+static inline u32 gr_pd_ab_dist_cfg2_r(void)
+{
+        return 0x004064c8;
+}
+static inline u32 gr_pd_ab_dist_cfg2_token_limit_f(u32 v)
+{
+        return (v & 0xfff) << 0;
+}
+static inline u32 gr_pd_ab_dist_cfg2_token_limit_init_v(void)
+{
+        return 0x00000100;
+}
+static inline u32 gr_pd_ab_dist_cfg2_state_limit_f(u32 v)
+{
+        return (v & 0xfff) << 16;
+}
+static inline u32 gr_pd_ab_dist_cfg2_state_limit_scc_bundle_granularity_v(void)
+{
+        return 0x00000020;
+}
+static inline u32 gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v(void)
+{
+        return 0x00000062;
+}
+static inline u32 gr_pd_pagepool_r(void)
+{
+        return 0x004064cc;
+}
+static inline u32 gr_pd_pagepool_total_pages_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 gr_pd_pagepool_valid_true_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gr_pd_dist_skip_table_r(u32 i)
+{
+        return 0x004064d0 + i*4;
+}
+static inline u32 gr_pd_dist_skip_table__size_1_v(void)
+{
+        return 0x00000008;
+}
+static inline u32 gr_pd_dist_skip_table_gpc_4n0_mask_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 gr_pd_dist_skip_table_gpc_4n1_mask_f(u32 v)
+{
+        return (v & 0xff) << 8;
+}
+static inline u32 gr_pd_dist_skip_table_gpc_4n2_mask_f(u32 v)
+{
+        return (v & 0xff) << 16;
+}
+static inline u32 gr_pd_dist_skip_table_gpc_4n3_mask_f(u32 v)
+{
+        return (v & 0xff) << 24;
+}
+static inline u32 gr_pd_alpha_ratio_table_r(u32 i)
+{
+        return 0x00406800 + i*4;
+}
+static inline u32 gr_pd_alpha_ratio_table__size_1_v(void)
+{
+        return 0x00000100;
+}
+static inline u32 gr_pd_alpha_ratio_table_gpc_4n0_mask_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 gr_pd_alpha_ratio_table_gpc_4n1_mask_f(u32 v)
+{
+        return (v & 0xff) << 8;
+}
+static inline u32 gr_pd_alpha_ratio_table_gpc_4n2_mask_f(u32 v)
+{
+        return (v & 0xff) << 16;
+}
+static inline u32 gr_pd_alpha_ratio_table_gpc_4n3_mask_f(u32 v)
+{
+        return (v & 0xff) << 24;
+}
+static inline u32 gr_pd_beta_ratio_table_r(u32 i)
+{
+        return 0x00406c00 + i*4;
+}
+static inline u32 gr_pd_beta_ratio_table__size_1_v(void)
+{
+        return 0x00000100;
+}
+static inline u32 gr_pd_beta_ratio_table_gpc_4n0_mask_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 gr_pd_beta_ratio_table_gpc_4n1_mask_f(u32 v)
+{
+        return (v & 0xff) << 8;
+}
+static inline u32 gr_pd_beta_ratio_table_gpc_4n2_mask_f(u32 v)
+{
+        return (v & 0xff) << 16;
+}
+static inline u32 gr_pd_beta_ratio_table_gpc_4n3_mask_f(u32 v)
+{
+        return (v & 0xff) << 24;
+}
+static inline u32 gr_ds_debug_r(void)
+{
+        return 0x00405800;
+}
+static inline u32 gr_ds_debug_timeslice_mode_disable_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_ds_debug_timeslice_mode_enable_f(void)
+{
+        return 0x8000000;
+}
+static inline u32 gr_ds_zbc_color_r_r(void)
+{
+        return 0x00405804;
+}
+static inline u32 gr_ds_zbc_color_r_val_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_ds_zbc_color_g_r(void)
+{
+        return 0x00405808;
+}
+static inline u32 gr_ds_zbc_color_g_val_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_ds_zbc_color_b_r(void)
+{
+        return 0x0040580c;
+}
+static inline u32 gr_ds_zbc_color_b_val_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_ds_zbc_color_a_r(void)
+{
+        return 0x00405810;
+}
+static inline u32 gr_ds_zbc_color_a_val_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_ds_zbc_color_fmt_r(void)
+{
+        return 0x00405814;
+}
+static inline u32 gr_ds_zbc_color_fmt_val_f(u32 v)
+{
+        return (v & 0x7f) << 0;
+}
+static inline u32 gr_ds_zbc_color_fmt_val_invalid_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_ds_zbc_color_fmt_val_zero_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 gr_ds_zbc_color_fmt_val_unorm_one_v(void)
+{
+        return 0x00000002;
+}
+static inline u32 gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v(void)
+{
+        return 0x00000004;
+}
+static inline u32 gr_ds_zbc_z_r(void)
+{
+        return 0x00405818;
+}
+static inline u32 gr_ds_zbc_z_val_s(void)
+{
+        return 32;
+}
+static inline u32 gr_ds_zbc_z_val_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_ds_zbc_z_val_m(void)
+{
+        return 0xffffffff << 0;
+}
+static inline u32 gr_ds_zbc_z_val_v(u32 r)
+{
+        return (r >> 0) & 0xffffffff;
+}
+static inline u32 gr_ds_zbc_z_val__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 gr_ds_zbc_z_val__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_ds_zbc_z_fmt_r(void)
+{
+        return 0x0040581c;
+}
+static inline u32 gr_ds_zbc_z_fmt_val_f(u32 v)
+{
+        return (v & 0x1) << 0;
+}
+static inline u32 gr_ds_zbc_z_fmt_val_invalid_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_ds_zbc_z_fmt_val_fp32_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 gr_ds_zbc_tbl_index_r(void)
+{
+        return 0x00405820;
+}
+static inline u32 gr_ds_zbc_tbl_index_val_f(u32 v)
+{
+        return (v & 0xf) << 0;
+}
+static inline u32 gr_ds_zbc_tbl_ld_r(void)
+{
+        return 0x00405824;
+}
+static inline u32 gr_ds_zbc_tbl_ld_select_c_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_ds_zbc_tbl_ld_select_z_f(void)
+{
+        return 0x1;
+}
+static inline u32 gr_ds_zbc_tbl_ld_action_write_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_ds_zbc_tbl_ld_trigger_active_f(void)
+{
+        return 0x4;
+}
+static inline u32 gr_ds_tga_constraintlogic_r(void)
+{
+        return 0x00405830;
+}
+static inline u32 gr_ds_tga_constraintlogic_beta_cbsize_f(u32 v)
+{
+        return (v & 0xfff) << 16;
+}
+static inline u32 gr_ds_tga_constraintlogic_alpha_cbsize_f(u32 v)
+{
+        return (v & 0xfff) << 0;
+}
+static inline u32 gr_ds_hww_esr_r(void)
+{
+        return 0x00405840;
+}
+static inline u32 gr_ds_hww_esr_reset_s(void)
+{
+        return 1;
+}
+static inline u32 gr_ds_hww_esr_reset_f(u32 v)
+{
+        return (v & 0x1) << 30;
+}
+static inline u32 gr_ds_hww_esr_reset_m(void)
+{
+        return 0x1 << 30;
+}
+static inline u32 gr_ds_hww_esr_reset_v(u32 r)
+{
+        return (r >> 30) & 0x1;
+}
+static inline u32 gr_ds_hww_esr_reset_task_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 gr_ds_hww_esr_reset_task_f(void)
+{
+        return 0x40000000;
+}
+static inline u32 gr_ds_hww_esr_en_enabled_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gr_ds_hww_report_mask_r(void)
+{
+        return 0x00405844;
+}
+static inline u32 gr_ds_hww_report_mask_sph0_err_report_f(void)
+{
+        return 0x1;
+}
+static inline u32 gr_ds_hww_report_mask_sph1_err_report_f(void)
+{
+        return 0x2;
+}
+static inline u32 gr_ds_hww_report_mask_sph2_err_report_f(void)
+{
+        return 0x4;
+}
+static inline u32 gr_ds_hww_report_mask_sph3_err_report_f(void)
+{
+        return 0x8;
+}
+static inline u32 gr_ds_hww_report_mask_sph4_err_report_f(void)
+{
+        return 0x10;
+}
+static inline u32 gr_ds_hww_report_mask_sph5_err_report_f(void)
+{
+        return 0x20;
+}
+static inline u32 gr_ds_hww_report_mask_sph6_err_report_f(void)
+{
+        return 0x40;
+}
+static inline u32 gr_ds_hww_report_mask_sph7_err_report_f(void)
+{
+        return 0x80;
+}
+static inline u32 gr_ds_hww_report_mask_sph8_err_report_f(void)
+{
+        return 0x100;
+}
+static inline u32 gr_ds_hww_report_mask_sph9_err_report_f(void)
+{
+        return 0x200;
+}
+static inline u32 gr_ds_hww_report_mask_sph10_err_report_f(void)
+{
+        return 0x400;
+}
+static inline u32 gr_ds_hww_report_mask_sph11_err_report_f(void)
+{
+        return 0x800;
+}
+static inline u32 gr_ds_hww_report_mask_sph12_err_report_f(void)
+{
+        return 0x1000;
+}
+static inline u32 gr_ds_hww_report_mask_sph13_err_report_f(void)
+{
+        return 0x2000;
+}
+static inline u32 gr_ds_hww_report_mask_sph14_err_report_f(void)
+{
+        return 0x4000;
+}
+static inline u32 gr_ds_hww_report_mask_sph15_err_report_f(void)
+{
+        return 0x8000;
+}
+static inline u32 gr_ds_hww_report_mask_sph16_err_report_f(void)
+{
+        return 0x10000;
+}
+static inline u32 gr_ds_hww_report_mask_sph17_err_report_f(void)
+{
+        return 0x20000;
+}
+static inline u32 gr_ds_hww_report_mask_sph18_err_report_f(void)
+{
+        return 0x40000;
+}
+static inline u32 gr_ds_hww_report_mask_sph19_err_report_f(void)
+{
+        return 0x80000;
+}
+static inline u32 gr_ds_hww_report_mask_sph20_err_report_f(void)
+{
+        return 0x100000;
+}
+static inline u32 gr_ds_hww_report_mask_sph21_err_report_f(void)
+{
+        return 0x200000;
+}
+static inline u32 gr_ds_hww_report_mask_sph22_err_report_f(void)
+{
+        return 0x400000;
+}
+static inline u32 gr_ds_hww_report_mask_sph23_err_report_f(void)
+{
+        return 0x800000;
+}
+static inline u32 gr_ds_num_tpc_per_gpc_r(u32 i)
+{
+        return 0x00405870 + i*4;
+}
+static inline u32 gr_scc_bundle_cb_base_r(void)
+{
+        return 0x00408004;
+}
+static inline u32 gr_scc_bundle_cb_base_addr_39_8_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_scc_bundle_cb_base_addr_39_8_align_bits_v(void)
+{
+        return 0x00000008;
+}
+static inline u32 gr_scc_bundle_cb_size_r(void)
+{
+        return 0x00408008;
+}
+static inline u32 gr_scc_bundle_cb_size_div_256b_f(u32 v)
+{
+        return (v & 0x7ff) << 0;
+}
+static inline u32 gr_scc_bundle_cb_size_div_256b__prod_v(void)
+{
+        return 0x00000018;
+}
+static inline u32 gr_scc_bundle_cb_size_div_256b_byte_granularity_v(void)
+{
+        return 0x00000100;
+}
+static inline u32 gr_scc_bundle_cb_size_valid_false_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 gr_scc_bundle_cb_size_valid_false_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_scc_bundle_cb_size_valid_true_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gr_scc_pagepool_base_r(void)
+{
+        return 0x0040800c;
+}
+static inline u32 gr_scc_pagepool_base_addr_39_8_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_scc_pagepool_base_addr_39_8_align_bits_v(void)
+{
+        return 0x00000008;
+}
+static inline u32 gr_scc_pagepool_r(void)
+{
+        return 0x00408010;
+}
+static inline u32 gr_scc_pagepool_total_pages_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 gr_scc_pagepool_total_pages_hwmax_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 gr_scc_pagepool_total_pages_hwmax_value_v(void)
+{
+        return 0x00000080;
+}
+static inline u32 gr_scc_pagepool_total_pages_byte_granularity_v(void)
+{
+        return 0x00000100;
+}
+static inline u32 gr_scc_pagepool_max_valid_pages_s(void)
+{
+        return 8;
+}
+static inline u32 gr_scc_pagepool_max_valid_pages_f(u32 v)
+{
+        return (v & 0xff) << 8;
+}
+static inline u32 gr_scc_pagepool_max_valid_pages_m(void)
+{
+        return 0xff << 8;
+}
+static inline u32 gr_scc_pagepool_max_valid_pages_v(u32 r)
+{
+        return (r >> 8) & 0xff;
+}
+static inline u32 gr_scc_pagepool_valid_true_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gr_scc_init_r(void)
+{
+        return 0x0040802c;
+}
+static inline u32 gr_scc_init_ram_trigger_f(void)
+{
+        return 0x1;
+}
+static inline u32 gr_scc_hww_esr_r(void)
+{
+        return 0x00408030;
+}
+static inline u32 gr_scc_hww_esr_reset_active_f(void)
+{
+        return 0x40000000;
+}
+static inline u32 gr_scc_hww_esr_en_enable_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gr_sked_hww_esr_r(void)
+{
+        return 0x00407020;
+}
+static inline u32 gr_sked_hww_esr_reset_active_f(void)
+{
+        return 0x40000000;
+}
+static inline u32 gr_cwd_fs_r(void)
+{
+        return 0x00405b00;
+}
+static inline u32 gr_cwd_fs_num_gpcs_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 gr_cwd_fs_num_tpcs_f(u32 v)
+{
+        return (v & 0xff) << 8;
+}
+static inline u32 gr_gpc0_fs_gpc_r(void)
+{
+        return 0x00502608;
+}
+static inline u32 gr_gpc0_fs_gpc_num_available_tpcs_v(u32 r)
+{
+        return (r >> 0) & 0x1f;
+}
+static inline u32 gr_gpc0_fs_gpc_num_available_zculls_v(u32 r)
+{
+        return (r >> 16) & 0x1f;
+}
+static inline u32 gr_gpc0_cfg_r(void)
+{
+        return 0x00502620;
+}
+static inline u32 gr_gpc0_cfg_imem_sz_v(u32 r)
+{
+        return (r >> 0) & 0xff;
+}
+static inline u32 gr_gpccs_rc_lanes_r(void)
+{
+        return 0x00502880;
+}
+static inline u32 gr_gpccs_rc_lanes_num_chains_s(void)
+{
+        return 6;
+}
+static inline u32 gr_gpccs_rc_lanes_num_chains_f(u32 v)
+{
+        return (v & 0x3f) << 0;
+}
+static inline u32 gr_gpccs_rc_lanes_num_chains_m(void)
+{
+        return 0x3f << 0;
+}
+static inline u32 gr_gpccs_rc_lanes_num_chains_v(u32 r)
+{
+        return (r >> 0) & 0x3f;
+}
+static inline u32 gr_gpccs_rc_lane_size_r(u32 i)
+{
+        return 0x00502910 + i*0;
+}
+static inline u32 gr_gpccs_rc_lane_size__size_1_v(void)
+{
+        return 0x00000010;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_s(void)
+{
+        return 24;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_f(u32 v)
+{
+        return (v & 0xffffff) << 0;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_m(void)
+{
+        return 0xffffff << 0;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_v(u32 r)
+{
+        return (r >> 0) & 0xffffff;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_0_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_0_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_gpc0_zcull_fs_r(void)
+{
+        return 0x00500910;
+}
+static inline u32 gr_gpc0_zcull_fs_num_sms_f(u32 v)
+{
+        return (v & 0x1ff) << 0;
+}
+static inline u32 gr_gpc0_zcull_fs_num_active_banks_f(u32 v)
+{
+        return (v & 0xf) << 16;
+}
+static inline u32 gr_gpc0_zcull_ram_addr_r(void)
+{
+        return 0x00500914;
+}
+static inline u32 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(u32 v)
+{
+        return (v & 0xf) << 0;
+}
+static inline u32 gr_gpc0_zcull_ram_addr_row_offset_f(u32 v)
+{
+        return (v & 0xf) << 8;
+}
+static inline u32 gr_gpc0_zcull_sm_num_rcp_r(void)
+{
+        return 0x00500918;
+}
+static inline u32 gr_gpc0_zcull_sm_num_rcp_conservative_f(u32 v)
+{
+        return (v & 0xffffff) << 0;
+}
+static inline u32 gr_gpc0_zcull_sm_num_rcp_conservative__max_v(void)
+{
+        return 0x00800000;
+}
+static inline u32 gr_gpc0_zcull_total_ram_size_r(void)
+{
+        return 0x00500920;
+}
+static inline u32 gr_gpc0_zcull_total_ram_size_num_aliquots_f(u32 v)
+{
+        return (v & 0xffff) << 0;
+}
+static inline u32 gr_gpc0_zcull_zcsize_r(u32 i)
+{
+        return 0x00500a04 + i*32;
+}
+static inline u32 gr_gpc0_zcull_zcsize_height_subregion__multiple_v(void)
+{
+        return 0x00000040;
+}
+static inline u32 gr_gpc0_zcull_zcsize_width_subregion__multiple_v(void)
+{
+        return 0x00000010;
+}
+static inline u32 gr_gpc0_gpm_pd_active_tpcs_r(void)
+{
+        return 0x00500c08;
+}
+static inline u32 gr_gpc0_gpm_pd_active_tpcs_num_f(u32 v)
+{
+        return (v & 0x7) << 0;
+}
+static inline u32 gr_gpc0_gpm_pd_sm_id_r(u32 i)
+{
+        return 0x00500c10 + i*4;
+}
+static inline u32 gr_gpc0_gpm_pd_sm_id_id_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 gr_gpc0_gpm_pd_pes_tpc_id_mask_r(u32 i)
+{
+        return 0x00500c30 + i*4;
+}
+static inline u32 gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(u32 r)
+{
+        return (r >> 0) & 0xff;
+}
+static inline u32 gr_gpc0_gpm_sd_active_tpcs_r(void)
+{
+        return 0x00500c8c;
+}
+static inline u32 gr_gpc0_gpm_sd_active_tpcs_num_f(u32 v)
+{
+        return (v & 0x7) << 0;
+}
+static inline u32 gr_gpc0_tpc0_pe_cfg_smid_r(void)
+{
+        return 0x00504088;
+}
+static inline u32 gr_gpc0_tpc0_pe_cfg_smid_value_f(u32 v)
+{
+        return (v & 0xffff) << 0;
+}
+static inline u32 gr_gpc0_tpc0_l1c_cfg_smid_r(void)
+{
+        return 0x005044e8;
+}
+static inline u32 gr_gpc0_tpc0_l1c_cfg_smid_value_f(u32 v)
+{
+        return (v & 0xffff) << 0;
+}
+static inline u32 gr_gpc0_tpc0_sm_cfg_r(void)
+{
+        return 0x00504698;
+}
+static inline u32 gr_gpc0_tpc0_sm_cfg_sm_id_f(u32 v)
+{
+        return (v & 0xffff) << 0;
+}
+static inline u32 gr_gpc0_ppc0_pes_vsc_strem_r(void)
+{
+        return 0x00503018;
+}
+static inline u32 gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(void)
+{
+        return 0x1 << 0;
+}
+static inline u32 gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f(void)
+{
+        return 0x1;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_r(void)
+{
+        return 0x005030c0;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_start_offset_f(u32 v)
+{
+        return (v & 0xffff) << 0;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_start_offset_m(void)
+{
+        return 0xffff << 0;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_start_offset_v(u32 r)
+{
+        return (r >> 0) & 0xffff;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_size_f(u32 v)
+{
+        return (v & 0xfff) << 16;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_size_m(void)
+{
+        return 0xfff << 16;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_size_v(u32 r)
+{
+        return (r >> 16) & 0xfff;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_size_default_v(void)
+{
+        return 0x00000240;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_size_granularity_v(void)
+{
+        return 0x00000020;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(u32 v)
+{
+        return (v & 0x1) << 28;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_r(void)
+{
+        return 0x005030e4;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_start_offset_f(u32 v)
+{
+        return (v & 0xffff) << 0;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_size_f(u32 v)
+{
+        return (v & 0xfff) << 16;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_size_m(void)
+{
+        return 0xfff << 16;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_size_v(u32 r)
+{
+        return (r >> 16) & 0xfff;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_size_default_v(void)
+{
+        return 0x00000648;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_size_granularity_v(void)
+{
+        return 0x00000020;
+}
+static inline u32 gr_gpccs_falcon_addr_r(void)
+{
+        return 0x0041a0ac;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_s(void)
+{
+        return 6;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_f(u32 v)
+{
+        return (v & 0x3f) << 0;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_m(void)
+{
+        return 0x3f << 0;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_v(u32 r)
+{
+        return (r >> 0) & 0x3f;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_init_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_s(void)
+{
+        return 6;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_f(u32 v)
+{
+        return (v & 0x3f) << 6;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_m(void)
+{
+        return 0x3f << 6;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_v(u32 r)
+{
+        return (r >> 6) & 0x3f;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_init_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_gpccs_falcon_addr_ext_s(void)
+{
+        return 12;
+}
+static inline u32 gr_gpccs_falcon_addr_ext_f(u32 v)
+{
+        return (v & 0xfff) << 0;
+}
+static inline u32 gr_gpccs_falcon_addr_ext_m(void)
+{
+        return 0xfff << 0;
+}
+static inline u32 gr_gpccs_falcon_addr_ext_v(u32 r)
+{
+        return (r >> 0) & 0xfff;
+}
+static inline u32 gr_gpccs_cpuctl_r(void)
+{
+        return 0x0041a100;
+}
+static inline u32 gr_gpccs_cpuctl_startcpu_f(u32 v)
+{
+        return (v & 0x1) << 1;
+}
+static inline u32 gr_gpccs_dmactl_r(void)
+{
+        return 0x0041a10c;
+}
+static inline u32 gr_gpccs_dmactl_require_ctx_f(u32 v)
+{
+        return (v & 0x1) << 0;
+}
+static inline u32 gr_gpccs_dmactl_dmem_scrubbing_m(void)
+{
+        return 0x1 << 1;
+}
+static inline u32 gr_gpccs_dmactl_imem_scrubbing_m(void)
+{
+        return 0x1 << 2;
+}
+static inline u32 gr_gpccs_imemc_r(u32 i)
+{
+        return 0x0041a180 + i*16;
+}
+static inline u32 gr_gpccs_imemc_offs_f(u32 v)
+{
+        return (v & 0x3f) << 2;
+}
+static inline u32 gr_gpccs_imemc_blk_f(u32 v)
+{
+        return (v & 0xff) << 8;
+}
+static inline u32 gr_gpccs_imemc_aincw_f(u32 v)
+{
+        return (v & 0x1) << 24;
+}
+static inline u32 gr_gpccs_imemd_r(u32 i)
+{
+        return 0x0041a184 + i*16;
+}
+static inline u32 gr_gpccs_imemt_r(u32 i)
+{
+        return 0x0041a188 + i*16;
+}
+static inline u32 gr_gpccs_imemt__size_1_v(void)
+{
+        return 0x00000004;
+}
+static inline u32 gr_gpccs_imemt_tag_f(u32 v)
+{
+        return (v & 0xffff) << 0;
+}
+static inline u32 gr_gpccs_dmemc_r(u32 i)
+{
+        return 0x0041a1c0 + i*8;
+}
+static inline u32 gr_gpccs_dmemc_offs_f(u32 v)
+{
+        return (v & 0x3f) << 2;
+}
+static inline u32 gr_gpccs_dmemc_blk_f(u32 v)
+{
+        return (v & 0xff) << 8;
+}
+static inline u32 gr_gpccs_dmemc_aincw_f(u32 v)
+{
+        return (v & 0x1) << 24;
+}
+static inline u32 gr_gpccs_dmemd_r(u32 i)
+{
+        return 0x0041a1c4 + i*8;
+}
+static inline u32 gr_gpccs_ctxsw_mailbox_r(u32 i)
+{
+        return 0x0041a800 + i*4;
+}
+static inline u32 gr_gpccs_ctxsw_mailbox_value_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_r(void)
+{
+        return 0x00418808;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_s(void)
+{
+        return 32;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_m(void)
+{
+        return 0xffffffff << 0;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_v(u32 r)
+{
+        return (r >> 0) & 0xffffffff;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_init_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_r(void)
+{
+        return 0x0041880c;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_s(void)
+{
+        return 11;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_f(u32 v)
+{
+        return (v & 0x7ff) << 0;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_m(void)
+{
+        return 0x7ff << 0;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_v(u32 r)
+{
+        return (r >> 0) & 0x7ff;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_init_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b__prod_v(void)
+{
+        return 0x00000018;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b__prod_f(void)
+{
+        return 0x18;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_s(void)
+{
+        return 1;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_f(u32 v)
+{
+        return (v & 0x1) << 31;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_m(void)
+{
+        return 0x1 << 31;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_v(u32 r)
+{
+        return (r >> 31) & 0x1;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_false_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_false_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_true_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_true_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gr_gpcs_setup_attrib_cb_base_r(void)
+{
+        return 0x00418810;
+}
+static inline u32 gr_gpcs_setup_attrib_cb_base_addr_39_12_f(u32 v)
+{
+        return (v & 0xfffffff) << 0;
+}
+static inline u32 gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v(void)
+{
+        return 0x0000000c;
+}
+static inline u32 gr_gpcs_setup_attrib_cb_base_valid_true_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gr_crstr_gpc_map0_r(void)
+{
+        return 0x00418b08;
+}
+static inline u32 gr_crstr_gpc_map0_tile0_f(u32 v)
+{
+        return (v & 0x7) << 0;
+}
+static inline u32 gr_crstr_gpc_map0_tile1_f(u32 v)
+{
+        return (v & 0x7) << 5;
+}
+static inline u32 gr_crstr_gpc_map0_tile2_f(u32 v)
+{
+        return (v & 0x7) << 10;
+}
+static inline u32 gr_crstr_gpc_map0_tile3_f(u32 v)
+{
+        return (v & 0x7) << 15;
+}
+static inline u32 gr_crstr_gpc_map0_tile4_f(u32 v)
+{
+        return (v & 0x7) << 20;
+}
+static inline u32 gr_crstr_gpc_map0_tile5_f(u32 v)
+{
+        return (v & 0x7) << 25;
+}
+static inline u32 gr_crstr_gpc_map1_r(void)
+{
+        return 0x00418b0c;
+}
+static inline u32 gr_crstr_gpc_map1_tile6_f(u32 v)
+{
+        return (v & 0x7) << 0;
+}
+static inline u32 gr_crstr_gpc_map1_tile7_f(u32 v)
+{
+        return (v & 0x7) << 5;
+}
+static inline u32 gr_crstr_gpc_map1_tile8_f(u32 v)
+{
+        return (v & 0x7) << 10;
+}
+static inline u32 gr_crstr_gpc_map1_tile9_f(u32 v)
+{
+        return (v & 0x7) << 15;
+}
+static inline u32 gr_crstr_gpc_map1_tile10_f(u32 v)
+{
+        return (v & 0x7) << 20;
+}
+static inline u32 gr_crstr_gpc_map1_tile11_f(u32 v)
+{
+        return (v & 0x7) << 25;
+}
+static inline u32 gr_crstr_gpc_map2_r(void)
+{
+        return 0x00418b10;
+}
+static inline u32 gr_crstr_gpc_map2_tile12_f(u32 v)
+{
+        return (v & 0x7) << 0;
+}
+static inline u32 gr_crstr_gpc_map2_tile13_f(u32 v)
+{
+        return (v & 0x7) << 5;
+}
+static inline u32 gr_crstr_gpc_map2_tile14_f(u32 v)
+{
+        return (v & 0x7) << 10;
+}
+static inline u32 gr_crstr_gpc_map2_tile15_f(u32 v)
+{
+        return (v & 0x7) << 15;
+}
+static inline u32 gr_crstr_gpc_map2_tile16_f(u32 v)
+{
+        return (v & 0x7) << 20;
+}
+static inline u32 gr_crstr_gpc_map2_tile17_f(u32 v)
+{
+        return (v & 0x7) << 25;
+}
+static inline u32 gr_crstr_gpc_map3_r(void)
+{
+        return 0x00418b14;
+}
+static inline u32 gr_crstr_gpc_map3_tile18_f(u32 v)
+{
+        return (v & 0x7) << 0;
+}
+static inline u32 gr_crstr_gpc_map3_tile19_f(u32 v)
+{
+        return (v & 0x7) << 5;
+}
+static inline u32 gr_crstr_gpc_map3_tile20_f(u32 v)
+{
+        return (v & 0x7) << 10;
+}
+static inline u32 gr_crstr_gpc_map3_tile21_f(u32 v)
+{
+        return (v & 0x7) << 15;
+}
+static inline u32 gr_crstr_gpc_map3_tile22_f(u32 v)
+{
+        return (v & 0x7) << 20;
+}
+static inline u32 gr_crstr_gpc_map3_tile23_f(u32 v)
+{
+        return (v & 0x7) << 25;
+}
+static inline u32 gr_crstr_gpc_map4_r(void)
+{
+        return 0x00418b18;
+}
+static inline u32 gr_crstr_gpc_map4_tile24_f(u32 v)
+{
+        return (v & 0x7) << 0;
+}
+static inline u32 gr_crstr_gpc_map4_tile25_f(u32 v)
+{
+        return (v & 0x7) << 5;
+}
+static inline u32 gr_crstr_gpc_map4_tile26_f(u32 v)
+{
+        return (v & 0x7) << 10;
+}
+static inline u32 gr_crstr_gpc_map4_tile27_f(u32 v)
+{
+        return (v & 0x7) << 15;
+}
+static inline u32 gr_crstr_gpc_map4_tile28_f(u32 v)
+{
+        return (v & 0x7) << 20;
+}
+static inline u32 gr_crstr_gpc_map4_tile29_f(u32 v)
+{
+        return (v & 0x7) << 25;
+}
+static inline u32 gr_crstr_gpc_map5_r(void)
+{
+        return 0x00418b1c;
+}
+static inline u32 gr_crstr_gpc_map5_tile30_f(u32 v)
+{
+        return (v & 0x7) << 0;
+}
+static inline u32 gr_crstr_gpc_map5_tile31_f(u32 v)
+{
+        return (v & 0x7) << 5;
+}
+static inline u32 gr_crstr_gpc_map5_tile32_f(u32 v)
+{
+        return (v & 0x7) << 10;
+}
+static inline u32 gr_crstr_gpc_map5_tile33_f(u32 v)
+{
+        return (v & 0x7) << 15;
+}
+static inline u32 gr_crstr_gpc_map5_tile34_f(u32 v)
+{
+        return (v & 0x7) << 20;
+}
+static inline u32 gr_crstr_gpc_map5_tile35_f(u32 v)
+{
+        return (v & 0x7) << 25;
+}
+static inline u32 gr_crstr_map_table_cfg_r(void)
+{
+        return 0x00418bb8;
+}
+static inline u32 gr_crstr_map_table_cfg_row_offset_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 gr_crstr_map_table_cfg_num_entries_f(u32 v)
+{
+        return (v & 0xff) << 8;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_r(void)
+{
+        return 0x00418980;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(u32 v)
+{
+        return (v & 0x7) << 0;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(u32 v)
+{
+        return (v & 0x7) << 4;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(u32 v)
+{
+        return (v & 0x7) << 8;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(u32 v)
+{
+        return (v & 0x7) << 12;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(u32 v)
+{
+        return (v & 0x7) << 16;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(u32 v)
+{
+        return (v & 0x7) << 20;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(u32 v)
+{
+        return (v & 0x7) << 24;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(u32 v)
+{
+        return (v & 0x7) << 28;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_r(void)
+{
+        return 0x00418984;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(u32 v)
+{
+        return (v & 0x7) << 0;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(u32 v)
+{
+        return (v & 0x7) << 4;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(u32 v)
+{
+        return (v & 0x7) << 8;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(u32 v)
+{
+        return (v & 0x7) << 12;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(u32 v)
+{
+        return (v & 0x7) << 16;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(u32 v)
+{
+        return (v & 0x7) << 20;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(u32 v)
+{
+        return (v & 0x7) << 24;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(u32 v)
+{
+        return (v & 0x7) << 28;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_r(void)
+{
+        return 0x00418988;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(u32 v)
+{
+        return (v & 0x7) << 0;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(u32 v)
+{
+        return (v & 0x7) << 4;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(u32 v)
+{
+        return (v & 0x7) << 8;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(u32 v)
+{
+        return (v & 0x7) << 12;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(u32 v)
+{
+        return (v & 0x7) << 16;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(u32 v)
+{
+        return (v & 0x7) << 20;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(u32 v)
+{
+        return (v & 0x7) << 24;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_s(void)
+{
+        return 3;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(u32 v)
+{
+        return (v & 0x7) << 28;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_m(void)
+{
+        return 0x7 << 28;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_v(u32 r)
+{
+        return (r >> 28) & 0x7;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_r(void)
+{
+        return 0x0041898c;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(u32 v)
+{
+        return (v & 0x7) << 0;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(u32 v)
+{
+        return (v & 0x7) << 4;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(u32 v)
+{
+        return (v & 0x7) << 8;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(u32 v)
+{
+        return (v & 0x7) << 12;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(u32 v)
+{
+        return (v & 0x7) << 16;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(u32 v)
+{
+        return (v & 0x7) << 20;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(u32 v)
+{
+        return (v & 0x7) << 24;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(u32 v)
+{
+        return (v & 0x7) << 28;
+}
+static inline u32 gr_gpcs_gpm_pd_cfg_r(void)
+{
+        return 0x00418c6c;
+}
+static inline u32 gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f(void)
+{
+        return 0x1;
+}
+static inline u32 gr_gpcs_gcc_pagepool_base_r(void)
+{
+        return 0x00419004;
+}
+static inline u32 gr_gpcs_gcc_pagepool_base_addr_39_8_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_gpcs_gcc_pagepool_r(void)
+{
+        return 0x00419008;
+}
+static inline u32 gr_gpcs_gcc_pagepool_total_pages_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 gr_gpcs_tpcs_pe_vaf_r(void)
+{
+        return 0x0041980c;
+}
+static inline u32 gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f(void)
+{
+        return 0x10;
+}
+static inline u32 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(void)
+{
+        return 0x00419848;
+}
+static inline u32 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(u32 v)
+{
+        return (v & 0xfffffff) << 0;
+}
+static inline u32 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_f(u32 v)
+{
+        return (v & 0x1) << 28;
+}
+static inline u32 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(void)
+{
+        return 0x10000000;
+}
+static inline u32 gr_gpcs_tpcs_l1c_pm_r(void)
+{
+        return 0x00419ca8;
+}
+static inline u32 gr_gpcs_tpcs_l1c_pm_enable_m(void)
+{
+        return 0x1 << 31;
+}
+static inline u32 gr_gpcs_tpcs_l1c_pm_enable_enable_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gr_gpcs_tpcs_l1c_cfg_r(void)
+{
+        return 0x00419cb8;
+}
+static inline u32 gr_gpcs_tpcs_l1c_cfg_blkactivity_enable_m(void)
+{
+        return 0x1 << 31;
+}
+static inline u32 gr_gpcs_tpcs_l1c_cfg_blkactivity_enable_enable_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gr_gpcs_tpcs_mpc_vtg_debug_r(void)
+{
+        return 0x00419c00;
+}
+static inline u32 gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f(void)
+{
+        return 0x8;
+}
+static inline u32 gr_gpcs_tpcs_sm_pm_ctrl_r(void)
+{
+        return 0x00419e00;
+}
+static inline u32 gr_gpcs_tpcs_sm_pm_ctrl_core_enable_m(void)
+{
+        return 0x1 << 7;
+}
+static inline u32 gr_gpcs_tpcs_sm_pm_ctrl_core_enable_enable_f(void)
+{
+        return 0x80;
+}
+static inline u32 gr_gpcs_tpcs_sm_pm_ctrl_qctl_enable_m(void)
+{
+        return 0x1 << 15;
+}
+static inline u32 gr_gpcs_tpcs_sm_pm_ctrl_qctl_enable_enable_f(void)
+{
+        return 0x8000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(void)
+{
+        return 0x00419e44;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f(void)
+{
+        return 0x2;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f(void)
+{
+        return 0x4;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f(void)
+{
+        return 0x8;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f(void)
+{
+        return 0x10;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f(void)
+{
+        return 0x20;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f(void)
+{
+        return 0x40;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f(void)
+{
+        return 0x80;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f(void)
+{
+        return 0x100;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f(void)
+{
+        return 0x200;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f(void)
+{
+        return 0x400;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f(void)
+{
+        return 0x800;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f(void)
+{
+        return 0x1000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f(void)
+{
+        return 0x2000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f(void)
+{
+        return 0x4000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f(void)
+{
+        return 0x8000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f(void)
+{
+        return 0x10000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f(void)
+{
+        return 0x20000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f(void)
+{
+        return 0x40000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f(void)
+{
+        return 0x80000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f(void)
+{
+        return 0x100000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(void)
+{
+        return 0x00419e4c;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f(void)
+{
+        return 0x1;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f(void)
+{
+        return 0x2;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f(void)
+{
+        return 0x4;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f(void)
+{
+        return 0x8;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f(void)
+{
+        return 0x10;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f(void)
+{
+        return 0x20;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f(void)
+{
+        return 0x40;
+}
+static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_en_r(void)
+{
+        return 0x0050450c;
+}
+static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f(void)
+{
+        return 0x2;
+}
+static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_gpc0_gpccs_gpc_exception_en_r(void)
+{
+        return 0x00502c94;
+}
+static inline u32 gr_gpc0_gpccs_gpc_exception_en_tpc_0_enabled_f(void)
+{
+        return 0x10000;
+}
+static inline u32 gr_gpc0_gpccs_gpc_exception_en_tpc_0_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_gpcs_gpccs_gpc_exception_r(void)
+{
+        return 0x0041ac90;
+}
+static inline u32 gr_gpcs_gpccs_gpc_exception_tpc_v(u32 r)
+{
+        return (r >> 16) & 0xff;
+}
+static inline u32 gr_gpcs_gpccs_gpc_exception_tpc_0_pending_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 gr_gpcs_tpcs_tpccs_tpc_exception_r(void)
+{
+        return 0x00419d08;
+}
+static inline u32 gr_gpcs_tpcs_tpccs_tpc_exception_sm_v(u32 r)
+{
+        return (r >> 1) & 0x1;
+}
+static inline u32 gr_gpcs_tpcs_tpccs_tpc_exception_sm_pending_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_r(void)
+{
+        return 0x00504610;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(u32 r)
+{
+        return (r >> 0) & 0x1;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_status0_r(void)
+{
+        return 0x0050460c;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(u32 r)
+{
+        return (r >> 4) & 0x1;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_r(void)
+{
+        return 0x00504650;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f(void)
+{
+        return 0x10;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f(void)
+{
+        return 0x20;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f(void)
+{
+        return 0x40;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_warp_esr_r(void)
+{
+        return 0x00504648;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_warp_esr_error_v(u32 r)
+{
+        return (r >> 0) & 0xffff;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_warp_esr_error_none_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_gpc0_tpc0_sm_halfctl_ctrl_r(void)
+{
+        return 0x00504770;
+}
+static inline u32 gr_gpcs_tpcs_sm_halfctl_ctrl_r(void)
+{
+        return 0x00419f70;
+}
+static inline u32 gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_blkactivity_enable_m(void)
+{
+        return 0x1 << 1;
+}
+static inline u32 gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_blkactivity_enable_enable_f(void)
+{
+        return 0x2;
+}
+static inline u32 gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_m(void)
+{
+        return 0x1 << 4;
+}
+static inline u32 gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_f(u32 v)
+{
+        return (v & 0x1) << 4;
+}
+static inline u32 gr_gpc0_tpc0_sm_debug_sfe_control_r(void)
+{
+        return 0x0050477c;
+}
+static inline u32 gr_gpcs_tpcs_sm_debug_sfe_control_r(void)
+{
+        return 0x00419f7c;
+}
+static inline u32 gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_m(void)
+{
+        return 0x1 << 0;
+}
+static inline u32 gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_f(u32 v)
+{
+        return (v & 0x1) << 0;
+}
+static inline u32 gr_gpcs_tpcs_sm_debug_sfe_control_blkactivity_enable_m(void)
+{
+        return 0x1 << 16;
+}
+static inline u32 gr_gpcs_tpcs_sm_debug_sfe_control_blkactivity_enable_enable_f(void)
+{
+        return 0x10000;
+}
+static inline u32 gr_gpcs_tpcs_sm_power_throttle_r(void)
+{
+        return 0x00419ed0;
+}
+static inline u32 gr_gpcs_tpcs_pes_vsc_vpc_r(void)
+{
+        return 0x0041be08;
+}
+static inline u32 gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f(void)
+{
+        return 0x4;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map0_r(void)
+{
+        return 0x0041bf00;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map1_r(void)
+{
+        return 0x0041bf04;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map2_r(void)
+{
+        return 0x0041bf08;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map3_r(void)
+{
+        return 0x0041bf0c;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map4_r(void)
+{
+        return 0x0041bf10;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map5_r(void)
+{
+        return 0x0041bf14;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_r(void)
+{
+        return 0x0041bfd0;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_row_offset_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_num_entries_f(u32 v)
+{
+        return (v & 0xff) << 8;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(u32 v)
+{
+        return (v & 0x1f) << 16;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(u32 v)
+{
+        return (v & 0x7) << 21;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(u32 v)
+{
+        return (v & 0x1f) << 24;
+}
+static inline u32 gr_gpcs_ppcs_wwdx_sm_num_rcp_r(void)
+{
+        return 0x0041bfd4;
+}
+static inline u32 gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(u32 v)
+{
+        return (v & 0xffffff) << 0;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_r(void)
+{
+        return 0x0041bfe4;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(u32 v)
+{
+        return (v & 0x1f) << 0;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(u32 v)
+{
+        return (v & 0x1f) << 5;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(u32 v)
+{
+        return (v & 0x1f) << 10;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(u32 v)
+{
+        return (v & 0x1f) << 15;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(u32 v)
+{
+        return (v & 0x1f) << 20;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(u32 v)
+{
+        return (v & 0x1f) << 25;
+}
+static inline u32 gr_gpcs_ppcs_cbm_cfg_r(void)
+{
+        return 0x0041bec0;
+}
+static inline u32 gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 gr_bes_zrop_settings_r(void)
+{
+        return 0x00408850;
+}
+static inline u32 gr_bes_zrop_settings_num_active_fbps_f(u32 v)
+{
+        return (v & 0xf) << 0;
+}
+static inline u32 gr_bes_crop_settings_r(void)
+{
+        return 0x00408958;
+}
+static inline u32 gr_bes_crop_settings_num_active_fbps_f(u32 v)
+{
+        return (v & 0xf) << 0;
+}
+static inline u32 gr_zcull_bytes_per_aliquot_per_gpu_v(void)
+{
+        return 0x00000020;
+}
+static inline u32 gr_zcull_save_restore_header_bytes_per_gpc_v(void)
+{
+        return 0x00000020;
+}
+static inline u32 gr_zcull_save_restore_subregion_header_bytes_per_gpc_v(void)
+{
+        return 0x000000c0;
+}
+static inline u32 gr_zcull_subregion_qty_v(void)
+{
+        return 0x00000010;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r(void)
+{
+        return 0x00504604;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r(void)
+{
+        return 0x00504608;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r(void)
+{
+        return 0x0050465c;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control1_r(void)
+{
+        return 0x00504660;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control2_r(void)
+{
+        return 0x00504664;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control3_r(void)
+{
+        return 0x00504668;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control4_r(void)
+{
+        return 0x0050466c;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r(void)
+{
+        return 0x00504658;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status_r(void)
+{
+        return 0x00504670;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r(void)
+{
+        return 0x00504694;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r(void)
+{
+        return 0x00504730;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r(void)
+{
+        return 0x00504734;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r(void)
+{
+        return 0x00504738;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r(void)
+{
+        return 0x0050473c;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r(void)
+{
+        return 0x00504740;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r(void)
+{
+        return 0x00504744;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r(void)
+{
+        return 0x00504748;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r(void)
+{
+        return 0x0050474c;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_r(void)
+{
+        return 0x00504674;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_r(void)
+{
+        return 0x00504678;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_r(void)
+{
+        return 0x0050467c;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_r(void)
+{
+        return 0x00504680;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r(void)
+{
+        return 0x00504684;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r(void)
+{
+        return 0x00504688;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r(void)
+{
+        return 0x0050468c;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r(void)
+{
+        return 0x00504690;
+}
+static inline u32 gr_fe_pwr_mode_r(void)
+{
+        return 0x00404170;
+}
+static inline u32 gr_fe_pwr_mode_mode_auto_f(void)
+{
+        return 0x0;
+}
+static inline u32 gr_fe_pwr_mode_mode_force_on_f(void)
+{
+        return 0x2;
+}
+static inline u32 gr_fe_pwr_mode_req_v(u32 r)
+{
+        return (r >> 4) & 0x1;
+}
+static inline u32 gr_fe_pwr_mode_req_send_f(void)
+{
+        return 0x10;
+}
+static inline u32 gr_fe_pwr_mode_req_done_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 gr_gpc0_tpc0_l1c_dbg_r(void)
+{
+        return 0x005044b0;
+}
+static inline u32 gr_gpc0_tpc0_l1c_dbg_cya15_en_f(void)
+{
+        return 0x8000000;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_ltc_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_ltc_gk20a.h
new file mode 100644
index 00000000..65221b59
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_ltc_gk20a.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_ltc_gk20a_h_
+#define _hw_ltc_gk20a_h_
+static inline u32 ltc_ltcs_lts0_cbc_ctrl1_r(void)
+{
+        return 0x001410c8;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_r(void)
+{
+        return 0x00141104;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_active_ways_v(u32 r)
+{
+        return (r >> 0) & 0xffff;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_active_sets_v(u32 r)
+{
+        return (r >> 16) & 0x3;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_active_sets_all_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_active_sets_half_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_active_sets_quarter_v(void)
+{
+        return 0x00000002;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl1_r(void)
+{
+        return 0x0017e8c8;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl1_clear_v(u32 r)
+{
+        return (r >> 2) & 0x1;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl1_clear_active_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl1_clear_active_f(void)
+{
+        return 0x4;
+}
+static inline u32 ltc_ltc0_lts0_cbc_ctrl1_r(void)
+{
+        return 0x0017e8c8;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl2_r(void)
+{
+        return 0x0017e8cc;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl2_clear_lower_bound_f(u32 v)
+{
+        return (v & 0x1ffff) << 0;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl3_r(void)
+{
+        return 0x0017e8d0;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_f(u32 v)
+{
+        return (v & 0x1ffff) << 0;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_init_v(void)
+{
+        return 0x0001ffff;
+}
+static inline u32 ltc_ltcs_ltss_cbc_base_r(void)
+{
+        return 0x0017e8d4;
+}
+static inline u32 ltc_ltcs_ltss_cbc_base_alignment_shift_v(void)
+{
+        return 0x0000000b;
+}
+static inline u32 ltc_ltcs_ltss_cbc_base_address_v(u32 r)
+{
+        return (r >> 0) & 0x3ffffff;
+}
+static inline u32 ltc_ltcs_ltss_cbc_param_r(void)
+{
+        return 0x0017e8dc;
+}
+static inline u32 ltc_ltcs_ltss_cbc_param_comptags_per_cache_line_v(u32 r)
+{
+        return (r >> 0) & 0xffff;
+}
+static inline u32 ltc_ltcs_ltss_cbc_param_cache_line_size_v(u32 r)
+{
+        return (r >> 24) & 0xf;
+}
+static inline u32 ltc_ltcs_ltss_cbc_param_slices_per_fbp_v(u32 r)
+{
+        return (r >> 28) & 0xf;
+}
+static inline u32 ltc_ltcs_ltss_tstg_set_mgmt_r(void)
+{
+        return 0x0017e91c;
+}
+static inline u32 ltc_ltcs_ltss_tstg_set_mgmt_max_ways_evict_last_f(u32 v)
+{
+        return (v & 0x1f) << 16;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_index_r(void)
+{
+        return 0x0017ea44;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_index_address_f(u32 v)
+{
+        return (v & 0xf) << 0;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(u32 i)
+{
+        return 0x0017ea48 + i*4;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(void)
+{
+        return 0x00000004;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(void)
+{
+        return 0x0017ea58;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_depth_clear_value_field_s(void)
+{
+        return 32;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_depth_clear_value_field_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_depth_clear_value_field_m(void)
+{
+        return 0xffffffff << 0;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_depth_clear_value_field_v(u32 r)
+{
+        return (r >> 0) & 0xffffffff;
+}
+static inline u32 ltc_ltcs_ltss_tstg_set_mgmt_2_r(void)
+{
+        return 0x0017e924;
+}
+static inline u32 ltc_ltcs_ltss_tstg_set_mgmt_2_l2_bypass_mode_enabled_f(void)
+{
+        return 0x10000000;
+}
+static inline u32 ltc_ltss_g_elpg_r(void)
+{
+        return 0x0017e828;
+}
+static inline u32 ltc_ltss_g_elpg_flush_v(u32 r)
+{
+        return (r >> 0) & 0x1;
+}
+static inline u32 ltc_ltss_g_elpg_flush_pending_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 ltc_ltss_g_elpg_flush_pending_f(void)
+{
+        return 0x1;
+}
+static inline u32 ltc_ltc0_ltss_intr_r(void)
+{
+        return 0x00140820;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_mc_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_mc_gk20a.h
new file mode 100644
index 00000000..1692bb54
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_mc_gk20a.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_mc_gk20a_h_
+#define _hw_mc_gk20a_h_
+static inline u32 mc_boot_0_r(void)
+{
+        return 0x00000000;
+}
+static inline u32 mc_boot_0_architecture_v(u32 r)
+{
+        return (r >> 24) & 0x1f;
+}
+static inline u32 mc_boot_0_implementation_v(u32 r)
+{
+        return (r >> 20) & 0xf;
+}
+static inline u32 mc_boot_0_major_revision_v(u32 r)
+{
+        return (r >> 4) & 0xf;
+}
+static inline u32 mc_boot_0_minor_revision_v(u32 r)
+{
+        return (r >> 0) & 0xf;
+}
+static inline u32 mc_intr_0_r(void)
+{
+        return 0x00000100;
+}
+static inline u32 mc_intr_0_pfifo_pending_f(void)
+{
+        return 0x100;
+}
+static inline u32 mc_intr_0_pgraph_pending_f(void)
+{
+        return 0x1000;
+}
+static inline u32 mc_intr_0_pmu_pending_f(void)
+{
+        return 0x1000000;
+}
+static inline u32 mc_intr_0_ltc_pending_f(void)
+{
+        return 0x2000000;
+}
+static inline u32 mc_intr_0_priv_ring_pending_f(void)
+{
+        return 0x40000000;
+}
+static inline u32 mc_intr_0_pbus_pending_f(void)
+{
+        return 0x10000000;
+}
+static inline u32 mc_intr_1_r(void)
+{
+        return 0x00000104;
+}
+static inline u32 mc_intr_mask_0_r(void)
+{
+        return 0x00000640;
+}
+static inline u32 mc_intr_mask_0_pmu_enabled_f(void)
+{
+        return 0x1000000;
+}
+static inline u32 mc_intr_mask_1_r(void)
+{
+        return 0x00000644;
+}
+static inline u32 mc_intr_mask_1_pmu_enabled_f(void)
+{
+        return 0x1000000;
+}
+static inline u32 mc_intr_en_0_r(void)
+{
+        return 0x00000140;
+}
+static inline u32 mc_intr_en_0_inta_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 mc_intr_en_0_inta_hardware_f(void)
+{
+        return 0x1;
+}
+static inline u32 mc_intr_en_1_r(void)
+{
+        return 0x00000144;
+}
+static inline u32 mc_intr_en_1_inta_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 mc_intr_en_1_inta_hardware_f(void)
+{
+        return 0x1;
+}
+static inline u32 mc_enable_r(void)
+{
+        return 0x00000200;
+}
+static inline u32 mc_enable_xbar_enabled_f(void)
+{
+        return 0x4;
+}
+static inline u32 mc_enable_l2_enabled_f(void)
+{
+        return 0x8;
+}
+static inline u32 mc_enable_pmedia_s(void)
+{
+        return 1;
+}
+static inline u32 mc_enable_pmedia_f(u32 v)
+{
+        return (v & 0x1) << 4;
+}
+static inline u32 mc_enable_pmedia_m(void)
+{
+        return 0x1 << 4;
+}
+static inline u32 mc_enable_pmedia_v(u32 r)
+{
+        return (r >> 4) & 0x1;
+}
+static inline u32 mc_enable_priv_ring_enabled_f(void)
+{
+        return 0x20;
+}
+static inline u32 mc_enable_ce0_m(void)
+{
+        return 0x1 << 6;
+}
+static inline u32 mc_enable_pfifo_enabled_f(void)
+{
+        return 0x100;
+}
+static inline u32 mc_enable_pgraph_enabled_f(void)
+{
+        return 0x1000;
+}
+static inline u32 mc_enable_pwr_v(u32 r)
+{
+        return (r >> 13) & 0x1;
+}
+static inline u32 mc_enable_pwr_disabled_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 mc_enable_pwr_enabled_f(void)
+{
+        return 0x2000;
+}
+static inline u32 mc_enable_pfb_enabled_f(void)
+{
+        return 0x100000;
+}
+static inline u32 mc_enable_ce2_m(void)
+{
+        return 0x1 << 21;
+}
+static inline u32 mc_enable_ce2_enabled_f(void)
+{
+        return 0x200000;
+}
+static inline u32 mc_enable_blg_enabled_f(void)
+{
+        return 0x8000000;
+}
+static inline u32 mc_enable_perfmon_enabled_f(void)
+{
+        return 0x10000000;
+}
+static inline u32 mc_enable_hub_enabled_f(void)
+{
+        return 0x20000000;
+}
+static inline u32 mc_enable_pb_r(void)
+{
+        return 0x00000204;
+}
+static inline u32 mc_enable_pb_0_s(void)
+{
+        return 1;
+}
+static inline u32 mc_enable_pb_0_f(u32 v)
+{
+        return (v & 0x1) << 0;
+}
+static inline u32 mc_enable_pb_0_m(void)
+{
+        return 0x1 << 0;
+}
+static inline u32 mc_enable_pb_0_v(u32 r)
+{
+        return (r >> 0) & 0x1;
+}
+static inline u32 mc_enable_pb_0_enabled_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 mc_enable_pb_sel_f(u32 v, u32 i)
+{
+        return (v & 0x1) << (0 + i*1);
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_pbdma_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_pbdma_gk20a.h
new file mode 100644
index 00000000..df1a6d48
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_pbdma_gk20a.h
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_pbdma_gk20a_h_
+#define _hw_pbdma_gk20a_h_
+static inline u32 pbdma_gp_entry1_r(void)
+{
+        return 0x10000004;
+}
+static inline u32 pbdma_gp_entry1_get_hi_v(u32 r)
+{
+        return (r >> 0) & 0xff;
+}
+static inline u32 pbdma_gp_entry1_length_f(u32 v)
+{
+        return (v & 0x1fffff) << 10;
+}
+static inline u32 pbdma_gp_entry1_length_v(u32 r)
+{
+        return (r >> 10) & 0x1fffff;
+}
+static inline u32 pbdma_gp_base_r(u32 i)
+{
+        return 0x00040048 + i*8192;
+}
+static inline u32 pbdma_gp_base__size_1_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 pbdma_gp_base_offset_f(u32 v)
+{
+        return (v & 0x1fffffff) << 3;
+}
+static inline u32 pbdma_gp_base_rsvd_s(void)
+{
+        return 3;
+}
+static inline u32 pbdma_gp_base_hi_r(u32 i)
+{
+        return 0x0004004c + i*8192;
+}
+static inline u32 pbdma_gp_base_hi_offset_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 pbdma_gp_base_hi_limit2_f(u32 v)
+{
+        return (v & 0x1f) << 16;
+}
+static inline u32 pbdma_gp_fetch_r(u32 i)
+{
+        return 0x00040050 + i*8192;
+}
+static inline u32 pbdma_gp_get_r(u32 i)
+{
+        return 0x00040014 + i*8192;
+}
+static inline u32 pbdma_gp_put_r(u32 i)
+{
+        return 0x00040000 + i*8192;
+}
+static inline u32 pbdma_pb_fetch_r(u32 i)
+{
+        return 0x00040054 + i*8192;
+}
+static inline u32 pbdma_pb_fetch_hi_r(u32 i)
+{
+        return 0x00040058 + i*8192;
+}
+static inline u32 pbdma_get_r(u32 i)
+{
+        return 0x00040018 + i*8192;
+}
+static inline u32 pbdma_get_hi_r(u32 i)
+{
+        return 0x0004001c + i*8192;
+}
+static inline u32 pbdma_put_r(u32 i)
+{
+        return 0x0004005c + i*8192;
+}
+static inline u32 pbdma_put_hi_r(u32 i)
+{
+        return 0x00040060 + i*8192;
+}
+static inline u32 pbdma_formats_r(u32 i)
+{
+        return 0x0004009c + i*8192;
+}
+static inline u32 pbdma_formats_gp_fermi0_f(void)
+{
+        return 0x0;
+}
+static inline u32 pbdma_formats_pb_fermi1_f(void)
+{
+        return 0x100;
+}
+static inline u32 pbdma_formats_mp_fermi0_f(void)
+{
+        return 0x0;
+}
+static inline u32 pbdma_syncpointa_r(u32 i)
+{
+        return 0x000400a4 + i*8192;
+}
+static inline u32 pbdma_syncpointa_payload_v(u32 r)
+{
+        return (r >> 0) & 0xffffffff;
+}
+static inline u32 pbdma_syncpointb_r(u32 i)
+{
+        return 0x000400a8 + i*8192;
+}
+static inline u32 pbdma_syncpointb_op_v(u32 r)
+{
+        return (r >> 0) & 0x3;
+}
+static inline u32 pbdma_syncpointb_op_wait_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pbdma_syncpointb_wait_switch_v(u32 r)
+{
+        return (r >> 4) & 0x1;
+}
+static inline u32 pbdma_syncpointb_wait_switch_en_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 pbdma_syncpointb_syncpt_index_v(u32 r)
+{
+        return (r >> 8) & 0xff;
+}
+static inline u32 pbdma_pb_header_r(u32 i)
+{
+        return 0x00040084 + i*8192;
+}
+static inline u32 pbdma_pb_header_priv_user_f(void)
+{
+        return 0x0;
+}
+static inline u32 pbdma_pb_header_method_zero_f(void)
+{
+        return 0x0;
+}
+static inline u32 pbdma_pb_header_subchannel_zero_f(void)
+{
+        return 0x0;
+}
+static inline u32 pbdma_pb_header_level_main_f(void)
+{
+        return 0x0;
+}
+static inline u32 pbdma_pb_header_first_true_f(void)
+{
+        return 0x400000;
+}
+static inline u32 pbdma_pb_header_type_inc_f(void)
+{
+        return 0x20000000;
+}
+static inline u32 pbdma_subdevice_r(u32 i)
+{
+        return 0x00040094 + i*8192;
+}
+static inline u32 pbdma_subdevice_id_f(u32 v)
+{
+        return (v & 0xfff) << 0;
+}
+static inline u32 pbdma_subdevice_status_active_f(void)
+{
+        return 0x10000000;
+}
+static inline u32 pbdma_subdevice_channel_dma_enable_f(void)
+{
+        return 0x20000000;
+}
+static inline u32 pbdma_method0_r(u32 i)
+{
+        return 0x000400c0 + i*8192;
+}
+static inline u32 pbdma_data0_r(u32 i)
+{
+        return 0x000400c4 + i*8192;
+}
+static inline u32 pbdma_target_r(u32 i)
+{
+        return 0x000400ac + i*8192;
+}
+static inline u32 pbdma_target_engine_sw_f(void)
+{
+        return 0x1f;
+}
+static inline u32 pbdma_acquire_r(u32 i)
+{
+        return 0x00040030 + i*8192;
+}
+static inline u32 pbdma_acquire_retry_man_2_f(void)
+{
+        return 0x2;
+}
+static inline u32 pbdma_acquire_retry_exp_2_f(void)
+{
+        return 0x100;
+}
+static inline u32 pbdma_acquire_timeout_exp_max_f(void)
+{
+        return 0x7800;
+}
+static inline u32 pbdma_acquire_timeout_man_max_f(void)
+{
+        return 0x7fff8000;
+}
+static inline u32 pbdma_acquire_timeout_en_disable_f(void)
+{
+        return 0x0;
+}
+static inline u32 pbdma_status_r(u32 i)
+{
+        return 0x00040100 + i*8192;
+}
+static inline u32 pbdma_channel_r(u32 i)
+{
+        return 0x00040120 + i*8192;
+}
+static inline u32 pbdma_signature_r(u32 i)
+{
+        return 0x00040010 + i*8192;
+}
+static inline u32 pbdma_signature_hw_valid_f(void)
+{
+        return 0xface;
+}
+static inline u32 pbdma_signature_sw_zero_f(void)
+{
+        return 0x0;
+}
+static inline u32 pbdma_userd_r(u32 i)
+{
+        return 0x00040008 + i*8192;
+}
+static inline u32 pbdma_userd_target_vid_mem_f(void)
+{
+        return 0x0;
+}
+static inline u32 pbdma_userd_addr_f(u32 v)
+{
+        return (v & 0x7fffff) << 9;
+}
+static inline u32 pbdma_userd_hi_r(u32 i)
+{
+        return 0x0004000c + i*8192;
+}
+static inline u32 pbdma_userd_hi_addr_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 pbdma_hce_ctrl_r(u32 i)
+{
+        return 0x000400e4 + i*8192;
+}
+static inline u32 pbdma_hce_ctrl_hce_priv_mode_yes_f(void)
+{
+        return 0x20;
+}
+static inline u32 pbdma_intr_0_r(u32 i)
+{
+        return 0x00040108 + i*8192;
+}
+static inline u32 pbdma_intr_0_memreq_v(u32 r)
+{
+        return (r >> 0) & 0x1;
+}
+static inline u32 pbdma_intr_0_memreq_pending_f(void)
+{
+        return 0x1;
+}
+static inline u32 pbdma_intr_0_memack_timeout_pending_f(void)
+{
+        return 0x2;
+}
+static inline u32 pbdma_intr_0_memack_extra_pending_f(void)
+{
+        return 0x4;
+}
+static inline u32 pbdma_intr_0_memdat_timeout_pending_f(void)
+{
+        return 0x8;
+}
+static inline u32 pbdma_intr_0_memdat_extra_pending_f(void)
+{
+        return 0x10;
+}
+static inline u32 pbdma_intr_0_memflush_pending_f(void)
+{
+        return 0x20;
+}
+static inline u32 pbdma_intr_0_memop_pending_f(void)
+{
+        return 0x40;
+}
+static inline u32 pbdma_intr_0_lbconnect_pending_f(void)
+{
+        return 0x80;
+}
+static inline u32 pbdma_intr_0_lbreq_pending_f(void)
+{
+        return 0x100;
+}
+static inline u32 pbdma_intr_0_lback_timeout_pending_f(void)
+{
+        return 0x200;
+}
+static inline u32 pbdma_intr_0_lback_extra_pending_f(void)
+{
+        return 0x400;
+}
+static inline u32 pbdma_intr_0_lbdat_timeout_pending_f(void)
+{
+        return 0x800;
+}
+static inline u32 pbdma_intr_0_lbdat_extra_pending_f(void)
+{
+        return 0x1000;
+}
+static inline u32 pbdma_intr_0_gpfifo_pending_f(void)
+{
+        return 0x2000;
+}
+static inline u32 pbdma_intr_0_gpptr_pending_f(void)
+{
+        return 0x4000;
+}
+static inline u32 pbdma_intr_0_gpentry_pending_f(void)
+{
+        return 0x8000;
+}
+static inline u32 pbdma_intr_0_gpcrc_pending_f(void)
+{
+        return 0x10000;
+}
+static inline u32 pbdma_intr_0_pbptr_pending_f(void)
+{
+        return 0x20000;
+}
+static inline u32 pbdma_intr_0_pbentry_pending_f(void)
+{
+        return 0x40000;
+}
+static inline u32 pbdma_intr_0_pbcrc_pending_f(void)
+{
+        return 0x80000;
+}
+static inline u32 pbdma_intr_0_xbarconnect_pending_f(void)
+{
+        return 0x100000;
+}
+static inline u32 pbdma_intr_0_method_pending_f(void)
+{
+        return 0x200000;
+}
+static inline u32 pbdma_intr_0_methodcrc_pending_f(void)
+{
+        return 0x400000;
+}
+static inline u32 pbdma_intr_0_device_pending_f(void)
+{
+        return 0x800000;
+}
+static inline u32 pbdma_intr_0_semaphore_pending_f(void)
+{
+        return 0x2000000;
+}
+static inline u32 pbdma_intr_0_acquire_pending_f(void)
+{
+        return 0x4000000;
+}
+static inline u32 pbdma_intr_0_pri_pending_f(void)
+{
+        return 0x8000000;
+}
+static inline u32 pbdma_intr_0_no_ctxsw_seg_pending_f(void)
+{
+        return 0x20000000;
+}
+static inline u32 pbdma_intr_0_pbseg_pending_f(void)
+{
+        return 0x40000000;
+}
+static inline u32 pbdma_intr_0_signature_pending_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 pbdma_intr_1_r(u32 i)
+{
+        return 0x00040148 + i*8192;
+}
+static inline u32 pbdma_intr_en_0_r(u32 i)
+{
+        return 0x0004010c + i*8192;
+}
+static inline u32 pbdma_intr_en_0_lbreq_enabled_f(void)
+{
+        return 0x100;
+}
+static inline u32 pbdma_intr_en_1_r(u32 i)
+{
+        return 0x0004014c + i*8192;
+}
+static inline u32 pbdma_intr_stall_r(u32 i)
+{
+        return 0x0004013c + i*8192;
+}
+static inline u32 pbdma_intr_stall_lbreq_enabled_f(void)
+{
+        return 0x100;
+}
+static inline u32 pbdma_udma_nop_r(void)
+{
+        return 0x00000008;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_pri_ringmaster_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_pri_ringmaster_gk20a.h
new file mode 100644
index 00000000..d4007613
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_pri_ringmaster_gk20a.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_pri_ringmaster_gk20a_h_
+#define _hw_pri_ringmaster_gk20a_h_
+static inline u32 pri_ringmaster_command_r(void)
+{
+        return 0x0012004c;
+}
+static inline u32 pri_ringmaster_command_cmd_m(void)
+{
+        return 0x3f << 0;
+}
+static inline u32 pri_ringmaster_command_cmd_v(u32 r)
+{
+        return (r >> 0) & 0x3f;
+}
+static inline u32 pri_ringmaster_command_cmd_no_cmd_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pri_ringmaster_command_cmd_start_ring_f(void)
+{
+        return 0x1;
+}
+static inline u32 pri_ringmaster_command_cmd_ack_interrupt_f(void)
+{
+        return 0x2;
+}
+static inline u32 pri_ringmaster_command_cmd_enumerate_stations_f(void)
+{
+        return 0x3;
+}
+static inline u32 pri_ringmaster_command_cmd_enumerate_stations_bc_grp_all_f(void)
+{
+        return 0x0;
+}
+static inline u32 pri_ringmaster_command_data_r(void)
+{
+        return 0x00120048;
+}
+static inline u32 pri_ringmaster_start_results_r(void)
+{
+        return 0x00120050;
+}
+static inline u32 pri_ringmaster_start_results_connectivity_v(u32 r)
+{
+        return (r >> 0) & 0x1;
+}
+static inline u32 pri_ringmaster_start_results_connectivity_pass_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 pri_ringmaster_intr_status0_r(void)
+{
+        return 0x00120058;
+}
+static inline u32 pri_ringmaster_intr_status1_r(void)
+{
+        return 0x0012005c;
+}
+static inline u32 pri_ringmaster_global_ctl_r(void)
+{
+        return 0x00120060;
+}
+static inline u32 pri_ringmaster_global_ctl_ring_reset_asserted_f(void)
+{
+        return 0x1;
+}
+static inline u32 pri_ringmaster_global_ctl_ring_reset_deasserted_f(void)
+{
+        return 0x0;
+}
+static inline u32 pri_ringmaster_enum_fbp_r(void)
+{
+        return 0x00120074;
+}
+static inline u32 pri_ringmaster_enum_fbp_count_v(u32 r)
+{
+        return (r >> 0) & 0x1f;
+}
+static inline u32 pri_ringmaster_enum_gpc_r(void)
+{
+        return 0x00120078;
+}
+static inline u32 pri_ringmaster_enum_gpc_count_v(u32 r)
+{
+        return (r >> 0) & 0x1f;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_fbp_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_fbp_gk20a.h
new file mode 100644
index 00000000..db16a8de
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_fbp_gk20a.h
@@ -0,0 +1,226 @@
+/*
+ * drivers/video/tegra/host/gk20a/hw_pri_ringstation_fbp_gk20a.h
+ *
+ * Copyright (c) 2012-2013, NVIDIA Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+ /*
+  * Function naming determines intended use:
+  *
+  *     <x>_r(void) : Returns the offset for register <x>.
+  *
+  *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+  *
+  *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+  *
+  *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+  *         and masked to place it at field <y> of register <x>.  This value
+  *         can be |'d with others to produce a full register value for
+  *         register <x>.
+  *
+  *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+  *         value can be ~'d and then &'d to clear the value of field <y> for
+  *         register <x>.
+  *
+  *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+  *         to place it at field <y> of register <x>.  This value can be |'d
+  *         with others to produce a full register value for <x>.
+  *
+  *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+  *         <x> value 'r' after being shifted to place its LSB at bit 0.
+  *         This value is suitable for direct comparison with other unshifted
+  *         values appropriate for use in field <y> of register <x>.
+  *
+  *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+  *         field <y> of register <x>.  This value is suitable for direct
+  *         comparison with unshifted values appropriate for use in field <y>
+  *         of register <x>.
+  */
+#ifndef __hw_pri_ringstation_fbp_gk20a_h__
+#define __hw_pri_ringstation_fbp_gk20a_h__
+/*This file is autogenerated.  Do not edit. */
+static inline u32 pri_ringstation_fbp_master_config_r(u32 i)
+{
+        return 0x00124300+((i)*4);
+}
+static inline u32 pri_ringstation_fbp_master_config__size_1_v(void)
+{
+        return 64;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_s(void)
+{
+        return 18;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_f(u32 v)
+{
+        return (v & 0x3ffff) << 0;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_m(void)
+{
+        return 0x3ffff << 0;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_v(u32 r)
+{
+        return (r >> 0) & 0x3ffff;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_i_v(void)
+{
+        return 0x00000064;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_i_f(void)
+{
+        return 0x64;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_s(void)
+{
+        return 1;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_f(u32 v)
+{
+        return (v & 0x1) << 30;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_m(void)
+{
+        return 0x1 << 30;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_v(u32 r)
+{
+        return (r >> 30) & 0x1;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_error_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_error_f(void)
+{
+        return 0x0;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_soldier_on_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_soldier_on_f(void)
+{
+        return 0x40000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_s(void)
+{
+        return 1;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_f(u32 v)
+{
+        return (v & 0x1) << 31;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_m(void)
+{
+        return 0x1 << 31;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_v(u32 r)
+{
+        return (r >> 31) & 0x1;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_error_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_error_f(void)
+{
+        return 0x0;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_soldier_on_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_soldier_on_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_s(void)
+{
+        return 3;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_f(u32 v)
+{
+        return (v & 0x7) << 20;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_m(void)
+{
+        return 0x7 << 20;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_v(u32 r)
+{
+        return (r >> 20) & 0x7;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_i_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_i_f(void)
+{
+        return 0x0;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_s(void)
+{
+        return 3;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_f(u32 v)
+{
+        return (v & 0x7) << 24;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_m(void)
+{
+        return 0x7 << 24;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_v(u32 r)
+{
+        return (r >> 24) & 0x7;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_i_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_i_f(void)
+{
+        return 0x0;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_s(void)
+{
+        return 3;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_f(u32 v)
+{
+        return (v & 0x7) << 27;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_m(void)
+{
+        return 0x7 << 27;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_v(u32 r)
+{
+        return (r >> 27) & 0x7;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_i_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_i_f(void)
+{
+        return 0x0;
+}
+#endif /* __hw_pri_ringstation_fbp_gk20a_h__ */
diff --git a/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_gpc_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_gpc_gk20a.h
new file mode 100644
index 00000000..e8aad933
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_gpc_gk20a.h
@@ -0,0 +1,226 @@
+/*
+ * drivers/video/tegra/host/gk20a/hw_pri_ringstation_gpc_gk20a.h
+ *
+ * Copyright (c) 2012-2013, NVIDIA Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+ /*
+  * Function naming determines intended use:
+  *
+  *     <x>_r(void) : Returns the offset for register <x>.
+  *
+  *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+  *
+  *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+  *
+  *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+  *         and masked to place it at field <y> of register <x>.  This value
+  *         can be |'d with others to produce a full register value for
+  *         register <x>.
+  *
+  *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+  *         value can be ~'d and then &'d to clear the value of field <y> for
+  *         register <x>.
+  *
+  *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+  *         to place it at field <y> of register <x>.  This value can be |'d
+  *         with others to produce a full register value for <x>.
+  *
+  *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+  *         <x> value 'r' after being shifted to place its LSB at bit 0.
+  *         This value is suitable for direct comparison with other unshifted
+  *         values appropriate for use in field <y> of register <x>.
+  *
+  *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+  *         field <y> of register <x>.  This value is suitable for direct
+  *         comparison with unshifted values appropriate for use in field <y>
+  *         of register <x>.
+  */
+#ifndef __hw_pri_ringstation_gpc_gk20a_h__
+#define __hw_pri_ringstation_gpc_gk20a_h__
+/*This file is autogenerated.  Do not edit. */
+static inline u32 pri_ringstation_gpc_master_config_r(u32 i)
+{
+        return 0x00128300+((i)*4);
+}
+static inline u32 pri_ringstation_gpc_master_config__size_1_v(void)
+{
+        return 64;
+}
+static inline u32 pri_ringstation_gpc_master_config_timeout_s(void)
+{
+        return 18;
+}
+static inline u32 pri_ringstation_gpc_master_config_timeout_f(u32 v)
+{
+        return (v & 0x3ffff) << 0;
+}
+static inline u32 pri_ringstation_gpc_master_config_timeout_m(void)
+{
+        return 0x3ffff << 0;
+}
+static inline u32 pri_ringstation_gpc_master_config_timeout_v(u32 r)
+{
+        return (r >> 0) & 0x3ffff;
+}
+static inline u32 pri_ringstation_gpc_master_config_timeout_i_v(void)
+{
+        return 0x00000064;
+}
+static inline u32 pri_ringstation_gpc_master_config_timeout_i_f(void)
+{
+        return 0x64;
+}
+static inline u32 pri_ringstation_gpc_master_config_fs_action_s(void)
+{
+        return 1;
+}
+static inline u32 pri_ringstation_gpc_master_config_fs_action_f(u32 v)
+{
+        return (v & 0x1) << 30;
+}
+static inline u32 pri_ringstation_gpc_master_config_fs_action_m(void)
+{
+        return 0x1 << 30;
+}
+static inline u32 pri_ringstation_gpc_master_config_fs_action_v(u32 r)
+{
+        return (r >> 30) & 0x1;
+}
+static inline u32 pri_ringstation_gpc_master_config_fs_action_error_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pri_ringstation_gpc_master_config_fs_action_error_f(void)
+{
+        return 0x0;
+}
+static inline u32 pri_ringstation_gpc_master_config_fs_action_soldier_on_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 pri_ringstation_gpc_master_config_fs_action_soldier_on_f(void)
+{
+        return 0x40000000;
+}
+static inline u32 pri_ringstation_gpc_master_config_reset_action_s(void)
+{
+        return 1;
+}
+static inline u32 pri_ringstation_gpc_master_config_reset_action_f(u32 v)
+{
+        return (v & 0x1) << 31;
+}
+static inline u32 pri_ringstation_gpc_master_config_reset_action_m(void)
+{
+        return 0x1 << 31;
+}
+static inline u32 pri_ringstation_gpc_master_config_reset_action_v(u32 r)
+{
+        return (r >> 31) & 0x1;
+}
+static inline u32 pri_ringstation_gpc_master_config_reset_action_error_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pri_ringstation_gpc_master_config_reset_action_error_f(void)
+{
+        return 0x0;
+}
+static inline u32 pri_ringstation_gpc_master_config_reset_action_soldier_on_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 pri_ringstation_gpc_master_config_reset_action_soldier_on_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 pri_ringstation_gpc_master_config_setup_clocks_s(void)
+{
+        return 3;
+}
+static inline u32 pri_ringstation_gpc_master_config_setup_clocks_f(u32 v)
+{
+        return (v & 0x7) << 20;
+}
+static inline u32 pri_ringstation_gpc_master_config_setup_clocks_m(void)
+{
+        return 0x7 << 20;
+}
+static inline u32 pri_ringstation_gpc_master_config_setup_clocks_v(u32 r)
+{
+        return (r >> 20) & 0x7;
+}
+static inline u32 pri_ringstation_gpc_master_config_setup_clocks_i_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pri_ringstation_gpc_master_config_setup_clocks_i_f(void)
+{
+        return 0x0;
+}
+static inline u32 pri_ringstation_gpc_master_config_wait_clocks_s(void)
+{
+        return 3;
+}
+static inline u32 pri_ringstation_gpc_master_config_wait_clocks_f(u32 v)
+{
+        return (v & 0x7) << 24;
+}
+static inline u32 pri_ringstation_gpc_master_config_wait_clocks_m(void)
+{
+        return 0x7 << 24;
+}
+static inline u32 pri_ringstation_gpc_master_config_wait_clocks_v(u32 r)
+{
+        return (r >> 24) & 0x7;
+}
+static inline u32 pri_ringstation_gpc_master_config_wait_clocks_i_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pri_ringstation_gpc_master_config_wait_clocks_i_f(void)
+{
+        return 0x0;
+}
+static inline u32 pri_ringstation_gpc_master_config_hold_clocks_s(void)
+{
+        return 3;
+}
+static inline u32 pri_ringstation_gpc_master_config_hold_clocks_f(u32 v)
+{
+        return (v & 0x7) << 27;
+}
+static inline u32 pri_ringstation_gpc_master_config_hold_clocks_m(void)
+{
+        return 0x7 << 27;
+}
+static inline u32 pri_ringstation_gpc_master_config_hold_clocks_v(u32 r)
+{
+        return (r >> 27) & 0x7;
+}
+static inline u32 pri_ringstation_gpc_master_config_hold_clocks_i_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pri_ringstation_gpc_master_config_hold_clocks_i_f(void)
+{
+        return 0x0;
+}
+#endif /* __hw_pri_ringstation_gpc_gk20a_h__ */
diff --git a/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_sys_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_sys_gk20a.h
new file mode 100644
index 00000000..c281dd54
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_sys_gk20a.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_pri_ringstation_sys_gk20a_h_
+#define _hw_pri_ringstation_sys_gk20a_h_
+static inline u32 pri_ringstation_sys_master_config_r(u32 i)
+{
+        return 0x00122300 + i*4;
+}
+static inline u32 pri_ringstation_sys_decode_config_r(void)
+{
+        return 0x00122204;
+}
+static inline u32 pri_ringstation_sys_decode_config_ring_m(void)
+{
+        return 0x7 << 0;
+}
+static inline u32 pri_ringstation_sys_decode_config_ring_drop_on_ring_not_started_f(void)
+{
+        return 0x1;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_proj_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_proj_gk20a.h
new file mode 100644
index 00000000..93c55c30
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_proj_gk20a.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_proj_gk20a_h_
+#define _hw_proj_gk20a_h_
+static inline u32 proj_gpc_base_v(void)
+{
+        return 0x00500000;
+}
+static inline u32 proj_gpc_shared_base_v(void)
+{
+        return 0x00418000;
+}
+static inline u32 proj_gpc_stride_v(void)
+{
+        return 0x00008000;
+}
+static inline u32 proj_ltc_stride_v(void)
+{
+        return 0x00002000;
+}
+static inline u32 proj_lts_stride_v(void)
+{
+        return 0x00000400;
+}
+static inline u32 proj_ppc_in_gpc_base_v(void)
+{
+        return 0x00003000;
+}
+static inline u32 proj_ppc_in_gpc_stride_v(void)
+{
+        return 0x00000200;
+}
+static inline u32 proj_rop_base_v(void)
+{
+        return 0x00410000;
+}
+static inline u32 proj_rop_shared_base_v(void)
+{
+        return 0x00408800;
+}
+static inline u32 proj_rop_stride_v(void)
+{
+        return 0x00000400;
+}
+static inline u32 proj_tpc_in_gpc_base_v(void)
+{
+        return 0x00004000;
+}
+static inline u32 proj_tpc_in_gpc_stride_v(void)
+{
+        return 0x00000800;
+}
+static inline u32 proj_tpc_in_gpc_shared_base_v(void)
+{
+        return 0x00001800;
+}
+static inline u32 proj_host_num_pbdma_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 proj_scal_litter_num_tpc_per_gpc_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 proj_scal_litter_num_fbps_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 proj_scal_litter_num_gpcs_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 proj_scal_litter_num_pes_per_gpc_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 proj_scal_litter_num_tpcs_per_pes_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 proj_scal_litter_num_zcull_banks_v(void)
+{
+        return 0x00000004;
+}
+static inline u32 proj_scal_max_gpcs_v(void)
+{
+        return 0x00000020;
+}
+static inline u32 proj_scal_max_tpc_per_gpc_v(void)
+{
+        return 0x00000008;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_pwr_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_pwr_gk20a.h
new file mode 100644
index 00000000..d7d26b80
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_pwr_gk20a.h
@@ -0,0 +1,737 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_pwr_gk20a_h_
+#define _hw_pwr_gk20a_h_
+static inline u32 pwr_falcon_irqsset_r(void)
+{
+        return 0x0010a000;
+}
+static inline u32 pwr_falcon_irqsset_swgen0_set_f(void)
+{
+        return 0x40;
+}
+static inline u32 pwr_falcon_irqsclr_r(void)
+{
+        return 0x0010a004;
+}
+static inline u32 pwr_falcon_irqstat_r(void)
+{
+        return 0x0010a008;
+}
+static inline u32 pwr_falcon_irqstat_halt_true_f(void)
+{
+        return 0x10;
+}
+static inline u32 pwr_falcon_irqstat_exterr_true_f(void)
+{
+        return 0x20;
+}
+static inline u32 pwr_falcon_irqstat_swgen0_true_f(void)
+{
+        return 0x40;
+}
+static inline u32 pwr_falcon_irqmode_r(void)
+{
+        return 0x0010a00c;
+}
+static inline u32 pwr_falcon_irqmset_r(void)
+{
+        return 0x0010a010;
+}
+static inline u32 pwr_falcon_irqmset_gptmr_f(u32 v)
+{
+        return (v & 0x1) << 0;
+}
+static inline u32 pwr_falcon_irqmset_wdtmr_f(u32 v)
+{
+        return (v & 0x1) << 1;
+}
+static inline u32 pwr_falcon_irqmset_mthd_f(u32 v)
+{
+        return (v & 0x1) << 2;
+}
+static inline u32 pwr_falcon_irqmset_ctxsw_f(u32 v)
+{
+        return (v & 0x1) << 3;
+}
+static inline u32 pwr_falcon_irqmset_halt_f(u32 v)
+{
+        return (v & 0x1) << 4;
+}
+static inline u32 pwr_falcon_irqmset_exterr_f(u32 v)
+{
+        return (v & 0x1) << 5;
+}
+static inline u32 pwr_falcon_irqmset_swgen0_f(u32 v)
+{
+        return (v & 0x1) << 6;
+}
+static inline u32 pwr_falcon_irqmset_swgen1_f(u32 v)
+{
+        return (v & 0x1) << 7;
+}
+static inline u32 pwr_falcon_irqmclr_r(void)
+{
+        return 0x0010a014;
+}
+static inline u32 pwr_falcon_irqmclr_gptmr_f(u32 v)
+{
+        return (v & 0x1) << 0;
+}
+static inline u32 pwr_falcon_irqmclr_wdtmr_f(u32 v)
+{
+        return (v & 0x1) << 1;
+}
+static inline u32 pwr_falcon_irqmclr_mthd_f(u32 v)
+{
+        return (v & 0x1) << 2;
+}
+static inline u32 pwr_falcon_irqmclr_ctxsw_f(u32 v)
+{
+        return (v & 0x1) << 3;
+}
+static inline u32 pwr_falcon_irqmclr_halt_f(u32 v)
+{
+        return (v & 0x1) << 4;
+}
+static inline u32 pwr_falcon_irqmclr_exterr_f(u32 v)
+{
+        return (v & 0x1) << 5;
+}
+static inline u32 pwr_falcon_irqmclr_swgen0_f(u32 v)
+{
+        return (v & 0x1) << 6;
+}
+static inline u32 pwr_falcon_irqmclr_swgen1_f(u32 v)
+{
+        return (v & 0x1) << 7;
+}
+static inline u32 pwr_falcon_irqmclr_ext_f(u32 v)
+{
+        return (v & 0xff) << 8;
+}
+static inline u32 pwr_falcon_irqmask_r(void)
+{
+        return 0x0010a018;
+}
+static inline u32 pwr_falcon_irqdest_r(void)
+{
+        return 0x0010a01c;
+}
+static inline u32 pwr_falcon_irqdest_host_gptmr_f(u32 v)
+{
+        return (v & 0x1) << 0;
+}
+static inline u32 pwr_falcon_irqdest_host_wdtmr_f(u32 v)
+{
+        return (v & 0x1) << 1;
+}
+static inline u32 pwr_falcon_irqdest_host_mthd_f(u32 v)
+{
+        return (v & 0x1) << 2;
+}
+static inline u32 pwr_falcon_irqdest_host_ctxsw_f(u32 v)
+{
+        return (v & 0x1) << 3;
+}
+static inline u32 pwr_falcon_irqdest_host_halt_f(u32 v)
+{
+        return (v & 0x1) << 4;
+}
+static inline u32 pwr_falcon_irqdest_host_exterr_f(u32 v)
+{
+        return (v & 0x1) << 5;
+}
+static inline u32 pwr_falcon_irqdest_host_swgen0_f(u32 v)
+{
+        return (v & 0x1) << 6;
+}
+static inline u32 pwr_falcon_irqdest_host_swgen1_f(u32 v)
+{
+        return (v & 0x1) << 7;
+}
+static inline u32 pwr_falcon_irqdest_host_ext_f(u32 v)
+{
+        return (v & 0xff) << 8;
+}
+static inline u32 pwr_falcon_irqdest_target_gptmr_f(u32 v)
+{
+        return (v & 0x1) << 16;
+}
+static inline u32 pwr_falcon_irqdest_target_wdtmr_f(u32 v)
+{
+        return (v & 0x1) << 17;
+}
+static inline u32 pwr_falcon_irqdest_target_mthd_f(u32 v)
+{
+        return (v & 0x1) << 18;
+}
+static inline u32 pwr_falcon_irqdest_target_ctxsw_f(u32 v)
+{
+        return (v & 0x1) << 19;
+}
+static inline u32 pwr_falcon_irqdest_target_halt_f(u32 v)
+{
+        return (v & 0x1) << 20;
+}
+static inline u32 pwr_falcon_irqdest_target_exterr_f(u32 v)
+{
+        return (v & 0x1) << 21;
+}
+static inline u32 pwr_falcon_irqdest_target_swgen0_f(u32 v)
+{
+        return (v & 0x1) << 22;
+}
+static inline u32 pwr_falcon_irqdest_target_swgen1_f(u32 v)
+{
+        return (v & 0x1) << 23;
+}
+static inline u32 pwr_falcon_irqdest_target_ext_f(u32 v)
+{
+        return (v & 0xff) << 24;
+}
+static inline u32 pwr_falcon_curctx_r(void)
+{
+        return 0x0010a050;
+}
+static inline u32 pwr_falcon_nxtctx_r(void)
+{
+        return 0x0010a054;
+}
+static inline u32 pwr_falcon_mailbox0_r(void)
+{
+        return 0x0010a040;
+}
+static inline u32 pwr_falcon_mailbox1_r(void)
+{
+        return 0x0010a044;
+}
+static inline u32 pwr_falcon_itfen_r(void)
+{
+        return 0x0010a048;
+}
+static inline u32 pwr_falcon_itfen_ctxen_enable_f(void)
+{
+        return 0x1;
+}
+static inline u32 pwr_falcon_idlestate_r(void)
+{
+        return 0x0010a04c;
+}
+static inline u32 pwr_falcon_idlestate_falcon_busy_v(u32 r)
+{
+        return (r >> 0) & 0x1;
+}
+static inline u32 pwr_falcon_idlestate_ext_busy_v(u32 r)
+{
+        return (r >> 1) & 0x7fff;
+}
+static inline u32 pwr_falcon_os_r(void)
+{
+        return 0x0010a080;
+}
+static inline u32 pwr_falcon_engctl_r(void)
+{
+        return 0x0010a0a4;
+}
+static inline u32 pwr_falcon_cpuctl_r(void)
+{
+        return 0x0010a100;
+}
+static inline u32 pwr_falcon_cpuctl_startcpu_f(u32 v)
+{
+        return (v & 0x1) << 1;
+}
+static inline u32 pwr_falcon_bootvec_r(void)
+{
+        return 0x0010a104;
+}
+static inline u32 pwr_falcon_bootvec_vec_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 pwr_falcon_dmactl_r(void)
+{
+        return 0x0010a10c;
+}
+static inline u32 pwr_falcon_dmactl_dmem_scrubbing_m(void)
+{
+        return 0x1 << 1;
+}
+static inline u32 pwr_falcon_dmactl_imem_scrubbing_m(void)
+{
+        return 0x1 << 2;
+}
+static inline u32 pwr_falcon_hwcfg_r(void)
+{
+        return 0x0010a108;
+}
+static inline u32 pwr_falcon_hwcfg_imem_size_v(u32 r)
+{
+        return (r >> 0) & 0x1ff;
+}
+static inline u32 pwr_falcon_hwcfg_dmem_size_v(u32 r)
+{
+        return (r >> 9) & 0x1ff;
+}
+static inline u32 pwr_falcon_dmatrfbase_r(void)
+{
+        return 0x0010a110;
+}
+static inline u32 pwr_falcon_dmatrfmoffs_r(void)
+{
+        return 0x0010a114;
+}
+static inline u32 pwr_falcon_dmatrfcmd_r(void)
+{
+        return 0x0010a118;
+}
+static inline u32 pwr_falcon_dmatrfcmd_imem_f(u32 v)
+{
+        return (v & 0x1) << 4;
+}
+static inline u32 pwr_falcon_dmatrfcmd_write_f(u32 v)
+{
+        return (v & 0x1) << 5;
+}
+static inline u32 pwr_falcon_dmatrfcmd_size_f(u32 v)
+{
+        return (v & 0x7) << 8;
+}
+static inline u32 pwr_falcon_dmatrfcmd_ctxdma_f(u32 v)
+{
+        return (v & 0x7) << 12;
+}
+static inline u32 pwr_falcon_dmatrffboffs_r(void)
+{
+        return 0x0010a11c;
+}
+static inline u32 pwr_falcon_exterraddr_r(void)
+{
+        return 0x0010a168;
+}
+static inline u32 pwr_falcon_exterrstat_r(void)
+{
+        return 0x0010a16c;
+}
+static inline u32 pwr_falcon_exterrstat_valid_m(void)
+{
+        return 0x1 << 31;
+}
+static inline u32 pwr_falcon_exterrstat_valid_v(u32 r)
+{
+        return (r >> 31) & 0x1;
+}
+static inline u32 pwr_falcon_exterrstat_valid_true_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_r(void)
+{
+        return 0x0010a200;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_s(void)
+{
+        return 4;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_f(u32 v)
+{
+        return (v & 0xf) << 0;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_m(void)
+{
+        return 0xf << 0;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_v(u32 r)
+{
+        return (r >> 0) & 0xf;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_rreg_f(void)
+{
+        return 0x8;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_rstat_f(void)
+{
+        return 0xe;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_idx_f(u32 v)
+{
+        return (v & 0x1f) << 8;
+}
+static inline u32 pwr_pmu_falcon_icd_rdata_r(void)
+{
+        return 0x0010a20c;
+}
+static inline u32 pwr_falcon_dmemc_r(u32 i)
+{
+        return 0x0010a1c0 + i*8;
+}
+static inline u32 pwr_falcon_dmemc_offs_f(u32 v)
+{
+        return (v & 0x3f) << 2;
+}
+static inline u32 pwr_falcon_dmemc_offs_m(void)
+{
+        return 0x3f << 2;
+}
+static inline u32 pwr_falcon_dmemc_blk_f(u32 v)
+{
+        return (v & 0xff) << 8;
+}
+static inline u32 pwr_falcon_dmemc_blk_m(void)
+{
+        return 0xff << 8;
+}
+static inline u32 pwr_falcon_dmemc_aincw_f(u32 v)
+{
+        return (v & 0x1) << 24;
+}
+static inline u32 pwr_falcon_dmemc_aincr_f(u32 v)
+{
+        return (v & 0x1) << 25;
+}
+static inline u32 pwr_falcon_dmemd_r(u32 i)
+{
+        return 0x0010a1c4 + i*8;
+}
+static inline u32 pwr_pmu_new_instblk_r(void)
+{
+        return 0x0010a480;
+}
+static inline u32 pwr_pmu_new_instblk_ptr_f(u32 v)
+{
+        return (v & 0xfffffff) << 0;
+}
+static inline u32 pwr_pmu_new_instblk_target_fb_f(void)
+{
+        return 0x0;
+}
+static inline u32 pwr_pmu_new_instblk_target_sys_coh_f(void)
+{
+        return 0x20000000;
+}
+static inline u32 pwr_pmu_new_instblk_valid_f(u32 v)
+{
+        return (v & 0x1) << 30;
+}
+static inline u32 pwr_pmu_mutex_id_r(void)
+{
+        return 0x0010a488;
+}
+static inline u32 pwr_pmu_mutex_id_value_v(u32 r)
+{
+        return (r >> 0) & 0xff;
+}
+static inline u32 pwr_pmu_mutex_id_value_init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pwr_pmu_mutex_id_value_not_avail_v(void)
+{
+        return 0x000000ff;
+}
+static inline u32 pwr_pmu_mutex_id_release_r(void)
+{
+        return 0x0010a48c;
+}
+static inline u32 pwr_pmu_mutex_id_release_value_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 pwr_pmu_mutex_id_release_value_m(void)
+{
+        return 0xff << 0;
+}
+static inline u32 pwr_pmu_mutex_id_release_value_init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 pwr_pmu_mutex_id_release_value_init_f(void)
+{
+        return 0x0;
+}
+static inline u32 pwr_pmu_mutex_r(u32 i)
+{
+        return 0x0010a580 + i*4;
+}
+static inline u32 pwr_pmu_mutex__size_1_v(void)
+{
+        return 0x00000010;
+}
+static inline u32 pwr_pmu_mutex_value_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 pwr_pmu_mutex_value_v(u32 r)
+{
+        return (r >> 0) & 0xff;
+}
+static inline u32 pwr_pmu_mutex_value_initial_lock_f(void)
+{
+        return 0x0;
+}
+static inline u32 pwr_pmu_queue_head_r(u32 i)
+{
+        return 0x0010a4a0 + i*4;
+}
+static inline u32 pwr_pmu_queue_head__size_1_v(void)
+{
+        return 0x00000004;
+}
+static inline u32 pwr_pmu_queue_head_address_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 pwr_pmu_queue_head_address_v(u32 r)
+{
+        return (r >> 0) & 0xffffffff;
+}
+static inline u32 pwr_pmu_queue_tail_r(u32 i)
+{
+        return 0x0010a4b0 + i*4;
+}
+static inline u32 pwr_pmu_queue_tail__size_1_v(void)
+{
+        return 0x00000004;
+}
+static inline u32 pwr_pmu_queue_tail_address_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 pwr_pmu_queue_tail_address_v(u32 r)
+{
+        return (r >> 0) & 0xffffffff;
+}
+static inline u32 pwr_pmu_msgq_head_r(void)
+{
+        return 0x0010a4c8;
+}
+static inline u32 pwr_pmu_msgq_head_val_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 pwr_pmu_msgq_head_val_v(u32 r)
+{
+        return (r >> 0) & 0xffffffff;
+}
+static inline u32 pwr_pmu_msgq_tail_r(void)
+{
+        return 0x0010a4cc;
+}
+static inline u32 pwr_pmu_msgq_tail_val_f(u32 v)
+{
+        return (v & 0xffffffff) << 0;
+}
+static inline u32 pwr_pmu_msgq_tail_val_v(u32 r)
+{
+        return (r >> 0) & 0xffffffff;
+}
+static inline u32 pwr_pmu_idle_mask_r(u32 i)
+{
+        return 0x0010a504 + i*16;
+}
+static inline u32 pwr_pmu_idle_mask_gr_enabled_f(void)
+{
+        return 0x1;
+}
+static inline u32 pwr_pmu_idle_mask_ce_2_enabled_f(void)
+{
+        return 0x200000;
+}
+static inline u32 pwr_pmu_idle_count_r(u32 i)
+{
+        return 0x0010a508 + i*16;
+}
+static inline u32 pwr_pmu_idle_count_value_f(u32 v)
+{
+        return (v & 0x7fffffff) << 0;
+}
+static inline u32 pwr_pmu_idle_count_value_v(u32 r)
+{
+        return (r >> 0) & 0x7fffffff;
+}
+static inline u32 pwr_pmu_idle_count_reset_f(u32 v)
+{
+        return (v & 0x1) << 31;
+}
+static inline u32 pwr_pmu_idle_ctrl_r(u32 i)
+{
+        return 0x0010a50c + i*16;
+}
+static inline u32 pwr_pmu_idle_ctrl_value_m(void)
+{
+        return 0x3 << 0;
+}
+static inline u32 pwr_pmu_idle_ctrl_value_busy_f(void)
+{
+        return 0x2;
+}
+static inline u32 pwr_pmu_idle_ctrl_value_always_f(void)
+{
+        return 0x3;
+}
+static inline u32 pwr_pmu_idle_ctrl_filter_m(void)
+{
+        return 0x1 << 2;
+}
+static inline u32 pwr_pmu_idle_ctrl_filter_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 pwr_pmu_idle_mask_supp_r(u32 i)
+{
+        return 0x0010a9f0 + i*8;
+}
+static inline u32 pwr_pmu_idle_mask_1_supp_r(u32 i)
+{
+        return 0x0010a9f4 + i*8;
+}
+static inline u32 pwr_pmu_idle_ctrl_supp_r(u32 i)
+{
+        return 0x0010aa30 + i*8;
+}
+static inline u32 pwr_pmu_debug_r(u32 i)
+{
+        return 0x0010a5c0 + i*4;
+}
+static inline u32 pwr_pmu_debug__size_1_v(void)
+{
+        return 0x00000004;
+}
+static inline u32 pwr_pmu_mailbox_r(u32 i)
+{
+        return 0x0010a450 + i*4;
+}
+static inline u32 pwr_pmu_mailbox__size_1_v(void)
+{
+        return 0x0000000c;
+}
+static inline u32 pwr_pmu_bar0_addr_r(void)
+{
+        return 0x0010a7a0;
+}
+static inline u32 pwr_pmu_bar0_data_r(void)
+{
+        return 0x0010a7a4;
+}
+static inline u32 pwr_pmu_bar0_ctl_r(void)
+{
+        return 0x0010a7ac;
+}
+static inline u32 pwr_pmu_bar0_timeout_r(void)
+{
+        return 0x0010a7a8;
+}
+static inline u32 pwr_pmu_bar0_fecs_error_r(void)
+{
+        return 0x0010a988;
+}
+static inline u32 pwr_pmu_bar0_error_status_r(void)
+{
+        return 0x0010a7b0;
+}
+static inline u32 pwr_pmu_pg_idlefilth_r(u32 i)
+{
+        return 0x0010a6c0 + i*4;
+}
+static inline u32 pwr_pmu_pg_ppuidlefilth_r(u32 i)
+{
+        return 0x0010a6e8 + i*4;
+}
+static inline u32 pwr_pmu_pg_idle_cnt_r(u32 i)
+{
+        return 0x0010a710 + i*4;
+}
+static inline u32 pwr_pmu_pg_intren_r(u32 i)
+{
+        return 0x0010a760 + i*4;
+}
+static inline u32 pwr_fbif_transcfg_r(u32 i)
+{
+        return 0x0010a600 + i*4;
+}
+static inline u32 pwr_fbif_transcfg_target_local_fb_f(void)
+{
+        return 0x0;
+}
+static inline u32 pwr_fbif_transcfg_target_coherent_sysmem_f(void)
+{
+        return 0x1;
+}
+static inline u32 pwr_fbif_transcfg_target_noncoherent_sysmem_f(void)
+{
+        return 0x2;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_s(void)
+{
+        return 1;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_f(u32 v)
+{
+        return (v & 0x1) << 2;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_m(void)
+{
+        return 0x1 << 2;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_v(u32 r)
+{
+        return (r >> 2) & 0x1;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_virtual_f(void)
+{
+        return 0x0;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_physical_f(void)
+{
+        return 0x4;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_ram_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_ram_gk20a.h
new file mode 100644
index 00000000..7eff3881
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_ram_gk20a.h
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_ram_gk20a_h_
+#define _hw_ram_gk20a_h_
+static inline u32 ram_in_ramfc_s(void)
+{
+        return 4096;
+}
+static inline u32 ram_in_ramfc_w(void)
+{
+        return 0;
+}
+static inline u32 ram_in_page_dir_base_target_f(u32 v)
+{
+        return (v & 0x3) << 0;
+}
+static inline u32 ram_in_page_dir_base_target_w(void)
+{
+        return 128;
+}
+static inline u32 ram_in_page_dir_base_target_vid_mem_f(void)
+{
+        return 0x0;
+}
+static inline u32 ram_in_page_dir_base_vol_w(void)
+{
+        return 128;
+}
+static inline u32 ram_in_page_dir_base_vol_true_f(void)
+{
+        return 0x4;
+}
+static inline u32 ram_in_page_dir_base_lo_f(u32 v)
+{
+        return (v & 0xfffff) << 12;
+}
+static inline u32 ram_in_page_dir_base_lo_w(void)
+{
+        return 128;
+}
+static inline u32 ram_in_page_dir_base_hi_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 ram_in_page_dir_base_hi_w(void)
+{
+        return 129;
+}
+static inline u32 ram_in_adr_limit_lo_f(u32 v)
+{
+        return (v & 0xfffff) << 12;
+}
+static inline u32 ram_in_adr_limit_lo_w(void)
+{
+        return 130;
+}
+static inline u32 ram_in_adr_limit_hi_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 ram_in_adr_limit_hi_w(void)
+{
+        return 131;
+}
+static inline u32 ram_in_engine_cs_w(void)
+{
+        return 132;
+}
+static inline u32 ram_in_engine_cs_wfi_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 ram_in_engine_cs_wfi_f(void)
+{
+        return 0x0;
+}
+static inline u32 ram_in_engine_cs_fg_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 ram_in_engine_cs_fg_f(void)
+{
+        return 0x8;
+}
+static inline u32 ram_in_gr_cs_w(void)
+{
+        return 132;
+}
+static inline u32 ram_in_gr_cs_wfi_f(void)
+{
+        return 0x0;
+}
+static inline u32 ram_in_gr_wfi_target_w(void)
+{
+        return 132;
+}
+static inline u32 ram_in_gr_wfi_mode_w(void)
+{
+        return 132;
+}
+static inline u32 ram_in_gr_wfi_mode_physical_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 ram_in_gr_wfi_mode_physical_f(void)
+{
+        return 0x0;
+}
+static inline u32 ram_in_gr_wfi_mode_virtual_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 ram_in_gr_wfi_mode_virtual_f(void)
+{
+        return 0x4;
+}
+static inline u32 ram_in_gr_wfi_ptr_lo_f(u32 v)
+{
+        return (v & 0xfffff) << 12;
+}
+static inline u32 ram_in_gr_wfi_ptr_lo_w(void)
+{
+        return 132;
+}
+static inline u32 ram_in_gr_wfi_ptr_hi_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 ram_in_gr_wfi_ptr_hi_w(void)
+{
+        return 133;
+}
+static inline u32 ram_in_base_shift_v(void)
+{
+        return 0x0000000c;
+}
+static inline u32 ram_in_alloc_size_v(void)
+{
+        return 0x00001000;
+}
+static inline u32 ram_fc_size_val_v(void)
+{
+        return 0x00000200;
+}
+static inline u32 ram_fc_gp_put_w(void)
+{
+        return 0;
+}
+static inline u32 ram_fc_userd_w(void)
+{
+        return 2;
+}
+static inline u32 ram_fc_userd_hi_w(void)
+{
+        return 3;
+}
+static inline u32 ram_fc_signature_w(void)
+{
+        return 4;
+}
+static inline u32 ram_fc_gp_get_w(void)
+{
+        return 5;
+}
+static inline u32 ram_fc_pb_get_w(void)
+{
+        return 6;
+}
+static inline u32 ram_fc_pb_get_hi_w(void)
+{
+        return 7;
+}
+static inline u32 ram_fc_pb_top_level_get_w(void)
+{
+        return 8;
+}
+static inline u32 ram_fc_pb_top_level_get_hi_w(void)
+{
+        return 9;
+}
+static inline u32 ram_fc_acquire_w(void)
+{
+        return 12;
+}
+static inline u32 ram_fc_semaphorea_w(void)
+{
+        return 14;
+}
+static inline u32 ram_fc_semaphoreb_w(void)
+{
+        return 15;
+}
+static inline u32 ram_fc_semaphorec_w(void)
+{
+        return 16;
+}
+static inline u32 ram_fc_semaphored_w(void)
+{
+        return 17;
+}
+static inline u32 ram_fc_gp_base_w(void)
+{
+        return 18;
+}
+static inline u32 ram_fc_gp_base_hi_w(void)
+{
+        return 19;
+}
+static inline u32 ram_fc_gp_fetch_w(void)
+{
+        return 20;
+}
+static inline u32 ram_fc_pb_fetch_w(void)
+{
+        return 21;
+}
+static inline u32 ram_fc_pb_fetch_hi_w(void)
+{
+        return 22;
+}
+static inline u32 ram_fc_pb_put_w(void)
+{
+        return 23;
+}
+static inline u32 ram_fc_pb_put_hi_w(void)
+{
+        return 24;
+}
+static inline u32 ram_fc_pb_header_w(void)
+{
+        return 33;
+}
+static inline u32 ram_fc_pb_count_w(void)
+{
+        return 34;
+}
+static inline u32 ram_fc_subdevice_w(void)
+{
+        return 37;
+}
+static inline u32 ram_fc_formats_w(void)
+{
+        return 39;
+}
+static inline u32 ram_fc_syncpointa_w(void)
+{
+        return 41;
+}
+static inline u32 ram_fc_syncpointb_w(void)
+{
+        return 42;
+}
+static inline u32 ram_fc_target_w(void)
+{
+        return 43;
+}
+static inline u32 ram_fc_hce_ctrl_w(void)
+{
+        return 57;
+}
+static inline u32 ram_fc_chid_w(void)
+{
+        return 58;
+}
+static inline u32 ram_fc_chid_id_f(u32 v)
+{
+        return (v & 0xfff) << 0;
+}
+static inline u32 ram_fc_chid_id_w(void)
+{
+        return 0;
+}
+static inline u32 ram_fc_eng_timeslice_w(void)
+{
+        return 62;
+}
+static inline u32 ram_fc_pb_timeslice_w(void)
+{
+        return 63;
+}
+static inline u32 ram_userd_base_shift_v(void)
+{
+        return 0x00000009;
+}
+static inline u32 ram_userd_chan_size_v(void)
+{
+        return 0x00000200;
+}
+static inline u32 ram_userd_put_w(void)
+{
+        return 16;
+}
+static inline u32 ram_userd_get_w(void)
+{
+        return 17;
+}
+static inline u32 ram_userd_ref_w(void)
+{
+        return 18;
+}
+static inline u32 ram_userd_put_hi_w(void)
+{
+        return 19;
+}
+static inline u32 ram_userd_ref_threshold_w(void)
+{
+        return 20;
+}
+static inline u32 ram_userd_top_level_get_w(void)
+{
+        return 22;
+}
+static inline u32 ram_userd_top_level_get_hi_w(void)
+{
+        return 23;
+}
+static inline u32 ram_userd_get_hi_w(void)
+{
+        return 24;
+}
+static inline u32 ram_userd_gp_get_w(void)
+{
+        return 34;
+}
+static inline u32 ram_userd_gp_put_w(void)
+{
+        return 35;
+}
+static inline u32 ram_userd_gp_top_level_get_w(void)
+{
+        return 22;
+}
+static inline u32 ram_userd_gp_top_level_get_hi_w(void)
+{
+        return 23;
+}
+static inline u32 ram_rl_entry_size_v(void)
+{
+        return 0x00000008;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_sim_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_sim_gk20a.h
new file mode 100644
index 00000000..b1e6658d
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_sim_gk20a.h
@@ -0,0 +1,2150 @@
+/*
+ * drivers/video/tegra/host/gk20a/hw_sim_gk20a.h
+ *
+ * Copyright (c) 2012, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+ /*
+  * Function naming determines intended use:
+  *
+  *     <x>_r(void) : Returns the offset for register <x>.
+  *
+  *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+  *
+  *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+  *
+  *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+  *         and masked to place it at field <y> of register <x>.  This value
+  *         can be |'d with others to produce a full register value for
+  *         register <x>.
+  *
+  *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+  *         value can be ~'d and then &'d to clear the value of field <y> for
+  *         register <x>.
+  *
+  *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+  *         to place it at field <y> of register <x>.  This value can be |'d
+  *         with others to produce a full register value for <x>.
+  *
+  *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+  *         <x> value 'r' after being shifted to place its LSB at bit 0.
+  *         This value is suitable for direct comparison with other unshifted
+  *         values appropriate for use in field <y> of register <x>.
+  *
+  *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+  *         field <y> of register <x>.  This value is suitable for direct
+  *         comparison with unshifted values appropriate for use in field <y>
+  *         of register <x>.
+  */
+#ifndef __hw_sim_gk20a_h__
+#define __hw_sim_gk20a_h__
+/*This file is autogenerated.  Do not edit. */
+static inline u32 sim_send_ring_r(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_send_ring_target_s(void)
+{
+        return 2;
+}
+static inline u32 sim_send_ring_target_f(u32 v)
+{
+        return (v & 0x3) << 0;
+}
+static inline u32 sim_send_ring_target_m(void)
+{
+        return 0x3 << 0;
+}
+static inline u32 sim_send_ring_target_v(u32 r)
+{
+        return (r >> 0) & 0x3;
+}
+static inline u32 sim_send_ring_target_phys_init_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_send_ring_target_phys_init_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_send_ring_target_phys__init_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_send_ring_target_phys__init_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_send_ring_target_phys__prod_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_send_ring_target_phys__prod_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_send_ring_target_phys_nvm_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_send_ring_target_phys_nvm_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_send_ring_target_phys_pci_v(void)
+{
+        return 0x00000002;
+}
+static inline u32 sim_send_ring_target_phys_pci_f(void)
+{
+        return 0x2;
+}
+static inline u32 sim_send_ring_target_phys_pci_coherent_v(void)
+{
+        return 0x00000003;
+}
+static inline u32 sim_send_ring_target_phys_pci_coherent_f(void)
+{
+        return 0x3;
+}
+static inline u32 sim_send_ring_status_s(void)
+{
+        return 1;
+}
+static inline u32 sim_send_ring_status_f(u32 v)
+{
+        return (v & 0x1) << 3;
+}
+static inline u32 sim_send_ring_status_m(void)
+{
+        return 0x1 << 3;
+}
+static inline u32 sim_send_ring_status_v(u32 r)
+{
+        return (r >> 3) & 0x1;
+}
+static inline u32 sim_send_ring_status_init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_send_ring_status_init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_send_ring_status__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_send_ring_status__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_send_ring_status__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_send_ring_status__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_send_ring_status_invalid_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_send_ring_status_invalid_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_send_ring_status_valid_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_send_ring_status_valid_f(void)
+{
+        return 0x8;
+}
+static inline u32 sim_send_ring_size_s(void)
+{
+        return 2;
+}
+static inline u32 sim_send_ring_size_f(u32 v)
+{
+        return (v & 0x3) << 4;
+}
+static inline u32 sim_send_ring_size_m(void)
+{
+        return 0x3 << 4;
+}
+static inline u32 sim_send_ring_size_v(u32 r)
+{
+        return (r >> 4) & 0x3;
+}
+static inline u32 sim_send_ring_size_init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_send_ring_size_init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_send_ring_size__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_send_ring_size__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_send_ring_size__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_send_ring_size__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_send_ring_size_4kb_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_send_ring_size_4kb_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_send_ring_size_8kb_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_send_ring_size_8kb_f(void)
+{
+        return 0x10;
+}
+static inline u32 sim_send_ring_size_12kb_v(void)
+{
+        return 0x00000002;
+}
+static inline u32 sim_send_ring_size_12kb_f(void)
+{
+        return 0x20;
+}
+static inline u32 sim_send_ring_size_16kb_v(void)
+{
+        return 0x00000003;
+}
+static inline u32 sim_send_ring_size_16kb_f(void)
+{
+        return 0x30;
+}
+static inline u32 sim_send_ring_gp_in_ring_s(void)
+{
+        return 1;
+}
+static inline u32 sim_send_ring_gp_in_ring_f(u32 v)
+{
+        return (v & 0x1) << 11;
+}
+static inline u32 sim_send_ring_gp_in_ring_m(void)
+{
+        return 0x1 << 11;
+}
+static inline u32 sim_send_ring_gp_in_ring_v(u32 r)
+{
+        return (r >> 11) & 0x1;
+}
+static inline u32 sim_send_ring_gp_in_ring__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_send_ring_gp_in_ring__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_send_ring_gp_in_ring__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_send_ring_gp_in_ring__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_send_ring_gp_in_ring_no_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_send_ring_gp_in_ring_no_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_send_ring_gp_in_ring_yes_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_send_ring_gp_in_ring_yes_f(void)
+{
+        return 0x800;
+}
+static inline u32 sim_send_ring_addr_lo_s(void)
+{
+        return 20;
+}
+static inline u32 sim_send_ring_addr_lo_f(u32 v)
+{
+        return (v & 0xfffff) << 12;
+}
+static inline u32 sim_send_ring_addr_lo_m(void)
+{
+        return 0xfffff << 12;
+}
+static inline u32 sim_send_ring_addr_lo_v(u32 r)
+{
+        return (r >> 12) & 0xfffff;
+}
+static inline u32 sim_send_ring_addr_lo__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_send_ring_addr_lo__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_send_ring_addr_lo__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_send_ring_addr_lo__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_send_ring_hi_r(void)
+{
+        return 0x00000004;
+}
+static inline u32 sim_send_ring_hi_addr_s(void)
+{
+        return 20;
+}
+static inline u32 sim_send_ring_hi_addr_f(u32 v)
+{
+        return (v & 0xfffff) << 0;
+}
+static inline u32 sim_send_ring_hi_addr_m(void)
+{
+        return 0xfffff << 0;
+}
+static inline u32 sim_send_ring_hi_addr_v(u32 r)
+{
+        return (r >> 0) & 0xfffff;
+}
+static inline u32 sim_send_ring_hi_addr__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_send_ring_hi_addr__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_send_ring_hi_addr__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_send_ring_hi_addr__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_send_put_r(void)
+{
+        return 0x00000008;
+}
+static inline u32 sim_send_put_pointer_s(void)
+{
+        return 29;
+}
+static inline u32 sim_send_put_pointer_f(u32 v)
+{
+        return (v & 0x1fffffff) << 3;
+}
+static inline u32 sim_send_put_pointer_m(void)
+{
+        return 0x1fffffff << 3;
+}
+static inline u32 sim_send_put_pointer_v(u32 r)
+{
+        return (r >> 3) & 0x1fffffff;
+}
+static inline u32 sim_send_get_r(void)
+{
+        return 0x0000000c;
+}
+static inline u32 sim_send_get_pointer_s(void)
+{
+        return 29;
+}
+static inline u32 sim_send_get_pointer_f(u32 v)
+{
+        return (v & 0x1fffffff) << 3;
+}
+static inline u32 sim_send_get_pointer_m(void)
+{
+        return 0x1fffffff << 3;
+}
+static inline u32 sim_send_get_pointer_v(u32 r)
+{
+        return (r >> 3) & 0x1fffffff;
+}
+static inline u32 sim_recv_ring_r(void)
+{
+        return 0x00000010;
+}
+static inline u32 sim_recv_ring_target_s(void)
+{
+        return 2;
+}
+static inline u32 sim_recv_ring_target_f(u32 v)
+{
+        return (v & 0x3) << 0;
+}
+static inline u32 sim_recv_ring_target_m(void)
+{
+        return 0x3 << 0;
+}
+static inline u32 sim_recv_ring_target_v(u32 r)
+{
+        return (r >> 0) & 0x3;
+}
+static inline u32 sim_recv_ring_target_phys_init_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_recv_ring_target_phys_init_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_recv_ring_target_phys__init_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_recv_ring_target_phys__init_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_recv_ring_target_phys__prod_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_recv_ring_target_phys__prod_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_recv_ring_target_phys_nvm_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_recv_ring_target_phys_nvm_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_recv_ring_target_phys_pci_v(void)
+{
+        return 0x00000002;
+}
+static inline u32 sim_recv_ring_target_phys_pci_f(void)
+{
+        return 0x2;
+}
+static inline u32 sim_recv_ring_target_phys_pci_coherent_v(void)
+{
+        return 0x00000003;
+}
+static inline u32 sim_recv_ring_target_phys_pci_coherent_f(void)
+{
+        return 0x3;
+}
+static inline u32 sim_recv_ring_status_s(void)
+{
+        return 1;
+}
+static inline u32 sim_recv_ring_status_f(u32 v)
+{
+        return (v & 0x1) << 3;
+}
+static inline u32 sim_recv_ring_status_m(void)
+{
+        return 0x1 << 3;
+}
+static inline u32 sim_recv_ring_status_v(u32 r)
+{
+        return (r >> 3) & 0x1;
+}
+static inline u32 sim_recv_ring_status_init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_recv_ring_status_init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_recv_ring_status__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_recv_ring_status__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_recv_ring_status__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_recv_ring_status__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_recv_ring_status_invalid_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_recv_ring_status_invalid_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_recv_ring_status_valid_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_recv_ring_status_valid_f(void)
+{
+        return 0x8;
+}
+static inline u32 sim_recv_ring_size_s(void)
+{
+        return 2;
+}
+static inline u32 sim_recv_ring_size_f(u32 v)
+{
+        return (v & 0x3) << 4;
+}
+static inline u32 sim_recv_ring_size_m(void)
+{
+        return 0x3 << 4;
+}
+static inline u32 sim_recv_ring_size_v(u32 r)
+{
+        return (r >> 4) & 0x3;
+}
+static inline u32 sim_recv_ring_size_init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_recv_ring_size_init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_recv_ring_size__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_recv_ring_size__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_recv_ring_size__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_recv_ring_size__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_recv_ring_size_4kb_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_recv_ring_size_4kb_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_recv_ring_size_8kb_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_recv_ring_size_8kb_f(void)
+{
+        return 0x10;
+}
+static inline u32 sim_recv_ring_size_12kb_v(void)
+{
+        return 0x00000002;
+}
+static inline u32 sim_recv_ring_size_12kb_f(void)
+{
+        return 0x20;
+}
+static inline u32 sim_recv_ring_size_16kb_v(void)
+{
+        return 0x00000003;
+}
+static inline u32 sim_recv_ring_size_16kb_f(void)
+{
+        return 0x30;
+}
+static inline u32 sim_recv_ring_gp_in_ring_s(void)
+{
+        return 1;
+}
+static inline u32 sim_recv_ring_gp_in_ring_f(u32 v)
+{
+        return (v & 0x1) << 11;
+}
+static inline u32 sim_recv_ring_gp_in_ring_m(void)
+{
+        return 0x1 << 11;
+}
+static inline u32 sim_recv_ring_gp_in_ring_v(u32 r)
+{
+        return (r >> 11) & 0x1;
+}
+static inline u32 sim_recv_ring_gp_in_ring__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_recv_ring_gp_in_ring__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_recv_ring_gp_in_ring__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_recv_ring_gp_in_ring__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_recv_ring_gp_in_ring_no_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_recv_ring_gp_in_ring_no_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_recv_ring_gp_in_ring_yes_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_recv_ring_gp_in_ring_yes_f(void)
+{
+        return 0x800;
+}
+static inline u32 sim_recv_ring_addr_lo_s(void)
+{
+        return 20;
+}
+static inline u32 sim_recv_ring_addr_lo_f(u32 v)
+{
+        return (v & 0xfffff) << 12;
+}
+static inline u32 sim_recv_ring_addr_lo_m(void)
+{
+        return 0xfffff << 12;
+}
+static inline u32 sim_recv_ring_addr_lo_v(u32 r)
+{
+        return (r >> 12) & 0xfffff;
+}
+static inline u32 sim_recv_ring_addr_lo__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_recv_ring_addr_lo__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_recv_ring_addr_lo__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_recv_ring_addr_lo__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_recv_ring_hi_r(void)
+{
+        return 0x00000014;
+}
+static inline u32 sim_recv_ring_hi_addr_s(void)
+{
+        return 20;
+}
+static inline u32 sim_recv_ring_hi_addr_f(u32 v)
+{
+        return (v & 0xfffff) << 0;
+}
+static inline u32 sim_recv_ring_hi_addr_m(void)
+{
+        return 0xfffff << 0;
+}
+static inline u32 sim_recv_ring_hi_addr_v(u32 r)
+{
+        return (r >> 0) & 0xfffff;
+}
+static inline u32 sim_recv_ring_hi_addr__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_recv_ring_hi_addr__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_recv_ring_hi_addr__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_recv_ring_hi_addr__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_recv_put_r(void)
+{
+        return 0x00000018;
+}
+static inline u32 sim_recv_put_pointer_s(void)
+{
+        return 11;
+}
+static inline u32 sim_recv_put_pointer_f(u32 v)
+{
+        return (v & 0x7ff) << 3;
+}
+static inline u32 sim_recv_put_pointer_m(void)
+{
+        return 0x7ff << 3;
+}
+static inline u32 sim_recv_put_pointer_v(u32 r)
+{
+        return (r >> 3) & 0x7ff;
+}
+static inline u32 sim_recv_get_r(void)
+{
+        return 0x0000001c;
+}
+static inline u32 sim_recv_get_pointer_s(void)
+{
+        return 11;
+}
+static inline u32 sim_recv_get_pointer_f(u32 v)
+{
+        return (v & 0x7ff) << 3;
+}
+static inline u32 sim_recv_get_pointer_m(void)
+{
+        return 0x7ff << 3;
+}
+static inline u32 sim_recv_get_pointer_v(u32 r)
+{
+        return (r >> 3) & 0x7ff;
+}
+static inline u32 sim_config_r(void)
+{
+        return 0x00000020;
+}
+static inline u32 sim_config_mode_s(void)
+{
+        return 1;
+}
+static inline u32 sim_config_mode_f(u32 v)
+{
+        return (v & 0x1) << 0;
+}
+static inline u32 sim_config_mode_m(void)
+{
+        return 0x1 << 0;
+}
+static inline u32 sim_config_mode_v(u32 r)
+{
+        return (r >> 0) & 0x1;
+}
+static inline u32 sim_config_mode_disabled_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_config_mode_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_config_mode_enabled_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_config_mode_enabled_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_config_channels_s(void)
+{
+        return 7;
+}
+static inline u32 sim_config_channels_f(u32 v)
+{
+        return (v & 0x7f) << 1;
+}
+static inline u32 sim_config_channels_m(void)
+{
+        return 0x7f << 1;
+}
+static inline u32 sim_config_channels_v(u32 r)
+{
+        return (r >> 1) & 0x7f;
+}
+static inline u32 sim_config_channels_none_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_config_channels_none_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_config_cached_only_s(void)
+{
+        return 1;
+}
+static inline u32 sim_config_cached_only_f(u32 v)
+{
+        return (v & 0x1) << 8;
+}
+static inline u32 sim_config_cached_only_m(void)
+{
+        return 0x1 << 8;
+}
+static inline u32 sim_config_cached_only_v(u32 r)
+{
+        return (r >> 8) & 0x1;
+}
+static inline u32 sim_config_cached_only_disabled_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_config_cached_only_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_config_cached_only_enabled_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_config_cached_only_enabled_f(void)
+{
+        return 0x100;
+}
+static inline u32 sim_config_validity_s(void)
+{
+        return 2;
+}
+static inline u32 sim_config_validity_f(u32 v)
+{
+        return (v & 0x3) << 9;
+}
+static inline u32 sim_config_validity_m(void)
+{
+        return 0x3 << 9;
+}
+static inline u32 sim_config_validity_v(u32 r)
+{
+        return (r >> 9) & 0x3;
+}
+static inline u32 sim_config_validity__init_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_config_validity__init_f(void)
+{
+        return 0x200;
+}
+static inline u32 sim_config_validity_valid_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_config_validity_valid_f(void)
+{
+        return 0x200;
+}
+static inline u32 sim_config_simulation_s(void)
+{
+        return 2;
+}
+static inline u32 sim_config_simulation_f(u32 v)
+{
+        return (v & 0x3) << 12;
+}
+static inline u32 sim_config_simulation_m(void)
+{
+        return 0x3 << 12;
+}
+static inline u32 sim_config_simulation_v(u32 r)
+{
+        return (r >> 12) & 0x3;
+}
+static inline u32 sim_config_simulation_disabled_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_config_simulation_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_config_simulation_fmodel_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_config_simulation_fmodel_f(void)
+{
+        return 0x1000;
+}
+static inline u32 sim_config_simulation_rtlsim_v(void)
+{
+        return 0x00000002;
+}
+static inline u32 sim_config_simulation_rtlsim_f(void)
+{
+        return 0x2000;
+}
+static inline u32 sim_config_secondary_display_s(void)
+{
+        return 1;
+}
+static inline u32 sim_config_secondary_display_f(u32 v)
+{
+        return (v & 0x1) << 14;
+}
+static inline u32 sim_config_secondary_display_m(void)
+{
+        return 0x1 << 14;
+}
+static inline u32 sim_config_secondary_display_v(u32 r)
+{
+        return (r >> 14) & 0x1;
+}
+static inline u32 sim_config_secondary_display_disabled_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_config_secondary_display_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_config_secondary_display_enabled_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_config_secondary_display_enabled_f(void)
+{
+        return 0x4000;
+}
+static inline u32 sim_config_num_heads_s(void)
+{
+        return 8;
+}
+static inline u32 sim_config_num_heads_f(u32 v)
+{
+        return (v & 0xff) << 17;
+}
+static inline u32 sim_config_num_heads_m(void)
+{
+        return 0xff << 17;
+}
+static inline u32 sim_config_num_heads_v(u32 r)
+{
+        return (r >> 17) & 0xff;
+}
+static inline u32 sim_event_ring_r(void)
+{
+        return 0x00000030;
+}
+static inline u32 sim_event_ring_target_s(void)
+{
+        return 2;
+}
+static inline u32 sim_event_ring_target_f(u32 v)
+{
+        return (v & 0x3) << 0;
+}
+static inline u32 sim_event_ring_target_m(void)
+{
+        return 0x3 << 0;
+}
+static inline u32 sim_event_ring_target_v(u32 r)
+{
+        return (r >> 0) & 0x3;
+}
+static inline u32 sim_event_ring_target_phys_init_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_event_ring_target_phys_init_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_event_ring_target_phys__init_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_event_ring_target_phys__init_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_event_ring_target_phys__prod_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_event_ring_target_phys__prod_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_event_ring_target_phys_nvm_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_event_ring_target_phys_nvm_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_event_ring_target_phys_pci_v(void)
+{
+        return 0x00000002;
+}
+static inline u32 sim_event_ring_target_phys_pci_f(void)
+{
+        return 0x2;
+}
+static inline u32 sim_event_ring_target_phys_pci_coherent_v(void)
+{
+        return 0x00000003;
+}
+static inline u32 sim_event_ring_target_phys_pci_coherent_f(void)
+{
+        return 0x3;
+}
+static inline u32 sim_event_ring_status_s(void)
+{
+        return 1;
+}
+static inline u32 sim_event_ring_status_f(u32 v)
+{
+        return (v & 0x1) << 3;
+}
+static inline u32 sim_event_ring_status_m(void)
+{
+        return 0x1 << 3;
+}
+static inline u32 sim_event_ring_status_v(u32 r)
+{
+        return (r >> 3) & 0x1;
+}
+static inline u32 sim_event_ring_status_init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_event_ring_status_init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_event_ring_status__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_event_ring_status__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_event_ring_status__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_event_ring_status__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_event_ring_status_invalid_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_event_ring_status_invalid_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_event_ring_status_valid_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_event_ring_status_valid_f(void)
+{
+        return 0x8;
+}
+static inline u32 sim_event_ring_size_s(void)
+{
+        return 2;
+}
+static inline u32 sim_event_ring_size_f(u32 v)
+{
+        return (v & 0x3) << 4;
+}
+static inline u32 sim_event_ring_size_m(void)
+{
+        return 0x3 << 4;
+}
+static inline u32 sim_event_ring_size_v(u32 r)
+{
+        return (r >> 4) & 0x3;
+}
+static inline u32 sim_event_ring_size_init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_event_ring_size_init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_event_ring_size__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_event_ring_size__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_event_ring_size__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_event_ring_size__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_event_ring_size_4kb_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_event_ring_size_4kb_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_event_ring_size_8kb_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_event_ring_size_8kb_f(void)
+{
+        return 0x10;
+}
+static inline u32 sim_event_ring_size_12kb_v(void)
+{
+        return 0x00000002;
+}
+static inline u32 sim_event_ring_size_12kb_f(void)
+{
+        return 0x20;
+}
+static inline u32 sim_event_ring_size_16kb_v(void)
+{
+        return 0x00000003;
+}
+static inline u32 sim_event_ring_size_16kb_f(void)
+{
+        return 0x30;
+}
+static inline u32 sim_event_ring_gp_in_ring_s(void)
+{
+        return 1;
+}
+static inline u32 sim_event_ring_gp_in_ring_f(u32 v)
+{
+        return (v & 0x1) << 11;
+}
+static inline u32 sim_event_ring_gp_in_ring_m(void)
+{
+        return 0x1 << 11;
+}
+static inline u32 sim_event_ring_gp_in_ring_v(u32 r)
+{
+        return (r >> 11) & 0x1;
+}
+static inline u32 sim_event_ring_gp_in_ring__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_event_ring_gp_in_ring__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_event_ring_gp_in_ring__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_event_ring_gp_in_ring__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_event_ring_gp_in_ring_no_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_event_ring_gp_in_ring_no_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_event_ring_gp_in_ring_yes_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_event_ring_gp_in_ring_yes_f(void)
+{
+        return 0x800;
+}
+static inline u32 sim_event_ring_addr_lo_s(void)
+{
+        return 20;
+}
+static inline u32 sim_event_ring_addr_lo_f(u32 v)
+{
+        return (v & 0xfffff) << 12;
+}
+static inline u32 sim_event_ring_addr_lo_m(void)
+{
+        return 0xfffff << 12;
+}
+static inline u32 sim_event_ring_addr_lo_v(u32 r)
+{
+        return (r >> 12) & 0xfffff;
+}
+static inline u32 sim_event_ring_addr_lo__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_event_ring_addr_lo__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_event_ring_addr_lo__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_event_ring_addr_lo__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_event_ring_hi_v(void)
+{
+        return 0x00000034;
+}
+static inline u32 sim_event_ring_hi_addr_s(void)
+{
+        return 20;
+}
+static inline u32 sim_event_ring_hi_addr_f(u32 v)
+{
+        return (v & 0xfffff) << 0;
+}
+static inline u32 sim_event_ring_hi_addr_m(void)
+{
+        return 0xfffff << 0;
+}
+static inline u32 sim_event_ring_hi_addr_v(u32 r)
+{
+        return (r >> 0) & 0xfffff;
+}
+static inline u32 sim_event_ring_hi_addr__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_event_ring_hi_addr__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_event_ring_hi_addr__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_event_ring_hi_addr__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_event_put_r(void)
+{
+        return 0x00000038;
+}
+static inline u32 sim_event_put_pointer_s(void)
+{
+        return 30;
+}
+static inline u32 sim_event_put_pointer_f(u32 v)
+{
+        return (v & 0x3fffffff) << 2;
+}
+static inline u32 sim_event_put_pointer_m(void)
+{
+        return 0x3fffffff << 2;
+}
+static inline u32 sim_event_put_pointer_v(u32 r)
+{
+        return (r >> 2) & 0x3fffffff;
+}
+static inline u32 sim_event_get_r(void)
+{
+        return 0x0000003c;
+}
+static inline u32 sim_event_get_pointer_s(void)
+{
+        return 30;
+}
+static inline u32 sim_event_get_pointer_f(u32 v)
+{
+        return (v & 0x3fffffff) << 2;
+}
+static inline u32 sim_event_get_pointer_m(void)
+{
+        return 0x3fffffff << 2;
+}
+static inline u32 sim_event_get_pointer_v(u32 r)
+{
+        return (r >> 2) & 0x3fffffff;
+}
+static inline u32 sim_status_r(void)
+{
+        return 0x00000028;
+}
+static inline u32 sim_status_send_put_s(void)
+{
+        return 1;
+}
+static inline u32 sim_status_send_put_f(u32 v)
+{
+        return (v & 0x1) << 0;
+}
+static inline u32 sim_status_send_put_m(void)
+{
+        return 0x1 << 0;
+}
+static inline u32 sim_status_send_put_v(u32 r)
+{
+        return (r >> 0) & 0x1;
+}
+static inline u32 sim_status_send_put__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_status_send_put__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_status_send_put_idle_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_status_send_put_idle_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_status_send_put_pending_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_status_send_put_pending_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_status_send_get_s(void)
+{
+        return 1;
+}
+static inline u32 sim_status_send_get_f(u32 v)
+{
+        return (v & 0x1) << 1;
+}
+static inline u32 sim_status_send_get_m(void)
+{
+        return 0x1 << 1;
+}
+static inline u32 sim_status_send_get_v(u32 r)
+{
+        return (r >> 1) & 0x1;
+}
+static inline u32 sim_status_send_get__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_status_send_get__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_status_send_get_idle_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_status_send_get_idle_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_status_send_get_pending_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_status_send_get_pending_f(void)
+{
+        return 0x2;
+}
+static inline u32 sim_status_send_get_clear_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_status_send_get_clear_f(void)
+{
+        return 0x2;
+}
+static inline u32 sim_status_recv_put_s(void)
+{
+        return 1;
+}
+static inline u32 sim_status_recv_put_f(u32 v)
+{
+        return (v & 0x1) << 2;
+}
+static inline u32 sim_status_recv_put_m(void)
+{
+        return 0x1 << 2;
+}
+static inline u32 sim_status_recv_put_v(u32 r)
+{
+        return (r >> 2) & 0x1;
+}
+static inline u32 sim_status_recv_put__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_status_recv_put__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_status_recv_put_idle_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_status_recv_put_idle_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_status_recv_put_pending_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_status_recv_put_pending_f(void)
+{
+        return 0x4;
+}
+static inline u32 sim_status_recv_put_clear_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_status_recv_put_clear_f(void)
+{
+        return 0x4;
+}
+static inline u32 sim_status_recv_get_s(void)
+{
+        return 1;
+}
+static inline u32 sim_status_recv_get_f(u32 v)
+{
+        return (v & 0x1) << 3;
+}
+static inline u32 sim_status_recv_get_m(void)
+{
+        return 0x1 << 3;
+}
+static inline u32 sim_status_recv_get_v(u32 r)
+{
+        return (r >> 3) & 0x1;
+}
+static inline u32 sim_status_recv_get__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_status_recv_get__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_status_recv_get_idle_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_status_recv_get_idle_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_status_recv_get_pending_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_status_recv_get_pending_f(void)
+{
+        return 0x8;
+}
+static inline u32 sim_status_event_put_s(void)
+{
+        return 1;
+}
+static inline u32 sim_status_event_put_f(u32 v)
+{
+        return (v & 0x1) << 4;
+}
+static inline u32 sim_status_event_put_m(void)
+{
+        return 0x1 << 4;
+}
+static inline u32 sim_status_event_put_v(u32 r)
+{
+        return (r >> 4) & 0x1;
+}
+static inline u32 sim_status_event_put__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_status_event_put__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_status_event_put_idle_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_status_event_put_idle_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_status_event_put_pending_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_status_event_put_pending_f(void)
+{
+        return 0x10;
+}
+static inline u32 sim_status_event_put_clear_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_status_event_put_clear_f(void)
+{
+        return 0x10;
+}
+static inline u32 sim_status_event_get_s(void)
+{
+        return 1;
+}
+static inline u32 sim_status_event_get_f(u32 v)
+{
+        return (v & 0x1) << 5;
+}
+static inline u32 sim_status_event_get_m(void)
+{
+        return 0x1 << 5;
+}
+static inline u32 sim_status_event_get_v(u32 r)
+{
+        return (r >> 5) & 0x1;
+}
+static inline u32 sim_status_event_get__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_status_event_get__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_status_event_get_idle_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_status_event_get_idle_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_status_event_get_pending_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_status_event_get_pending_f(void)
+{
+        return 0x20;
+}
+static inline u32 sim_control_r(void)
+{
+        return 0x0000002c;
+}
+static inline u32 sim_control_send_put_s(void)
+{
+        return 1;
+}
+static inline u32 sim_control_send_put_f(u32 v)
+{
+        return (v & 0x1) << 0;
+}
+static inline u32 sim_control_send_put_m(void)
+{
+        return 0x1 << 0;
+}
+static inline u32 sim_control_send_put_v(u32 r)
+{
+        return (r >> 0) & 0x1;
+}
+static inline u32 sim_control_send_put__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_control_send_put__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_control_send_put_disabled_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_control_send_put_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_control_send_put_enabled_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_control_send_put_enabled_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_control_send_get_s(void)
+{
+        return 1;
+}
+static inline u32 sim_control_send_get_f(u32 v)
+{
+        return (v & 0x1) << 1;
+}
+static inline u32 sim_control_send_get_m(void)
+{
+        return 0x1 << 1;
+}
+static inline u32 sim_control_send_get_v(u32 r)
+{
+        return (r >> 1) & 0x1;
+}
+static inline u32 sim_control_send_get__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_control_send_get__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_control_send_get_disabled_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_control_send_get_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_control_send_get_enabled_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_control_send_get_enabled_f(void)
+{
+        return 0x2;
+}
+static inline u32 sim_control_recv_put_s(void)
+{
+        return 1;
+}
+static inline u32 sim_control_recv_put_f(u32 v)
+{
+        return (v & 0x1) << 2;
+}
+static inline u32 sim_control_recv_put_m(void)
+{
+        return 0x1 << 2;
+}
+static inline u32 sim_control_recv_put_v(u32 r)
+{
+        return (r >> 2) & 0x1;
+}
+static inline u32 sim_control_recv_put__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_control_recv_put__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_control_recv_put_disabled_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_control_recv_put_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_control_recv_put_enabled_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_control_recv_put_enabled_f(void)
+{
+        return 0x4;
+}
+static inline u32 sim_control_recv_get_s(void)
+{
+        return 1;
+}
+static inline u32 sim_control_recv_get_f(u32 v)
+{
+        return (v & 0x1) << 3;
+}
+static inline u32 sim_control_recv_get_m(void)
+{
+        return 0x1 << 3;
+}
+static inline u32 sim_control_recv_get_v(u32 r)
+{
+        return (r >> 3) & 0x1;
+}
+static inline u32 sim_control_recv_get__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_control_recv_get__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_control_recv_get_disabled_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_control_recv_get_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_control_recv_get_enabled_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_control_recv_get_enabled_f(void)
+{
+        return 0x8;
+}
+static inline u32 sim_control_event_put_s(void)
+{
+        return 1;
+}
+static inline u32 sim_control_event_put_f(u32 v)
+{
+        return (v & 0x1) << 4;
+}
+static inline u32 sim_control_event_put_m(void)
+{
+        return 0x1 << 4;
+}
+static inline u32 sim_control_event_put_v(u32 r)
+{
+        return (r >> 4) & 0x1;
+}
+static inline u32 sim_control_event_put__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_control_event_put__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_control_event_put_disabled_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_control_event_put_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_control_event_put_enabled_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_control_event_put_enabled_f(void)
+{
+        return 0x10;
+}
+static inline u32 sim_control_event_get_s(void)
+{
+        return 1;
+}
+static inline u32 sim_control_event_get_f(u32 v)
+{
+        return (v & 0x1) << 5;
+}
+static inline u32 sim_control_event_get_m(void)
+{
+        return 0x1 << 5;
+}
+static inline u32 sim_control_event_get_v(u32 r)
+{
+        return (r >> 5) & 0x1;
+}
+static inline u32 sim_control_event_get__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_control_event_get__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_control_event_get_disabled_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_control_event_get_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_control_event_get_enabled_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_control_event_get_enabled_f(void)
+{
+        return 0x20;
+}
+static inline u32 sim_dma_r(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_dma_target_s(void)
+{
+        return 2;
+}
+static inline u32 sim_dma_target_f(u32 v)
+{
+        return (v & 0x3) << 0;
+}
+static inline u32 sim_dma_target_m(void)
+{
+        return 0x3 << 0;
+}
+static inline u32 sim_dma_target_v(u32 r)
+{
+        return (r >> 0) & 0x3;
+}
+static inline u32 sim_dma_target_phys_init_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_dma_target_phys_init_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_dma_target_phys__init_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_dma_target_phys__init_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_dma_target_phys__prod_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_dma_target_phys__prod_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_dma_target_phys_nvm_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_dma_target_phys_nvm_f(void)
+{
+        return 0x1;
+}
+static inline u32 sim_dma_target_phys_pci_v(void)
+{
+        return 0x00000002;
+}
+static inline u32 sim_dma_target_phys_pci_f(void)
+{
+        return 0x2;
+}
+static inline u32 sim_dma_target_phys_pci_coherent_v(void)
+{
+        return 0x00000003;
+}
+static inline u32 sim_dma_target_phys_pci_coherent_f(void)
+{
+        return 0x3;
+}
+static inline u32 sim_dma_status_s(void)
+{
+        return 1;
+}
+static inline u32 sim_dma_status_f(u32 v)
+{
+        return (v & 0x1) << 3;
+}
+static inline u32 sim_dma_status_m(void)
+{
+        return 0x1 << 3;
+}
+static inline u32 sim_dma_status_v(u32 r)
+{
+        return (r >> 3) & 0x1;
+}
+static inline u32 sim_dma_status_init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_dma_status_init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_dma_status__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_dma_status__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_dma_status__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_dma_status__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_dma_status_invalid_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_dma_status_invalid_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_dma_status_valid_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_dma_status_valid_f(void)
+{
+        return 0x8;
+}
+static inline u32 sim_dma_size_s(void)
+{
+        return 2;
+}
+static inline u32 sim_dma_size_f(u32 v)
+{
+        return (v & 0x3) << 4;
+}
+static inline u32 sim_dma_size_m(void)
+{
+        return 0x3 << 4;
+}
+static inline u32 sim_dma_size_v(u32 r)
+{
+        return (r >> 4) & 0x3;
+}
+static inline u32 sim_dma_size_init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_dma_size_init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_dma_size__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_dma_size__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_dma_size__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_dma_size__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_dma_size_4kb_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_dma_size_4kb_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_dma_size_8kb_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 sim_dma_size_8kb_f(void)
+{
+        return 0x10;
+}
+static inline u32 sim_dma_size_12kb_v(void)
+{
+        return 0x00000002;
+}
+static inline u32 sim_dma_size_12kb_f(void)
+{
+        return 0x20;
+}
+static inline u32 sim_dma_size_16kb_v(void)
+{
+        return 0x00000003;
+}
+static inline u32 sim_dma_size_16kb_f(void)
+{
+        return 0x30;
+}
+static inline u32 sim_dma_addr_lo_s(void)
+{
+        return 20;
+}
+static inline u32 sim_dma_addr_lo_f(u32 v)
+{
+        return (v & 0xfffff) << 12;
+}
+static inline u32 sim_dma_addr_lo_m(void)
+{
+        return 0xfffff << 12;
+}
+static inline u32 sim_dma_addr_lo_v(u32 r)
+{
+        return (r >> 12) & 0xfffff;
+}
+static inline u32 sim_dma_addr_lo__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_dma_addr_lo__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_dma_addr_lo__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_dma_addr_lo__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_dma_hi_r(void)
+{
+        return 0x00000004;
+}
+static inline u32 sim_dma_hi_addr_s(void)
+{
+        return 20;
+}
+static inline u32 sim_dma_hi_addr_f(u32 v)
+{
+        return (v & 0xfffff) << 0;
+}
+static inline u32 sim_dma_hi_addr_m(void)
+{
+        return 0xfffff << 0;
+}
+static inline u32 sim_dma_hi_addr_v(u32 r)
+{
+        return (r >> 0) & 0xfffff;
+}
+static inline u32 sim_dma_hi_addr__init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_dma_hi_addr__init_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_dma_hi_addr__prod_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_dma_hi_addr__prod_f(void)
+{
+        return 0x0;
+}
+static inline u32 sim_msg_signature_r(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_msg_signature_valid_v(void)
+{
+        return 0x43505256;
+}
+static inline u32 sim_msg_length_r(void)
+{
+        return 0x00000004;
+}
+static inline u32 sim_msg_function_r(void)
+{
+        return 0x00000008;
+}
+static inline u32 sim_msg_function_sim_escape_read_v(void)
+{
+        return 0x00000023;
+}
+static inline u32 sim_msg_function_sim_escape_write_v(void)
+{
+        return 0x00000024;
+}
+static inline u32 sim_msg_result_r(void)
+{
+        return 0x0000000c;
+}
+static inline u32 sim_msg_result_success_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 sim_msg_result_rpc_pending_v(void)
+{
+        return 0xFFFFFFFF;
+}
+static inline u32 sim_msg_sequence_r(void)
+{
+        return 0x00000010;
+}
+static inline u32 sim_msg_spare_r(void)
+{
+        return 0x00000014;
+}
+static inline u32 sim_msg_spare__init_v(void)
+{
+        return 0x00000000;
+}
+#endif /* __hw_sim_gk20a_h__ */
diff --git a/drivers/gpu/nvgpu/gk20a/hw_therm_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_therm_gk20a.h
new file mode 100644
index 00000000..5d6397b4
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_therm_gk20a.h
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_therm_gk20a_h_
+#define _hw_therm_gk20a_h_
+static inline u32 therm_use_a_r(void)
+{
+        return 0x00020798;
+}
+static inline u32 therm_evt_ext_therm_0_r(void)
+{
+        return 0x00020700;
+}
+static inline u32 therm_evt_ext_therm_1_r(void)
+{
+        return 0x00020704;
+}
+static inline u32 therm_evt_ext_therm_2_r(void)
+{
+        return 0x00020708;
+}
+static inline u32 therm_evt_ba_w0_t1h_r(void)
+{
+        return 0x00020750;
+}
+static inline u32 therm_weight_1_r(void)
+{
+        return 0x00020024;
+}
+static inline u32 therm_peakpower_config1_r(u32 i)
+{
+        return 0x00020154 + i*4;
+}
+static inline u32 therm_peakpower_config1_window_period_2m_v(void)
+{
+        return 0x0000000f;
+}
+static inline u32 therm_peakpower_config1_window_period_2m_f(void)
+{
+        return 0xf;
+}
+static inline u32 therm_peakpower_config1_ba_sum_shift_s(void)
+{
+        return 6;
+}
+static inline u32 therm_peakpower_config1_ba_sum_shift_f(u32 v)
+{
+        return (v & 0x3f) << 8;
+}
+static inline u32 therm_peakpower_config1_ba_sum_shift_m(void)
+{
+        return 0x3f << 8;
+}
+static inline u32 therm_peakpower_config1_ba_sum_shift_v(u32 r)
+{
+        return (r >> 8) & 0x3f;
+}
+static inline u32 therm_peakpower_config1_ba_sum_shift_20_f(void)
+{
+        return 0x1400;
+}
+static inline u32 therm_peakpower_config1_window_en_enabled_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 therm_peakpower_config2_r(u32 i)
+{
+        return 0x00020170 + i*4;
+}
+static inline u32 therm_peakpower_config4_r(u32 i)
+{
+        return 0x000201c0 + i*4;
+}
+static inline u32 therm_peakpower_config6_r(u32 i)
+{
+        return 0x00020270 + i*4;
+}
+static inline u32 therm_peakpower_config8_r(u32 i)
+{
+        return 0x000202e8 + i*4;
+}
+static inline u32 therm_peakpower_config9_r(u32 i)
+{
+        return 0x000202f4 + i*4;
+}
+static inline u32 therm_config1_r(void)
+{
+        return 0x00020050;
+}
+static inline u32 therm_gate_ctrl_r(u32 i)
+{
+        return 0x00020200 + i*4;
+}
+static inline u32 therm_gate_ctrl_eng_clk_m(void)
+{
+        return 0x3 << 0;
+}
+static inline u32 therm_gate_ctrl_eng_clk_run_f(void)
+{
+        return 0x0;
+}
+static inline u32 therm_gate_ctrl_eng_clk_auto_f(void)
+{
+        return 0x1;
+}
+static inline u32 therm_gate_ctrl_eng_clk_stop_f(void)
+{
+        return 0x2;
+}
+static inline u32 therm_gate_ctrl_blk_clk_m(void)
+{
+        return 0x3 << 2;
+}
+static inline u32 therm_gate_ctrl_blk_clk_run_f(void)
+{
+        return 0x0;
+}
+static inline u32 therm_gate_ctrl_blk_clk_auto_f(void)
+{
+        return 0x4;
+}
+static inline u32 therm_gate_ctrl_eng_pwr_m(void)
+{
+        return 0x3 << 4;
+}
+static inline u32 therm_gate_ctrl_eng_pwr_auto_f(void)
+{
+        return 0x10;
+}
+static inline u32 therm_gate_ctrl_eng_pwr_off_v(void)
+{
+        return 0x00000002;
+}
+static inline u32 therm_gate_ctrl_eng_pwr_off_f(void)
+{
+        return 0x20;
+}
+static inline u32 therm_gate_ctrl_eng_idle_filt_exp_f(u32 v)
+{
+        return (v & 0x1f) << 8;
+}
+static inline u32 therm_gate_ctrl_eng_idle_filt_exp_m(void)
+{
+        return 0x1f << 8;
+}
+static inline u32 therm_gate_ctrl_eng_idle_filt_mant_f(u32 v)
+{
+        return (v & 0x7) << 13;
+}
+static inline u32 therm_gate_ctrl_eng_idle_filt_mant_m(void)
+{
+        return 0x7 << 13;
+}
+static inline u32 therm_gate_ctrl_eng_delay_after_f(u32 v)
+{
+        return (v & 0xf) << 20;
+}
+static inline u32 therm_gate_ctrl_eng_delay_after_m(void)
+{
+        return 0xf << 20;
+}
+static inline u32 therm_fecs_idle_filter_r(void)
+{
+        return 0x00020288;
+}
+static inline u32 therm_fecs_idle_filter_value_m(void)
+{
+        return 0xffffffff << 0;
+}
+static inline u32 therm_hubmmu_idle_filter_r(void)
+{
+        return 0x0002028c;
+}
+static inline u32 therm_hubmmu_idle_filter_value_m(void)
+{
+        return 0xffffffff << 0;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h
new file mode 100644
index 00000000..22bc50ac
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_timer_gk20a_h_
+#define _hw_timer_gk20a_h_
+static inline u32 timer_pri_timeout_r(void)
+{
+        return 0x00009080;
+}
+static inline u32 timer_pri_timeout_period_f(u32 v)
+{
+        return (v & 0xffffff) << 0;
+}
+static inline u32 timer_pri_timeout_period_m(void)
+{
+        return 0xffffff << 0;
+}
+static inline u32 timer_pri_timeout_period_v(u32 r)
+{
+        return (r >> 0) & 0xffffff;
+}
+static inline u32 timer_pri_timeout_en_f(u32 v)
+{
+        return (v & 0x1) << 31;
+}
+static inline u32 timer_pri_timeout_en_m(void)
+{
+        return 0x1 << 31;
+}
+static inline u32 timer_pri_timeout_en_v(u32 r)
+{
+        return (r >> 31) & 0x1;
+}
+static inline u32 timer_pri_timeout_en_en_enabled_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 timer_pri_timeout_en_en_disabled_f(void)
+{
+        return 0x0;
+}
+static inline u32 timer_pri_timeout_save_0_r(void)
+{
+        return 0x00009084;
+}
+static inline u32 timer_pri_timeout_save_1_r(void)
+{
+        return 0x00009088;
+}
+static inline u32 timer_pri_timeout_fecs_errcode_r(void)
+{
+        return 0x0000908c;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_top_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_top_gk20a.h
new file mode 100644
index 00000000..c2922814
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_top_gk20a.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_top_gk20a_h_
+#define _hw_top_gk20a_h_
+static inline u32 top_num_gpcs_r(void)
+{
+        return 0x00022430;
+}
+static inline u32 top_num_gpcs_value_v(u32 r)
+{
+        return (r >> 0) & 0x1f;
+}
+static inline u32 top_tpc_per_gpc_r(void)
+{
+        return 0x00022434;
+}
+static inline u32 top_tpc_per_gpc_value_v(u32 r)
+{
+        return (r >> 0) & 0x1f;
+}
+static inline u32 top_num_fbps_r(void)
+{
+        return 0x00022438;
+}
+static inline u32 top_num_fbps_value_v(u32 r)
+{
+        return (r >> 0) & 0x1f;
+}
+static inline u32 top_fs_status_r(void)
+{
+        return 0x00022500;
+}
+static inline u32 top_device_info_r(u32 i)
+{
+        return 0x00022700 + i*4;
+}
+static inline u32 top_device_info__size_1_v(void)
+{
+        return 0x00000040;
+}
+static inline u32 top_device_info_chain_v(u32 r)
+{
+        return (r >> 31) & 0x1;
+}
+static inline u32 top_device_info_chain_enable_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 top_device_info_engine_enum_v(u32 r)
+{
+        return (r >> 26) & 0xf;
+}
+static inline u32 top_device_info_runlist_enum_v(u32 r)
+{
+        return (r >> 21) & 0xf;
+}
+static inline u32 top_device_info_type_enum_v(u32 r)
+{
+        return (r >> 2) & 0x1fffffff;
+}
+static inline u32 top_device_info_type_enum_graphics_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 top_device_info_type_enum_graphics_f(void)
+{
+        return 0x0;
+}
+static inline u32 top_device_info_type_enum_copy0_v(void)
+{
+        return 0x00000001;
+}
+static inline u32 top_device_info_type_enum_copy0_f(void)
+{
+        return 0x4;
+}
+static inline u32 top_device_info_entry_v(u32 r)
+{
+        return (r >> 0) & 0x3;
+}
+static inline u32 top_device_info_entry_not_valid_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 top_device_info_entry_enum_v(void)
+{
+        return 0x00000002;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_trim_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_trim_gk20a.h
new file mode 100644
index 00000000..826e9bd1
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_trim_gk20a.h
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_trim_gk20a_h_
+#define _hw_trim_gk20a_h_
+static inline u32 trim_sys_gpcpll_cfg_r(void)
+{
+        return 0x00137000;
+}
+static inline u32 trim_sys_gpcpll_cfg_enable_m(void)
+{
+        return 0x1 << 0;
+}
+static inline u32 trim_sys_gpcpll_cfg_enable_v(u32 r)
+{
+        return (r >> 0) & 0x1;
+}
+static inline u32 trim_sys_gpcpll_cfg_enable_no_f(void)
+{
+        return 0x0;
+}
+static inline u32 trim_sys_gpcpll_cfg_enable_yes_f(void)
+{
+        return 0x1;
+}
+static inline u32 trim_sys_gpcpll_cfg_iddq_m(void)
+{
+        return 0x1 << 1;
+}
+static inline u32 trim_sys_gpcpll_cfg_iddq_v(u32 r)
+{
+        return (r >> 1) & 0x1;
+}
+static inline u32 trim_sys_gpcpll_cfg_iddq_power_on_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 trim_sys_gpcpll_cfg_enb_lckdet_m(void)
+{
+        return 0x1 << 4;
+}
+static inline u32 trim_sys_gpcpll_cfg_enb_lckdet_power_on_f(void)
+{
+        return 0x0;
+}
+static inline u32 trim_sys_gpcpll_cfg_enb_lckdet_power_off_f(void)
+{
+        return 0x10;
+}
+static inline u32 trim_sys_gpcpll_cfg_pll_lock_v(u32 r)
+{
+        return (r >> 17) & 0x1;
+}
+static inline u32 trim_sys_gpcpll_cfg_pll_lock_true_f(void)
+{
+        return 0x20000;
+}
+static inline u32 trim_sys_gpcpll_coeff_r(void)
+{
+        return 0x00137004;
+}
+static inline u32 trim_sys_gpcpll_coeff_mdiv_f(u32 v)
+{
+        return (v & 0xff) << 0;
+}
+static inline u32 trim_sys_gpcpll_coeff_mdiv_v(u32 r)
+{
+        return (r >> 0) & 0xff;
+}
+static inline u32 trim_sys_gpcpll_coeff_ndiv_f(u32 v)
+{
+        return (v & 0xff) << 8;
+}
+static inline u32 trim_sys_gpcpll_coeff_ndiv_m(void)
+{
+        return 0xff << 8;
+}
+static inline u32 trim_sys_gpcpll_coeff_ndiv_v(u32 r)
+{
+        return (r >> 8) & 0xff;
+}
+static inline u32 trim_sys_gpcpll_coeff_pldiv_f(u32 v)
+{
+        return (v & 0x3f) << 16;
+}
+static inline u32 trim_sys_gpcpll_coeff_pldiv_v(u32 r)
+{
+        return (r >> 16) & 0x3f;
+}
+static inline u32 trim_sys_sel_vco_r(void)
+{
+        return 0x00137100;
+}
+static inline u32 trim_sys_sel_vco_gpc2clk_out_m(void)
+{
+        return 0x1 << 0;
+}
+static inline u32 trim_sys_sel_vco_gpc2clk_out_init_v(void)
+{
+        return 0x00000000;
+}
+static inline u32 trim_sys_sel_vco_gpc2clk_out_init_f(void)
+{
+        return 0x0;
+}
+static inline u32 trim_sys_sel_vco_gpc2clk_out_bypass_f(void)
+{
+        return 0x0;
+}
+static inline u32 trim_sys_sel_vco_gpc2clk_out_vco_f(void)
+{
+        return 0x1;
+}
+static inline u32 trim_sys_gpc2clk_out_r(void)
+{
+        return 0x00137250;
+}
+static inline u32 trim_sys_gpc2clk_out_bypdiv_s(void)
+{
+        return 6;
+}
+static inline u32 trim_sys_gpc2clk_out_bypdiv_f(u32 v)
+{
+        return (v & 0x3f) << 0;
+}
+static inline u32 trim_sys_gpc2clk_out_bypdiv_m(void)
+{
+        return 0x3f << 0;
+}
+static inline u32 trim_sys_gpc2clk_out_bypdiv_v(u32 r)
+{
+        return (r >> 0) & 0x3f;
+}
+static inline u32 trim_sys_gpc2clk_out_bypdiv_by31_f(void)
+{
+        return 0x3c;
+}
+static inline u32 trim_sys_gpc2clk_out_vcodiv_s(void)
+{
+        return 6;
+}
+static inline u32 trim_sys_gpc2clk_out_vcodiv_f(u32 v)
+{
+        return (v & 0x3f) << 8;
+}
+static inline u32 trim_sys_gpc2clk_out_vcodiv_m(void)
+{
+        return 0x3f << 8;
+}
+static inline u32 trim_sys_gpc2clk_out_vcodiv_v(u32 r)
+{
+        return (r >> 8) & 0x3f;
+}
+static inline u32 trim_sys_gpc2clk_out_vcodiv_by1_f(void)
+{
+        return 0x0;
+}
+static inline u32 trim_sys_gpc2clk_out_sdiv14_m(void)
+{
+        return 0x1 << 31;
+}
+static inline u32 trim_sys_gpc2clk_out_sdiv14_indiv4_mode_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cfg_r(u32 i)
+{
+        return 0x00134124 + i*512;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cfg_noofipclks_f(u32 v)
+{
+        return (v & 0x3fff) << 0;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cfg_write_en_asserted_f(void)
+{
+        return 0x10000;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cfg_enable_asserted_f(void)
+{
+        return 0x100000;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cfg_reset_asserted_f(void)
+{
+        return 0x1000000;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cnt_r(u32 i)
+{
+        return 0x00134128 + i*512;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cnt_value_v(u32 r)
+{
+        return (r >> 0) & 0xfffff;
+}
+static inline u32 trim_sys_gpcpll_cfg2_r(void)
+{
+        return 0x0013700c;
+}
+static inline u32 trim_sys_gpcpll_cfg2_pll_stepa_f(u32 v)
+{
+        return (v & 0xff) << 24;
+}
+static inline u32 trim_sys_gpcpll_cfg2_pll_stepa_m(void)
+{
+        return 0xff << 24;
+}
+static inline u32 trim_sys_gpcpll_cfg3_r(void)
+{
+        return 0x00137018;
+}
+static inline u32 trim_sys_gpcpll_cfg3_pll_stepb_f(u32 v)
+{
+        return (v & 0xff) << 16;
+}
+static inline u32 trim_sys_gpcpll_cfg3_pll_stepb_m(void)
+{
+        return 0xff << 16;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_r(void)
+{
+        return 0x0013701c;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_slowdown_using_pll_m(void)
+{
+        return 0x1 << 22;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_slowdown_using_pll_yes_f(void)
+{
+        return 0x400000;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_slowdown_using_pll_no_f(void)
+{
+        return 0x0;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_en_dynramp_m(void)
+{
+        return 0x1 << 31;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_en_dynramp_yes_f(void)
+{
+        return 0x80000000;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_en_dynramp_no_f(void)
+{
+        return 0x0;
+}
+static inline u32 trim_gpc_bcast_gpcpll_ndiv_slowdown_debug_r(void)
+{
+        return 0x001328a0;
+}
+static inline u32 trim_gpc_bcast_gpcpll_ndiv_slowdown_debug_pll_dynramp_done_synced_v(u32 r)
+{
+        return (r >> 24) & 0x1;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/kind_gk20a.c b/drivers/gpu/nvgpu/gk20a/kind_gk20a.c
new file mode 100644
index 00000000..b0a74056
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/kind_gk20a.c
@@ -0,0 +1,424 @@
+/*
+ * drivers/video/tegra/host/gk20a/kind_gk20a.c
+ *
+ * GK20A memory kind management
+ *
+ * Copyright (c) 2011, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/bitops.h>
+#include "hw_gmmu_gk20a.h"
+#include "kind_gk20a.h"
+/* TBD: generate these from kind_macros.h */
+/* TBD: not sure on the work creation for gk20a, doubtful */
+static inline bool gk20a_kind_work_creation_sked(u8 k)
+{
+        return false;
+}
+static inline bool gk20a_kind_work_creation_host(u8 k)
+{
+        return false;
+}
+static inline bool gk20a_kind_work_creation(u8 k)
+{
+        return gk20a_kind_work_creation_sked(k) ||
+                gk20a_kind_work_creation_host(k);
+}
+/* note: taken from the !2cs_compression case */
+static inline bool gk20a_kind_supported(u8 k)
+{
+        return gk20a_kind_work_creation(k) ||
+                (k == gmmu_pte_kind_invalid_v()) ||
+                (k == gmmu_pte_kind_pitch_v()) ||
+                (k >= gmmu_pte_kind_z16_v() &&
+                 k <= gmmu_pte_kind_z16_ms8_2c_v()) ||
+                (k >= gmmu_pte_kind_z16_2z_v() &&
+                 k <= gmmu_pte_kind_z16_ms8_2z_v()) ||
+                (k == gmmu_pte_kind_s8z24_v()) ||
+                (k >= gmmu_pte_kind_s8z24_2cz_v() &&
+                 k <= gmmu_pte_kind_s8z24_ms8_2cz_v()) ||
+                (k >= gmmu_pte_kind_v8z24_ms4_vc12_v() &&
+                 k <= gmmu_pte_kind_v8z24_ms8_vc24_v()) ||
+                (k >= gmmu_pte_kind_v8z24_ms4_vc12_2czv_v() &&
+                 k <= gmmu_pte_kind_v8z24_ms8_vc24_2zv_v()) ||
+                (k == gmmu_pte_kind_z24s8_v()) ||
+                (k >= gmmu_pte_kind_z24s8_2cz_v() &&
+                 k <= gmmu_pte_kind_z24s8_ms8_2cz_v()) ||
+                (k == gmmu_pte_kind_zf32_v()) ||
+                (k >= gmmu_pte_kind_zf32_2cz_v() &&
+                 k <= gmmu_pte_kind_zf32_ms8_2cz_v()) ||
+                (k >= gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_v() &&
+                 k <= gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_v()) ||
+                (k >= gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_2cszv_v() &&
+                 k <= gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_v()) ||
+                (k >= gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_2cszv_v() &&
+                 k <= gmmu_pte_kind_zf32_x24s8_v()) ||
+                (k >= gmmu_pte_kind_zf32_x24s8_2cszv_v() &&
+                 k <= gmmu_pte_kind_zf32_x24s8_ms8_2cszv_v()) ||
+                (k == gmmu_pte_kind_generic_16bx2_v()) ||
+                (k == gmmu_pte_kind_c32_2c_v()) ||
+                (k == gmmu_pte_kind_c32_2cra_v()) ||
+                (k == gmmu_pte_kind_c32_ms2_2c_v()) ||
+                (k == gmmu_pte_kind_c32_ms2_2cra_v()) ||
+                (k >= gmmu_pte_kind_c32_ms4_2c_v() &&
+                 k <= gmmu_pte_kind_c32_ms4_2cbr_v()) ||
+                (k >= gmmu_pte_kind_c32_ms4_2cra_v() &&
+                 k <= gmmu_pte_kind_c64_2c_v()) ||
+                (k == gmmu_pte_kind_c64_2cra_v()) ||
+                (k == gmmu_pte_kind_c64_ms2_2c_v()) ||
+                (k == gmmu_pte_kind_c64_ms2_2cra_v()) ||
+                (k >= gmmu_pte_kind_c64_ms4_2c_v() &&
+                 k <= gmmu_pte_kind_c64_ms4_2cbr_v()) ||
+                (k >= gmmu_pte_kind_c64_ms4_2cra_v() &&
+                 k <= gmmu_pte_kind_c128_ms8_ms16_2cr_v()) ||
+                (k == gmmu_pte_kind_pitch_no_swizzle_v());
+                }
+static inline bool gk20a_kind_z(u8 k)
+{
+        return (k >= gmmu_pte_kind_z16_v() &&
+                k <= gmmu_pte_kind_v8z24_ms8_vc24_v()) ||
+                (k >= gmmu_pte_kind_v8z24_ms4_vc12_1zv_v() &&
+                 k <= gmmu_pte_kind_v8z24_ms8_vc24_2cs_v()) ||
+                (k >= gmmu_pte_kind_v8z24_ms4_vc12_2czv_v() &&
+                 k <= gmmu_pte_kind_z24v8_ms8_vc24_v()) ||
+                (k >= gmmu_pte_kind_z24v8_ms4_vc12_1zv_v() &&
+                 k <= gmmu_pte_kind_z24v8_ms8_vc24_2cs_v()) ||
+                (k >= gmmu_pte_kind_z24v8_ms4_vc12_2czv_v() &&
+                 k <= gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_1cs_v()) ||
+                (k >= gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_1zv_v() &&
+                 k <= gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_1cs_v()) ||
+                (k >= gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_1zv_v() &&
+                 k <= gmmu_pte_kind_zf32_x24s8_ms16_1cs_v())
+                /* ||
+                (k >= gmmu_pte_kind_zv32_x24s8_2cszv_v() &&
+                k <= gmmu_pte_kind_xf32_x24s8_ms16_2cs_v())*/;
+}
+static inline bool gk20a_kind_c(u8 k)
+{
+        return gk20a_kind_work_creation(k) ||
+                (k == gmmu_pte_kind_pitch_v()) ||
+                (k == gmmu_pte_kind_generic_16bx2_v()) ||
+                (k >= gmmu_pte_kind_c32_2c_v() &&
+                 k <= gmmu_pte_kind_c32_ms2_2cbr_v()) ||
+                (k == gmmu_pte_kind_c32_ms2_2cra_v()) ||
+                (k >= gmmu_pte_kind_c32_ms4_2c_v() &&
+                 k <= gmmu_pte_kind_c64_ms2_2cbr_v()) ||
+                (k == gmmu_pte_kind_c64_ms2_2cra_v()) ||
+                (k >= gmmu_pte_kind_c64_ms4_2c_v() &&
+                 k <= gmmu_pte_kind_pitch_no_swizzle_v());
+}
+static inline bool gk20a_kind_compressible(u8 k)
+{
+        return (k >= gmmu_pte_kind_z16_2c_v() &&
+                k <= gmmu_pte_kind_z16_ms16_4cz_v()) ||
+                (k >= gmmu_pte_kind_s8z24_1z_v() &&
+                 k <= gmmu_pte_kind_s8z24_ms16_4cszv_v()) ||
+                (k >= gmmu_pte_kind_v8z24_ms4_vc12_1zv_v() &&
+                 k <= gmmu_pte_kind_v8z24_ms8_vc24_2cs_v()) ||
+                (k >= gmmu_pte_kind_v8z24_ms4_vc12_2czv_v() &&
+                 k <= gmmu_pte_kind_v8z24_ms8_vc24_4cszv_v()) ||
+                (k >= gmmu_pte_kind_z24s8_1z_v() &&
+                 k <= gmmu_pte_kind_z24s8_ms16_4cszv_v()) ||
+                (k >= gmmu_pte_kind_z24v8_ms4_vc12_1zv_v() &&
+                 k <= gmmu_pte_kind_z24v8_ms8_vc24_2cs_v()) ||
+                (k >= gmmu_pte_kind_z24v8_ms4_vc12_2czv_v() &&
+                 k <= gmmu_pte_kind_z24v8_ms8_vc24_4cszv_v()) ||
+                (k >= gmmu_pte_kind_zf32_1z_v() &&
+                 k <= gmmu_pte_kind_zf32_ms16_2cz_v()) ||
+                (k >= gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_1cs_v() &&
+                 k <= gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_1cs_v()) ||
+                (k >= gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_1zv_v() &&
+                 k <= gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_2cszv_v()) ||
+                (k >= gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_1cs_v() &&
+                 k <= gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_1cs_v()) ||
+                (k >= gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_1zv_v() &&
+                 k <= gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_2cszv_v()) ||
+                (k >= gmmu_pte_kind_zf32_x24s8_1cs_v() &&
+                 k <= gmmu_pte_kind_zf32_x24s8_ms16_1cs_v()) ||
+                (k >= gmmu_pte_kind_zf32_x24s8_2cszv_v() &&
+                 k <= gmmu_pte_kind_c32_ms2_2cbr_v()) ||
+                (k == gmmu_pte_kind_c32_ms2_2cra_v()) ||
+                (k >= gmmu_pte_kind_c32_ms4_2c_v() &&
+                 k <= gmmu_pte_kind_c64_ms2_2cbr_v()) ||
+                (k == gmmu_pte_kind_c64_ms2_2cra_v()) ||
+                (k >= gmmu_pte_kind_c64_ms4_2c_v() &&
+                 k <= gmmu_pte_kind_c128_ms8_ms16_2cr_v());
+}
+static inline bool gk20a_kind_zbc(u8 k)
+{
+        return (k >= gmmu_pte_kind_z16_2c_v() &&
+                k <= gmmu_pte_kind_z16_ms16_2c_v()) ||
+                (k >= gmmu_pte_kind_z16_4cz_v() &&
+                 k <= gmmu_pte_kind_z16_ms16_4cz_v()) ||
+                (k >= gmmu_pte_kind_s8z24_2cz_v() &&
+                 k <= gmmu_pte_kind_s8z24_ms16_4cszv_v()) ||
+                (k >= gmmu_pte_kind_v8z24_ms4_vc12_2cs_v() &&
+                 k <= gmmu_pte_kind_v8z24_ms8_vc24_2cs_v()) ||
+                (k >= gmmu_pte_kind_v8z24_ms4_vc12_2czv_v() &&
+                 k <= gmmu_pte_kind_v8z24_ms8_vc24_2czv_v()) ||
+                (k >= gmmu_pte_kind_v8z24_ms4_vc12_4cszv_v() &&
+                 k <= gmmu_pte_kind_v8z24_ms8_vc24_4cszv_v()) ||
+                (k >= gmmu_pte_kind_z24s8_2cs_v() &&
+                 k <= gmmu_pte_kind_z24s8_ms16_4cszv_v()) ||
+                (k >= gmmu_pte_kind_z24v8_ms4_vc12_2cs_v() &&
+                 k <= gmmu_pte_kind_z24v8_ms8_vc24_2cs_v()) ||
+                (k >= gmmu_pte_kind_z24v8_ms4_vc12_2czv_v() &&
+                 k <= gmmu_pte_kind_z24v8_ms8_vc24_2czv_v()) ||
+                (k >= gmmu_pte_kind_z24v8_ms4_vc12_4cszv_v() &&
+                 k <= gmmu_pte_kind_z24v8_ms8_vc24_4cszv_v()) ||
+                (k >= gmmu_pte_kind_zf32_2cs_v() &&
+                 k <= gmmu_pte_kind_zf32_ms16_2cz_v()) ||
+                (k >= gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_1cs_v() &&
+                 k <= gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_1cs_v()) ||
+                (k >= gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_1czv_v() &&
+                 k <= gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_2cszv_v()) ||
+                (k >= gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_1cs_v() &&
+                 k <= gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_1cs_v()) ||
+                (k >= gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_1czv_v() &&
+                 k <= gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_2cszv_v()) ||
+                (k >= gmmu_pte_kind_zf32_x24s8_1cs_v() &&
+                 k <= gmmu_pte_kind_zf32_x24s8_ms16_1cs_v()) ||
+                (k >= gmmu_pte_kind_zf32_x24s8_2cszv_v() &&
+                 k <= gmmu_pte_kind_c32_2cra_v()) ||
+                (k >= gmmu_pte_kind_c32_ms2_2c_v() &&
+                 k <= gmmu_pte_kind_c32_ms2_2cbr_v())  ||
+                (k == gmmu_pte_kind_c32_ms2_2cra_v()) ||
+                (k >= gmmu_pte_kind_c32_ms4_2c_v() &&
+                 k <= gmmu_pte_kind_c32_ms4_2cra_v()) ||
+                (k >= gmmu_pte_kind_c32_ms8_ms16_2c_v() &&
+                 k <= gmmu_pte_kind_c64_2cra_v()) ||
+                (k >= gmmu_pte_kind_c64_ms2_2c_v() &&
+                 k <= gmmu_pte_kind_c64_ms2_2cbr_v()) ||
+                (k == gmmu_pte_kind_c64_ms2_2cra_v()) ||
+                (k >= gmmu_pte_kind_c64_ms4_2c_v() &&
+                 k <= gmmu_pte_kind_c64_ms4_2cra_v()) ||
+                (k >= gmmu_pte_kind_c64_ms8_ms16_2c_v() &&
+                 k <= gmmu_pte_kind_c128_ms8_ms16_2cr_v());
+}
+u8 gk20a_uc_kind_map[256];
+void gk20a_init_uncompressed_kind_map(void)
+{
+        int i;
+        for (i = 0; i < 256; i++)
+                gk20a_uc_kind_map[i] = gmmu_pte_kind_invalid_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_z16_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z16_2c_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z16_ms2_2c_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z16_ms4_2c_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z16_ms8_2c_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z16_2z_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z16_ms2_2z_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z16_ms4_2z_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z16_ms8_2z_v()] =
+                gmmu_pte_kind_z16_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_s8z24_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_s8z24_2cz_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_s8z24_ms2_2cz_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_s8z24_ms4_2cz_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_s8z24_ms8_2cz_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_s8z24_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_s8z24_ms2_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_s8z24_ms4_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_s8z24_ms8_2cs_v()] =
+                gmmu_pte_kind_s8z24_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms4_vc4_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms4_vc4_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms4_vc4_2czv_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms4_vc4_2zv_v()] =
+                gmmu_pte_kind_v8z24_ms4_vc4_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms8_vc8_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms8_vc8_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms8_vc8_2czv_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms8_vc8_2zv_v()] =
+                gmmu_pte_kind_v8z24_ms8_vc8_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms4_vc12_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms4_vc12_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms4_vc12_2czv_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms4_vc12_2zv_v()] =
+                gmmu_pte_kind_v8z24_ms4_vc12_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms8_vc24_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms8_vc24_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms8_vc24_2czv_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms8_vc24_2zv_v()] =
+                gmmu_pte_kind_v8z24_ms8_vc24_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_z24s8_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z24s8_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z24s8_ms2_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z24s8_ms4_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z24s8_ms8_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z24s8_2cz_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z24s8_ms2_2cz_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z24s8_ms4_2cz_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z24s8_ms8_2cz_v()] =
+                gmmu_pte_kind_z24s8_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_ms2_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_ms4_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_ms8_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_2cz_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_ms2_2cz_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_ms4_2cz_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_ms8_2cz_v()] =
+                gmmu_pte_kind_zf32_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_2cszv_v()] =
+                gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_2cszv_v()] =
+                gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_2cszv_v()] =
+                gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_2cszv_v()] =
+                gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_2cszv_v()] =
+                gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_2cszv_v()] =
+                gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_2cszv_v()] =
+                gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_2cszv_v()] =
+                gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_2cszv_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_ms2_2cszv_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_ms4_2cszv_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_ms8_2cszv_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_ms2_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_ms4_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_ms8_2cs_v()] =
+                gmmu_pte_kind_zf32_x24s8_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_c32_2c_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c32_2cba_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c32_2cra_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c32_2bra_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c32_ms2_2c_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c32_ms2_2cra_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c32_ms4_2c_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c32_ms4_2cbr_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c32_ms4_2cba_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c32_ms4_2cra_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c32_ms4_2bra_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c32_ms8_ms16_2c_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c32_ms8_ms16_2cra_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c64_2c_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c64_2cbr_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c64_2cba_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c64_2cra_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c64_2bra_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c64_ms2_2c_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c64_ms2_2cra_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c64_ms4_2c_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c64_ms4_2cbr_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c64_ms4_2cba_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c64_ms4_2cra_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c64_ms4_2bra_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c64_ms8_ms16_2c_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c64_ms8_ms16_2cra_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c128_2c_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c128_2cr_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c128_ms2_2c_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c128_ms2_2cr_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c128_ms4_2c_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c128_ms4_2cr_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c128_ms8_ms16_2c_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_c128_ms8_ms16_2cr_v()] =
+                gmmu_pte_kind_generic_16bx2_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms4_vc4_2czv_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms4_vc4_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms4_vc4_2zv_v()] =
+                gmmu_pte_kind_z24v8_ms4_vc4_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms4_vc12_2czv_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms4_vc12_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms4_vc12_2zv_v()] =
+                gmmu_pte_kind_z24v8_ms4_vc12_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms8_vc8_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms8_vc8_2czv_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms8_vc8_2zv_v()] =
+                gmmu_pte_kind_z24v8_ms8_vc8_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms8_vc24_2cs_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms8_vc24_2czv_v()] =
+        gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms8_vc24_2zv_v()] =
+                gmmu_pte_kind_z24v8_ms8_vc24_v();
+        gk20a_uc_kind_map[gmmu_pte_kind_x8c24_v()] =
+                gmmu_pte_kind_x8c24_v();
+}
+u16 gk20a_kind_attr[256];
+void gk20a_init_kind_attr(void)
+{
+        u16 k;
+        for (k = 0; k < 256; k++) {
+                gk20a_kind_attr[k] = 0;
+                if (gk20a_kind_supported((u8)k))
+                        gk20a_kind_attr[k] |= GK20A_KIND_ATTR_SUPPORTED;
+                if (gk20a_kind_compressible((u8)k))
+                        gk20a_kind_attr[k] |= GK20A_KIND_ATTR_COMPRESSIBLE;
+                if (gk20a_kind_z((u8)k))
+                        gk20a_kind_attr[k] |= GK20A_KIND_ATTR_Z;
+                if (gk20a_kind_c((u8)k))
+                        gk20a_kind_attr[k] |= GK20A_KIND_ATTR_C;
+                if (gk20a_kind_zbc((u8)k))
+                        gk20a_kind_attr[k] |= GK20A_KIND_ATTR_ZBC;
+        }
+}
diff --git a/drivers/gpu/nvgpu/gk20a/kind_gk20a.h b/drivers/gpu/nvgpu/gk20a/kind_gk20a.h
new file mode 100644
index 00000000..93f011d4
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/kind_gk20a.h
@@ -0,0 +1,67 @@
+/*
+ * drivers/video/tegra/host/gk20a/kind_gk20a.h
+ *
+ * GK20A memory kind management
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __KIND_GK20A_H__
+#define __KIND_GK20A_H__
+void gk20a_init_uncompressed_kind_map(void);
+void gk20a_init_kind_attr(void);
+extern u16 gk20a_kind_attr[];
+#define NV_KIND_DEFAULT              -1
+#define GK20A_KIND_ATTR_SUPPORTED    BIT(0)
+#define GK20A_KIND_ATTR_COMPRESSIBLE BIT(1)
+#define GK20A_KIND_ATTR_Z            BIT(2)
+#define GK20A_KIND_ATTR_C            BIT(3)
+#define GK20A_KIND_ATTR_ZBC          BIT(4)
+static inline bool gk20a_kind_is_supported(u8 k)
+{
+        return !!(gk20a_kind_attr[k] & GK20A_KIND_ATTR_SUPPORTED);
+}
+static inline bool gk20a_kind_is_compressible(u8 k)
+{
+        return !!(gk20a_kind_attr[k] & GK20A_KIND_ATTR_COMPRESSIBLE);
+}
+static inline bool gk20a_kind_is_z(u8 k)
+{
+        return !!(gk20a_kind_attr[k] & GK20A_KIND_ATTR_Z);
+}
+static inline bool gk20a_kind_is_c(u8 k)
+{
+        return !!(gk20a_kind_attr[k] & GK20A_KIND_ATTR_C);
+}
+static inline bool gk20a_kind_is_zbc(u8 k)
+{
+        return !!(gk20a_kind_attr[k] & GK20A_KIND_ATTR_ZBC);
+}
+/* maps kind to its uncompressed version */
+extern u8 gk20a_uc_kind_map[];
+static inline u8 gk20a_get_uncompressed_kind(u8 k)
+{
+        return gk20a_uc_kind_map[k];
+}
+#endif /* __KIND_GK20A_H__ */
diff --git a/drivers/gpu/nvgpu/gk20a/ltc_common.c b/drivers/gpu/nvgpu/gk20a/ltc_common.c
new file mode 100644
index 00000000..cbb27cc7
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/ltc_common.c
@@ -0,0 +1,243 @@
+/*
+ * drivers/video/tegra/host/gk20a/ltc_common.c
+ *
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/dma-mapping.h>
+#include <linux/delay.h>
+#include "gk20a.h"
+#include "gr_gk20a.h"
+static int gk20a_determine_L2_size_bytes(struct gk20a *g)
+{
+        const u32 gpuid = GK20A_GPUID(g->gpu_characteristics.arch,
+                                      g->gpu_characteristics.impl);
+        u32 lts_per_ltc;
+        u32 ways;
+        u32 sets;
+        u32 bytes_per_line;
+        u32 active_ltcs;
+        u32 cache_size;
+        u32 tmp;
+        u32 active_sets_value;
+        tmp = gk20a_readl(g, ltc_ltc0_lts0_tstg_cfg1_r());
+        ways = hweight32(ltc_ltc0_lts0_tstg_cfg1_active_ways_v(tmp));
+        active_sets_value = ltc_ltc0_lts0_tstg_cfg1_active_sets_v(tmp);
+        if (active_sets_value == ltc_ltc0_lts0_tstg_cfg1_active_sets_all_v()) {
+                sets = 64;
+        } else if (active_sets_value ==
+                 ltc_ltc0_lts0_tstg_cfg1_active_sets_half_v()) {
+                sets = 32;
+        } else if (active_sets_value ==
+                 ltc_ltc0_lts0_tstg_cfg1_active_sets_quarter_v()) {
+                sets = 16;
+        } else {
+                dev_err(dev_from_gk20a(g),
+                        "Unknown constant %u for active sets",
+                       (unsigned)active_sets_value);
+                sets = 0;
+        }
+        active_ltcs = g->gr.num_fbps;
+        /* chip-specific values */
+        switch (gpuid) {
+        case GK20A_GPUID_GK20A:
+                lts_per_ltc = 1;
+                bytes_per_line = 128;
+                break;
+        default:
+                dev_err(dev_from_gk20a(g), "Unknown GPU id 0x%02x\n",
+                        (unsigned)gpuid);
+                lts_per_ltc = 0;
+                bytes_per_line = 0;
+        }
+        cache_size = active_ltcs * lts_per_ltc * ways * sets * bytes_per_line;
+        return cache_size;
+}
+/*
+ * Set the maximum number of ways that can have the "EVIST_LAST" class.
+ */
+static void gk20a_ltc_set_max_ways_evict_last(struct gk20a *g, u32 max_ways)
+{
+        u32 mgmt_reg;
+        mgmt_reg = gk20a_readl(g, ltc_ltcs_ltss_tstg_set_mgmt_r()) &
+                ~ltc_ltcs_ltss_tstg_set_mgmt_max_ways_evict_last_f(~0);
+        mgmt_reg |= ltc_ltcs_ltss_tstg_set_mgmt_max_ways_evict_last_f(max_ways);
+        gk20a_writel(g, ltc_ltcs_ltss_tstg_set_mgmt_r(), mgmt_reg);
+}
+/*
+ * Sets the ZBC color for the passed index.
+ */
+static void gk20a_ltc_set_zbc_color_entry(struct gk20a *g,
+                                          struct zbc_entry *color_val,
+                                          u32 index)
+{
+        u32 i;
+        u32 real_index = index + GK20A_STARTOF_ZBC_TABLE;
+        gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
+                     ltc_ltcs_ltss_dstg_zbc_index_address_f(real_index));
+        for (i = 0;
+             i < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); i++)
+                gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(i),
+                             color_val->color_l2[i]);
+}
+/*
+ * Sets the ZBC depth for the passed index.
+ */
+static void gk20a_ltc_set_zbc_depth_entry(struct gk20a *g,
+                                          struct zbc_entry *depth_val,
+                                          u32 index)
+{
+        u32 real_index = index + GK20A_STARTOF_ZBC_TABLE;
+        gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
+                     ltc_ltcs_ltss_dstg_zbc_index_address_f(real_index));
+        gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(),
+                     depth_val->depth);
+}
+/*
+ * Clear the L2 ZBC color table for the passed index.
+ */
+static void gk20a_ltc_clear_zbc_color_entry(struct gk20a *g, u32 index)
+{
+        u32 i;
+        u32 real_index = index + GK20A_STARTOF_ZBC_TABLE;
+        gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
+                     ltc_ltcs_ltss_dstg_zbc_index_address_f(real_index));
+        for (i = 0;
+             i < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); i++)
+                gk20a_writel(g,
+                             ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(i), 0);
+}
+/*
+ * Clear the L2 ZBC depth entry for the passed index.
+ */
+static void gk20a_ltc_clear_zbc_depth_entry(struct gk20a *g, u32 index)
+{
+        u32 real_index = index + GK20A_STARTOF_ZBC_TABLE;
+        gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
+                     ltc_ltcs_ltss_dstg_zbc_index_address_f(real_index));
+        gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(), 0);
+}
+static int gk20a_ltc_init_zbc(struct gk20a *g, struct gr_gk20a *gr)
+{
+        u32 i, j;
+        /* reset zbc clear */
+        for (i = 0; i < GK20A_SIZEOF_ZBC_TABLE -
+            GK20A_STARTOF_ZBC_TABLE; i++) {
+                gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
+                        (gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
+                         ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
+                                ltc_ltcs_ltss_dstg_zbc_index_address_f(
+                                        i + GK20A_STARTOF_ZBC_TABLE));
+                for (j = 0; j < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); j++)
+                        gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(j), 0);
+                gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(), 0);
+        }
+        gr_gk20a_clear_zbc_table(g, gr);
+        gr_gk20a_load_zbc_default_table(g, gr);
+        return 0;
+}
+static void gk20a_ltc_init_cbc(struct gk20a *g, struct gr_gk20a *gr)
+{
+        u32 compbit_base_post_divide;
+        u64 compbit_base_post_multiply64;
+        u64 compbit_store_base_iova =
+                NV_MC_SMMU_VADDR_TRANSLATE(gr->compbit_store.base_iova);
+        u64 compbit_base_post_divide64 = (compbit_store_base_iova >>
+                ltc_ltcs_ltss_cbc_base_alignment_shift_v());
+        do_div(compbit_base_post_divide64, gr->num_fbps);
+        compbit_base_post_divide = u64_lo32(compbit_base_post_divide64);
+        compbit_base_post_multiply64 = ((u64)compbit_base_post_divide *
+                gr->num_fbps) << ltc_ltcs_ltss_cbc_base_alignment_shift_v();
+        if (compbit_base_post_multiply64 < compbit_store_base_iova)
+                compbit_base_post_divide++;
+        gk20a_writel(g, ltc_ltcs_ltss_cbc_base_r(),
+                compbit_base_post_divide);
+        gk20a_dbg(gpu_dbg_info | gpu_dbg_map | gpu_dbg_pte,
+                   "compbit base.pa: 0x%x,%08x cbc_base:0x%08x\n",
+                   (u32)(compbit_store_base_iova >> 32),
+                   (u32)(compbit_store_base_iova & 0xffffffff),
+                   compbit_base_post_divide);
+}
+/* Flushes the compression bit cache as well as "data".
+ * Note: the name here is a bit of a misnomer.  ELPG uses this
+ * internally... but ELPG doesn't have to be on to do it manually.
+ */
+static void gk20a_mm_g_elpg_flush_locked(struct gk20a *g)
+{
+        u32 data;
+        s32 retry = 100;
+        gk20a_dbg_fn("");
+        /* Make sure all previous writes are committed to the L2. There's no
+           guarantee that writes are to DRAM. This will be a sysmembar internal
+           to the L2. */
+        gk20a_writel(g, ltc_ltss_g_elpg_r(),
+                     ltc_ltss_g_elpg_flush_pending_f());
+        do {
+                data = gk20a_readl(g, ltc_ltss_g_elpg_r());
+                if (ltc_ltss_g_elpg_flush_v(data) ==
+                    ltc_ltss_g_elpg_flush_pending_v()) {
+                        gk20a_dbg_info("g_elpg_flush 0x%x", data);
+                        retry--;
+                        usleep_range(20, 40);
+                } else
+                        break;
+        } while (retry >= 0 || !tegra_platform_is_silicon());
+        if (retry < 0)
+                gk20a_warn(dev_from_gk20a(g),
+                            "g_elpg_flush too many retries");
+}
diff --git a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
new file mode 100644
index 00000000..08aedecd
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
@@ -0,0 +1,203 @@
+/*
+ * drivers/video/tegra/host/gk20a/ltc_gk20a.c
+ *
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/kernel.h>
+#include "hw_ltc_gk20a.h"
+#include "hw_proj_gk20a.h"
+#include "ltc_common.c"
+static int gk20a_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
+{
+        struct device *d = dev_from_gk20a(g);
+        DEFINE_DMA_ATTRS(attrs);
+        dma_addr_t iova;
+        /* max memory size (MB) to cover */
+        u32 max_size = gr->max_comptag_mem;
+        /* one tag line covers 128KB */
+        u32 max_comptag_lines = max_size << 3;
+        u32 hw_max_comptag_lines =
+                ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_init_v();
+        u32 cbc_param =
+                gk20a_readl(g, ltc_ltcs_ltss_cbc_param_r());
+        u32 comptags_per_cacheline =
+                ltc_ltcs_ltss_cbc_param_comptags_per_cache_line_v(cbc_param);
+        u32 slices_per_fbp =
+                ltc_ltcs_ltss_cbc_param_slices_per_fbp_v(cbc_param);
+        u32 cacheline_size =
+                512 << ltc_ltcs_ltss_cbc_param_cache_line_size_v(cbc_param);
+        u32 compbit_backing_size;
+        gk20a_dbg_fn("");
+        if (max_comptag_lines == 0) {
+                gr->compbit_store.size = 0;
+                return 0;
+        }
+        if (max_comptag_lines > hw_max_comptag_lines)
+                max_comptag_lines = hw_max_comptag_lines;
+        /* no hybird fb */
+        compbit_backing_size =
+                DIV_ROUND_UP(max_comptag_lines, comptags_per_cacheline) *
+                cacheline_size * slices_per_fbp * gr->num_fbps;
+        /* aligned to 2KB * num_fbps */
+        compbit_backing_size +=
+                gr->num_fbps << ltc_ltcs_ltss_cbc_base_alignment_shift_v();
+        /* must be a multiple of 64KB */
+        compbit_backing_size = roundup(compbit_backing_size, 64*1024);
+        max_comptag_lines =
+                (compbit_backing_size * comptags_per_cacheline) /
+                cacheline_size * slices_per_fbp * gr->num_fbps;
+        if (max_comptag_lines > hw_max_comptag_lines)
+                max_comptag_lines = hw_max_comptag_lines;
+        gk20a_dbg_info("compbit backing store size : %d",
+                compbit_backing_size);
+        gk20a_dbg_info("max comptag lines : %d",
+                max_comptag_lines);
+        dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+        gr->compbit_store.size = compbit_backing_size;
+        gr->compbit_store.pages = dma_alloc_attrs(d, gr->compbit_store.size,
+                                        &iova, GFP_KERNEL, &attrs);
+        if (!gr->compbit_store.pages) {
+                gk20a_err(dev_from_gk20a(g), "failed to allocate"
+                           "backing store for compbit : size %d",
+                           compbit_backing_size);
+                return -ENOMEM;
+        }
+        gr->compbit_store.base_iova = iova;
+        gk20a_allocator_init(&gr->comp_tags, "comptag",
+                              1, /* start */
+                              max_comptag_lines - 1, /* length*/
+                              1); /* align */
+        return 0;
+}
+static int gk20a_ltc_clear_comptags(struct gk20a *g, u32 min, u32 max)
+{
+        struct gr_gk20a *gr = &g->gr;
+        u32 fbp, slice, ctrl1, val;
+        unsigned long end_jiffies = jiffies +
+                msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+        u32 delay = GR_IDLE_CHECK_DEFAULT;
+        u32 slices_per_fbp =
+                ltc_ltcs_ltss_cbc_param_slices_per_fbp_v(
+                        gk20a_readl(g, ltc_ltcs_ltss_cbc_param_r()));
+        gk20a_dbg_fn("");
+        if (gr->compbit_store.size == 0)
+                return 0;
+        gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl2_r(),
+                     ltc_ltcs_ltss_cbc_ctrl2_clear_lower_bound_f(min));
+        gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl3_r(),
+                     ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_f(max));
+        gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl1_r(),
+                     gk20a_readl(g, ltc_ltcs_ltss_cbc_ctrl1_r()) |
+                     ltc_ltcs_ltss_cbc_ctrl1_clear_active_f());
+        for (fbp = 0; fbp < gr->num_fbps; fbp++) {
+                for (slice = 0; slice < slices_per_fbp; slice++) {
+                        delay = GR_IDLE_CHECK_DEFAULT;
+                        ctrl1 = ltc_ltc0_lts0_cbc_ctrl1_r() +
+                                fbp * proj_ltc_stride_v() +
+                                slice * proj_lts_stride_v();
+                        do {
+                                val = gk20a_readl(g, ctrl1);
+                                if (ltc_ltcs_ltss_cbc_ctrl1_clear_v(val) !=
+                                    ltc_ltcs_ltss_cbc_ctrl1_clear_active_v())
+                                        break;
+                                usleep_range(delay, delay * 2);
+                                delay = min_t(u32, delay << 1,
+                                        GR_IDLE_CHECK_MAX);
+                        } while (time_before(jiffies, end_jiffies) ||
+                                        !tegra_platform_is_silicon());
+                        if (!time_before(jiffies, end_jiffies)) {
+                                gk20a_err(dev_from_gk20a(g),
+                                           "comp tag clear timeout\n");
+                                return -EBUSY;
+                        }
+                }
+        }
+        return 0;
+}
+#ifdef CONFIG_DEBUG_FS
+static void gk20a_ltc_sync_debugfs(struct gk20a *g)
+{
+        u32 reg_f = ltc_ltcs_ltss_tstg_set_mgmt_2_l2_bypass_mode_enabled_f();
+        spin_lock(&g->debugfs_lock);
+        if (g->mm.ltc_enabled != g->mm.ltc_enabled_debug) {
+                u32 reg = gk20a_readl(g, ltc_ltcs_ltss_tstg_set_mgmt_2_r());
+                if (g->mm.ltc_enabled_debug)
+                        /* bypass disabled (normal caching ops)*/
+                        reg &= ~reg_f;
+                else
+                        /* bypass enabled (no caching) */
+                        reg |= reg_f;
+                gk20a_writel(g, ltc_ltcs_ltss_tstg_set_mgmt_2_r(), reg);
+                g->mm.ltc_enabled = g->mm.ltc_enabled_debug;
+        }
+        spin_unlock(&g->debugfs_lock);
+}
+#endif
+void gk20a_init_ltc(struct gpu_ops *gops)
+{
+        gops->ltc.determine_L2_size_bytes = gk20a_determine_L2_size_bytes;
+        gops->ltc.set_max_ways_evict_last = gk20a_ltc_set_max_ways_evict_last;
+        gops->ltc.init_comptags = gk20a_ltc_init_comptags;
+        gops->ltc.clear_comptags = gk20a_ltc_clear_comptags;
+        gops->ltc.set_zbc_color_entry = gk20a_ltc_set_zbc_color_entry;
+        gops->ltc.set_zbc_depth_entry = gk20a_ltc_set_zbc_depth_entry;
+        gops->ltc.clear_zbc_color_entry = gk20a_ltc_clear_zbc_color_entry;
+        gops->ltc.clear_zbc_depth_entry = gk20a_ltc_clear_zbc_depth_entry;
+        gops->ltc.init_zbc = gk20a_ltc_init_zbc;
+        gops->ltc.init_cbc = gk20a_ltc_init_cbc;
+#ifdef CONFIG_DEBUG_FS
+        gops->ltc.sync_debugfs = gk20a_ltc_sync_debugfs;
+#endif
+        gops->ltc.elpg_flush = gk20a_mm_g_elpg_flush_locked;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.h b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.h
new file mode 100644
index 00000000..208811b2
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.h
@@ -0,0 +1,21 @@
+/*
+ * GK20A L2
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef _NVHOST_GK20A_LTC
+#define _NVHOST_GK20A_LTC
+struct gk20a;
+void gk20a_init_ltc(struct gpu_ops *gops);
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
new file mode 100644
index 00000000..b22df5e8
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -0,0 +1,2984 @@
+/*
+ * drivers/video/tegra/host/gk20a/mm_gk20a.c
+ *
+ * GK20A memory management
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/delay.h>
+#include <linux/highmem.h>
+#include <linux/log2.h>
+#include <linux/nvhost.h>
+#include <linux/pm_runtime.h>
+#include <linux/scatterlist.h>
+#include <linux/nvmap.h>
+#include <linux/tegra-soc.h>
+#include <linux/vmalloc.h>
+#include <linux/dma-buf.h>
+#include <asm/cacheflush.h>
+#include "gk20a.h"
+#include "mm_gk20a.h"
+#include "hw_gmmu_gk20a.h"
+#include "hw_fb_gk20a.h"
+#include "hw_bus_gk20a.h"
+#include "hw_ram_gk20a.h"
+#include "hw_mc_gk20a.h"
+#include "hw_flush_gk20a.h"
+#include "hw_ltc_gk20a.h"
+#include "kind_gk20a.h"
+#ifdef CONFIG_ARM64
+#define outer_flush_range(a, b)
+#define __cpuc_flush_dcache_area __flush_dcache_area
+#endif
+/*
+ * GPU mapping life cycle
+ * ======================
+ *
+ * Kernel mappings
+ * ---------------
+ *
+ * Kernel mappings are created through vm.map(..., false):
+ *
+ *  - Mappings to the same allocations are reused and refcounted.
+ *  - This path does not support deferred unmapping (i.e. kernel must wait for
+ *    all hw operations on the buffer to complete before unmapping).
+ *  - References to dmabuf are owned and managed by the (kernel) clients of
+ *    the gk20a_vm layer.
+ *
+ *
+ * User space mappings
+ * -------------------
+ *
+ * User space mappings are created through as.map_buffer -> vm.map(..., true):
+ *
+ *  - Mappings to the same allocations are reused and refcounted.
+ *  - This path supports deferred unmapping (i.e. we delay the actual unmapping
+ *    until all hw operations have completed).
+ *  - References to dmabuf are owned and managed by the vm_gk20a
+ *    layer itself. vm.map acquires these refs, and sets
+ *    mapped_buffer->own_mem_ref to record that we must release the refs when we
+ *    actually unmap.
+ *
+ */
+static inline int vm_aspace_id(struct vm_gk20a *vm)
+{
+        /* -1 is bar1 or pmu, etc. */
+        return vm->as_share ? vm->as_share->id : -1;
+}
+static inline u32 hi32(u64 f)
+{
+        return (u32)(f >> 32);
+}
+static inline u32 lo32(u64 f)
+{
+        return (u32)(f & 0xffffffff);
+}
+#define FLUSH_CPU_DCACHE(va, pa, size)  \
+        do {    \
+                __cpuc_flush_dcache_area((void *)(va), (size_t)(size)); \
+                outer_flush_range(pa, pa + (size_t)(size));             \
+        } while (0)
+static void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer);
+static struct mapped_buffer_node *find_mapped_buffer_locked(
+                                        struct rb_root *root, u64 addr);
+static struct mapped_buffer_node *find_mapped_buffer_reverse_locked(
+                                struct rb_root *root, struct dma_buf *dmabuf,
+                                u32 kind);
+static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
+                                   enum gmmu_pgsz_gk20a pgsz_idx,
+                                   struct sg_table *sgt,
+                                   u64 first_vaddr, u64 last_vaddr,
+                                   u8 kind_v, u32 ctag_offset, bool cacheable,
+                                   int rw_flag);
+static void update_gmmu_pde_locked(struct vm_gk20a *vm, u32 i);
+static void gk20a_vm_remove_support(struct vm_gk20a *vm);
+/* note: keep the page sizes sorted lowest to highest here */
+static const u32 gmmu_page_sizes[gmmu_nr_page_sizes] = { SZ_4K, SZ_128K };
+static const u32 gmmu_page_shifts[gmmu_nr_page_sizes] = { 12, 17 };
+static const u64 gmmu_page_offset_masks[gmmu_nr_page_sizes] = { 0xfffLL,
+                                                                0x1ffffLL };
+static const u64 gmmu_page_masks[gmmu_nr_page_sizes] = { ~0xfffLL, ~0x1ffffLL };
+struct gk20a_comptags {
+        u32 offset;
+        u32 lines;
+};
+struct gk20a_dmabuf_priv {
+        struct mutex lock;
+        struct gk20a_allocator *comptag_allocator;
+        struct gk20a_comptags comptags;
+        struct dma_buf_attachment *attach;
+        struct sg_table *sgt;
+        int pin_count;
+};
+static void gk20a_mm_delete_priv(void *_priv)
+{
+        struct gk20a_dmabuf_priv *priv = _priv;
+        if (!priv)
+                return;
+        if (priv->comptags.lines) {
+                BUG_ON(!priv->comptag_allocator);
+                priv->comptag_allocator->free(priv->comptag_allocator,
+                                              priv->comptags.offset,
+                                              priv->comptags.lines);
+        }
+        kfree(priv);
+}
+struct sg_table *gk20a_mm_pin(struct device *dev, struct dma_buf *dmabuf)
+{
+        struct gk20a_dmabuf_priv *priv;
+        priv = dma_buf_get_drvdata(dmabuf, dev);
+        if (WARN_ON(!priv))
+                return ERR_PTR(-EINVAL);
+        mutex_lock(&priv->lock);
+        if (priv->pin_count == 0) {
+                priv->attach = dma_buf_attach(dmabuf, dev);
+                if (IS_ERR(priv->attach)) {
+                        mutex_unlock(&priv->lock);
+                        return (struct sg_table *)priv->attach;
+                }
+                priv->sgt = dma_buf_map_attachment(priv->attach,
+                                                   DMA_BIDIRECTIONAL);
+                if (IS_ERR(priv->sgt)) {
+                        dma_buf_detach(dmabuf, priv->attach);
+                        mutex_unlock(&priv->lock);
+                        return priv->sgt;
+                }
+        }
+        priv->pin_count++;
+        mutex_unlock(&priv->lock);
+        return priv->sgt;
+}
+void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf,
+                    struct sg_table *sgt)
+{
+        struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
+        dma_addr_t dma_addr;
+        if (IS_ERR(priv) || !priv)
+                return;
+        mutex_lock(&priv->lock);
+        WARN_ON(priv->sgt != sgt);
+        priv->pin_count--;
+        WARN_ON(priv->pin_count < 0);
+        dma_addr = sg_dma_address(priv->sgt->sgl);
+        if (priv->pin_count == 0) {
+                dma_buf_unmap_attachment(priv->attach, priv->sgt,
+                                         DMA_BIDIRECTIONAL);
+                dma_buf_detach(dmabuf, priv->attach);
+        }
+        mutex_unlock(&priv->lock);
+}
+static void gk20a_get_comptags(struct device *dev,
+                               struct dma_buf *dmabuf,
+                               struct gk20a_comptags *comptags)
+{
+        struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
+        if (!comptags)
+                return;
+        if (!priv) {
+                comptags->lines = 0;
+                comptags->offset = 0;
+                return;
+        }
+        *comptags = priv->comptags;
+}
+static int gk20a_alloc_comptags(struct device *dev,
+                                struct dma_buf *dmabuf,
+                                struct gk20a_allocator *allocator,
+                                int lines)
+{
+        struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
+        u32 offset = 0;
+        int err;
+        if (!priv)
+                return -ENOSYS;
+        if (!lines)
+                return -EINVAL;
+        /* store the allocator so we can use it when we free the ctags */
+        priv->comptag_allocator = allocator;
+        err = allocator->alloc(allocator, &offset, lines);
+        if (!err) {
+                priv->comptags.lines = lines;
+                priv->comptags.offset = offset;
+        }
+        return err;
+}
+static int gk20a_init_mm_reset_enable_hw(struct gk20a *g)
+{
+        gk20a_dbg_fn("");
+        if (g->ops.fb.reset)
+                g->ops.fb.reset(g);
+        if (g->ops.fb.init_fs_state)
+                g->ops.fb.init_fs_state(g);
+        return 0;
+}
+void gk20a_remove_mm_support(struct mm_gk20a *mm)
+{
+        struct gk20a *g = mm->g;
+        struct device *d = dev_from_gk20a(g);
+        struct vm_gk20a *vm = &mm->bar1.vm;
+        struct inst_desc *inst_block = &mm->bar1.inst_block;
+        gk20a_dbg_fn("");
+        if (inst_block->cpuva)
+                dma_free_coherent(d, inst_block->size,
+                        inst_block->cpuva, inst_block->iova);
+        inst_block->cpuva = NULL;
+        inst_block->iova = 0;
+        gk20a_vm_remove_support(vm);
+}
+int gk20a_init_mm_setup_sw(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        int i;
+        gk20a_dbg_fn("");
+        if (mm->sw_ready) {
+                gk20a_dbg_fn("skip init");
+                return 0;
+        }
+        mm->g = g;
+        mutex_init(&mm->tlb_lock);
+        mutex_init(&mm->l2_op_lock);
+        mm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
+        mm->compression_page_size = gmmu_page_sizes[gmmu_page_size_big];
+        mm->pde_stride    = mm->big_page_size << 10;
+        mm->pde_stride_shift = ilog2(mm->pde_stride);
+        BUG_ON(mm->pde_stride_shift > 31); /* we have assumptions about this */
+        for (i = 0; i < ARRAY_SIZE(gmmu_page_sizes); i++) {
+                u32 num_ptes, pte_space, num_pages;
+                /* assuming "full" page tables */
+                num_ptes = mm->pde_stride / gmmu_page_sizes[i];
+                pte_space = num_ptes * gmmu_pte__size_v();
+                /* allocate whole pages */
+                pte_space = roundup(pte_space, PAGE_SIZE);
+                num_pages = pte_space / PAGE_SIZE;
+                /* make sure "order" is viable */
+                BUG_ON(!is_power_of_2(num_pages));
+                mm->page_table_sizing[i].num_ptes = num_ptes;
+                mm->page_table_sizing[i].order = ilog2(num_pages);
+        }
+        /*TBD: make channel vm size configurable */
+        mm->channel.size = 1ULL << NV_GMMU_VA_RANGE;
+        gk20a_dbg_info("channel vm size: %dMB", (int)(mm->channel.size >> 20));
+        gk20a_dbg_info("small page-size (%dKB) pte array: %dKB",
+                        gmmu_page_sizes[gmmu_page_size_small] >> 10,
+                        (mm->page_table_sizing[gmmu_page_size_small].num_ptes *
+                         gmmu_pte__size_v()) >> 10);
+        gk20a_dbg_info("big page-size (%dKB) pte array: %dKB",
+                        gmmu_page_sizes[gmmu_page_size_big] >> 10,
+                        (mm->page_table_sizing[gmmu_page_size_big].num_ptes *
+                         gmmu_pte__size_v()) >> 10);
+        gk20a_init_bar1_vm(mm);
+        mm->remove_support = gk20a_remove_mm_support;
+        mm->sw_ready = true;
+        gk20a_dbg_fn("done");
+        return 0;
+}
+/* make sure gk20a_init_mm_support is called before */
+static int gk20a_init_mm_setup_hw(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        struct inst_desc *inst_block = &mm->bar1.inst_block;
+        phys_addr_t inst_pa = inst_block->cpu_pa;
+        gk20a_dbg_fn("");
+        /* set large page size in fb
+         * note this is very early on, can we defer it ? */
+        {
+                u32 fb_mmu_ctrl = gk20a_readl(g, fb_mmu_ctrl_r());
+                if (gmmu_page_sizes[gmmu_page_size_big] == SZ_128K)
+                        fb_mmu_ctrl = (fb_mmu_ctrl &
+                                       ~fb_mmu_ctrl_vm_pg_size_f(~0x0)) |
+                                fb_mmu_ctrl_vm_pg_size_128kb_f();
+                else
+                        BUG_ON(1); /* no support/testing for larger ones yet */
+                gk20a_writel(g, fb_mmu_ctrl_r(), fb_mmu_ctrl);
+        }
+        inst_pa = (u32)(inst_pa >> bar1_instance_block_shift_gk20a());
+        gk20a_dbg_info("bar1 inst block ptr: 0x%08x",  (u32)inst_pa);
+        /* this is very early in init... can we defer this? */
+        {
+                gk20a_writel(g, bus_bar1_block_r(),
+                             bus_bar1_block_target_vid_mem_f() |
+                             bus_bar1_block_mode_virtual_f() |
+                             bus_bar1_block_ptr_f(inst_pa));
+        }
+        gk20a_dbg_fn("done");
+        return 0;
+}
+int gk20a_init_mm_support(struct gk20a *g)
+{
+        u32 err;
+        err = gk20a_init_mm_reset_enable_hw(g);
+        if (err)
+                return err;
+        err = gk20a_init_mm_setup_sw(g);
+        if (err)
+                return err;
+        err = gk20a_init_mm_setup_hw(g);
+        if (err)
+                return err;
+        return err;
+}
+#ifdef CONFIG_GK20A_PHYS_PAGE_TABLES
+static int alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
+                            void **handle,
+                            struct sg_table **sgt,
+                            size_t *size)
+{
+        u32 num_pages = 1 << order;
+        u32 len = num_pages * PAGE_SIZE;
+        int err;
+        struct page *pages;
+        gk20a_dbg_fn("");
+        pages = alloc_pages(GFP_KERNEL, order);
+        if (!pages) {
+                gk20a_dbg(gpu_dbg_pte, "alloc_pages failed\n");
+                goto err_out;
+        }
+        *sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
+        if (!sgt) {
+                gk20a_dbg(gpu_dbg_pte, "cannot allocate sg table");
+                goto err_alloced;
+        }
+        err = sg_alloc_table(*sgt, 1, GFP_KERNEL);
+        if (err) {
+                gk20a_dbg(gpu_dbg_pte, "sg_alloc_table failed\n");
+                goto err_sg_table;
+        }
+        sg_set_page((*sgt)->sgl, pages, len, 0);
+        *handle = page_address(pages);
+        memset(*handle, 0, len);
+        *size = len;
+        FLUSH_CPU_DCACHE(*handle, sg_phys((*sgt)->sgl), len);
+        return 0;
+err_sg_table:
+        kfree(*sgt);
+err_alloced:
+        __free_pages(pages, order);
+err_out:
+        return -ENOMEM;
+}
+static void free_gmmu_pages(struct vm_gk20a *vm, void *handle,
+                            struct sg_table *sgt, u32 order,
+                            size_t size)
+{
+        gk20a_dbg_fn("");
+        BUG_ON(sgt == NULL);
+        free_pages((unsigned long)handle, order);
+        sg_free_table(sgt);
+        kfree(sgt);
+}
+static int map_gmmu_pages(void *handle, struct sg_table *sgt,
+                          void **va, size_t size)
+{
+        FLUSH_CPU_DCACHE(handle, sg_phys(sgt->sgl), sgt->sgl->length);
+        *va = handle;
+        return 0;
+}
+static void unmap_gmmu_pages(void *handle, struct sg_table *sgt, void *va)
+{
+        FLUSH_CPU_DCACHE(handle, sg_phys(sgt->sgl), sgt->sgl->length);
+}
+#else
+static int alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
+                            void **handle,
+                            struct sg_table **sgt,
+                            size_t *size)
+{
+        struct device *d = dev_from_vm(vm);
+        u32 num_pages = 1 << order;
+        u32 len = num_pages * PAGE_SIZE;
+        dma_addr_t iova;
+        DEFINE_DMA_ATTRS(attrs);
+        struct page **pages;
+        int err = 0;
+        gk20a_dbg_fn("");
+        *size = len;
+        dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+        pages = dma_alloc_attrs(d, len, &iova, GFP_KERNEL, &attrs);
+        if (!pages) {
+                gk20a_err(d, "memory allocation failed\n");
+                goto err_out;
+        }
+        err = gk20a_get_sgtable_from_pages(d, sgt, pages,
+                                iova, len);
+        if (err) {
+                gk20a_err(d, "sgt allocation failed\n");
+                goto err_free;
+        }
+        *handle = (void *)pages;
+        return 0;
+err_free:
+        dma_free_attrs(d, len, pages, iova, &attrs);
+        pages = NULL;
+        iova = 0;
+err_out:
+        return -ENOMEM;
+}
+static void free_gmmu_pages(struct vm_gk20a *vm, void *handle,
+                            struct sg_table *sgt, u32 order,
+                            size_t size)
+{
+        struct device *d = dev_from_vm(vm);
+        u64 iova;
+        DEFINE_DMA_ATTRS(attrs);
+        struct page **pages = (struct page **)handle;
+        gk20a_dbg_fn("");
+        BUG_ON(sgt == NULL);
+        iova = sg_dma_address(sgt->sgl);
+        gk20a_free_sgtable(&sgt);
+        dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+        dma_free_attrs(d, size, pages, iova, &attrs);
+        pages = NULL;
+        iova = 0;
+}
+static int map_gmmu_pages(void *handle, struct sg_table *sgt,
+                          void **kva, size_t size)
+{
+        int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+        struct page **pages = (struct page **)handle;
+        gk20a_dbg_fn("");
+        *kva = vmap(pages, count, 0, pgprot_dmacoherent(PAGE_KERNEL));
+        if (!(*kva))
+                return -ENOMEM;
+        return 0;
+}
+static void unmap_gmmu_pages(void *handle, struct sg_table *sgt, void *va)
+{
+        gk20a_dbg_fn("");
+        vunmap(va);
+}
+#endif
+/* allocate a phys contig region big enough for a full
+ * sized gmmu page table for the given gmmu_page_size.
+ * the whole range is zeroed so it's "invalid"/will fault
+ */
+static int zalloc_gmmu_page_table_gk20a(struct vm_gk20a *vm,
+                                        enum gmmu_pgsz_gk20a gmmu_pgsz_idx,
+                                        struct page_table_gk20a *pte)
+{
+        int err;
+        u32 pte_order;
+        void *handle = NULL;
+        struct sg_table *sgt;
+        size_t size;
+        gk20a_dbg_fn("");
+        /* allocate enough pages for the table */
+        pte_order = vm->mm->page_table_sizing[gmmu_pgsz_idx].order;
+        err = alloc_gmmu_pages(vm, pte_order, &handle, &sgt, &size);
+        if (err)
+                return err;
+        gk20a_dbg(gpu_dbg_pte, "pte = 0x%p, addr=%08llx, size %d",
+                        pte, gk20a_mm_iova_addr(sgt->sgl), pte_order);
+        pte->ref = handle;
+        pte->sgt = sgt;
+        pte->size = size;
+        return 0;
+}
+/* given address range (inclusive) determine the pdes crossed */
+static inline void pde_range_from_vaddr_range(struct vm_gk20a *vm,
+                                              u64 addr_lo, u64 addr_hi,
+                                              u32 *pde_lo, u32 *pde_hi)
+{
+        *pde_lo = (u32)(addr_lo >> vm->mm->pde_stride_shift);
+        *pde_hi = (u32)(addr_hi >> vm->mm->pde_stride_shift);
+        gk20a_dbg(gpu_dbg_pte, "addr_lo=0x%llx addr_hi=0x%llx pde_ss=%d",
+                   addr_lo, addr_hi, vm->mm->pde_stride_shift);
+        gk20a_dbg(gpu_dbg_pte, "pde_lo=%d pde_hi=%d",
+                   *pde_lo, *pde_hi);
+}
+static inline u32 *pde_from_index(struct vm_gk20a *vm, u32 i)
+{
+        return (u32 *) (((u8 *)vm->pdes.kv) + i*gmmu_pde__size_v());
+}
+static inline u32 pte_index_from_vaddr(struct vm_gk20a *vm,
+                                       u64 addr, enum gmmu_pgsz_gk20a pgsz_idx)
+{
+        u32 ret;
+        /* mask off pde part */
+        addr = addr & ((((u64)1) << vm->mm->pde_stride_shift) - ((u64)1));
+        /* shift over to get pte index. note assumption that pte index
+         * doesn't leak over into the high 32b */
+        ret = (u32)(addr >> gmmu_page_shifts[pgsz_idx]);
+        gk20a_dbg(gpu_dbg_pte, "addr=0x%llx pte_i=0x%x", addr, ret);
+        return ret;
+}
+static inline void pte_space_page_offset_from_index(u32 i, u32 *pte_page,
+                                                    u32 *pte_offset)
+{
+        /* ptes are 8B regardless of pagesize */
+        /* pte space pages are 4KB. so 512 ptes per 4KB page*/
+        *pte_page = i >> 9;
+        /* this offset is a pte offset, not a byte offset */
+        *pte_offset = i & ((1<<9)-1);
+        gk20a_dbg(gpu_dbg_pte, "i=0x%x pte_page=0x%x pte_offset=0x%x",
+                   i, *pte_page, *pte_offset);
+}
+/*
+ * given a pde index/page table number make sure it has
+ * backing store and if not go ahead allocate it and
+ * record it in the appropriate pde
+ */
+static int validate_gmmu_page_table_gk20a_locked(struct vm_gk20a *vm,
+                                u32 i, enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
+{
+        int err;
+        struct page_table_gk20a *pte =
+                vm->pdes.ptes[gmmu_pgsz_idx] + i;
+        gk20a_dbg_fn("");
+        /* if it's already in place it's valid */
+        if (pte->ref)
+                return 0;
+        gk20a_dbg(gpu_dbg_pte, "alloc %dKB ptes for pde %d",
+                   gmmu_page_sizes[gmmu_pgsz_idx]/1024, i);
+        err = zalloc_gmmu_page_table_gk20a(vm, gmmu_pgsz_idx, pte);
+        if (err)
+                return err;
+        /* rewrite pde */
+        update_gmmu_pde_locked(vm, i);
+        return 0;
+}
+static struct vm_reserved_va_node *addr_to_reservation(struct vm_gk20a *vm,
+                                                       u64 addr)
+{
+        struct vm_reserved_va_node *va_node;
+        list_for_each_entry(va_node, &vm->reserved_va_list, reserved_va_list)
+                if (addr >= va_node->vaddr_start &&
+                    addr < (u64)va_node->vaddr_start + (u64)va_node->size)
+                        return va_node;
+        return NULL;
+}
+int gk20a_vm_get_buffers(struct vm_gk20a *vm,
+                         struct mapped_buffer_node ***mapped_buffers,
+                         int *num_buffers)
+{
+        struct mapped_buffer_node *mapped_buffer;
+        struct mapped_buffer_node **buffer_list;
+        struct rb_node *node;
+        int i = 0;
+        mutex_lock(&vm->update_gmmu_lock);
+        buffer_list = kzalloc(sizeof(*buffer_list) *
+                              vm->num_user_mapped_buffers, GFP_KERNEL);
+        if (!buffer_list) {
+                mutex_unlock(&vm->update_gmmu_lock);
+                return -ENOMEM;
+        }
+        node = rb_first(&vm->mapped_buffers);
+        while (node) {
+                mapped_buffer =
+                        container_of(node, struct mapped_buffer_node, node);
+                if (mapped_buffer->user_mapped) {
+                        buffer_list[i] = mapped_buffer;
+                        kref_get(&mapped_buffer->ref);
+                        i++;
+                }
+                node = rb_next(&mapped_buffer->node);
+        }
+        BUG_ON(i != vm->num_user_mapped_buffers);
+        *num_buffers = vm->num_user_mapped_buffers;
+        *mapped_buffers = buffer_list;
+        mutex_unlock(&vm->update_gmmu_lock);
+        return 0;
+}
+static void gk20a_vm_unmap_locked_kref(struct kref *ref)
+{
+        struct mapped_buffer_node *mapped_buffer =
+                container_of(ref, struct mapped_buffer_node, ref);
+        gk20a_vm_unmap_locked(mapped_buffer);
+}
+void gk20a_vm_put_buffers(struct vm_gk20a *vm,
+                                 struct mapped_buffer_node **mapped_buffers,
+                                 int num_buffers)
+{
+        int i;
+        mutex_lock(&vm->update_gmmu_lock);
+        for (i = 0; i < num_buffers; ++i)
+                kref_put(&mapped_buffers[i]->ref,
+                         gk20a_vm_unmap_locked_kref);
+        mutex_unlock(&vm->update_gmmu_lock);
+        kfree(mapped_buffers);
+}
+static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset)
+{
+        struct device *d = dev_from_vm(vm);
+        int retries;
+        struct mapped_buffer_node *mapped_buffer;
+        mutex_lock(&vm->update_gmmu_lock);
+        mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, offset);
+        if (!mapped_buffer) {
+                mutex_unlock(&vm->update_gmmu_lock);
+                gk20a_err(d, "invalid addr to unmap 0x%llx", offset);
+                return;
+        }
+        if (mapped_buffer->flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET) {
+                mutex_unlock(&vm->update_gmmu_lock);
+                retries = 1000;
+                while (retries) {
+                        if (atomic_read(&mapped_buffer->ref.refcount) == 1)
+                                break;
+                        retries--;
+                        udelay(50);
+                }
+                if (!retries)
+                        gk20a_err(d, "sync-unmap failed on 0x%llx",
+                                                                offset);
+                mutex_lock(&vm->update_gmmu_lock);
+        }
+        mapped_buffer->user_mapped--;
+        if (mapped_buffer->user_mapped == 0)
+                vm->num_user_mapped_buffers--;
+        kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref);
+        mutex_unlock(&vm->update_gmmu_lock);
+}
+static u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,
+                             u64 size,
+                             enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
+{
+        struct gk20a_allocator *vma = &vm->vma[gmmu_pgsz_idx];
+        int err;
+        u64 offset;
+        u32 start_page_nr = 0, num_pages;
+        u64 gmmu_page_size = gmmu_page_sizes[gmmu_pgsz_idx];
+        if (gmmu_pgsz_idx >= ARRAY_SIZE(gmmu_page_sizes)) {
+                dev_warn(dev_from_vm(vm),
+                         "invalid page size requested in gk20a vm alloc");
+                return -EINVAL;
+        }
+        if ((gmmu_pgsz_idx == gmmu_page_size_big) && !vm->big_pages) {
+                dev_warn(dev_from_vm(vm),
+                         "unsupportd page size requested");
+                return -EINVAL;
+        }
+        /* be certain we round up to gmmu_page_size if needed */
+        /* TBD: DIV_ROUND_UP -> undefined reference to __aeabi_uldivmod */
+        size = (size + ((u64)gmmu_page_size - 1)) & ~((u64)gmmu_page_size - 1);
+        gk20a_dbg_info("size=0x%llx @ pgsz=%dKB", size,
+                        gmmu_page_sizes[gmmu_pgsz_idx]>>10);
+        /* The vma allocator represents page accounting. */
+        num_pages = size >> gmmu_page_shifts[gmmu_pgsz_idx];
+        err = vma->alloc(vma, &start_page_nr, num_pages);
+        if (err) {
+                gk20a_err(dev_from_vm(vm),
+                           "%s oom: sz=0x%llx", vma->name, size);
+                return 0;
+        }
+        offset = (u64)start_page_nr << gmmu_page_shifts[gmmu_pgsz_idx];
+        gk20a_dbg_fn("%s found addr: 0x%llx", vma->name, offset);
+        return offset;
+}
+static int gk20a_vm_free_va(struct vm_gk20a *vm,
+                             u64 offset, u64 size,
+                             enum gmmu_pgsz_gk20a pgsz_idx)
+{
+        struct gk20a_allocator *vma = &vm->vma[pgsz_idx];
+        u32 page_size = gmmu_page_sizes[pgsz_idx];
+        u32 page_shift = gmmu_page_shifts[pgsz_idx];
+        u32 start_page_nr, num_pages;
+        int err;
+        gk20a_dbg_info("%s free addr=0x%llx, size=0x%llx",
+                        vma->name, offset, size);
+        start_page_nr = (u32)(offset >> page_shift);
+        num_pages = (u32)((size + page_size - 1) >> page_shift);
+        err = vma->free(vma, start_page_nr, num_pages);
+        if (err) {
+                gk20a_err(dev_from_vm(vm),
+                           "not found: offset=0x%llx, sz=0x%llx",
+                           offset, size);
+        }
+        return err;
+}
+static int insert_mapped_buffer(struct rb_root *root,
+                                struct mapped_buffer_node *mapped_buffer)
+{
+        struct rb_node **new_node = &(root->rb_node), *parent = NULL;
+        /* Figure out where to put new node */
+        while (*new_node) {
+                struct mapped_buffer_node *cmp_with =
+                        container_of(*new_node, struct mapped_buffer_node,
+                                     node);
+                parent = *new_node;
+                if (cmp_with->addr > mapped_buffer->addr) /* u64 cmp */
+                        new_node = &((*new_node)->rb_left);
+                else if (cmp_with->addr != mapped_buffer->addr) /* u64 cmp */
+                        new_node = &((*new_node)->rb_right);
+                else
+                        return -EINVAL; /* no fair dup'ing */
+        }
+        /* Add new node and rebalance tree. */
+        rb_link_node(&mapped_buffer->node, parent, new_node);
+        rb_insert_color(&mapped_buffer->node, root);
+        return 0;
+}
+static struct mapped_buffer_node *find_mapped_buffer_reverse_locked(
+                                struct rb_root *root, struct dma_buf *dmabuf,
+                                u32 kind)
+{
+        struct rb_node *node = rb_first(root);
+        while (node) {
+                struct mapped_buffer_node *mapped_buffer =
+                        container_of(node, struct mapped_buffer_node, node);
+                if (mapped_buffer->dmabuf == dmabuf &&
+                    kind == mapped_buffer->kind)
+                        return mapped_buffer;
+                node = rb_next(&mapped_buffer->node);
+        }
+        return 0;
+}
+static struct mapped_buffer_node *find_mapped_buffer_locked(
+                                        struct rb_root *root, u64 addr)
+{
+        struct rb_node *node = root->rb_node;
+        while (node) {
+                struct mapped_buffer_node *mapped_buffer =
+                        container_of(node, struct mapped_buffer_node, node);
+                if (mapped_buffer->addr > addr) /* u64 cmp */
+                        node = node->rb_left;
+                else if (mapped_buffer->addr != addr) /* u64 cmp */
+                        node = node->rb_right;
+                else
+                        return mapped_buffer;
+        }
+        return 0;
+}
+static struct mapped_buffer_node *find_mapped_buffer_range_locked(
+                                        struct rb_root *root, u64 addr)
+{
+        struct rb_node *node = root->rb_node;
+        while (node) {
+                struct mapped_buffer_node *m =
+                        container_of(node, struct mapped_buffer_node, node);
+                if (m->addr <= addr && m->addr + m->size > addr)
+                        return m;
+                else if (m->addr > addr) /* u64 cmp */
+                        node = node->rb_left;
+                else
+                        node = node->rb_right;
+        }
+        return 0;
+}
+#define BFR_ATTRS (sizeof(nvmap_bfr_param)/sizeof(nvmap_bfr_param[0]))
+struct buffer_attrs {
+        struct sg_table *sgt;
+        u64 size;
+        u64 align;
+        u32 ctag_offset;
+        u32 ctag_lines;
+        int pgsz_idx;
+        u8 kind_v;
+        u8 uc_kind_v;
+};
+static void gmmu_select_page_size(struct buffer_attrs *bfr)
+{
+        int i;
+        /*  choose the biggest first (top->bottom) */
+        for (i = (gmmu_nr_page_sizes-1); i >= 0; i--)
+                if (!(gmmu_page_offset_masks[i] & bfr->align)) {
+                        /* would like to add this too but nvmap returns the
+                         * original requested size not the allocated size.
+                         * (!(gmmu_page_offset_masks[i] & bfr->size)) */
+                        bfr->pgsz_idx = i;
+                        break;
+                }
+}
+static int setup_buffer_kind_and_compression(struct device *d,
+                                             u32 flags,
+                                             struct buffer_attrs *bfr,
+                                             enum gmmu_pgsz_gk20a pgsz_idx)
+{
+        bool kind_compressible;
+        if (unlikely(bfr->kind_v == gmmu_pte_kind_invalid_v()))
+                bfr->kind_v = gmmu_pte_kind_pitch_v();
+        if (unlikely(!gk20a_kind_is_supported(bfr->kind_v))) {
+                gk20a_err(d, "kind 0x%x not supported", bfr->kind_v);
+                return -EINVAL;
+        }
+        bfr->uc_kind_v = gmmu_pte_kind_invalid_v();
+        /* find a suitable uncompressed kind if it becomes necessary later */
+        kind_compressible = gk20a_kind_is_compressible(bfr->kind_v);
+        if (kind_compressible) {
+                bfr->uc_kind_v = gk20a_get_uncompressed_kind(bfr->kind_v);
+                if (unlikely(bfr->uc_kind_v == gmmu_pte_kind_invalid_v())) {
+                        /* shouldn't happen, but it is worth cross-checking */
+                        gk20a_err(d, "comptag kind 0x%x can't be"
+                                   " downgraded to uncompressed kind",
+                                   bfr->kind_v);
+                        return -EINVAL;
+                }
+        }
+        /* comptags only supported for suitable kinds, 128KB pagesize */
+        if (unlikely(kind_compressible &&
+                     (gmmu_page_sizes[pgsz_idx] != 128*1024))) {
+                /*
+                gk20a_warn(d, "comptags specified"
+                " but pagesize being used doesn't support it");*/
+                /* it is safe to fall back to uncompressed as
+                   functionality is not harmed */
+                bfr->kind_v = bfr->uc_kind_v;
+                kind_compressible = false;
+        }
+        if (kind_compressible)
+                bfr->ctag_lines = ALIGN(bfr->size, COMP_TAG_LINE_SIZE) >>
+                        COMP_TAG_LINE_SIZE_SHIFT;
+        else
+                bfr->ctag_lines = 0;
+        return 0;
+}
+static int validate_fixed_buffer(struct vm_gk20a *vm,
+                                 struct buffer_attrs *bfr,
+                                 u64 map_offset)
+{
+        struct device *dev = dev_from_vm(vm);
+        struct vm_reserved_va_node *va_node;
+        struct mapped_buffer_node *buffer;
+        if (map_offset & gmmu_page_offset_masks[bfr->pgsz_idx]) {
+                gk20a_err(dev, "map offset must be buffer page size aligned 0x%llx",
+                           map_offset);
+                return -EINVAL;
+        }
+        /* find the space reservation */
+        va_node = addr_to_reservation(vm, map_offset);
+        if (!va_node) {
+                gk20a_warn(dev, "fixed offset mapping without space allocation");
+                return -EINVAL;
+        }
+        /* check that this mappings does not collide with existing
+         * mappings by checking the overlapping area between the current
+         * buffer and all other mapped buffers */
+        list_for_each_entry(buffer,
+                &va_node->va_buffers_list, va_buffers_list) {
+                s64 begin = max(buffer->addr, map_offset);
+                s64 end = min(buffer->addr +
+                        buffer->size, map_offset + bfr->size);
+                if (end - begin > 0) {
+                        gk20a_warn(dev, "overlapping buffer map requested");
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
+static u64 __locked_gmmu_map(struct vm_gk20a *vm,
+                                u64 map_offset,
+                                struct sg_table *sgt,
+                                u64 size,
+                                int pgsz_idx,
+                                u8 kind_v,
+                                u32 ctag_offset,
+                                u32 flags,
+                                int rw_flag)
+{
+        int err = 0, i = 0;
+        u32 pde_lo, pde_hi;
+        struct device *d = dev_from_vm(vm);
+        /* Allocate (or validate when map_offset != 0) the virtual address. */
+        if (!map_offset) {
+                map_offset = gk20a_vm_alloc_va(vm, size,
+                                          pgsz_idx);
+                if (!map_offset) {
+                        gk20a_err(d, "failed to allocate va space");
+                        err = -ENOMEM;
+                        goto fail;
+                }
+        }
+        pde_range_from_vaddr_range(vm,
+                                   map_offset,
+                                   map_offset + size - 1,
+                                   &pde_lo, &pde_hi);
+        /* mark the addr range valid (but with 0 phys addr, which will fault) */
+        for (i = pde_lo; i <= pde_hi; i++) {
+                err = validate_gmmu_page_table_gk20a_locked(vm, i,
+                                                            pgsz_idx);
+                if (err) {
+                        gk20a_err(d, "failed to validate page table %d: %d",
+                                                           i, err);
+                        goto fail;
+                }
+        }
+        err = update_gmmu_ptes_locked(vm, pgsz_idx,
+                                      sgt,
+                                      map_offset, map_offset + size - 1,
+                                      kind_v,
+                                      ctag_offset,
+                                      flags &
+                                      NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+                                      rw_flag);
+        if (err) {
+                gk20a_err(d, "failed to update ptes on map");
+                goto fail;
+        }
+        return map_offset;
+ fail:
+        gk20a_err(d, "%s: failed with err=%d\n", __func__, err);
+        return 0;
+}
+static void __locked_gmmu_unmap(struct vm_gk20a *vm,
+                                u64 vaddr,
+                                u64 size,
+                                int pgsz_idx,
+                                bool va_allocated,
+                                int rw_flag)
+{
+        int err = 0;
+        struct gk20a *g = gk20a_from_vm(vm);
+        if (va_allocated) {
+                err = gk20a_vm_free_va(vm, vaddr, size, pgsz_idx);
+                if (err) {
+                        dev_err(dev_from_vm(vm),
+                                "failed to free va");
+                        return;
+                }
+        }
+        /* unmap here needs to know the page size we assigned at mapping */
+        err = update_gmmu_ptes_locked(vm,
+                                pgsz_idx,
+                                0, /* n/a for unmap */
+                                vaddr,
+                                vaddr + size - 1,
+                                0, 0, false /* n/a for unmap */,
+                                rw_flag);
+        if (err)
+                dev_err(dev_from_vm(vm),
+                        "failed to update gmmu ptes on unmap");
+        /* detect which if any pdes/ptes can now be released */
+        /* flush l2 so any dirty lines are written out *now*.
+         *  also as we could potentially be switching this buffer
+         * from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at
+         * some point in the future we need to invalidate l2.  e.g. switching
+         * from a render buffer unmap (here) to later using the same memory
+         * for gmmu ptes.  note the positioning of this relative to any smmu
+         * unmapping (below). */
+        gk20a_mm_l2_flush(g, true);
+}
+static u64 gk20a_vm_map_duplicate_locked(struct vm_gk20a *vm,
+                                         struct dma_buf *dmabuf,
+                                         u64 offset_align,
+                                         u32 flags,
+                                         int kind,
+                                         struct sg_table **sgt,
+                                         bool user_mapped,
+                                         int rw_flag)
+{
+        struct mapped_buffer_node *mapped_buffer = 0;
+        mapped_buffer =
+                find_mapped_buffer_reverse_locked(&vm->mapped_buffers,
+                                                  dmabuf, kind);
+        if (!mapped_buffer)
+                return 0;
+        if (mapped_buffer->flags != flags)
+                return 0;
+        if (flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET &&
+            mapped_buffer->addr != offset_align)
+                return 0;
+        BUG_ON(mapped_buffer->vm != vm);
+        /* mark the buffer as used */
+        if (user_mapped) {
+                if (mapped_buffer->user_mapped == 0)
+                        vm->num_user_mapped_buffers++;
+                mapped_buffer->user_mapped++;
+                /* If the mapping comes from user space, we own
+                 * the handle ref. Since we reuse an
+                 * existing mapping here, we need to give back those
+                 * refs once in order not to leak.
+                 */
+                if (mapped_buffer->own_mem_ref)
+                        dma_buf_put(mapped_buffer->dmabuf);
+                else
+                        mapped_buffer->own_mem_ref = true;
+        }
+        kref_get(&mapped_buffer->ref);
+        gk20a_dbg(gpu_dbg_map,
+                   "reusing as=%d pgsz=%d flags=0x%x ctags=%d "
+                   "start=%d gv=0x%x,%08x -> 0x%x,%08x -> 0x%x,%08x "
+                   "own_mem_ref=%d user_mapped=%d",
+                   vm_aspace_id(vm), mapped_buffer->pgsz_idx,
+                   mapped_buffer->flags,
+                   mapped_buffer->ctag_lines,
+                   mapped_buffer->ctag_offset,
+                   hi32(mapped_buffer->addr), lo32(mapped_buffer->addr),
+                   hi32((u64)sg_dma_address(mapped_buffer->sgt->sgl)),
+                   lo32((u64)sg_dma_address(mapped_buffer->sgt->sgl)),
+                   hi32((u64)sg_phys(mapped_buffer->sgt->sgl)),
+                   lo32((u64)sg_phys(mapped_buffer->sgt->sgl)),
+                   mapped_buffer->own_mem_ref, user_mapped);
+        if (sgt)
+                *sgt = mapped_buffer->sgt;
+        return mapped_buffer->addr;
+}
+u64 gk20a_vm_map(struct vm_gk20a *vm,
+                        struct dma_buf *dmabuf,
+                        u64 offset_align,
+                        u32 flags /*NVHOST_AS_MAP_BUFFER_FLAGS_*/,
+                        int kind,
+                        struct sg_table **sgt,
+                        bool user_mapped,
+                        int rw_flag)
+{
+        struct gk20a *g = gk20a_from_vm(vm);
+        struct gk20a_allocator *ctag_allocator = &g->gr.comp_tags;
+        struct device *d = dev_from_vm(vm);
+        struct mapped_buffer_node *mapped_buffer = 0;
+        bool inserted = false, va_allocated = false;
+        u32 gmmu_page_size = 0;
+        u64 map_offset = 0;
+        int err = 0;
+        struct buffer_attrs bfr = {0};
+        struct gk20a_comptags comptags;
+        mutex_lock(&vm->update_gmmu_lock);
+        /* check if this buffer is already mapped */
+        map_offset = gk20a_vm_map_duplicate_locked(vm, dmabuf, offset_align,
+                                                   flags, kind, sgt,
+                                                   user_mapped, rw_flag);
+        if (map_offset) {
+                mutex_unlock(&vm->update_gmmu_lock);
+                return map_offset;
+        }
+        /* pin buffer to get phys/iovmm addr */
+        bfr.sgt = gk20a_mm_pin(d, dmabuf);
+        if (IS_ERR(bfr.sgt)) {
+                /* Falling back to physical is actually possible
+                 * here in many cases if we use 4K phys pages in the
+                 * gmmu.  However we have some regions which require
+                 * contig regions to work properly (either phys-contig
+                 * or contig through smmu io_vaspace).  Until we can
+                 * track the difference between those two cases we have
+                 * to fail the mapping when we run out of SMMU space.
+                 */
+                gk20a_warn(d, "oom allocating tracking buffer");
+                goto clean_up;
+        }
+        if (sgt)
+                *sgt = bfr.sgt;
+        bfr.kind_v = kind;
+        bfr.size = dmabuf->size;
+        bfr.align = 1 << __ffs((u64)sg_dma_address(bfr.sgt->sgl));
+        bfr.pgsz_idx = -1;
+        /* If FIX_OFFSET is set, pgsz is determined. Otherwise, select
+         * page size according to memory alignment */
+        if (flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET) {
+                bfr.pgsz_idx = NV_GMMU_VA_IS_UPPER(offset_align) ?
+                                gmmu_page_size_big : gmmu_page_size_small;
+        } else {
+                gmmu_select_page_size(&bfr);
+        }
+        /* validate/adjust bfr attributes */
+        if (unlikely(bfr.pgsz_idx == -1)) {
+                gk20a_err(d, "unsupported page size detected");
+                goto clean_up;
+        }
+        if (unlikely(bfr.pgsz_idx < gmmu_page_size_small ||
+                     bfr.pgsz_idx > gmmu_page_size_big)) {
+                BUG_ON(1);
+                err = -EINVAL;
+                goto clean_up;
+        }
+        gmmu_page_size = gmmu_page_sizes[bfr.pgsz_idx];
+        /* Check if we should use a fixed offset for mapping this buffer */
+        if (flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET)  {
+                err = validate_fixed_buffer(vm, &bfr, offset_align);
+                if (err)
+                        goto clean_up;
+                map_offset = offset_align;
+                va_allocated = false;
+        } else
+                va_allocated = true;
+        if (sgt)
+                *sgt = bfr.sgt;
+        err = setup_buffer_kind_and_compression(d, flags, &bfr, bfr.pgsz_idx);
+        if (unlikely(err)) {
+                gk20a_err(d, "failure setting up kind and compression");
+                goto clean_up;
+        }
+        /* bar1 and pmu vm don't need ctag */
+        if (!vm->enable_ctag)
+                bfr.ctag_lines = 0;
+        gk20a_get_comptags(d, dmabuf, &comptags);
+        if (bfr.ctag_lines && !comptags.lines) {
+                /* allocate compression resources if needed */
+                err = gk20a_alloc_comptags(d, dmabuf, ctag_allocator,
+                                           bfr.ctag_lines);
+                if (err) {
+                        /* ok to fall back here if we ran out */
+                        /* TBD: we can partially alloc ctags as well... */
+                        bfr.ctag_lines = bfr.ctag_offset = 0;
+                        bfr.kind_v = bfr.uc_kind_v;
+                } else {
+                        gk20a_get_comptags(d, dmabuf, &comptags);
+                        /* init/clear the ctag buffer */
+                        g->ops.ltc.clear_comptags(g,
+                                          comptags.offset,
+                                          comptags.offset + comptags.lines - 1);
+                }
+        }
+        /* store the comptag info */
+        bfr.ctag_offset = comptags.offset;
+        /* update gmmu ptes */
+        map_offset = __locked_gmmu_map(vm, map_offset,
+                                        bfr.sgt,
+                                        bfr.size,
+                                        bfr.pgsz_idx,
+                                        bfr.kind_v,
+                                        bfr.ctag_offset,
+                                        flags, rw_flag);
+        if (!map_offset)
+                goto clean_up;
+        gk20a_dbg(gpu_dbg_map,
+           "as=%d pgsz=%d "
+           "kind=0x%x kind_uc=0x%x flags=0x%x "
+           "ctags=%d start=%d gv=0x%x,%08x -> 0x%x,%08x -> 0x%x,%08x",
+           vm_aspace_id(vm), gmmu_page_size,
+           bfr.kind_v, bfr.uc_kind_v, flags,
+           bfr.ctag_lines, bfr.ctag_offset,
+           hi32(map_offset), lo32(map_offset),
+           hi32((u64)sg_dma_address(bfr.sgt->sgl)),
+           lo32((u64)sg_dma_address(bfr.sgt->sgl)),
+           hi32((u64)sg_phys(bfr.sgt->sgl)),
+           lo32((u64)sg_phys(bfr.sgt->sgl)));
+#if defined(NVHOST_DEBUG)
+        {
+                int i;
+                struct scatterlist *sg = NULL;
+                gk20a_dbg(gpu_dbg_pte, "for_each_sg(bfr.sgt->sgl, sg, bfr.sgt->nents, i)");
+                for_each_sg(bfr.sgt->sgl, sg, bfr.sgt->nents, i ) {
+                        u64 da = sg_dma_address(sg);
+                        u64 pa = sg_phys(sg);
+                        u64 len = sg->length;
+                        gk20a_dbg(gpu_dbg_pte, "i=%d pa=0x%x,%08x da=0x%x,%08x len=0x%x,%08x",
+                                   i, hi32(pa), lo32(pa), hi32(da), lo32(da),
+                                   hi32(len), lo32(len));
+                }
+        }
+#endif
+        /* keep track of the buffer for unmapping */
+        /* TBD: check for multiple mapping of same buffer */
+        mapped_buffer = kzalloc(sizeof(*mapped_buffer), GFP_KERNEL);
+        if (!mapped_buffer) {
+                gk20a_warn(d, "oom allocating tracking buffer");
+                goto clean_up;
+        }
+        mapped_buffer->dmabuf      = dmabuf;
+        mapped_buffer->sgt         = bfr.sgt;
+        mapped_buffer->addr        = map_offset;
+        mapped_buffer->size        = bfr.size;
+        mapped_buffer->pgsz_idx    = bfr.pgsz_idx;
+        mapped_buffer->ctag_offset = bfr.ctag_offset;
+        mapped_buffer->ctag_lines  = bfr.ctag_lines;
+        mapped_buffer->vm          = vm;
+        mapped_buffer->flags       = flags;
+        mapped_buffer->kind        = kind;
+        mapped_buffer->va_allocated = va_allocated;
+        mapped_buffer->user_mapped = user_mapped ? 1 : 0;
+        mapped_buffer->own_mem_ref = user_mapped;
+        INIT_LIST_HEAD(&mapped_buffer->unmap_list);
+        INIT_LIST_HEAD(&mapped_buffer->va_buffers_list);
+        kref_init(&mapped_buffer->ref);
+        err = insert_mapped_buffer(&vm->mapped_buffers, mapped_buffer);
+        if (err) {
+                gk20a_err(d, "failed to insert into mapped buffer tree");
+                goto clean_up;
+        }
+        inserted = true;
+        if (user_mapped)
+                vm->num_user_mapped_buffers++;
+        gk20a_dbg_info("allocated va @ 0x%llx", map_offset);
+        if (!va_allocated) {
+                struct vm_reserved_va_node *va_node;
+                /* find the space reservation */
+                va_node = addr_to_reservation(vm, map_offset);
+                list_add_tail(&mapped_buffer->va_buffers_list,
+                              &va_node->va_buffers_list);
+                mapped_buffer->va_node = va_node;
+        }
+        mutex_unlock(&vm->update_gmmu_lock);
+        /* Invalidate kernel mappings immediately */
+        if (vm_aspace_id(vm) == -1)
+                gk20a_mm_tlb_invalidate(vm);
+        return map_offset;
+clean_up:
+        if (inserted) {
+                rb_erase(&mapped_buffer->node, &vm->mapped_buffers);
+                if (user_mapped)
+                        vm->num_user_mapped_buffers--;
+        }
+        kfree(mapped_buffer);
+        if (va_allocated)
+                gk20a_vm_free_va(vm, map_offset, bfr.size, bfr.pgsz_idx);
+        if (!IS_ERR(bfr.sgt))
+                gk20a_mm_unpin(d, dmabuf, bfr.sgt);
+        mutex_unlock(&vm->update_gmmu_lock);
+        gk20a_dbg_info("err=%d\n", err);
+        return 0;
+}
+u64 gk20a_gmmu_map(struct vm_gk20a *vm,
+                struct sg_table **sgt,
+                u64 size,
+                u32 flags,
+                int rw_flag)
+{
+        u64 vaddr;
+        mutex_lock(&vm->update_gmmu_lock);
+        vaddr = __locked_gmmu_map(vm, 0, /* already mapped? - No */
+                                *sgt, /* sg table */
+                                size,
+                                0, /* page size index = 0 i.e. SZ_4K */
+                                0, /* kind */
+                                0, /* ctag_offset */
+                                flags, rw_flag);
+        mutex_unlock(&vm->update_gmmu_lock);
+        if (!vaddr) {
+                gk20a_err(dev_from_vm(vm), "failed to allocate va space");
+                return 0;
+        }
+        /* Invalidate kernel mappings immediately */
+        gk20a_mm_tlb_invalidate(vm);
+        return vaddr;
+}
+void gk20a_gmmu_unmap(struct vm_gk20a *vm,
+                u64 vaddr,
+                u64 size,
+                int rw_flag)
+{
+        mutex_lock(&vm->update_gmmu_lock);
+        __locked_gmmu_unmap(vm,
+                        vaddr,
+                        size,
+                        0, /* page size 4K */
+                        true, /*va_allocated */
+                        rw_flag);
+        mutex_unlock(&vm->update_gmmu_lock);
+}
+phys_addr_t gk20a_get_phys_from_iova(struct device *d,
+                                u64 dma_addr)
+{
+        phys_addr_t phys;
+        u64 iova;
+        struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(d);
+        if (!mapping)
+                return dma_addr;
+        iova = dma_addr & PAGE_MASK;
+        phys = iommu_iova_to_phys(mapping->domain, iova);
+        return phys;
+}
+/* get sg_table from already allocated buffer */
+int gk20a_get_sgtable(struct device *d, struct sg_table **sgt,
+                        void *cpuva, u64 iova,
+                        size_t size)
+{
+        int err = 0;
+        *sgt = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
+        if (!(*sgt)) {
+                dev_err(d, "failed to allocate memory\n");
+                err = -ENOMEM;
+                goto fail;
+        }
+        err = dma_get_sgtable(d, *sgt,
+                        cpuva, iova,
+                        size);
+        if (err) {
+                dev_err(d, "failed to create sg table\n");
+                goto fail;
+        }
+        sg_dma_address((*sgt)->sgl) = iova;
+        return 0;
+ fail:
+        if (*sgt) {
+                kfree(*sgt);
+                *sgt = NULL;
+        }
+        return err;
+}
+int gk20a_get_sgtable_from_pages(struct device *d, struct sg_table **sgt,
+                        struct page **pages, u64 iova,
+                        size_t size)
+{
+        int err = 0;
+        *sgt = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
+        if (!(*sgt)) {
+                dev_err(d, "failed to allocate memory\n");
+                err = -ENOMEM;
+                goto fail;
+        }
+        err = sg_alloc_table(*sgt, 1, GFP_KERNEL);
+        if (err) {
+                dev_err(d, "failed to allocate sg_table\n");
+                goto fail;
+        }
+        sg_set_page((*sgt)->sgl, *pages, size, 0);
+        sg_dma_address((*sgt)->sgl) = iova;
+        return 0;
+ fail:
+        if (*sgt) {
+                kfree(*sgt);
+                *sgt = NULL;
+        }
+        return err;
+}
+void gk20a_free_sgtable(struct sg_table **sgt)
+{
+        sg_free_table(*sgt);
+        kfree(*sgt);
+        *sgt = NULL;
+}
+u64 gk20a_mm_iova_addr(struct scatterlist *sgl)
+{
+        u64 result = sg_phys(sgl);
+#ifdef CONFIG_TEGRA_IOMMU_SMMU
+        if (sg_dma_address(sgl) == DMA_ERROR_CODE)
+                result = 0;
+        else if (sg_dma_address(sgl)) {
+                result = sg_dma_address(sgl) |
+                        1ULL << NV_MC_SMMU_VADDR_TRANSLATION_BIT;
+        }
+#endif
+        return result;
+}
+static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
+                                   enum gmmu_pgsz_gk20a pgsz_idx,
+                                   struct sg_table *sgt,
+                                   u64 first_vaddr, u64 last_vaddr,
+                                   u8 kind_v, u32 ctag_offset,
+                                   bool cacheable,
+                                   int rw_flag)
+{
+        int err;
+        u32 pde_lo, pde_hi, pde_i;
+        struct scatterlist *cur_chunk;
+        unsigned int cur_offset;
+        u32 pte_w[2] = {0, 0}; /* invalid pte */
+        u32 ctag = ctag_offset;
+        u32 ctag_incr;
+        u32 page_size  = gmmu_page_sizes[pgsz_idx];
+        u64 addr = 0;
+        pde_range_from_vaddr_range(vm, first_vaddr, last_vaddr,
+                                   &pde_lo, &pde_hi);
+        gk20a_dbg(gpu_dbg_pte, "size_idx=%d, pde_lo=%d, pde_hi=%d",
+                   pgsz_idx, pde_lo, pde_hi);
+        /* If ctag_offset !=0 add 1 else add 0.  The idea is to avoid a branch
+         * below (per-pte). Note: this doesn't work unless page size (when
+         * comptags are active) is 128KB. We have checks elsewhere for that. */
+        ctag_incr = !!ctag_offset;
+        if (sgt)
+                cur_chunk = sgt->sgl;
+        else
+                cur_chunk = NULL;
+        cur_offset = 0;
+        for (pde_i = pde_lo; pde_i <= pde_hi; pde_i++) {
+                u32 pte_lo, pte_hi;
+                u32 pte_cur;
+                void *pte_kv_cur;
+                struct page_table_gk20a *pte = vm->pdes.ptes[pgsz_idx] + pde_i;
+                if (pde_i == pde_lo)
+                        pte_lo = pte_index_from_vaddr(vm, first_vaddr,
+                                                      pgsz_idx);
+                else
+                        pte_lo = 0;
+                if ((pde_i != pde_hi) && (pde_hi != pde_lo))
+                        pte_hi = vm->mm->page_table_sizing[pgsz_idx].num_ptes-1;
+                else
+                        pte_hi = pte_index_from_vaddr(vm, last_vaddr,
+                                                      pgsz_idx);
+                /* get cpu access to the ptes */
+                err = map_gmmu_pages(pte->ref, pte->sgt, &pte_kv_cur,
+                                     pte->size);
+                if (err) {
+                        gk20a_err(dev_from_vm(vm),
+                                   "couldn't map ptes for update as=%d pte_ref_cnt=%d",
+                                   vm_aspace_id(vm), pte->ref_cnt);
+                        goto clean_up;
+                }
+                gk20a_dbg(gpu_dbg_pte, "pte_lo=%d, pte_hi=%d", pte_lo, pte_hi);
+                for (pte_cur = pte_lo; pte_cur <= pte_hi; pte_cur++) {
+                        if (likely(sgt)) {
+                                u64 new_addr = gk20a_mm_iova_addr(cur_chunk);
+                                if (new_addr) {
+                                        addr = new_addr;
+                                        addr += cur_offset;
+                                }
+                                pte_w[0] = gmmu_pte_valid_true_f() |
+                                        gmmu_pte_address_sys_f(addr
+                                                >> gmmu_pte_address_shift_v());
+                                pte_w[1] = gmmu_pte_aperture_video_memory_f() |
+                                        gmmu_pte_kind_f(kind_v) |
+                                        gmmu_pte_comptagline_f(ctag);
+                                if (rw_flag == gk20a_mem_flag_read_only) {
+                                        pte_w[0] |= gmmu_pte_read_only_true_f();
+                                        pte_w[1] |=
+                                                gmmu_pte_write_disable_true_f();
+                                } else if (rw_flag ==
+                                           gk20a_mem_flag_write_only) {
+                                        pte_w[1] |=
+                                                gmmu_pte_read_disable_true_f();
+                                }
+                                if (!cacheable)
+                                        pte_w[1] |= gmmu_pte_vol_true_f();
+                                pte->ref_cnt++;
+                                gk20a_dbg(gpu_dbg_pte,
+                                           "pte_cur=%d addr=0x%x,%08x kind=%d"
+                                           " ctag=%d vol=%d refs=%d"
+                                           " [0x%08x,0x%08x]",
+                                           pte_cur, hi32(addr), lo32(addr),
+                                           kind_v, ctag, !cacheable,
+                                           pte->ref_cnt, pte_w[1], pte_w[0]);
+                                ctag += ctag_incr;
+                                cur_offset += page_size;
+                                addr += page_size;
+                                while (cur_chunk &&
+                                        cur_offset >= cur_chunk->length) {
+                                        cur_offset -= cur_chunk->length;
+                                        cur_chunk = sg_next(cur_chunk);
+                                }
+                        } else {
+                                pte->ref_cnt--;
+                                gk20a_dbg(gpu_dbg_pte,
+                                           "pte_cur=%d ref=%d [0x0,0x0]",
+                                           pte_cur, pte->ref_cnt);
+                        }
+                        gk20a_mem_wr32(pte_kv_cur + pte_cur*8, 0, pte_w[0]);
+                        gk20a_mem_wr32(pte_kv_cur + pte_cur*8, 1, pte_w[1]);
+                }
+                unmap_gmmu_pages(pte->ref, pte->sgt, pte_kv_cur);
+                if (pte->ref_cnt == 0) {
+                        /* It can make sense to keep around one page table for
+                         * each flavor (empty)... in case a new map is coming
+                         * right back to alloc (and fill it in) again.
+                         * But: deferring unmapping should help with pathologic
+                         * unmap/map/unmap/map cases where we'd trigger pte
+                         * free/alloc/free/alloc.
+                         */
+                        free_gmmu_pages(vm, pte->ref, pte->sgt,
+                                vm->mm->page_table_sizing[pgsz_idx].order,
+                                pte->size);
+                        pte->ref = NULL;
+                        /* rewrite pde */
+                        update_gmmu_pde_locked(vm, pde_i);
+                }
+        }
+        smp_mb();
+        vm->tlb_dirty = true;
+        gk20a_dbg_fn("set tlb dirty");
+        return 0;
+clean_up:
+        /*TBD: potentially rewrite above to pre-map everything it needs to
+         * as that's the only way it can fail */
+        return err;
+}
+/* for gk20a the "video memory" apertures here are misnomers. */
+static inline u32 big_valid_pde0_bits(u64 pte_addr)
+{
+        u32 pde0_bits =
+                gmmu_pde_aperture_big_video_memory_f() |
+                gmmu_pde_address_big_sys_f(
+                           (u32)(pte_addr >> gmmu_pde_address_shift_v()));
+        return  pde0_bits;
+}
+static inline u32 small_valid_pde1_bits(u64 pte_addr)
+{
+        u32 pde1_bits =
+                gmmu_pde_aperture_small_video_memory_f() |
+                gmmu_pde_vol_small_true_f() | /* tbd: why? */
+                gmmu_pde_address_small_sys_f(
+                           (u32)(pte_addr >> gmmu_pde_address_shift_v()));
+        return pde1_bits;
+}
+/* Given the current state of the ptes associated with a pde,
+   determine value and write it out.  There's no checking
+   here to determine whether or not a change was actually
+   made.  So, superfluous updates will cause unnecessary
+   pde invalidations.
+*/
+static void update_gmmu_pde_locked(struct vm_gk20a *vm, u32 i)
+{
+        bool small_valid, big_valid;
+        u64 pte_addr[2] = {0, 0};
+        struct page_table_gk20a *small_pte =
+                vm->pdes.ptes[gmmu_page_size_small] + i;
+        struct page_table_gk20a *big_pte =
+                vm->pdes.ptes[gmmu_page_size_big] + i;
+        u32 pde_v[2] = {0, 0};
+        u32 *pde;
+        small_valid = small_pte && small_pte->ref;
+        big_valid   = big_pte && big_pte->ref;
+        if (small_valid)
+                pte_addr[gmmu_page_size_small] =
+                        gk20a_mm_iova_addr(small_pte->sgt->sgl);
+        if (big_valid)
+                pte_addr[gmmu_page_size_big] =
+                        gk20a_mm_iova_addr(big_pte->sgt->sgl);
+        pde_v[0] = gmmu_pde_size_full_f();
+        pde_v[0] |= big_valid ?
+                big_valid_pde0_bits(pte_addr[gmmu_page_size_big])
+                :
+                (gmmu_pde_aperture_big_invalid_f());
+        pde_v[1] |= (small_valid ?
+                     small_valid_pde1_bits(pte_addr[gmmu_page_size_small])
+                     :
+                     (gmmu_pde_aperture_small_invalid_f() |
+                      gmmu_pde_vol_small_false_f())
+                     )
+                |
+                (big_valid ? (gmmu_pde_vol_big_true_f()) :
+                 gmmu_pde_vol_big_false_f());
+        pde = pde_from_index(vm, i);
+        gk20a_mem_wr32(pde, 0, pde_v[0]);
+        gk20a_mem_wr32(pde, 1, pde_v[1]);
+        smp_mb();
+        FLUSH_CPU_DCACHE(pde,
+                         sg_phys(vm->pdes.sgt->sgl) + (i*gmmu_pde__size_v()),
+                         sizeof(u32)*2);
+        gk20a_mm_l2_invalidate(vm->mm->g);
+        gk20a_dbg(gpu_dbg_pte, "pde:%d = 0x%x,0x%08x\n", i, pde_v[1], pde_v[0]);
+        vm->tlb_dirty  = true;
+}
+static int gk20a_vm_put_empty(struct vm_gk20a *vm, u64 vaddr,
+                               u32 num_pages, u32 pgsz_idx)
+{
+        struct mm_gk20a *mm = vm->mm;
+        struct gk20a *g = mm->g;
+        u32 pgsz = gmmu_page_sizes[pgsz_idx];
+        u32 i;
+        dma_addr_t iova;
+        /* allocate the zero page if the va does not already have one */
+        if (!vm->zero_page_cpuva) {
+                int err = 0;
+                vm->zero_page_cpuva = dma_alloc_coherent(&g->dev->dev,
+                                                         mm->big_page_size,
+                                                         &iova,
+                                                         GFP_KERNEL);
+                if (!vm->zero_page_cpuva) {
+                        dev_err(&g->dev->dev, "failed to allocate zero page\n");
+                        return -ENOMEM;
+                }
+                vm->zero_page_iova = iova;
+                err = gk20a_get_sgtable(&g->dev->dev, &vm->zero_page_sgt,
+                                        vm->zero_page_cpuva, vm->zero_page_iova,
+                                        mm->big_page_size);
+                if (err) {
+                        dma_free_coherent(&g->dev->dev, mm->big_page_size,
+                                          vm->zero_page_cpuva,
+                                          vm->zero_page_iova);
+                        vm->zero_page_iova = 0;
+                        vm->zero_page_cpuva = NULL;
+                        dev_err(&g->dev->dev, "failed to create sg table for zero page\n");
+                        return -ENOMEM;
+                }
+        }
+        for (i = 0; i < num_pages; i++) {
+                u64 page_vaddr = __locked_gmmu_map(vm, vaddr,
+                        vm->zero_page_sgt, pgsz, pgsz_idx, 0, 0,
+                        NVHOST_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET,
+                        gk20a_mem_flag_none);
+                if (!page_vaddr) {
+                        gk20a_err(dev_from_vm(vm), "failed to remap clean buffers!");
+                        goto err_unmap;
+                }
+                vaddr += pgsz;
+        }
+        gk20a_mm_l2_flush(mm->g, true);
+        return 0;
+err_unmap:
+        WARN_ON(1);
+        /* something went wrong. unmap pages */
+        while (i--) {
+                vaddr -= pgsz;
+                __locked_gmmu_unmap(vm, vaddr, pgsz, pgsz_idx, 0,
+                                    gk20a_mem_flag_none);
+        }
+        return -EINVAL;
+}
+/* NOTE! mapped_buffers lock must be held */
+static void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer)
+{
+        struct vm_gk20a *vm = mapped_buffer->vm;
+        if (mapped_buffer->va_node &&
+            mapped_buffer->va_node->sparse) {
+                u64 vaddr = mapped_buffer->addr;
+                u32 pgsz_idx = mapped_buffer->pgsz_idx;
+                u32 num_pages = mapped_buffer->size >>
+                        gmmu_page_shifts[pgsz_idx];
+                /* there is little we can do if this fails... */
+                gk20a_vm_put_empty(vm, vaddr, num_pages, pgsz_idx);
+        } else
+                __locked_gmmu_unmap(vm,
+                                mapped_buffer->addr,
+                                mapped_buffer->size,
+                                mapped_buffer->pgsz_idx,
+                                mapped_buffer->va_allocated,
+                                gk20a_mem_flag_none);
+        gk20a_dbg(gpu_dbg_map, "as=%d pgsz=%d gv=0x%x,%08x own_mem_ref=%d",
+                   vm_aspace_id(vm), gmmu_page_sizes[mapped_buffer->pgsz_idx],
+                   hi32(mapped_buffer->addr), lo32(mapped_buffer->addr),
+                   mapped_buffer->own_mem_ref);
+        gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->dmabuf,
+                       mapped_buffer->sgt);
+        /* remove from mapped buffer tree and remove list, free */
+        rb_erase(&mapped_buffer->node, &vm->mapped_buffers);
+        if (!list_empty(&mapped_buffer->va_buffers_list))
+                list_del(&mapped_buffer->va_buffers_list);
+        /* keep track of mapped buffers */
+        if (mapped_buffer->user_mapped)
+                vm->num_user_mapped_buffers--;
+        if (mapped_buffer->own_mem_ref)
+                dma_buf_put(mapped_buffer->dmabuf);
+        kfree(mapped_buffer);
+        return;
+}
+void gk20a_vm_unmap(struct vm_gk20a *vm, u64 offset)
+{
+        struct device *d = dev_from_vm(vm);
+        struct mapped_buffer_node *mapped_buffer;
+        mutex_lock(&vm->update_gmmu_lock);
+        mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, offset);
+        if (!mapped_buffer) {
+                mutex_unlock(&vm->update_gmmu_lock);
+                gk20a_err(d, "invalid addr to unmap 0x%llx", offset);
+                return;
+        }
+        kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref);
+        mutex_unlock(&vm->update_gmmu_lock);
+}
+static void gk20a_vm_remove_support(struct vm_gk20a *vm)
+{
+        struct gk20a *g = vm->mm->g;
+        struct mapped_buffer_node *mapped_buffer;
+        struct vm_reserved_va_node *va_node, *va_node_tmp;
+        struct rb_node *node;
+        gk20a_dbg_fn("");
+        mutex_lock(&vm->update_gmmu_lock);
+        /* TBD: add a flag here for the unmap code to recognize teardown
+         * and short-circuit any otherwise expensive operations. */
+        node = rb_first(&vm->mapped_buffers);
+        while (node) {
+                mapped_buffer =
+                        container_of(node, struct mapped_buffer_node, node);
+                gk20a_vm_unmap_locked(mapped_buffer);
+                node = rb_first(&vm->mapped_buffers);
+        }
+        /* destroy remaining reserved memory areas */
+        list_for_each_entry_safe(va_node, va_node_tmp, &vm->reserved_va_list,
+                reserved_va_list) {
+                list_del(&va_node->reserved_va_list);
+                kfree(va_node);
+        }
+        /* TBD: unmapping all buffers above may not actually free
+         * all vm ptes.  jettison them here for certain... */
+        unmap_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, vm->pdes.kv);
+        free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0, vm->pdes.size);
+        kfree(vm->pdes.ptes[gmmu_page_size_small]);
+        kfree(vm->pdes.ptes[gmmu_page_size_big]);
+        gk20a_allocator_destroy(&vm->vma[gmmu_page_size_small]);
+        gk20a_allocator_destroy(&vm->vma[gmmu_page_size_big]);
+        mutex_unlock(&vm->update_gmmu_lock);
+        /* release zero page if used */
+        if (vm->zero_page_cpuva)
+                dma_free_coherent(&g->dev->dev, vm->mm->big_page_size,
+                                  vm->zero_page_cpuva, vm->zero_page_iova);
+        /* vm is not used anymore. release it. */
+        kfree(vm);
+}
+static void gk20a_vm_remove_support_kref(struct kref *ref)
+{
+        struct vm_gk20a *vm = container_of(ref, struct vm_gk20a, ref);
+        gk20a_vm_remove_support(vm);
+}
+void gk20a_vm_get(struct vm_gk20a *vm)
+{
+        kref_get(&vm->ref);
+}
+void gk20a_vm_put(struct vm_gk20a *vm)
+{
+        kref_put(&vm->ref, gk20a_vm_remove_support_kref);
+}
+/* address space interfaces for the gk20a module */
+int gk20a_vm_alloc_share(struct gk20a_as_share *as_share)
+{
+        struct gk20a_as *as = as_share->as;
+        struct gk20a *g = gk20a_from_as(as);
+        struct mm_gk20a *mm = &g->mm;
+        struct vm_gk20a *vm;
+        u64 vma_size;
+        u32 num_pages, low_hole_pages;
+        char name[32];
+        int err;
+        gk20a_dbg_fn("");
+        vm = kzalloc(sizeof(*vm), GFP_KERNEL);
+        if (!vm)
+                return -ENOMEM;
+        as_share->vm = vm;
+        vm->mm = mm;
+        vm->as_share = as_share;
+        vm->big_pages = true;
+        vm->va_start  = mm->pde_stride;   /* create a one pde hole */
+        vm->va_limit  = mm->channel.size; /* note this means channel.size is
+                                             really just the max */
+        {
+                u32 pde_lo, pde_hi;
+                pde_range_from_vaddr_range(vm,
+                                           0, vm->va_limit-1,
+                                           &pde_lo, &pde_hi);
+                vm->pdes.num_pdes = pde_hi + 1;
+        }
+        vm->pdes.ptes[gmmu_page_size_small] =
+                kzalloc(sizeof(struct page_table_gk20a) *
+                        vm->pdes.num_pdes, GFP_KERNEL);
+        vm->pdes.ptes[gmmu_page_size_big] =
+                kzalloc(sizeof(struct page_table_gk20a) *
+                        vm->pdes.num_pdes, GFP_KERNEL);
+        if (!(vm->pdes.ptes[gmmu_page_size_small] &&
+              vm->pdes.ptes[gmmu_page_size_big]))
+                return -ENOMEM;
+        gk20a_dbg_info("init space for va_limit=0x%llx num_pdes=%d",
+                   vm->va_limit, vm->pdes.num_pdes);
+        /* allocate the page table directory */
+        err = alloc_gmmu_pages(vm, 0, &vm->pdes.ref,
+                               &vm->pdes.sgt, &vm->pdes.size);
+        if (err)
+                return -ENOMEM;
+        err = map_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, &vm->pdes.kv,
+                             vm->pdes.size);
+        if (err) {
+                free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0,
+                                        vm->pdes.size);
+                return -ENOMEM;
+        }
+        gk20a_dbg(gpu_dbg_pte, "pdes.kv = 0x%p, pdes.phys = 0x%llx",
+                        vm->pdes.kv,
+                        gk20a_mm_iova_addr(vm->pdes.sgt->sgl));
+        /* we could release vm->pdes.kv but it's only one page... */
+        /* low-half: alloc small pages */
+        /* high-half: alloc big pages */
+        vma_size = mm->channel.size >> 1;
+        snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
+                 gmmu_page_sizes[gmmu_page_size_small]>>10);
+        num_pages = (u32)(vma_size >> gmmu_page_shifts[gmmu_page_size_small]);
+        /* num_pages above is without regard to the low-side hole. */
+        low_hole_pages = (vm->va_start >>
+                          gmmu_page_shifts[gmmu_page_size_small]);
+        gk20a_allocator_init(&vm->vma[gmmu_page_size_small], name,
+              low_hole_pages,             /* start */
+              num_pages - low_hole_pages, /* length */
+              1);                         /* align */
+        snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
+                 gmmu_page_sizes[gmmu_page_size_big]>>10);
+        num_pages = (u32)(vma_size >> gmmu_page_shifts[gmmu_page_size_big]);
+        gk20a_allocator_init(&vm->vma[gmmu_page_size_big], name,
+                              num_pages, /* start */
+                              num_pages, /* length */
+                              1); /* align */
+        vm->mapped_buffers = RB_ROOT;
+        mutex_init(&vm->update_gmmu_lock);
+        kref_init(&vm->ref);
+        INIT_LIST_HEAD(&vm->reserved_va_list);
+        vm->enable_ctag = true;
+        return 0;
+}
+int gk20a_vm_release_share(struct gk20a_as_share *as_share)
+{
+        struct vm_gk20a *vm = as_share->vm;
+        gk20a_dbg_fn("");
+        vm->as_share = NULL;
+        /* put as reference to vm */
+        gk20a_vm_put(vm);
+        as_share->vm = NULL;
+        return 0;
+}
+int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
+                         struct nvhost_as_alloc_space_args *args)
+{       int err = -ENOMEM;
+        int pgsz_idx;
+        u32 start_page_nr;
+        struct gk20a_allocator *vma;
+        struct vm_gk20a *vm = as_share->vm;
+        struct vm_reserved_va_node *va_node;
+        u64 vaddr_start = 0;
+        gk20a_dbg_fn("flags=0x%x pgsz=0x%x nr_pages=0x%x o/a=0x%llx",
+                        args->flags, args->page_size, args->pages,
+                        args->o_a.offset);
+        /* determine pagesz idx */
+        for (pgsz_idx = gmmu_page_size_small;
+             pgsz_idx < gmmu_nr_page_sizes;
+             pgsz_idx++) {
+                if (gmmu_page_sizes[pgsz_idx] == args->page_size)
+                        break;
+        }
+        if (pgsz_idx >= gmmu_nr_page_sizes) {
+                err = -EINVAL;
+                goto clean_up;
+        }
+        va_node = kzalloc(sizeof(*va_node), GFP_KERNEL);
+        if (!va_node) {
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        if (args->flags & NVHOST_AS_ALLOC_SPACE_FLAGS_SPARSE &&
+            pgsz_idx != gmmu_page_size_big) {
+                err = -ENOSYS;
+                kfree(va_node);
+                goto clean_up;
+        }
+        start_page_nr = 0;
+        if (args->flags & NVHOST_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET)
+                start_page_nr = (u32)(args->o_a.offset >>
+                                      gmmu_page_shifts[pgsz_idx]);
+        vma = &vm->vma[pgsz_idx];
+        err = vma->alloc(vma, &start_page_nr, args->pages);
+        if (err) {
+                kfree(va_node);
+                goto clean_up;
+        }
+        vaddr_start = (u64)start_page_nr << gmmu_page_shifts[pgsz_idx];
+        va_node->vaddr_start = vaddr_start;
+        va_node->size = (u64)args->page_size * (u64)args->pages;
+        va_node->pgsz_idx = args->page_size;
+        INIT_LIST_HEAD(&va_node->va_buffers_list);
+        INIT_LIST_HEAD(&va_node->reserved_va_list);
+        mutex_lock(&vm->update_gmmu_lock);
+        /* mark that we need to use sparse mappings here */
+        if (args->flags & NVHOST_AS_ALLOC_SPACE_FLAGS_SPARSE) {
+                err = gk20a_vm_put_empty(vm, vaddr_start, args->pages,
+                                         pgsz_idx);
+                if (err) {
+                        mutex_unlock(&vm->update_gmmu_lock);
+                        vma->free(vma, start_page_nr, args->pages);
+                        kfree(va_node);
+                        goto clean_up;
+                }
+                va_node->sparse = true;
+        }
+        list_add_tail(&va_node->reserved_va_list, &vm->reserved_va_list);
+        mutex_unlock(&vm->update_gmmu_lock);
+        args->o_a.offset = vaddr_start;
+clean_up:
+        return err;
+}
+int gk20a_vm_free_space(struct gk20a_as_share *as_share,
+                        struct nvhost_as_free_space_args *args)
+{
+        int err = -ENOMEM;
+        int pgsz_idx;
+        u32 start_page_nr;
+        struct gk20a_allocator *vma;
+        struct vm_gk20a *vm = as_share->vm;
+        struct vm_reserved_va_node *va_node;
+        gk20a_dbg_fn("pgsz=0x%x nr_pages=0x%x o/a=0x%llx", args->page_size,
+                        args->pages, args->offset);
+        /* determine pagesz idx */
+        for (pgsz_idx = gmmu_page_size_small;
+             pgsz_idx < gmmu_nr_page_sizes;
+             pgsz_idx++) {
+                if (gmmu_page_sizes[pgsz_idx] == args->page_size)
+                        break;
+        }
+        if (pgsz_idx >= gmmu_nr_page_sizes) {
+                err = -EINVAL;
+                goto clean_up;
+        }
+        start_page_nr = (u32)(args->offset >>
+                              gmmu_page_shifts[pgsz_idx]);
+        vma = &vm->vma[pgsz_idx];
+        err = vma->free(vma, start_page_nr, args->pages);
+        if (err)
+                goto clean_up;
+        mutex_lock(&vm->update_gmmu_lock);
+        va_node = addr_to_reservation(vm, args->offset);
+        if (va_node) {
+                struct mapped_buffer_node *buffer;
+                /* there is no need to unallocate the buffers in va. Just
+                 * convert them into normal buffers */
+                list_for_each_entry(buffer,
+                        &va_node->va_buffers_list, va_buffers_list)
+                        list_del_init(&buffer->va_buffers_list);
+                list_del(&va_node->reserved_va_list);
+                /* if this was a sparse mapping, free the va */
+                if (va_node->sparse)
+                        __locked_gmmu_unmap(vm,
+                                va_node->vaddr_start,
+                                va_node->size,
+                                va_node->pgsz_idx,
+                                false,
+                                gk20a_mem_flag_none);
+                kfree(va_node);
+        }
+        mutex_unlock(&vm->update_gmmu_lock);
+clean_up:
+        return err;
+}
+int gk20a_vm_bind_channel(struct gk20a_as_share *as_share,
+                          struct channel_gk20a *ch)
+{
+        int err = 0;
+        struct vm_gk20a *vm = as_share->vm;
+        gk20a_dbg_fn("");
+        ch->vm = vm;
+        err = channel_gk20a_commit_va(ch);
+        if (err)
+                ch->vm = 0;
+        return err;
+}
+int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev)
+{
+        struct gk20a_dmabuf_priv *priv;
+        static DEFINE_MUTEX(priv_lock);
+        priv = dma_buf_get_drvdata(dmabuf, dev);
+        if (likely(priv))
+                return 0;
+        mutex_lock(&priv_lock);
+        priv = dma_buf_get_drvdata(dmabuf, dev);
+        if (priv)
+                goto priv_exist_or_err;
+        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+        if (!priv) {
+                priv = ERR_PTR(-ENOMEM);
+                goto priv_exist_or_err;
+        }
+        mutex_init(&priv->lock);
+        dma_buf_set_drvdata(dmabuf, dev, priv, gk20a_mm_delete_priv);
+priv_exist_or_err:
+        mutex_unlock(&priv_lock);
+        if (IS_ERR(priv))
+                return -ENOMEM;
+        return 0;
+}
+static int gk20a_dmabuf_get_kind(struct dma_buf *dmabuf)
+{
+        int kind = 0;
+#ifdef CONFIG_TEGRA_NVMAP
+        int err;
+        u64 nvmap_param;
+        err = nvmap_get_dmabuf_param(dmabuf, NVMAP_HANDLE_PARAM_KIND,
+                                     &nvmap_param);
+        kind = err ? kind : nvmap_param;
+#endif
+        return kind;
+}
+int gk20a_vm_map_buffer(struct gk20a_as_share *as_share,
+                        int dmabuf_fd,
+                        u64 *offset_align,
+                        u32 flags, /*NVHOST_AS_MAP_BUFFER_FLAGS_*/
+                        int kind)
+{
+        int err = 0;
+        struct vm_gk20a *vm = as_share->vm;
+        struct dma_buf *dmabuf;
+        u64 ret_va;
+        gk20a_dbg_fn("");
+        /* get ref to the mem handle (released on unmap_locked) */
+        dmabuf = dma_buf_get(dmabuf_fd);
+        if (!dmabuf)
+                return 0;
+        err = gk20a_dmabuf_alloc_drvdata(dmabuf, dev_from_vm(vm));
+        if (err) {
+                dma_buf_put(dmabuf);
+                return err;
+        }
+        if (kind == -1)
+                kind = gk20a_dmabuf_get_kind(dmabuf);
+        ret_va = gk20a_vm_map(vm, dmabuf, *offset_align,
+                        flags, kind, NULL, true,
+                        gk20a_mem_flag_none);
+        *offset_align = ret_va;
+        if (!ret_va) {
+                dma_buf_put(dmabuf);
+                err = -EINVAL;
+        }
+        return err;
+}
+int gk20a_vm_unmap_buffer(struct gk20a_as_share *as_share, u64 offset)
+{
+        struct vm_gk20a *vm = as_share->vm;
+        gk20a_dbg_fn("");
+        gk20a_vm_unmap_user(vm, offset);
+        return 0;
+}
+int gk20a_init_bar1_vm(struct mm_gk20a *mm)
+{
+        int err;
+        phys_addr_t inst_pa;
+        void *inst_ptr;
+        struct vm_gk20a *vm = &mm->bar1.vm;
+        struct gk20a *g = gk20a_from_mm(mm);
+        struct device *d = dev_from_gk20a(g);
+        struct inst_desc *inst_block = &mm->bar1.inst_block;
+        u64 pde_addr;
+        u32 pde_addr_lo;
+        u32 pde_addr_hi;
+        dma_addr_t iova;
+        vm->mm = mm;
+        mm->bar1.aperture_size = bar1_aperture_size_mb_gk20a() << 20;
+        gk20a_dbg_info("bar1 vm size = 0x%x", mm->bar1.aperture_size);
+        vm->va_start = mm->pde_stride * 1;
+        vm->va_limit = mm->bar1.aperture_size;
+        {
+                u32 pde_lo, pde_hi;
+                pde_range_from_vaddr_range(vm,
+                                           0, vm->va_limit-1,
+                                           &pde_lo, &pde_hi);
+                vm->pdes.num_pdes = pde_hi + 1;
+        }
+        /* bar1 is likely only to ever use/need small page sizes. */
+        /* But just in case, for now... arrange for both.*/
+        vm->pdes.ptes[gmmu_page_size_small] =
+                kzalloc(sizeof(struct page_table_gk20a) *
+                        vm->pdes.num_pdes, GFP_KERNEL);
+        vm->pdes.ptes[gmmu_page_size_big] =
+                kzalloc(sizeof(struct page_table_gk20a) *
+                        vm->pdes.num_pdes, GFP_KERNEL);
+        if (!(vm->pdes.ptes[gmmu_page_size_small] &&
+              vm->pdes.ptes[gmmu_page_size_big]))
+                return -ENOMEM;
+        gk20a_dbg_info("init space for bar1 va_limit=0x%llx num_pdes=%d",
+                   vm->va_limit, vm->pdes.num_pdes);
+        /* allocate the page table directory */
+        err = alloc_gmmu_pages(vm, 0, &vm->pdes.ref,
+                               &vm->pdes.sgt, &vm->pdes.size);
+        if (err)
+                goto clean_up;
+        err = map_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, &vm->pdes.kv,
+                             vm->pdes.size);
+        if (err) {
+                free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0,
+                                        vm->pdes.size);
+                goto clean_up;
+        }
+        gk20a_dbg(gpu_dbg_pte, "bar 1 pdes.kv = 0x%p, pdes.phys = 0x%llx",
+                        vm->pdes.kv, gk20a_mm_iova_addr(vm->pdes.sgt->sgl));
+        /* we could release vm->pdes.kv but it's only one page... */
+        pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
+        pde_addr_lo = u64_lo32(pde_addr >> 12);
+        pde_addr_hi = u64_hi32(pde_addr);
+        gk20a_dbg_info("pde pa=0x%llx pde_addr_lo=0x%x pde_addr_hi=0x%x",
+                (u64)gk20a_mm_iova_addr(vm->pdes.sgt->sgl),
+                pde_addr_lo, pde_addr_hi);
+        /* allocate instance mem for bar1 */
+        inst_block->size = ram_in_alloc_size_v();
+        inst_block->cpuva = dma_alloc_coherent(d, inst_block->size,
+                                &iova, GFP_KERNEL);
+        if (!inst_block->cpuva) {
+                gk20a_err(d, "%s: memory allocation failed\n", __func__);
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        inst_block->iova = iova;
+        inst_block->cpu_pa = gk20a_get_phys_from_iova(d, inst_block->iova);
+        if (!inst_block->cpu_pa) {
+                gk20a_err(d, "%s: failed to get phys address\n", __func__);
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        inst_pa = inst_block->cpu_pa;
+        inst_ptr = inst_block->cpuva;
+        gk20a_dbg_info("bar1 inst block physical phys = 0x%llx, kv = 0x%p",
+                (u64)inst_pa, inst_ptr);
+        memset(inst_ptr, 0, ram_fc_size_val_v());
+        gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
+                ram_in_page_dir_base_target_vid_mem_f() |
+                ram_in_page_dir_base_vol_true_f() |
+                ram_in_page_dir_base_lo_f(pde_addr_lo));
+        gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
+                ram_in_page_dir_base_hi_f(pde_addr_hi));
+        gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
+                 u64_lo32(vm->va_limit) | 0xFFF);
+        gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
+                ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
+        gk20a_dbg_info("bar1 inst block ptr: %08llx",  (u64)inst_pa);
+        gk20a_allocator_init(&vm->vma[gmmu_page_size_small], "gk20a_bar1",
+                              1,/*start*/
+                              (vm->va_limit >> 12) - 1 /* length*/,
+                              1); /* align */
+        /* initialize just in case we try to use it anyway */
+        gk20a_allocator_init(&vm->vma[gmmu_page_size_big], "gk20a_bar1-unused",
+                              0x0badc0de, /* start */
+                              1, /* length */
+                              1); /* align */
+        vm->mapped_buffers = RB_ROOT;
+        mutex_init(&vm->update_gmmu_lock);
+        kref_init(&vm->ref);
+        INIT_LIST_HEAD(&vm->reserved_va_list);
+        return 0;
+clean_up:
+        /* free, etc */
+        if (inst_block->cpuva)
+                dma_free_coherent(d, inst_block->size,
+                        inst_block->cpuva, inst_block->iova);
+        inst_block->cpuva = NULL;
+        inst_block->iova = 0;
+        return err;
+}
+/* pmu vm, share channel_vm interfaces */
+int gk20a_init_pmu_vm(struct mm_gk20a *mm)
+{
+        int err;
+        phys_addr_t inst_pa;
+        void *inst_ptr;
+        struct vm_gk20a *vm = &mm->pmu.vm;
+        struct gk20a *g = gk20a_from_mm(mm);
+        struct device *d = dev_from_gk20a(g);
+        struct inst_desc *inst_block = &mm->pmu.inst_block;
+        u64 pde_addr;
+        u32 pde_addr_lo;
+        u32 pde_addr_hi;
+        dma_addr_t iova;
+        vm->mm = mm;
+        mm->pmu.aperture_size = GK20A_PMU_VA_SIZE;
+        gk20a_dbg_info("pmu vm size = 0x%x", mm->pmu.aperture_size);
+        vm->va_start  = GK20A_PMU_VA_START;
+        vm->va_limit  = vm->va_start + mm->pmu.aperture_size;
+        {
+                u32 pde_lo, pde_hi;
+                pde_range_from_vaddr_range(vm,
+                                           0, vm->va_limit-1,
+                                           &pde_lo, &pde_hi);
+                vm->pdes.num_pdes = pde_hi + 1;
+        }
+        /* The pmu is likely only to ever use/need small page sizes. */
+        /* But just in case, for now... arrange for both.*/
+        vm->pdes.ptes[gmmu_page_size_small] =
+                kzalloc(sizeof(struct page_table_gk20a) *
+                        vm->pdes.num_pdes, GFP_KERNEL);
+        vm->pdes.ptes[gmmu_page_size_big] =
+                kzalloc(sizeof(struct page_table_gk20a) *
+                        vm->pdes.num_pdes, GFP_KERNEL);
+        if (!(vm->pdes.ptes[gmmu_page_size_small] &&
+              vm->pdes.ptes[gmmu_page_size_big]))
+                return -ENOMEM;
+        gk20a_dbg_info("init space for pmu va_limit=0x%llx num_pdes=%d",
+                   vm->va_limit, vm->pdes.num_pdes);
+        /* allocate the page table directory */
+        err = alloc_gmmu_pages(vm, 0, &vm->pdes.ref,
+                               &vm->pdes.sgt, &vm->pdes.size);
+        if (err)
+                goto clean_up;
+        err = map_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, &vm->pdes.kv,
+                             vm->pdes.size);
+        if (err) {
+                free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0,
+                                        vm->pdes.size);
+                goto clean_up;
+        }
+        gk20a_dbg_info("pmu pdes phys @ 0x%llx",
+                        (u64)gk20a_mm_iova_addr(vm->pdes.sgt->sgl));
+        /* we could release vm->pdes.kv but it's only one page... */
+        pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
+        pde_addr_lo = u64_lo32(pde_addr >> 12);
+        pde_addr_hi = u64_hi32(pde_addr);
+        gk20a_dbg_info("pde pa=0x%llx pde_addr_lo=0x%x pde_addr_hi=0x%x",
+                        (u64)pde_addr, pde_addr_lo, pde_addr_hi);
+        /* allocate instance mem for pmu */
+        inst_block->size = GK20A_PMU_INST_SIZE;
+        inst_block->cpuva = dma_alloc_coherent(d, inst_block->size,
+                                &iova, GFP_KERNEL);
+        if (!inst_block->cpuva) {
+                gk20a_err(d, "%s: memory allocation failed\n", __func__);
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        inst_block->iova = iova;
+        inst_block->cpu_pa = gk20a_get_phys_from_iova(d, inst_block->iova);
+        if (!inst_block->cpu_pa) {
+                gk20a_err(d, "%s: failed to get phys address\n", __func__);
+                err = -ENOMEM;
+                goto clean_up;
+        }
+        inst_pa = inst_block->cpu_pa;
+        inst_ptr = inst_block->cpuva;
+        gk20a_dbg_info("pmu inst block physical addr: 0x%llx", (u64)inst_pa);
+        memset(inst_ptr, 0, GK20A_PMU_INST_SIZE);
+        gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
+                ram_in_page_dir_base_target_vid_mem_f() |
+                ram_in_page_dir_base_vol_true_f() |
+                ram_in_page_dir_base_lo_f(pde_addr_lo));
+        gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
+                ram_in_page_dir_base_hi_f(pde_addr_hi));
+        gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
+                 u64_lo32(vm->va_limit) | 0xFFF);
+        gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
+                ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
+        gk20a_allocator_init(&vm->vma[gmmu_page_size_small], "gk20a_pmu",
+                              (vm->va_start >> 12), /* start */
+                              (vm->va_limit - vm->va_start) >> 12, /*length*/
+                              1); /* align */
+        /* initialize just in case we try to use it anyway */
+        gk20a_allocator_init(&vm->vma[gmmu_page_size_big], "gk20a_pmu-unused",
+                              0x0badc0de, /* start */
+                              1, /* length */
+                              1); /* align */
+        vm->mapped_buffers = RB_ROOT;
+        mutex_init(&vm->update_gmmu_lock);
+        kref_init(&vm->ref);
+        INIT_LIST_HEAD(&vm->reserved_va_list);
+        return 0;
+clean_up:
+        /* free, etc */
+        if (inst_block->cpuva)
+                dma_free_coherent(d, inst_block->size,
+                        inst_block->cpuva, inst_block->iova);
+        inst_block->cpuva = NULL;
+        inst_block->iova = 0;
+        return err;
+}
+void gk20a_mm_fb_flush(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        u32 data;
+        s32 retry = 100;
+        gk20a_dbg_fn("");
+        mutex_lock(&mm->l2_op_lock);
+        g->ops.ltc.elpg_flush(g);
+        /* Make sure all previous writes are committed to the L2. There's no
+           guarantee that writes are to DRAM. This will be a sysmembar internal
+           to the L2. */
+        gk20a_writel(g, flush_fb_flush_r(),
+                flush_fb_flush_pending_busy_f());
+        do {
+                data = gk20a_readl(g, flush_fb_flush_r());
+                if (flush_fb_flush_outstanding_v(data) ==
+                        flush_fb_flush_outstanding_true_v() ||
+                    flush_fb_flush_pending_v(data) ==
+                        flush_fb_flush_pending_busy_v()) {
+                                gk20a_dbg_info("fb_flush 0x%x", data);
+                                retry--;
+                                usleep_range(20, 40);
+                } else
+                        break;
+        } while (retry >= 0 || !tegra_platform_is_silicon());
+        if (retry < 0)
+                gk20a_warn(dev_from_gk20a(g),
+                        "fb_flush too many retries");
+        mutex_unlock(&mm->l2_op_lock);
+}
+static void gk20a_mm_l2_invalidate_locked(struct gk20a *g)
+{
+        u32 data;
+        s32 retry = 200;
+        /* Invalidate any clean lines from the L2 so subsequent reads go to
+           DRAM. Dirty lines are not affected by this operation. */
+        gk20a_writel(g, flush_l2_system_invalidate_r(),
+                flush_l2_system_invalidate_pending_busy_f());
+        do {
+                data = gk20a_readl(g, flush_l2_system_invalidate_r());
+                if (flush_l2_system_invalidate_outstanding_v(data) ==
+                        flush_l2_system_invalidate_outstanding_true_v() ||
+                    flush_l2_system_invalidate_pending_v(data) ==
+                        flush_l2_system_invalidate_pending_busy_v()) {
+                                gk20a_dbg_info("l2_system_invalidate 0x%x",
+                                                data);
+                                retry--;
+                                usleep_range(20, 40);
+                } else
+                        break;
+        } while (retry >= 0 || !tegra_platform_is_silicon());
+        if (retry < 0)
+                gk20a_warn(dev_from_gk20a(g),
+                        "l2_system_invalidate too many retries");
+}
+void gk20a_mm_l2_invalidate(struct gk20a *g)
+{
+        struct mm_gk20a *mm = &g->mm;
+        mutex_lock(&mm->l2_op_lock);
+        gk20a_mm_l2_invalidate_locked(g);
+        mutex_unlock(&mm->l2_op_lock);
+}
+void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate)
+{
+        struct mm_gk20a *mm = &g->mm;
+        u32 data;
+        s32 retry = 200;
+        gk20a_dbg_fn("");
+        mutex_lock(&mm->l2_op_lock);
+        /* Flush all dirty lines from the L2 to DRAM. Lines are left in the L2
+           as clean, so subsequent reads might hit in the L2. */
+        gk20a_writel(g, flush_l2_flush_dirty_r(),
+                flush_l2_flush_dirty_pending_busy_f());
+        do {
+                data = gk20a_readl(g, flush_l2_flush_dirty_r());
+                if (flush_l2_flush_dirty_outstanding_v(data) ==
+                        flush_l2_flush_dirty_outstanding_true_v() ||
+                    flush_l2_flush_dirty_pending_v(data) ==
+                        flush_l2_flush_dirty_pending_busy_v()) {
+                                gk20a_dbg_info("l2_flush_dirty 0x%x", data);
+                                retry--;
+                                usleep_range(20, 40);
+                } else
+                        break;
+        } while (retry >= 0 || !tegra_platform_is_silicon());
+        if (retry < 0)
+                gk20a_warn(dev_from_gk20a(g),
+                        "l2_flush_dirty too many retries");
+        if (invalidate)
+                gk20a_mm_l2_invalidate_locked(g);
+        mutex_unlock(&mm->l2_op_lock);
+}
+int gk20a_vm_find_buffer(struct vm_gk20a *vm, u64 gpu_va,
+                         struct dma_buf **dmabuf,
+                         u64 *offset)
+{
+        struct mapped_buffer_node *mapped_buffer;
+        gk20a_dbg_fn("gpu_va=0x%llx", gpu_va);
+        mutex_lock(&vm->update_gmmu_lock);
+        mapped_buffer = find_mapped_buffer_range_locked(&vm->mapped_buffers,
+                                                        gpu_va);
+        if (!mapped_buffer) {
+                mutex_unlock(&vm->update_gmmu_lock);
+                return -EINVAL;
+        }
+        *dmabuf = mapped_buffer->dmabuf;
+        *offset = gpu_va - mapped_buffer->addr;
+        mutex_unlock(&vm->update_gmmu_lock);
+        return 0;
+}
+void gk20a_mm_tlb_invalidate(struct vm_gk20a *vm)
+{
+        struct mm_gk20a *mm = vm->mm;
+        struct gk20a *g = gk20a_from_vm(vm);
+        u32 addr_lo = u64_lo32(gk20a_mm_iova_addr(vm->pdes.sgt->sgl) >> 12);
+        u32 data;
+        s32 retry = 200;
+        gk20a_dbg_fn("");
+        /* pagetables are considered sw states which are preserved after
+           prepare_poweroff. When gk20a deinit releases those pagetables,
+           common code in vm unmap path calls tlb invalidate that touches
+           hw. Use the power_on flag to skip tlb invalidation when gpu
+           power is turned off */
+        if (!g->power_on)
+                return;
+        /* No need to invalidate if tlb is clean */
+        mutex_lock(&vm->update_gmmu_lock);
+        if (!vm->tlb_dirty) {
+                mutex_unlock(&vm->update_gmmu_lock);
+                return;
+        }
+        vm->tlb_dirty = false;
+        mutex_unlock(&vm->update_gmmu_lock);
+        mutex_lock(&mm->tlb_lock);
+        do {
+                data = gk20a_readl(g, fb_mmu_ctrl_r());
+                if (fb_mmu_ctrl_pri_fifo_space_v(data) != 0)
+                        break;
+                usleep_range(20, 40);
+                retry--;
+        } while (retry >= 0 || !tegra_platform_is_silicon());
+        if (retry < 0)
+                gk20a_warn(dev_from_gk20a(g),
+                        "wait mmu fifo space too many retries");
+        gk20a_writel(g, fb_mmu_invalidate_pdb_r(),
+                fb_mmu_invalidate_pdb_addr_f(addr_lo) |
+                fb_mmu_invalidate_pdb_aperture_vid_mem_f());
+        /* this is a sledgehammer, it would seem */
+        gk20a_writel(g, fb_mmu_invalidate_r(),
+                fb_mmu_invalidate_all_pdb_true_f() |
+                fb_mmu_invalidate_all_va_true_f() |
+                fb_mmu_invalidate_trigger_true_f());
+        do {
+                data = gk20a_readl(g, fb_mmu_ctrl_r());
+                if (fb_mmu_ctrl_pri_fifo_empty_v(data) !=
+                        fb_mmu_ctrl_pri_fifo_empty_false_f())
+                        break;
+                retry--;
+                usleep_range(20, 40);
+        } while (retry >= 0 || !tegra_platform_is_silicon());
+        if (retry < 0)
+                gk20a_warn(dev_from_gk20a(g),
+                        "mmu invalidate too many retries");
+        mutex_unlock(&mm->tlb_lock);
+}
+int gk20a_mm_suspend(struct gk20a *g)
+{
+        gk20a_dbg_fn("");
+        gk20a_mm_fb_flush(g);
+        gk20a_mm_l2_flush(g, true);
+        gk20a_dbg_fn("done");
+        return 0;
+}
+void gk20a_mm_ltc_isr(struct gk20a *g)
+{
+        u32 intr;
+        intr = gk20a_readl(g, ltc_ltc0_ltss_intr_r());
+        gk20a_err(dev_from_gk20a(g), "ltc: %08x\n", intr);
+        gk20a_writel(g, ltc_ltc0_ltss_intr_r(), intr);
+}
+bool gk20a_mm_mmu_debug_mode_enabled(struct gk20a *g)
+{
+        u32 debug_ctrl = gk20a_readl(g, fb_mmu_debug_ctrl_r());
+        return fb_mmu_debug_ctrl_debug_v(debug_ctrl) ==
+                fb_mmu_debug_ctrl_debug_enabled_v();
+}
+static int gk20a_mm_mmu_vpr_info_fetch_wait(struct gk20a *g,
+                                            const unsigned int msec)
+{
+        unsigned long timeout;
+        timeout = jiffies + msecs_to_jiffies(msec);
+        while (1) {
+                u32 val;
+                val = gk20a_readl(g, fb_mmu_vpr_info_r());
+                if (fb_mmu_vpr_info_fetch_v(val) ==
+                    fb_mmu_vpr_info_fetch_false_v())
+                        break;
+                if (tegra_platform_is_silicon() &&
+                                WARN_ON(time_after(jiffies, timeout)))
+                        return -ETIME;
+        }
+        return 0;
+}
+int gk20a_mm_mmu_vpr_info_fetch(struct gk20a *g)
+{
+        int ret = 0;
+        gk20a_busy_noresume(g->dev);
+        if (!pm_runtime_active(&g->dev->dev))
+                goto fail;
+        if (gk20a_mm_mmu_vpr_info_fetch_wait(g, 5)) {
+                ret = -ETIME;
+                goto fail;
+        }
+        gk20a_writel(g, fb_mmu_vpr_info_r(),
+                     fb_mmu_vpr_info_fetch_true_v());
+        ret = gk20a_mm_mmu_vpr_info_fetch_wait(g, 5);
+ fail:
+        gk20a_idle(g->dev);
+        return ret;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
new file mode 100644
index 00000000..23d15c23
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -0,0 +1,464 @@
+/*
+ * drivers/video/tegra/host/gk20a/mm_gk20a.h
+ *
+ * GK20A memory management
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __MM_GK20A_H__
+#define __MM_GK20A_H__
+#include <linux/scatterlist.h>
+#include <linux/dma-attrs.h>
+#include <linux/iommu.h>
+#include <asm/dma-iommu.h>
+#include "gk20a_allocator.h"
+/* This "address bit" in the gmmu ptes (and other gk20a accesses)
+ * signals the address as presented should be translated by the SMMU.
+ * Without this bit present gk20a accesses are *not* translated.
+ */
+/* Hack, get this from manuals somehow... */
+#define NV_MC_SMMU_VADDR_TRANSLATION_BIT     34
+#define NV_MC_SMMU_VADDR_TRANSLATE(x) (x | \
+                                (1ULL << NV_MC_SMMU_VADDR_TRANSLATION_BIT))
+/* For now keep the size relatively small-ish compared to the full
+ * 40b va.  32GB for now. It consists of two 16GB spaces. */
+#define NV_GMMU_VA_RANGE        35ULL
+#define NV_GMMU_VA_IS_UPPER(x)  ((x) >= ((u64)0x1 << (NV_GMMU_VA_RANGE-1)))
+struct mem_desc {
+        struct dma_buf *ref;
+        struct sg_table *sgt;
+        u32 size;
+};
+struct mem_desc_sub {
+        u32 offset;
+        u32 size;
+};
+struct gpfifo_desc {
+        size_t size;
+        u32 entry_num;
+        u32 get;
+        u32 put;
+        bool wrap;
+        u64 iova;
+        struct gpfifo *cpu_va;
+        u64 gpu_va;
+};
+struct mmu_desc {
+        void *cpuva;
+        u64 iova;
+        size_t size;
+};
+struct inst_desc {
+        u64 iova;
+        void *cpuva;
+        phys_addr_t cpu_pa;
+        size_t size;
+};
+struct surface_mem_desc {
+        u64 iova;
+        void *cpuva;
+        struct sg_table *sgt;
+        size_t size;
+};
+struct userd_desc {
+        struct sg_table *sgt;
+        u64 iova;
+        void *cpuva;
+        size_t size;
+        u64 gpu_va;
+};
+struct runlist_mem_desc {
+        u64 iova;
+        void *cpuva;
+        size_t size;
+};
+struct patch_desc {
+        struct page **pages;
+        u64 iova;
+        size_t size;
+        void *cpu_va;
+        u64 gpu_va;
+        u32 data_count;
+};
+struct pmu_mem_desc {
+        void *cpuva;
+        u64 iova;
+        u64 pmu_va;
+        size_t size;
+};
+struct priv_cmd_queue_mem_desc {
+        u64 base_iova;
+        u32 *base_cpuva;
+        size_t size;
+};
+struct zcull_ctx_desc {
+        struct mem_desc mem;
+        u64 gpu_va;
+        u32 ctx_attr;
+        u32 ctx_sw_mode;
+};
+struct pm_ctx_desc {
+        struct mem_desc mem;
+        u64 gpu_va;
+        u32 ctx_attr;
+        u32 ctx_sw_mode;
+};
+struct gr_ctx_buffer_desc;
+struct platform_device;
+struct gr_ctx_buffer_desc {
+        void (*destroy)(struct platform_device *, struct gr_ctx_buffer_desc *);
+        struct sg_table *sgt;
+        struct page **pages;
+        size_t size;
+        u64 iova;
+        struct dma_attrs attrs;
+        void *priv;
+};
+struct gr_ctx_desc {
+        struct page **pages;
+        u64 iova;
+        size_t size;
+        u64 gpu_va;
+};
+struct compbit_store_desc {
+        struct pages **pages;
+        size_t size;
+        u64 base_iova;
+};
+struct page_table_gk20a {
+        /* backing for */
+        /* Either a *page or a *mem_handle */
+        void *ref;
+        /* track mapping cnt on this page table */
+        u32 ref_cnt;
+        struct sg_table *sgt;
+        size_t size;
+};
+#ifndef _NVHOST_MEM_MGR_H
+enum gk20a_mem_rw_flag {
+        gk20a_mem_flag_none = 0,
+        gk20a_mem_flag_read_only = 1,
+        gk20a_mem_flag_write_only = 2,
+};
+#endif
+enum gmmu_pgsz_gk20a {
+        gmmu_page_size_small = 0,
+        gmmu_page_size_big   = 1,
+        gmmu_nr_page_sizes   = 2
+};
+struct page_directory_gk20a {
+        /* backing for */
+        u32 num_pdes;
+        void *kv;
+        /* Either a *page or a *mem_handle */
+        void *ref;
+        struct sg_table *sgt;
+        size_t size;
+        struct page_table_gk20a *ptes[gmmu_nr_page_sizes];
+};
+struct priv_cmd_queue {
+        struct priv_cmd_queue_mem_desc mem;
+        u64 base_gpuva; /* gpu_va base */
+        u16 size;       /* num of entries in words */
+        u16 put;        /* put for priv cmd queue */
+        u16 get;        /* get for priv cmd queue */
+        struct list_head free;  /* list of pre-allocated free entries */
+        struct list_head head;  /* list of used entries */
+};
+struct priv_cmd_entry {
+        u32 *ptr;
+        u64 gva;
+        u16 get;        /* start of entry in queue */
+        u16 size;       /* in words */
+        u32 gp_get;     /* gp_get when submitting last priv cmd */
+        u32 gp_put;     /* gp_put when submitting last priv cmd */
+        u32 gp_wrap;    /* wrap when submitting last priv cmd */
+        bool pre_alloc; /* prealloc entry, free to free list */
+        struct list_head list;  /* node for lists */
+};
+struct mapped_buffer_node {
+        struct vm_gk20a *vm;
+        struct rb_node node;
+        struct list_head unmap_list;
+        struct list_head va_buffers_list;
+        struct vm_reserved_va_node *va_node;
+        u64 addr;
+        u64 size;
+        struct dma_buf *dmabuf;
+        struct sg_table *sgt;
+        struct kref ref;
+        u32 user_mapped;
+        bool own_mem_ref;
+        u32 pgsz_idx;
+        u32 ctag_offset;
+        u32 ctag_lines;
+        u32 flags;
+        u32 kind;
+        bool va_allocated;
+};
+struct vm_reserved_va_node {
+        struct list_head reserved_va_list;
+        struct list_head va_buffers_list;
+        u32 pgsz_idx;
+        u64 vaddr_start;
+        u64 size;
+        bool sparse;
+};
+struct vm_gk20a {
+        struct mm_gk20a *mm;
+        struct gk20a_as_share *as_share; /* as_share this represents */
+        u64 va_start;
+        u64 va_limit;
+        int num_user_mapped_buffers;
+        bool big_pages;   /* enable large page support */
+        bool enable_ctag;
+        bool tlb_dirty;
+        bool mapped;
+        struct kref ref;
+        struct mutex update_gmmu_lock;
+        struct page_directory_gk20a pdes;
+        struct gk20a_allocator vma[gmmu_nr_page_sizes];
+        struct rb_root mapped_buffers;
+        struct list_head reserved_va_list;
+        dma_addr_t zero_page_iova;
+        void *zero_page_cpuva;
+        struct sg_table *zero_page_sgt;
+};
+struct gk20a;
+struct channel_gk20a;
+int gk20a_init_mm_support(struct gk20a *g);
+int gk20a_init_mm_setup_sw(struct gk20a *g);
+int gk20a_init_bar1_vm(struct mm_gk20a *mm);
+int gk20a_init_pmu_vm(struct mm_gk20a *mm);
+void gk20a_mm_fb_flush(struct gk20a *g);
+void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate);
+void gk20a_mm_l2_invalidate(struct gk20a *g);
+struct mm_gk20a {
+        struct gk20a *g;
+        u32 compression_page_size;
+        u32 big_page_size;
+        u32 pde_stride;
+        u32 pde_stride_shift;
+        struct {
+                u32 order;
+                u32 num_ptes;
+        } page_table_sizing[gmmu_nr_page_sizes];
+        struct {
+                u64 size;
+        } channel;
+        struct {
+                u32 aperture_size;
+                struct vm_gk20a vm;
+                struct inst_desc inst_block;
+        } bar1;
+        struct {
+                u32 aperture_size;
+                struct vm_gk20a vm;
+                struct inst_desc inst_block;
+        } pmu;
+        struct mutex tlb_lock;
+        struct mutex l2_op_lock;
+        void (*remove_support)(struct mm_gk20a *mm);
+        bool sw_ready;
+#ifdef CONFIG_DEBUG_FS
+        u32 ltc_enabled;
+        u32 ltc_enabled_debug;
+#endif
+};
+int gk20a_mm_init(struct mm_gk20a *mm);
+#define gk20a_from_mm(mm) ((mm)->g)
+#define gk20a_from_vm(vm) ((vm)->mm->g)
+#define dev_from_vm(vm) dev_from_gk20a(vm->mm->g)
+#define DEFAULT_ALLOC_ALIGNMENT (4*1024)
+static inline int bar1_aperture_size_mb_gk20a(void)
+{
+        return 128; /*TBD read this from fuses?*/
+}
+/* max address bits */
+static inline int max_physaddr_bits_gk20a(void)
+{
+        return 40;/*"old" sys physaddr, meaningful? */
+}
+static inline int max_vid_physaddr_bits_gk20a(void)
+{
+        /* "vid phys" is asid/smmu phys?,
+         * i.e. is this the real sys physaddr? */
+        return 37;
+}
+static inline int max_vaddr_bits_gk20a(void)
+{
+        return 40; /* chopped for area? */
+}
+#if 0 /*related to addr bits above, concern below TBD on which is accurate */
+#define bar1_instance_block_shift_gk20a() (max_physaddr_bits_gk20a() -\
+                                           bus_bar1_block_ptr_s())
+#else
+#define bar1_instance_block_shift_gk20a() bus_bar1_block_ptr_shift_v()
+#endif
+void gk20a_mm_dump_vm(struct vm_gk20a *vm,
+                u64 va_begin, u64 va_end, char *label);
+int gk20a_mm_suspend(struct gk20a *g);
+phys_addr_t gk20a_get_phys_from_iova(struct device *d,
+                                u64 dma_addr);
+int gk20a_get_sgtable(struct device *d, struct sg_table **sgt,
+                        void *cpuva, u64 iova,
+                        size_t size);
+int gk20a_get_sgtable_from_pages(struct device *d, struct sg_table **sgt,
+                        struct page **pages, u64 iova,
+                        size_t size);
+void gk20a_free_sgtable(struct sg_table **sgt);
+u64 gk20a_mm_iova_addr(struct scatterlist *sgl);
+void gk20a_mm_ltc_isr(struct gk20a *g);
+bool gk20a_mm_mmu_debug_mode_enabled(struct gk20a *g);
+int gk20a_mm_mmu_vpr_info_fetch(struct gk20a *g);
+u64 gk20a_gmmu_map(struct vm_gk20a *vm,
+                struct sg_table **sgt,
+                u64 size,
+                u32 flags,
+                int rw_flag);
+void gk20a_gmmu_unmap(struct vm_gk20a *vm,
+                u64 vaddr,
+                u64 size,
+                int rw_flag);
+struct sg_table *gk20a_mm_pin(struct device *dev, struct dma_buf *dmabuf);
+void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf,
+                    struct sg_table *sgt);
+u64 gk20a_vm_map(struct vm_gk20a *vm,
+                struct dma_buf *dmabuf,
+                u64 offset_align,
+                u32 flags /*NVHOST_AS_MAP_BUFFER_FLAGS_*/,
+                int kind,
+                struct sg_table **sgt,
+                bool user_mapped,
+                int rw_flag);
+/* unmap handle from kernel */
+void gk20a_vm_unmap(struct vm_gk20a *vm, u64 offset);
+/* get reference to all currently mapped buffers */
+int gk20a_vm_get_buffers(struct vm_gk20a *vm,
+                         struct mapped_buffer_node ***mapped_buffers,
+                         int *num_buffers);
+/* put references on the given buffers */
+void gk20a_vm_put_buffers(struct vm_gk20a *vm,
+                          struct mapped_buffer_node **mapped_buffers,
+                          int num_buffers);
+/* invalidate tlbs for the vm area */
+void gk20a_mm_tlb_invalidate(struct vm_gk20a *vm);
+/* find buffer corresponding to va */
+int gk20a_vm_find_buffer(struct vm_gk20a *vm, u64 gpu_va,
+                         struct dma_buf **dmabuf,
+                         u64 *offset);
+void gk20a_vm_get(struct vm_gk20a *vm);
+void gk20a_vm_put(struct vm_gk20a *vm);
+/* vm-as interface */
+struct nvhost_as_alloc_space_args;
+struct nvhost_as_free_space_args;
+int gk20a_vm_alloc_share(struct gk20a_as_share *as_share);
+int gk20a_vm_release_share(struct gk20a_as_share *as_share);
+int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
+                         struct nvhost_as_alloc_space_args *args);
+int gk20a_vm_free_space(struct gk20a_as_share *as_share,
+                        struct nvhost_as_free_space_args *args);
+int gk20a_vm_bind_channel(struct gk20a_as_share *as_share,
+                          struct channel_gk20a *ch);
+int gk20a_vm_map_buffer(struct gk20a_as_share *as_share,
+                        int dmabuf_fd,
+                        u64 *offset_align,
+                        u32 flags, /*NVHOST_AS_MAP_BUFFER_FLAGS_*/
+                        int kind);
+int gk20a_vm_unmap_buffer(struct gk20a_as_share *, u64 offset);
+int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev);
+#endif /*_MM_GK20A_H_ */
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
new file mode 100644
index 00000000..09f348cb
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
@@ -0,0 +1,160 @@
+/*
+ * drivers/video/tegra/host/gk20a/soc/platform_gk20a.h
+ *
+ * GK20A Platform (SoC) Interface
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef _GK20A_PLATFORM_H_
+#define _GK20A_PLATFORM_H_
+#include <linux/platform_device.h>
+#include <linux/pm_domain.h>
+struct gk20a;
+struct channel_gk20a;
+struct gr_ctx_buffer_desc;
+struct gk20a_scale_profile;
+struct gk20a_platform {
+#ifdef CONFIG_TEGRA_GK20A
+        u32 syncpt_base;
+#endif
+        /* Populated by the gk20a driver before probing the platform. */
+        struct gk20a *g;
+        /* Should be populated at probe. */
+        bool can_railgate;
+        /* Should be populated at probe. */
+        bool has_syncpoints;
+        /* Should be populated by probe. */
+        struct dentry *debugfs;
+        /* Clock configuration is stored here. Platform probe is responsible
+         * for filling this data. */
+        struct clk *clk[3];
+        int num_clks;
+        /* Delay before rail gated */
+        int railgate_delay;
+        /* Delay before clock gated */
+        int clockgate_delay;
+        /* Initialize the platform interface of the gk20a driver.
+         *
+         * The platform implementation of this function must
+         *   - set the power and clocks of the gk20a device to a known
+         *     state, and
+         *   - populate the gk20a_platform structure (a pointer to the
+         *     structure can be obtained by calling gk20a_get_platform).
+         *
+         * After this function is finished, the driver will initialise
+         * pm runtime and genpd based on the platform configuration.
+         */
+        int (*probe)(struct platform_device *dev);
+        /* Second stage initialisation - called once all power management
+         * initialisations are done.
+         */
+        int (*late_probe)(struct platform_device *dev);
+        /* Called before submitting work to the gpu. The platform may use this
+         * hook to ensure that any other hw modules that the gpu depends on are
+         * powered. The platform implementation must count refs to this call. */
+        int (*channel_busy)(struct platform_device *dev);
+        /* Called after the work on the gpu is completed. The platform may use
+         * this hook to release power refs to any other hw modules that the gpu
+         * depends on. The platform implementation must count refs to this
+         * call. */
+        void (*channel_idle)(struct platform_device *dev);
+        /* This function is called to allocate secure memory (memory that the
+         * CPU cannot see). The function should fill the context buffer
+         * descriptor (especially fields destroy, sgt, size).
+         */
+        int (*secure_alloc)(struct platform_device *dev,
+                            struct gr_ctx_buffer_desc *desc,
+                            size_t size);
+        /* Device is going to be suspended */
+        int (*suspend)(struct device *);
+        /* Called to turn off the device */
+        int (*railgate)(struct platform_device *dev);
+        /* Called to turn on the device */
+        int (*unrailgate)(struct platform_device *dev);
+        /* Postscale callback is called after frequency change */
+        void (*postscale)(struct platform_device *pdev,
+                          unsigned long freq);
+        /* Pre callback is called before frequency change */
+        void (*prescale)(struct platform_device *pdev);
+        /* Devfreq governor name. If scaling is enabled, we request
+         * this governor to be used in scaling */
+        const char *devfreq_governor;
+        /* Quality of service id. If this is set, the scaling routines
+         * will register a callback to id. Each time we receive a new value,
+         * the postscale callback gets called.  */
+        int qos_id;
+        /* Called as part of debug dump. If the gpu gets hung, this function
+         * is responsible for delivering all necessary debug data of other
+         * hw units which may interact with the gpu without direct supervision
+         * of the CPU.
+         */
+        void (*dump_platform_dependencies)(struct platform_device *dev);
+};
+static inline struct gk20a_platform *gk20a_get_platform(
+                struct platform_device *dev)
+{
+        return (struct gk20a_platform *)platform_get_drvdata(dev);
+}
+extern struct gk20a_platform gk20a_generic_platform;
+#ifdef CONFIG_TEGRA_GK20A
+extern struct gk20a_platform gk20a_tegra_platform;
+#endif
+static inline int gk20a_platform_channel_busy(struct platform_device *dev)
+{
+        struct gk20a_platform *p = gk20a_get_platform(dev);
+        int ret = 0;
+        if (p->channel_busy)
+                ret = p->channel_busy(dev);
+        return ret;
+}
+static inline void gk20a_platform_channel_idle(struct platform_device *dev)
+{
+        struct gk20a_platform *p = gk20a_get_platform(dev);
+        if (p->channel_idle)
+                p->channel_idle(dev);
+}
+static inline bool gk20a_platform_has_syncpoints(struct platform_device *dev)
+{
+        struct gk20a_platform *p = gk20a_get_platform(dev);
+        return p->has_syncpoints;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a_generic.c b/drivers/gpu/nvgpu/gk20a/platform_gk20a_generic.c
new file mode 100644
index 00000000..7b750df6
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a_generic.c
@@ -0,0 +1,35 @@
+/*
+ * drivers/video/tegra/host/gk20a/platform_gk20a_generic.c
+ *
+ * GK20A Generic Platform Interface
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include "platform_gk20a.h"
+static int gk20a_generic_probe(struct platform_device *dev)
+{
+        struct gk20a_platform *platform = gk20a_get_platform(dev);
+        /* TODO: Initialize clocks and power */
+        (void)platform;
+        return 0;
+}
+struct gk20a_platform gk20a_generic_platform = {
+        .probe = gk20a_generic_probe,
+};
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
new file mode 100644
index 00000000..35658f31
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
@@ -0,0 +1,561 @@
+/*
+ * drivers/video/tegra/host/gk20a/platform_gk20a_tegra.c
+ *
+ * GK20A Tegra Platform Interface
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/debugfs.h>
+#include <linux/tegra-powergate.h>
+#include <linux/platform_data/tegra_edp.h>
+#include <linux/nvhost_ioctl.h>
+#include <linux/dma-buf.h>
+#include <linux/nvmap.h>
+#include <mach/irqs.h>
+#include <mach/pm_domains.h>
+#include "../../../arch/arm/mach-tegra/iomap.h"
+#include "gk20a.h"
+#include "hal_gk20a.h"
+#include "platform_gk20a.h"
+#include "gk20a_scale.h"
+#define TEGRA_GK20A_INTR                INT_GPU
+#define TEGRA_GK20A_INTR_NONSTALL       INT_GPU_NONSTALL
+#define TEGRA_GK20A_SIM_BASE 0x538F0000 /*tbd: get from iomap.h */
+#define TEGRA_GK20A_SIM_SIZE 0x1000     /*tbd: this is a high-side guess */
+extern struct device tegra_vpr_dev;
+struct gk20a_platform t132_gk20a_tegra_platform;
+struct gk20a_emc_params {
+        long                            emc_slope;
+        long                            emc_offset;
+        long                            emc_dip_slope;
+        long                            emc_dip_offset;
+        long                            emc_xmid;
+        bool                            linear;
+};
+/*
+ * 20.12 fixed point arithmetic
+ */
+static const int FXFRAC = 12;
+static const int FX_HALF = (1 << 12) / 2;
+#define INT_TO_FX(x) ((x) << FXFRAC)
+#define FX_TO_INT(x) ((x) >> FXFRAC)
+#define MHZ_TO_HZ(x) ((x) * 1000000)
+#define HZ_TO_MHZ(x) ((x) / 1000000)
+int FXMUL(int x, int y)
+{
+        return ((long long) x * (long long) y) >> FXFRAC;
+}
+int FXDIV(int x, int y)
+{
+        /* long long div operation not supported, must shift manually. This
+         * would have been
+         *
+         *    return (((long long) x) << FXFRAC) / (long long) y;
+         */
+        int pos, t;
+        if (x == 0)
+                return 0;
+        /* find largest allowable right shift to numerator, limit to FXFRAC */
+        t = x < 0 ? -x : x;
+        pos = 31 - fls(t); /* fls can't be 32 if x != 0 */
+        if (pos > FXFRAC)
+                pos = FXFRAC;
+        y >>= FXFRAC - pos;
+        if (y == 0)
+                return 0x7FFFFFFF; /* overflow, return MAX_FIXED */
+        return (x << pos) / y;
+}
+static int gk20a_tegra_channel_busy(struct platform_device *dev)
+{
+        int ret = 0;
+        /* Explicitly turn on the host1x clocks
+         * - This is needed as host1x driver sets ignore_children = true
+         * to cater the use case of display clock ON but host1x clock OFF
+         * in OS-Idle-Display-ON case
+         * - This was easily done in ACM as it only checked the ref count
+         * of host1x (or any device for that matter) to be zero before
+         * turning off its clock
+         * - However, runtime PM checks to see if *ANY* child of device is
+         * in ACTIVE state and if yes, it doesn't suspend the parent. As a
+         * result of this, display && host1x clocks remains ON during
+         * OS-Idle-Display-ON case
+         * - The code below fixes this use-case
+         */
+        if (to_platform_device(dev->dev.parent))
+                ret = nvhost_module_busy_ext(
+                        to_platform_device(dev->dev.parent));
+        return ret;
+}
+static void gk20a_tegra_channel_idle(struct platform_device *dev)
+{
+        /* Explicitly turn off the host1x clocks */
+        if (to_platform_device(dev->dev.parent))
+                nvhost_module_idle_ext(to_platform_device(dev->dev.parent));
+}
+static void gk20a_tegra_secure_destroy(struct platform_device *pdev,
+                                       struct gr_ctx_buffer_desc *desc)
+{
+        gk20a_free_sgtable(&desc->sgt);
+        dma_free_attrs(&tegra_vpr_dev, desc->size,
+                        (void *)(uintptr_t)&desc->iova,
+                        desc->iova, &desc->attrs);
+}
+static int gk20a_tegra_secure_alloc(struct platform_device *pdev,
+                                    struct gr_ctx_buffer_desc *desc,
+                                    size_t size)
+{
+        struct device *dev = &pdev->dev;
+        DEFINE_DMA_ATTRS(attrs);
+        dma_addr_t iova;
+        struct sg_table *sgt;
+        struct page *page;
+        int err = 0;
+        dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+        (void)dma_alloc_attrs(&tegra_vpr_dev, size, &iova,
+                                      GFP_KERNEL, &attrs);
+        if (dma_mapping_error(&tegra_vpr_dev, iova))
+                return -ENOMEM;
+        desc->iova = iova;
+        desc->size = size;
+        desc->attrs = attrs;
+        desc->destroy = gk20a_tegra_secure_destroy;
+        sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
+        if (!sgt) {
+                gk20a_err(dev, "failed to allocate memory\n");
+                goto fail;
+        }
+        err = sg_alloc_table(sgt, 1, GFP_KERNEL);
+        if (err) {
+                gk20a_err(dev, "failed to allocate sg_table\n");
+                goto fail_sgt;
+        }
+        page = phys_to_page(iova);
+        sg_set_page(sgt->sgl, page, size, 0);
+        sg_dma_address(sgt->sgl) = iova;
+        desc->sgt = sgt;
+        return err;
+fail_sgt:
+        kfree(sgt);
+fail:
+        dma_free_attrs(&tegra_vpr_dev, desc->size,
+                        (void *)(uintptr_t)&desc->iova,
+                        desc->iova, &desc->attrs);
+        return err;
+}
+/*
+ * gk20a_tegra_get_emc_rate()
+ *
+ * This function returns the minimum emc clock based on gpu frequency
+ */
+long gk20a_tegra_get_emc_rate(struct gk20a_emc_params *emc_params, long freq)
+{
+        long hz;
+        freq = INT_TO_FX(HZ_TO_MHZ(freq));
+        hz = FXMUL(freq, emc_params->emc_slope) + emc_params->emc_offset;
+        hz -= FXMUL(emc_params->emc_dip_slope,
+                FXMUL(freq - emc_params->emc_xmid,
+                        freq - emc_params->emc_xmid)) +
+                emc_params->emc_dip_offset;
+        hz = MHZ_TO_HZ(FX_TO_INT(hz + FX_HALF)); /* round to nearest */
+        hz = (hz < 0) ? 0 : hz;
+        return hz;
+}
+/*
+ * gk20a_tegra_postscale(profile, freq)
+ *
+ * This function sets emc frequency based on current gpu frequency
+ */
+static void gk20a_tegra_postscale(struct platform_device *pdev,
+                                  unsigned long freq)
+{
+        struct gk20a_platform *platform = platform_get_drvdata(pdev);
+        struct gk20a_scale_profile *profile = platform->g->scale_profile;
+        struct gk20a_emc_params *emc_params = profile->private_data;
+        struct gk20a *g = get_gk20a(pdev);
+        long after = gk20a_clk_get_rate(g);
+        long emc_target = gk20a_tegra_get_emc_rate(emc_params, after);
+        clk_set_rate(platform->clk[2], emc_target);
+}
+/*
+ * gk20a_tegra_prescale(profile, freq)
+ *
+ * This function informs EDP about changed constraints.
+ */
+static void gk20a_tegra_prescale(struct platform_device *pdev)
+{
+        struct gk20a *g = get_gk20a(pdev);
+        u32 avg = 0;
+        gk20a_pmu_load_norm(g, &avg);
+        tegra_edp_notify_gpu_load(avg);
+}
+/*
+ * gk20a_tegra_calibrate_emc()
+ *
+ * Compute emc scaling parameters
+ *
+ * Remc = S * R3d + O - (Sd * (R3d - Rm)^2 + Od)
+ *
+ * Remc - 3d.emc rate
+ * R3d  - 3d.cbus rate
+ * Rm   - 3d.cbus 'middle' rate = (max + min)/2
+ * S    - emc_slope
+ * O    - emc_offset
+ * Sd   - emc_dip_slope
+ * Od   - emc_dip_offset
+ *
+ * this superposes a quadratic dip centered around the middle 3d
+ * frequency over a linear correlation of 3d.emc to 3d clock
+ * rates.
+ *
+ * S, O are chosen so that the maximum 3d rate produces the
+ * maximum 3d.emc rate exactly, and the minimum 3d rate produces
+ * at least the minimum 3d.emc rate.
+ *
+ * Sd and Od are chosen to produce the largest dip that will
+ * keep 3d.emc frequencies monotonously decreasing with 3d
+ * frequencies. To achieve this, the first derivative of Remc
+ * with respect to R3d should be zero for the minimal 3d rate:
+ *
+ *   R'emc = S - 2 * Sd * (R3d - Rm)
+ *   R'emc(R3d-min) = 0
+ *   S = 2 * Sd * (R3d-min - Rm)
+ *     = 2 * Sd * (R3d-min - R3d-max) / 2
+ *
+ *   +------------------------------+
+ *   | Sd = S / (R3d-min - R3d-max) |
+ *   +------------------------------+
+ *
+ *   dip = Sd * (R3d - Rm)^2 + Od
+ *
+ * requiring dip(R3d-min) = 0 and dip(R3d-max) = 0 gives
+ *
+ *   Sd * (R3d-min - Rm)^2 + Od = 0
+ *   Od = -Sd * ((R3d-min - R3d-max) / 2)^2
+ *      = -Sd * ((R3d-min - R3d-max)^2) / 4
+ *
+ *   +------------------------------+
+ *   | Od = (emc-max - emc-min) / 4 |
+ *   +------------------------------+
+ *
+ */
+void gk20a_tegra_calibrate_emc(struct gk20a_emc_params *emc_params,
+                               struct clk *clk_3d, struct clk *clk_3d_emc)
+{
+        long correction;
+        unsigned long max_emc;
+        unsigned long min_emc;
+        unsigned long min_rate_3d;
+        unsigned long max_rate_3d;
+        max_emc = clk_round_rate(clk_3d_emc, UINT_MAX);
+        max_emc = INT_TO_FX(HZ_TO_MHZ(max_emc));
+        min_emc = clk_round_rate(clk_3d_emc, 0);
+        min_emc = INT_TO_FX(HZ_TO_MHZ(min_emc));
+        max_rate_3d = clk_round_rate(clk_3d, UINT_MAX);
+        max_rate_3d = INT_TO_FX(HZ_TO_MHZ(max_rate_3d));
+        min_rate_3d = clk_round_rate(clk_3d, 0);
+        min_rate_3d = INT_TO_FX(HZ_TO_MHZ(min_rate_3d));
+        emc_params->emc_slope =
+                FXDIV((max_emc - min_emc), (max_rate_3d - min_rate_3d));
+        emc_params->emc_offset = max_emc -
+                FXMUL(emc_params->emc_slope, max_rate_3d);
+        /* Guarantee max 3d rate maps to max emc rate */
+        emc_params->emc_offset += max_emc -
+                (FXMUL(emc_params->emc_slope, max_rate_3d) +
+                emc_params->emc_offset);
+        emc_params->emc_dip_offset = (max_emc - min_emc) / 4;
+        emc_params->emc_dip_slope =
+                -FXDIV(emc_params->emc_slope, max_rate_3d - min_rate_3d);
+        emc_params->emc_xmid = (max_rate_3d + min_rate_3d) / 2;
+        correction =
+                emc_params->emc_dip_offset +
+                        FXMUL(emc_params->emc_dip_slope,
+                        FXMUL(max_rate_3d - emc_params->emc_xmid,
+                                max_rate_3d - emc_params->emc_xmid));
+        emc_params->emc_dip_offset -= correction;
+}
+/*
+ * gk20a_tegra_railgate()
+ *
+ * Gate (disable) gk20a power rail
+ */
+static int gk20a_tegra_railgate(struct platform_device *pdev)
+{
+        if (tegra_powergate_is_powered(TEGRA_POWERGATE_GPU))
+                tegra_powergate_partition(TEGRA_POWERGATE_GPU);
+        return 0;
+}
+/*
+ * gk20a_tegra_unrailgate()
+ *
+ * Ungate (enable) gk20a power rail
+ */
+static int gk20a_tegra_unrailgate(struct platform_device *pdev)
+{
+        tegra_unpowergate_partition(TEGRA_POWERGATE_GPU);
+        return 0;
+}
+struct {
+        char *name;
+        unsigned long default_rate;
+} tegra_gk20a_clocks[] = {
+        {"PLLG_ref", UINT_MAX},
+        {"pwr", 204000000},
+        {"emc", UINT_MAX} };
+/*
+ * gk20a_tegra_get_clocks()
+ *
+ * This function finds clocks in tegra platform and populates
+ * the clock information to gk20a platform data.
+ */
+static int gk20a_tegra_get_clocks(struct platform_device *pdev)
+{
+        struct gk20a_platform *platform = platform_get_drvdata(pdev);
+        char devname[16];
+        int i;
+        int ret = 0;
+        snprintf(devname, sizeof(devname),
+                 (pdev->id <= 0) ? "tegra_%s" : "tegra_%s.%d\n",
+                 pdev->name, pdev->id);
+        platform->num_clks = 0;
+        for (i = 0; i < ARRAY_SIZE(tegra_gk20a_clocks); i++) {
+                long rate = tegra_gk20a_clocks[i].default_rate;
+                struct clk *c;
+                c = clk_get_sys(devname, tegra_gk20a_clocks[i].name);
+                if (IS_ERR(c)) {
+                        ret = PTR_ERR(c);
+                        goto err_get_clock;
+                }
+                rate = clk_round_rate(c, rate);
+                clk_set_rate(c, rate);
+                platform->clk[i] = c;
+        }
+        platform->num_clks = i;
+        return 0;
+err_get_clock:
+        while (i--)
+                clk_put(platform->clk[i]);
+        return ret;
+}
+static void gk20a_tegra_scale_init(struct platform_device *pdev)
+{
+        struct gk20a_platform *platform = gk20a_get_platform(pdev);
+        struct gk20a_scale_profile *profile = platform->g->scale_profile;
+                struct gk20a_emc_params *emc_params;
+        if (!profile)
+                return;
+        emc_params = kzalloc(sizeof(*emc_params), GFP_KERNEL);
+        if (!emc_params)
+                return;
+        gk20a_tegra_calibrate_emc(emc_params, gk20a_clk_get(platform->g),
+                                  platform->clk[2]);
+        profile->private_data = emc_params;
+}
+static void gk20a_tegra_debug_dump(struct platform_device *pdev)
+{
+        struct gk20a_platform *platform = gk20a_get_platform(pdev);
+        struct gk20a *g = platform->g;
+        nvhost_debug_dump_device(g->dev);
+}
+static int gk20a_tegra_probe(struct platform_device *dev)
+{
+        struct gk20a_platform *platform = gk20a_get_platform(dev);
+        if (tegra_get_chipid() == TEGRA_CHIPID_TEGRA13) {
+                t132_gk20a_tegra_platform.g = platform->g;
+                *platform = t132_gk20a_tegra_platform;
+        }
+        gk20a_tegra_get_clocks(dev);
+        return 0;
+}
+static int gk20a_tegra_late_probe(struct platform_device *dev)
+{
+        struct gk20a_platform *platform = gk20a_get_platform(dev);
+        /* Make gk20a power domain a subdomain of mc */
+        tegra_pd_add_sd(&platform->g->pd);
+        /* Initialise tegra specific scaling quirks */
+        gk20a_tegra_scale_init(dev);
+        return 0;
+}
+static int gk20a_tegra_suspend(struct device *dev)
+{
+        tegra_edp_notify_gpu_load(0);
+        return 0;
+}
+static struct resource gk20a_tegra_resources[] = {
+        {
+        .start = TEGRA_GK20A_BAR0_BASE,
+        .end   = TEGRA_GK20A_BAR0_BASE + TEGRA_GK20A_BAR0_SIZE - 1,
+        .flags = IORESOURCE_MEM,
+        },
+        {
+        .start = TEGRA_GK20A_BAR1_BASE,
+        .end   = TEGRA_GK20A_BAR1_BASE + TEGRA_GK20A_BAR1_SIZE - 1,
+        .flags = IORESOURCE_MEM,
+        },
+        { /* Used on ASIM only */
+        .start = TEGRA_GK20A_SIM_BASE,
+        .end   = TEGRA_GK20A_SIM_BASE + TEGRA_GK20A_SIM_SIZE - 1,
+        .flags = IORESOURCE_MEM,
+        },
+        {
+        .start = TEGRA_GK20A_INTR,
+        .end   = TEGRA_GK20A_INTR,
+        .flags = IORESOURCE_IRQ,
+        },
+        {
+        .start = TEGRA_GK20A_INTR_NONSTALL,
+        .end   = TEGRA_GK20A_INTR_NONSTALL,
+        .flags = IORESOURCE_IRQ,
+        },
+};
+struct gk20a_platform t132_gk20a_tegra_platform = {
+        .has_syncpoints = true,
+        /* power management configuration */
+        .railgate_delay         = 500,
+        .clockgate_delay        = 50,
+        .probe = gk20a_tegra_probe,
+        .late_probe = gk20a_tegra_late_probe,
+        /* power management callbacks */
+        .suspend = gk20a_tegra_suspend,
+        .railgate = gk20a_tegra_railgate,
+        .unrailgate = gk20a_tegra_unrailgate,
+        /* frequency scaling configuration */
+        .prescale = gk20a_tegra_prescale,
+        .postscale = gk20a_tegra_postscale,
+        .devfreq_governor = "nvhost_podgov",
+        .qos_id = PM_QOS_GPU_FREQ_MIN,
+        .channel_busy = gk20a_tegra_channel_busy,
+        .channel_idle = gk20a_tegra_channel_idle,
+        .secure_alloc = gk20a_tegra_secure_alloc,
+        .dump_platform_dependencies = gk20a_tegra_debug_dump,
+};
+struct gk20a_platform gk20a_tegra_platform = {
+        .has_syncpoints = true,
+        /* power management configuration */
+        .railgate_delay         = 500,
+        .clockgate_delay        = 50,
+        .can_railgate           = true,
+        .probe = gk20a_tegra_probe,
+        .late_probe = gk20a_tegra_late_probe,
+        /* power management callbacks */
+        .suspend = gk20a_tegra_suspend,
+        .railgate = gk20a_tegra_railgate,
+        .unrailgate = gk20a_tegra_unrailgate,
+        /* frequency scaling configuration */
+        .prescale = gk20a_tegra_prescale,
+        .postscale = gk20a_tegra_postscale,
+        .devfreq_governor = "nvhost_podgov",
+        .qos_id = PM_QOS_GPU_FREQ_MIN,
+        .channel_busy = gk20a_tegra_channel_busy,
+        .channel_idle = gk20a_tegra_channel_idle,
+        .secure_alloc = gk20a_tegra_secure_alloc,
+        .dump_platform_dependencies = gk20a_tegra_debug_dump,
+};
+struct platform_device tegra_gk20a_device = {
+        .name           = "gk20a",
+        .resource       = gk20a_tegra_resources,
+        .num_resources  = ARRAY_SIZE(gk20a_tegra_resources),
+        .dev            = {
+                .platform_data = &gk20a_tegra_platform,
+        },
+};
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
new file mode 100644
index 00000000..a00499a9
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -0,0 +1,3796 @@
+/*
+ * drivers/video/tegra/host/gk20a/pmu_gk20a.c
+ *
+ * GK20A PMU (aka. gPMU outside gk20a context)
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/delay.h>        /* for mdelay */
+#include <linux/firmware.h>
+#include <linux/clk.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/dma-mapping.h>
+#include "gk20a.h"
+#include "hw_mc_gk20a.h"
+#include "hw_pwr_gk20a.h"
+#include "hw_top_gk20a.h"
+#define GK20A_PMU_UCODE_IMAGE   "gpmu_ucode.bin"
+#define gk20a_dbg_pmu(fmt, arg...) \
+        gk20a_dbg(gpu_dbg_pmu, fmt, ##arg)
+static void pmu_dump_falcon_stats(struct pmu_gk20a *pmu);
+static int gk20a_pmu_get_elpg_residency_gating(struct gk20a *g,
+                u32 *ingating_time, u32 *ungating_time, u32 *gating_cnt);
+static void gk20a_init_pmu_setup_hw2_workqueue(struct work_struct *work);
+static void pmu_save_zbc(struct gk20a *g, u32 entries);
+static void ap_callback_init_and_enable_ctrl(
+                struct gk20a *g, struct pmu_msg *msg,
+                void *param, u32 seq_desc, u32 status);
+static int gk20a_pmu_ap_send_command(struct gk20a *g,
+                        union pmu_ap_cmd *p_ap_cmd, bool b_block);
+static u32 pmu_cmdline_size_v0(struct pmu_gk20a *pmu)
+{
+        return sizeof(struct pmu_cmdline_args_v0);
+}
+static u32 pmu_cmdline_size_v1(struct pmu_gk20a *pmu)
+{
+        return sizeof(struct pmu_cmdline_args_v1);
+}
+static void set_pmu_cmdline_args_cpufreq_v1(struct pmu_gk20a *pmu, u32 freq)
+{
+        pmu->args_v1.cpu_freq_hz = freq;
+}
+static void set_pmu_cmdline_args_cpufreq_v0(struct pmu_gk20a *pmu, u32 freq)
+{
+        pmu->args_v0.cpu_freq_hz = freq;
+}
+static void *get_pmu_cmdline_args_ptr_v1(struct pmu_gk20a *pmu)
+{
+        return (void *)(&pmu->args_v1);
+}
+static void *get_pmu_cmdline_args_ptr_v0(struct pmu_gk20a *pmu)
+{
+        return (void *)(&pmu->args_v0);
+}
+static u32 get_pmu_allocation_size_v1(struct pmu_gk20a *pmu)
+{
+        return sizeof(struct pmu_allocation_v1);
+}
+static u32 get_pmu_allocation_size_v0(struct pmu_gk20a *pmu)
+{
+        return sizeof(struct pmu_allocation_v0);
+}
+static void set_pmu_allocation_ptr_v1(struct pmu_gk20a *pmu,
+        void **pmu_alloc_ptr, void *assign_ptr)
+{
+        struct pmu_allocation_v1 **pmu_a_ptr =
+                (struct pmu_allocation_v1 **)pmu_alloc_ptr;
+        *pmu_a_ptr = (struct pmu_allocation_v1 *)assign_ptr;
+}
+static void set_pmu_allocation_ptr_v0(struct pmu_gk20a *pmu,
+        void **pmu_alloc_ptr, void *assign_ptr)
+{
+        struct pmu_allocation_v0 **pmu_a_ptr =
+                (struct pmu_allocation_v0 **)pmu_alloc_ptr;
+        *pmu_a_ptr = (struct pmu_allocation_v0 *)assign_ptr;
+}
+static void pmu_allocation_set_dmem_size_v1(struct pmu_gk20a *pmu,
+        void *pmu_alloc_ptr, u16 size)
+{
+        struct pmu_allocation_v1 *pmu_a_ptr =
+                (struct pmu_allocation_v1 *)pmu_alloc_ptr;
+        pmu_a_ptr->alloc.dmem.size = size;
+}
+static void pmu_allocation_set_dmem_size_v0(struct pmu_gk20a *pmu,
+        void *pmu_alloc_ptr, u16 size)
+{
+        struct pmu_allocation_v0 *pmu_a_ptr =
+                (struct pmu_allocation_v0 *)pmu_alloc_ptr;
+        pmu_a_ptr->alloc.dmem.size = size;
+}
+static u16 pmu_allocation_get_dmem_size_v1(struct pmu_gk20a *pmu,
+        void *pmu_alloc_ptr)
+{
+        struct pmu_allocation_v1 *pmu_a_ptr =
+                (struct pmu_allocation_v1 *)pmu_alloc_ptr;
+        return pmu_a_ptr->alloc.dmem.size;
+}
+static u16 pmu_allocation_get_dmem_size_v0(struct pmu_gk20a *pmu,
+        void *pmu_alloc_ptr)
+{
+        struct pmu_allocation_v0 *pmu_a_ptr =
+                (struct pmu_allocation_v0 *)pmu_alloc_ptr;
+        return pmu_a_ptr->alloc.dmem.size;
+}
+static u32 pmu_allocation_get_dmem_offset_v1(struct pmu_gk20a *pmu,
+        void *pmu_alloc_ptr)
+{
+        struct pmu_allocation_v1 *pmu_a_ptr =
+                (struct pmu_allocation_v1 *)pmu_alloc_ptr;
+        return pmu_a_ptr->alloc.dmem.offset;
+}
+static u32 pmu_allocation_get_dmem_offset_v0(struct pmu_gk20a *pmu,
+        void *pmu_alloc_ptr)
+{
+        struct pmu_allocation_v0 *pmu_a_ptr =
+                (struct pmu_allocation_v0 *)pmu_alloc_ptr;
+        return pmu_a_ptr->alloc.dmem.offset;
+}
+static u32 *pmu_allocation_get_dmem_offset_addr_v1(struct pmu_gk20a *pmu,
+        void *pmu_alloc_ptr)
+{
+        struct pmu_allocation_v1 *pmu_a_ptr =
+                (struct pmu_allocation_v1 *)pmu_alloc_ptr;
+        return &pmu_a_ptr->alloc.dmem.offset;
+}
+static u32 *pmu_allocation_get_dmem_offset_addr_v0(struct pmu_gk20a *pmu,
+        void *pmu_alloc_ptr)
+{
+        struct pmu_allocation_v0 *pmu_a_ptr =
+                (struct pmu_allocation_v0 *)pmu_alloc_ptr;
+        return &pmu_a_ptr->alloc.dmem.offset;
+}
+static void pmu_allocation_set_dmem_offset_v1(struct pmu_gk20a *pmu,
+        void *pmu_alloc_ptr, u32 offset)
+{
+        struct pmu_allocation_v1 *pmu_a_ptr =
+                (struct pmu_allocation_v1 *)pmu_alloc_ptr;
+        pmu_a_ptr->alloc.dmem.offset = offset;
+}
+static void pmu_allocation_set_dmem_offset_v0(struct pmu_gk20a *pmu,
+        void *pmu_alloc_ptr, u32 offset)
+{
+        struct pmu_allocation_v0 *pmu_a_ptr =
+                (struct pmu_allocation_v0 *)pmu_alloc_ptr;
+        pmu_a_ptr->alloc.dmem.offset = offset;
+}
+static void *get_pmu_msg_pmu_init_msg_ptr_v1(struct pmu_init_msg *init)
+{
+        return (void *)(&(init->pmu_init_v1));
+}
+static u16 get_pmu_init_msg_pmu_sw_mg_off_v1(union pmu_init_msg_pmu *init_msg)
+{
+        struct pmu_init_msg_pmu_v1 *init =
+                (struct pmu_init_msg_pmu_v1 *)(&init_msg->v1);
+        return init->sw_managed_area_offset;
+}
+static u16 get_pmu_init_msg_pmu_sw_mg_size_v1(union pmu_init_msg_pmu *init_msg)
+{
+        struct pmu_init_msg_pmu_v1 *init =
+                (struct pmu_init_msg_pmu_v1 *)(&init_msg->v1);
+        return init->sw_managed_area_size;
+}
+static void *get_pmu_msg_pmu_init_msg_ptr_v0(struct pmu_init_msg *init)
+{
+        return (void *)(&(init->pmu_init_v0));
+}
+static u16 get_pmu_init_msg_pmu_sw_mg_off_v0(union pmu_init_msg_pmu *init_msg)
+{
+        struct pmu_init_msg_pmu_v0 *init =
+                (struct pmu_init_msg_pmu_v0 *)(&init_msg->v0);
+        return init->sw_managed_area_offset;
+}
+static u16 get_pmu_init_msg_pmu_sw_mg_size_v0(union pmu_init_msg_pmu *init_msg)
+{
+        struct pmu_init_msg_pmu_v0 *init =
+                (struct pmu_init_msg_pmu_v0 *)(&init_msg->v0);
+        return init->sw_managed_area_size;
+}
+static u32 get_pmu_perfmon_cmd_start_size_v1(void)
+{
+        return sizeof(struct pmu_perfmon_cmd_start_v1);
+}
+static u32 get_pmu_perfmon_cmd_start_size_v0(void)
+{
+        return sizeof(struct pmu_perfmon_cmd_start_v0);
+}
+static int get_perfmon_cmd_start_offsetofvar_v1(
+        enum pmu_perfmon_cmd_start_fields field)
+{
+        switch (field) {
+        case COUNTER_ALLOC:
+                return offsetof(struct pmu_perfmon_cmd_start_v1,
+                counter_alloc);
+        default:
+                return -EINVAL;
+                break;
+        }
+        return 0;
+}
+static int get_perfmon_cmd_start_offsetofvar_v0(
+        enum pmu_perfmon_cmd_start_fields field)
+{
+        switch (field) {
+        case COUNTER_ALLOC:
+                return offsetof(struct pmu_perfmon_cmd_start_v0,
+                counter_alloc);
+        default:
+                return -EINVAL;
+                break;
+        }
+        return 0;
+}
+static u32 get_pmu_perfmon_cmd_init_size_v1(void)
+{
+        return sizeof(struct pmu_perfmon_cmd_init_v1);
+}
+static u32 get_pmu_perfmon_cmd_init_size_v0(void)
+{
+        return sizeof(struct pmu_perfmon_cmd_init_v0);
+}
+static int get_perfmon_cmd_init_offsetofvar_v1(
+        enum pmu_perfmon_cmd_start_fields field)
+{
+        switch (field) {
+        case COUNTER_ALLOC:
+                return offsetof(struct pmu_perfmon_cmd_init_v1,
+                counter_alloc);
+        default:
+                return -EINVAL;
+                break;
+        }
+        return 0;
+}
+static int get_perfmon_cmd_init_offsetofvar_v0(
+        enum pmu_perfmon_cmd_start_fields field)
+{
+        switch (field) {
+        case COUNTER_ALLOC:
+                return offsetof(struct pmu_perfmon_cmd_init_v0,
+                counter_alloc);
+        default:
+                return -EINVAL;
+                break;
+        }
+        return 0;
+}
+static void perfmon_start_set_cmd_type_v1(struct pmu_perfmon_cmd *pc, u8 value)
+{
+        struct pmu_perfmon_cmd_start_v1 *start = &pc->start_v1;
+        start->cmd_type = value;
+}
+static void perfmon_start_set_cmd_type_v0(struct pmu_perfmon_cmd *pc, u8 value)
+{
+        struct pmu_perfmon_cmd_start_v0 *start = &pc->start_v0;
+        start->cmd_type = value;
+}
+static void perfmon_start_set_group_id_v1(struct pmu_perfmon_cmd *pc, u8 value)
+{
+        struct pmu_perfmon_cmd_start_v1 *start = &pc->start_v1;
+        start->group_id = value;
+}
+static void perfmon_start_set_group_id_v0(struct pmu_perfmon_cmd *pc, u8 value)
+{
+        struct pmu_perfmon_cmd_start_v0 *start = &pc->start_v0;
+        start->group_id = value;
+}
+static void perfmon_start_set_state_id_v1(struct pmu_perfmon_cmd *pc, u8 value)
+{
+        struct pmu_perfmon_cmd_start_v1 *start = &pc->start_v1;
+        start->state_id = value;
+}
+static void perfmon_start_set_state_id_v0(struct pmu_perfmon_cmd *pc, u8 value)
+{
+        struct pmu_perfmon_cmd_start_v0 *start = &pc->start_v0;
+        start->state_id = value;
+}
+static void perfmon_start_set_flags_v1(struct pmu_perfmon_cmd *pc, u8 value)
+{
+        struct pmu_perfmon_cmd_start_v1 *start = &pc->start_v1;
+        start->flags = value;
+}
+static void perfmon_start_set_flags_v0(struct pmu_perfmon_cmd *pc, u8 value)
+{
+        struct pmu_perfmon_cmd_start_v0 *start = &pc->start_v0;
+        start->flags = value;
+}
+static u8 perfmon_start_get_flags_v1(struct pmu_perfmon_cmd *pc)
+{
+        struct pmu_perfmon_cmd_start_v1 *start = &pc->start_v1;
+        return start->flags;
+}
+static u8 perfmon_start_get_flags_v0(struct pmu_perfmon_cmd *pc)
+{
+        struct pmu_perfmon_cmd_start_v0 *start = &pc->start_v0;
+        return start->flags;
+}
+static void perfmon_cmd_init_set_sample_buffer_v1(struct pmu_perfmon_cmd *pc,
+        u16 value)
+{
+        struct pmu_perfmon_cmd_init_v1 *init = &pc->init_v1;
+        init->sample_buffer = value;
+}
+static void perfmon_cmd_init_set_sample_buffer_v0(struct pmu_perfmon_cmd *pc,
+        u16 value)
+{
+        struct pmu_perfmon_cmd_init_v0 *init = &pc->init_v0;
+        init->sample_buffer = value;
+}
+static void perfmon_cmd_init_set_dec_cnt_v1(struct pmu_perfmon_cmd *pc,
+        u8 value)
+{
+        struct pmu_perfmon_cmd_init_v1 *init = &pc->init_v1;
+        init->to_decrease_count = value;
+}
+static void perfmon_cmd_init_set_dec_cnt_v0(struct pmu_perfmon_cmd *pc,
+        u8 value)
+{
+        struct pmu_perfmon_cmd_init_v0 *init = &pc->init_v0;
+        init->to_decrease_count = value;
+}
+static void perfmon_cmd_init_set_base_cnt_id_v1(struct pmu_perfmon_cmd *pc,
+        u8 value)
+{
+        struct pmu_perfmon_cmd_init_v1 *init = &pc->init_v1;
+        init->base_counter_id = value;
+}
+static void perfmon_cmd_init_set_base_cnt_id_v0(struct pmu_perfmon_cmd *pc,
+        u8 value)
+{
+        struct pmu_perfmon_cmd_init_v0 *init = &pc->init_v0;
+        init->base_counter_id = value;
+}
+static void perfmon_cmd_init_set_samp_period_us_v1(struct pmu_perfmon_cmd *pc,
+        u32 value)
+{
+        struct pmu_perfmon_cmd_init_v1 *init = &pc->init_v1;
+        init->sample_period_us = value;
+}
+static void perfmon_cmd_init_set_samp_period_us_v0(struct pmu_perfmon_cmd *pc,
+        u32 value)
+{
+        struct pmu_perfmon_cmd_init_v0 *init = &pc->init_v0;
+        init->sample_period_us = value;
+}
+static void perfmon_cmd_init_set_num_cnt_v1(struct pmu_perfmon_cmd *pc,
+        u8 value)
+{
+        struct pmu_perfmon_cmd_init_v1 *init = &pc->init_v1;
+        init->num_counters = value;
+}
+static void perfmon_cmd_init_set_num_cnt_v0(struct pmu_perfmon_cmd *pc,
+        u8 value)
+{
+        struct pmu_perfmon_cmd_init_v0 *init = &pc->init_v0;
+        init->num_counters = value;
+}
+static void perfmon_cmd_init_set_mov_avg_v1(struct pmu_perfmon_cmd *pc,
+        u8 value)
+{
+        struct pmu_perfmon_cmd_init_v1 *init = &pc->init_v1;
+        init->samples_in_moving_avg = value;
+}
+static void perfmon_cmd_init_set_mov_avg_v0(struct pmu_perfmon_cmd *pc,
+        u8 value)
+{
+        struct pmu_perfmon_cmd_init_v0 *init = &pc->init_v0;
+        init->samples_in_moving_avg = value;
+}
+static void get_pmu_init_msg_pmu_queue_params_v0(struct pmu_queue *queue,
+        u32 id, void *pmu_init_msg)
+{
+        struct pmu_init_msg_pmu_v0 *init =
+                (struct pmu_init_msg_pmu_v0 *)pmu_init_msg;
+        queue->index    = init->queue_info[id].index;
+        queue->offset   = init->queue_info[id].offset;
+        queue->size = init->queue_info[id].size;
+}
+static void get_pmu_init_msg_pmu_queue_params_v1(struct pmu_queue *queue,
+        u32 id, void *pmu_init_msg)
+{
+        struct pmu_init_msg_pmu_v1 *init =
+                (struct pmu_init_msg_pmu_v1 *)pmu_init_msg;
+        queue->index    = init->queue_info[id].index;
+        queue->offset   = init->queue_info[id].offset;
+        queue->size = init->queue_info[id].size;
+}
+static void *get_pmu_sequence_in_alloc_ptr_v1(struct pmu_sequence *seq)
+{
+        return (void *)(&seq->in_v1);
+}
+static void *get_pmu_sequence_in_alloc_ptr_v0(struct pmu_sequence *seq)
+{
+        return (void *)(&seq->in_v0);
+}
+static void *get_pmu_sequence_out_alloc_ptr_v1(struct pmu_sequence *seq)
+{
+        return (void *)(&seq->out_v1);
+}
+static void *get_pmu_sequence_out_alloc_ptr_v0(struct pmu_sequence *seq)
+{
+        return (void *)(&seq->out_v0);
+}
+static int gk20a_init_pmu(struct pmu_gk20a *pmu)
+{
+        struct gk20a *g = pmu->g;
+        switch (pmu->desc->app_version) {
+        case APP_VERSION_1:
+                g->ops.pmu_ver.cmd_id_zbc_table_update = 16;
+                g->ops.pmu_ver.get_pmu_cmdline_args_size =
+                        pmu_cmdline_size_v1;
+                g->ops.pmu_ver.set_pmu_cmdline_args_cpu_freq =
+                        set_pmu_cmdline_args_cpufreq_v1;
+                g->ops.pmu_ver.get_pmu_cmdline_args_ptr =
+                        get_pmu_cmdline_args_ptr_v1;
+                g->ops.pmu_ver.get_pmu_allocation_struct_size =
+                        get_pmu_allocation_size_v1;
+                g->ops.pmu_ver.set_pmu_allocation_ptr =
+                        set_pmu_allocation_ptr_v1;
+                g->ops.pmu_ver.pmu_allocation_set_dmem_size =
+                        pmu_allocation_set_dmem_size_v1;
+                g->ops.pmu_ver.pmu_allocation_get_dmem_size =
+                        pmu_allocation_get_dmem_size_v1;
+                g->ops.pmu_ver.pmu_allocation_get_dmem_offset =
+                        pmu_allocation_get_dmem_offset_v1;
+                g->ops.pmu_ver.pmu_allocation_get_dmem_offset_addr =
+                        pmu_allocation_get_dmem_offset_addr_v1;
+                g->ops.pmu_ver.pmu_allocation_set_dmem_offset =
+                        pmu_allocation_set_dmem_offset_v1;
+                g->ops.pmu_ver.get_pmu_init_msg_pmu_queue_params =
+                        get_pmu_init_msg_pmu_queue_params_v1;
+                g->ops.pmu_ver.get_pmu_msg_pmu_init_msg_ptr =
+                        get_pmu_msg_pmu_init_msg_ptr_v1;
+                g->ops.pmu_ver.get_pmu_init_msg_pmu_sw_mg_off =
+                        get_pmu_init_msg_pmu_sw_mg_off_v1;
+                g->ops.pmu_ver.get_pmu_init_msg_pmu_sw_mg_size =
+                        get_pmu_init_msg_pmu_sw_mg_size_v1;
+                g->ops.pmu_ver.get_pmu_perfmon_cmd_start_size =
+                        get_pmu_perfmon_cmd_start_size_v1;
+                g->ops.pmu_ver.get_perfmon_cmd_start_offsetofvar =
+                        get_perfmon_cmd_start_offsetofvar_v1;
+                g->ops.pmu_ver.perfmon_start_set_cmd_type =
+                        perfmon_start_set_cmd_type_v1;
+                g->ops.pmu_ver.perfmon_start_set_group_id =
+                        perfmon_start_set_group_id_v1;
+                g->ops.pmu_ver.perfmon_start_set_state_id =
+                        perfmon_start_set_state_id_v1;
+                g->ops.pmu_ver.perfmon_start_set_flags =
+                        perfmon_start_set_flags_v1;
+                g->ops.pmu_ver.perfmon_start_get_flags =
+                        perfmon_start_get_flags_v1;
+                g->ops.pmu_ver.get_pmu_perfmon_cmd_init_size =
+                        get_pmu_perfmon_cmd_init_size_v1;
+                g->ops.pmu_ver.get_perfmon_cmd_init_offsetofvar =
+                        get_perfmon_cmd_init_offsetofvar_v1;
+                g->ops.pmu_ver.perfmon_cmd_init_set_sample_buffer =
+                        perfmon_cmd_init_set_sample_buffer_v1;
+                g->ops.pmu_ver.perfmon_cmd_init_set_dec_cnt =
+                        perfmon_cmd_init_set_dec_cnt_v1;
+                g->ops.pmu_ver.perfmon_cmd_init_set_base_cnt_id =
+                        perfmon_cmd_init_set_base_cnt_id_v1;
+                g->ops.pmu_ver.perfmon_cmd_init_set_samp_period_us =
+                        perfmon_cmd_init_set_samp_period_us_v1;
+                g->ops.pmu_ver.perfmon_cmd_init_set_num_cnt =
+                        perfmon_cmd_init_set_num_cnt_v1;
+                g->ops.pmu_ver.perfmon_cmd_init_set_mov_avg =
+                        perfmon_cmd_init_set_mov_avg_v1;
+                g->ops.pmu_ver.get_pmu_seq_in_a_ptr =
+                        get_pmu_sequence_in_alloc_ptr_v1;
+                g->ops.pmu_ver.get_pmu_seq_out_a_ptr =
+                        get_pmu_sequence_out_alloc_ptr_v1;
+                break;
+        case APP_VERSION_0:
+                g->ops.pmu_ver.cmd_id_zbc_table_update = 14;
+                g->ops.pmu_ver.get_pmu_cmdline_args_size =
+                        pmu_cmdline_size_v0;
+                g->ops.pmu_ver.set_pmu_cmdline_args_cpu_freq =
+                        set_pmu_cmdline_args_cpufreq_v0;
+                g->ops.pmu_ver.get_pmu_cmdline_args_ptr =
+                        get_pmu_cmdline_args_ptr_v0;
+                g->ops.pmu_ver.get_pmu_allocation_struct_size =
+                        get_pmu_allocation_size_v0;
+                g->ops.pmu_ver.set_pmu_allocation_ptr =
+                        set_pmu_allocation_ptr_v0;
+                g->ops.pmu_ver.pmu_allocation_set_dmem_size =
+                        pmu_allocation_set_dmem_size_v0;
+                g->ops.pmu_ver.pmu_allocation_get_dmem_size =
+                        pmu_allocation_get_dmem_size_v0;
+                g->ops.pmu_ver.pmu_allocation_get_dmem_offset =
+                        pmu_allocation_get_dmem_offset_v0;
+                g->ops.pmu_ver.pmu_allocation_get_dmem_offset_addr =
+                        pmu_allocation_get_dmem_offset_addr_v0;
+                g->ops.pmu_ver.pmu_allocation_set_dmem_offset =
+                        pmu_allocation_set_dmem_offset_v0;
+                g->ops.pmu_ver.get_pmu_init_msg_pmu_queue_params =
+                        get_pmu_init_msg_pmu_queue_params_v0;
+                g->ops.pmu_ver.get_pmu_msg_pmu_init_msg_ptr =
+                        get_pmu_msg_pmu_init_msg_ptr_v0;
+                g->ops.pmu_ver.get_pmu_init_msg_pmu_sw_mg_off =
+                        get_pmu_init_msg_pmu_sw_mg_off_v0;
+                g->ops.pmu_ver.get_pmu_init_msg_pmu_sw_mg_size =
+                        get_pmu_init_msg_pmu_sw_mg_size_v0;
+                g->ops.pmu_ver.get_pmu_perfmon_cmd_start_size =
+                        get_pmu_perfmon_cmd_start_size_v0;
+                g->ops.pmu_ver.get_perfmon_cmd_start_offsetofvar =
+                        get_perfmon_cmd_start_offsetofvar_v0;
+                g->ops.pmu_ver.perfmon_start_set_cmd_type =
+                        perfmon_start_set_cmd_type_v0;
+                g->ops.pmu_ver.perfmon_start_set_group_id =
+                        perfmon_start_set_group_id_v0;
+                g->ops.pmu_ver.perfmon_start_set_state_id =
+                        perfmon_start_set_state_id_v0;
+                g->ops.pmu_ver.perfmon_start_set_flags =
+                        perfmon_start_set_flags_v0;
+                g->ops.pmu_ver.perfmon_start_get_flags =
+                        perfmon_start_get_flags_v0;
+                g->ops.pmu_ver.get_pmu_perfmon_cmd_init_size =
+                        get_pmu_perfmon_cmd_init_size_v0;
+                g->ops.pmu_ver.get_perfmon_cmd_init_offsetofvar =
+                        get_perfmon_cmd_init_offsetofvar_v0;
+                g->ops.pmu_ver.perfmon_cmd_init_set_sample_buffer =
+                        perfmon_cmd_init_set_sample_buffer_v0;
+                g->ops.pmu_ver.perfmon_cmd_init_set_dec_cnt =
+                        perfmon_cmd_init_set_dec_cnt_v0;
+                g->ops.pmu_ver.perfmon_cmd_init_set_base_cnt_id =
+                        perfmon_cmd_init_set_base_cnt_id_v0;
+                g->ops.pmu_ver.perfmon_cmd_init_set_samp_period_us =
+                        perfmon_cmd_init_set_samp_period_us_v0;
+                g->ops.pmu_ver.perfmon_cmd_init_set_num_cnt =
+                        perfmon_cmd_init_set_num_cnt_v0;
+                g->ops.pmu_ver.perfmon_cmd_init_set_mov_avg =
+                        perfmon_cmd_init_set_mov_avg_v0;
+                g->ops.pmu_ver.get_pmu_seq_in_a_ptr =
+                        get_pmu_sequence_in_alloc_ptr_v0;
+                g->ops.pmu_ver.get_pmu_seq_out_a_ptr =
+                        get_pmu_sequence_out_alloc_ptr_v0;
+                break;
+        default:
+                gk20a_err(dev_from_gk20a(pmu->g),
+                "PMU code version not supported\n");
+                return -EINVAL;
+                break;
+        }
+        return 0;
+}
+static void pmu_copy_from_dmem(struct pmu_gk20a *pmu,
+                u32 src, u8 *dst, u32 size, u8 port)
+{
+        struct gk20a *g = pmu->g;
+        u32 i, words, bytes;
+        u32 data, addr_mask;
+        u32 *dst_u32 = (u32*)dst;
+        if (size == 0) {
+                gk20a_err(dev_from_gk20a(g),
+                        "size is zero");
+                return;
+        }
+        if (src & 0x3) {
+                gk20a_err(dev_from_gk20a(g),
+                        "src (0x%08x) not 4-byte aligned", src);
+                return;
+        }
+        mutex_lock(&pmu->pmu_copy_lock);
+        words = size >> 2;
+        bytes = size & 0x3;
+        addr_mask = pwr_falcon_dmemc_offs_m() |
+                    pwr_falcon_dmemc_blk_m();
+        src &= addr_mask;
+        gk20a_writel(g, pwr_falcon_dmemc_r(port),
+                src | pwr_falcon_dmemc_aincr_f(1));
+        for (i = 0; i < words; i++)
+                dst_u32[i] = gk20a_readl(g, pwr_falcon_dmemd_r(port));
+        if (bytes > 0) {
+                data = gk20a_readl(g, pwr_falcon_dmemd_r(port));
+                for (i = 0; i < bytes; i++) {
+                        dst[(words << 2) + i] = ((u8 *)&data)[i];
+                        gk20a_dbg_pmu("read: dst_u8[%d]=0x%08x",
+                                        i, dst[(words << 2) + i]);
+                }
+        }
+        mutex_unlock(&pmu->pmu_copy_lock);
+        return;
+}
+static void pmu_copy_to_dmem(struct pmu_gk20a *pmu,
+                u32 dst, u8 *src, u32 size, u8 port)
+{
+        struct gk20a *g = pmu->g;
+        u32 i, words, bytes;
+        u32 data, addr_mask;
+        u32 *src_u32 = (u32*)src;
+        if (size == 0) {
+                gk20a_err(dev_from_gk20a(g),
+                        "size is zero");
+                return;
+        }
+        if (dst & 0x3) {
+                gk20a_err(dev_from_gk20a(g),
+                        "dst (0x%08x) not 4-byte aligned", dst);
+                return;
+        }
+        mutex_lock(&pmu->pmu_copy_lock);
+        words = size >> 2;
+        bytes = size & 0x3;
+        addr_mask = pwr_falcon_dmemc_offs_m() |
+                    pwr_falcon_dmemc_blk_m();
+        dst &= addr_mask;
+        gk20a_writel(g, pwr_falcon_dmemc_r(port),
+                dst | pwr_falcon_dmemc_aincw_f(1));
+        for (i = 0; i < words; i++)
+                gk20a_writel(g, pwr_falcon_dmemd_r(port), src_u32[i]);
+        if (bytes > 0) {
+                data = 0;
+                for (i = 0; i < bytes; i++)
+                        ((u8 *)&data)[i] = src[(words << 2) + i];
+                gk20a_writel(g, pwr_falcon_dmemd_r(port), data);
+        }
+        data = gk20a_readl(g, pwr_falcon_dmemc_r(port)) & addr_mask;
+        size = ALIGN(size, 4);
+        if (data != dst + size) {
+                gk20a_err(dev_from_gk20a(g),
+                        "copy failed. bytes written %d, expected %d",
+                        data - dst, size);
+        }
+        mutex_unlock(&pmu->pmu_copy_lock);
+        return;
+}
+static int pmu_idle(struct pmu_gk20a *pmu)
+{
+        struct gk20a *g = pmu->g;
+        unsigned long end_jiffies = jiffies +
+                msecs_to_jiffies(2000);
+        u32 idle_stat;
+        /* wait for pmu idle */
+        do {
+                idle_stat = gk20a_readl(g, pwr_falcon_idlestate_r());
+                if (pwr_falcon_idlestate_falcon_busy_v(idle_stat) == 0 &&
+                    pwr_falcon_idlestate_ext_busy_v(idle_stat) == 0) {
+                        break;
+                }
+                if (time_after_eq(jiffies, end_jiffies)) {
+                        gk20a_err(dev_from_gk20a(g),
+                                "timeout waiting pmu idle : 0x%08x",
+                                idle_stat);
+                        return -EBUSY;
+                }
+                usleep_range(100, 200);
+        } while (1);
+        gk20a_dbg_fn("done");
+        return 0;
+}
+static void pmu_enable_irq(struct pmu_gk20a *pmu, bool enable)
+{
+        struct gk20a *g = pmu->g;
+        gk20a_dbg_fn("");
+        gk20a_writel(g, mc_intr_mask_0_r(),
+                gk20a_readl(g, mc_intr_mask_0_r()) &
+                ~mc_intr_mask_0_pmu_enabled_f());
+        gk20a_writel(g, mc_intr_mask_1_r(),
+                gk20a_readl(g, mc_intr_mask_1_r()) &
+                ~mc_intr_mask_1_pmu_enabled_f());
+        gk20a_writel(g, pwr_falcon_irqmclr_r(),
+                pwr_falcon_irqmclr_gptmr_f(1)  |
+                pwr_falcon_irqmclr_wdtmr_f(1)  |
+                pwr_falcon_irqmclr_mthd_f(1)   |
+                pwr_falcon_irqmclr_ctxsw_f(1)  |
+                pwr_falcon_irqmclr_halt_f(1)   |
+                pwr_falcon_irqmclr_exterr_f(1) |
+                pwr_falcon_irqmclr_swgen0_f(1) |
+                pwr_falcon_irqmclr_swgen1_f(1) |
+                pwr_falcon_irqmclr_ext_f(0xff));
+        if (enable) {
+                /* dest 0=falcon, 1=host; level 0=irq0, 1=irq1 */
+                gk20a_writel(g, pwr_falcon_irqdest_r(),
+                        pwr_falcon_irqdest_host_gptmr_f(0)    |
+                        pwr_falcon_irqdest_host_wdtmr_f(1)    |
+                        pwr_falcon_irqdest_host_mthd_f(0)     |
+                        pwr_falcon_irqdest_host_ctxsw_f(0)    |
+                        pwr_falcon_irqdest_host_halt_f(1)     |
+                        pwr_falcon_irqdest_host_exterr_f(0)   |
+                        pwr_falcon_irqdest_host_swgen0_f(1)   |
+                        pwr_falcon_irqdest_host_swgen1_f(0)   |
+                        pwr_falcon_irqdest_host_ext_f(0xff)   |
+                        pwr_falcon_irqdest_target_gptmr_f(1)  |
+                        pwr_falcon_irqdest_target_wdtmr_f(0)  |
+                        pwr_falcon_irqdest_target_mthd_f(0)   |
+                        pwr_falcon_irqdest_target_ctxsw_f(0)  |
+                        pwr_falcon_irqdest_target_halt_f(0)   |
+                        pwr_falcon_irqdest_target_exterr_f(0) |
+                        pwr_falcon_irqdest_target_swgen0_f(0) |
+                        pwr_falcon_irqdest_target_swgen1_f(0) |
+                        pwr_falcon_irqdest_target_ext_f(0xff));
+                /* 0=disable, 1=enable */
+                gk20a_writel(g, pwr_falcon_irqmset_r(),
+                        pwr_falcon_irqmset_gptmr_f(1)  |
+                        pwr_falcon_irqmset_wdtmr_f(1)  |
+                        pwr_falcon_irqmset_mthd_f(0)   |
+                        pwr_falcon_irqmset_ctxsw_f(0)  |
+                        pwr_falcon_irqmset_halt_f(1)   |
+                        pwr_falcon_irqmset_exterr_f(1) |
+                        pwr_falcon_irqmset_swgen0_f(1) |
+                        pwr_falcon_irqmset_swgen1_f(1));
+                gk20a_writel(g, mc_intr_mask_0_r(),
+                        gk20a_readl(g, mc_intr_mask_0_r()) |
+                        mc_intr_mask_0_pmu_enabled_f());
+        }
+        gk20a_dbg_fn("done");
+}
+static int pmu_enable_hw(struct pmu_gk20a *pmu, bool enable)
+{
+        struct gk20a *g = pmu->g;
+        gk20a_dbg_fn("");
+        if (enable) {
+                int retries = GR_IDLE_CHECK_MAX / GR_IDLE_CHECK_DEFAULT;
+                gk20a_enable(g, mc_enable_pwr_enabled_f());
+                do {
+                        u32 w = gk20a_readl(g, pwr_falcon_dmactl_r()) &
+                                (pwr_falcon_dmactl_dmem_scrubbing_m() |
+                                 pwr_falcon_dmactl_imem_scrubbing_m());
+                        if (!w) {
+                                gk20a_dbg_fn("done");
+                                return 0;
+                        }
+                        udelay(GR_IDLE_CHECK_DEFAULT);
+                } while (--retries || !tegra_platform_is_silicon());
+                gk20a_disable(g, mc_enable_pwr_enabled_f());
+                gk20a_err(dev_from_gk20a(g), "Falcon mem scrubbing timeout");
+                return -ETIMEDOUT;
+        } else {
+                gk20a_disable(g, mc_enable_pwr_enabled_f());
+                return 0;
+        }
+}
+static int pmu_enable(struct pmu_gk20a *pmu, bool enable)
+{
+        struct gk20a *g = pmu->g;
+        u32 pmc_enable;
+        int err;
+        gk20a_dbg_fn("");
+        if (!enable) {
+                pmc_enable = gk20a_readl(g, mc_enable_r());
+                if (mc_enable_pwr_v(pmc_enable) !=
+                    mc_enable_pwr_disabled_v()) {
+                        pmu_enable_irq(pmu, false);
+                        pmu_enable_hw(pmu, false);
+                }
+        } else {
+                err = pmu_enable_hw(pmu, true);
+                if (err)
+                        return err;
+                /* TBD: post reset */
+                err = pmu_idle(pmu);
+                if (err)
+                        return err;
+                pmu_enable_irq(pmu, true);
+        }
+        gk20a_dbg_fn("done");
+        return 0;
+}
+static int pmu_reset(struct pmu_gk20a *pmu)
+{
+        int err;
+        err = pmu_idle(pmu);
+        if (err)
+                return err;
+        /* TBD: release pmu hw mutex */
+        err = pmu_enable(pmu, false);
+        if (err)
+                return err;
+        /* TBD: cancel all sequences */
+        /* TBD: init all sequences and state tables */
+        /* TBD: restore pre-init message handler */
+        err = pmu_enable(pmu, true);
+        if (err)
+                return err;
+        return 0;
+}
+static int pmu_bootstrap(struct pmu_gk20a *pmu)
+{
+        struct gk20a *g = pmu->g;
+        struct gk20a_platform *platform = platform_get_drvdata(g->dev);
+        struct mm_gk20a *mm = &g->mm;
+        struct pmu_ucode_desc *desc = pmu->desc;
+        u64 addr_code, addr_data, addr_load;
+        u32 i, blocks, addr_args;
+        gk20a_dbg_fn("");
+        gk20a_writel(g, pwr_falcon_itfen_r(),
+                gk20a_readl(g, pwr_falcon_itfen_r()) |
+                pwr_falcon_itfen_ctxen_enable_f());
+        gk20a_writel(g, pwr_pmu_new_instblk_r(),
+                pwr_pmu_new_instblk_ptr_f(
+                        mm->pmu.inst_block.cpu_pa >> 12) |
+                pwr_pmu_new_instblk_valid_f(1) |
+                pwr_pmu_new_instblk_target_sys_coh_f());
+        /* TBD: load all other surfaces */
+        g->ops.pmu_ver.set_pmu_cmdline_args_cpu_freq(pmu,
+                clk_get_rate(platform->clk[1]));
+        addr_args = (pwr_falcon_hwcfg_dmem_size_v(
+                gk20a_readl(g, pwr_falcon_hwcfg_r()))
+                        << GK20A_PMU_DMEM_BLKSIZE2) -
+                g->ops.pmu_ver.get_pmu_cmdline_args_size(pmu);
+        pmu_copy_to_dmem(pmu, addr_args,
+                        (u8 *)(g->ops.pmu_ver.get_pmu_cmdline_args_ptr(pmu)),
+                        g->ops.pmu_ver.get_pmu_cmdline_args_size(pmu), 0);
+        gk20a_writel(g, pwr_falcon_dmemc_r(0),
+                pwr_falcon_dmemc_offs_f(0) |
+                pwr_falcon_dmemc_blk_f(0)  |
+                pwr_falcon_dmemc_aincw_f(1));
+        addr_code = u64_lo32((pmu->ucode.pmu_va +
+                        desc->app_start_offset +
+                        desc->app_resident_code_offset) >> 8) ;
+        addr_data = u64_lo32((pmu->ucode.pmu_va +
+                        desc->app_start_offset +
+                        desc->app_resident_data_offset) >> 8);
+        addr_load = u64_lo32((pmu->ucode.pmu_va +
+                        desc->bootloader_start_offset) >> 8);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), GK20A_PMU_DMAIDX_UCODE);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), addr_code);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), desc->app_size);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), desc->app_resident_code_size);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), desc->app_imem_entry);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), addr_data);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), desc->app_resident_data_size);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), addr_code);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), 0x1);
+        gk20a_writel(g, pwr_falcon_dmemd_r(0), addr_args);
+        gk20a_writel(g, pwr_falcon_dmatrfbase_r(),
+                addr_load - (desc->bootloader_imem_offset >> 8));
+        blocks = ((desc->bootloader_size + 0xFF) & ~0xFF) >> 8;
+        for (i = 0; i < blocks; i++) {
+                gk20a_writel(g, pwr_falcon_dmatrfmoffs_r(),
+                        desc->bootloader_imem_offset + (i << 8));
+                gk20a_writel(g, pwr_falcon_dmatrffboffs_r(),
+                        desc->bootloader_imem_offset + (i << 8));
+                gk20a_writel(g, pwr_falcon_dmatrfcmd_r(),
+                        pwr_falcon_dmatrfcmd_imem_f(1)  |
+                        pwr_falcon_dmatrfcmd_write_f(0) |
+                        pwr_falcon_dmatrfcmd_size_f(6)  |
+                        pwr_falcon_dmatrfcmd_ctxdma_f(GK20A_PMU_DMAIDX_UCODE));
+        }
+        gk20a_writel(g, pwr_falcon_bootvec_r(),
+                pwr_falcon_bootvec_vec_f(desc->bootloader_entry_point));
+        gk20a_writel(g, pwr_falcon_cpuctl_r(),
+                pwr_falcon_cpuctl_startcpu_f(1));
+        gk20a_writel(g, pwr_falcon_os_r(), desc->app_version);
+        return 0;
+}
+static void pmu_seq_init(struct pmu_gk20a *pmu)
+{
+        u32 i;
+        memset(pmu->seq, 0,
+                sizeof(struct pmu_sequence) * PMU_MAX_NUM_SEQUENCES);
+        memset(pmu->pmu_seq_tbl, 0,
+                sizeof(pmu->pmu_seq_tbl));
+        for (i = 0; i < PMU_MAX_NUM_SEQUENCES; i++)
+                pmu->seq[i].id = i;
+}
+static int pmu_seq_acquire(struct pmu_gk20a *pmu,
+                        struct pmu_sequence **pseq)
+{
+        struct gk20a *g = pmu->g;
+        struct pmu_sequence *seq;
+        u32 index;
+        mutex_lock(&pmu->pmu_seq_lock);
+        index = find_first_zero_bit(pmu->pmu_seq_tbl,
+                                sizeof(pmu->pmu_seq_tbl));
+        if (index >= sizeof(pmu->pmu_seq_tbl)) {
+                gk20a_err(dev_from_gk20a(g),
+                        "no free sequence available");
+                mutex_unlock(&pmu->pmu_seq_lock);
+                return -EAGAIN;
+        }
+        set_bit(index, pmu->pmu_seq_tbl);
+        mutex_unlock(&pmu->pmu_seq_lock);
+        seq = &pmu->seq[index];
+        seq->state = PMU_SEQ_STATE_PENDING;
+        *pseq = seq;
+        return 0;
+}
+static void pmu_seq_release(struct pmu_gk20a *pmu,
+                        struct pmu_sequence *seq)
+{
+        struct gk20a *g = pmu->g;
+        seq->state      = PMU_SEQ_STATE_FREE;
+        seq->desc       = PMU_INVALID_SEQ_DESC;
+        seq->callback   = NULL;
+        seq->cb_params  = NULL;
+        seq->msg        = NULL;
+        seq->out_payload = NULL;
+        g->ops.pmu_ver.pmu_allocation_set_dmem_size(pmu,
+                g->ops.pmu_ver.get_pmu_seq_in_a_ptr(seq), 0);
+        g->ops.pmu_ver.pmu_allocation_set_dmem_size(pmu,
+                g->ops.pmu_ver.get_pmu_seq_out_a_ptr(seq), 0);
+        clear_bit(seq->id, pmu->pmu_seq_tbl);
+}
+static int pmu_queue_init(struct pmu_gk20a *pmu,
+                u32 id, union pmu_init_msg_pmu *init)
+{
+        struct gk20a *g = pmu->g;
+        struct pmu_queue *queue = &pmu->queue[id];
+        queue->id       = id;
+        g->ops.pmu_ver.get_pmu_init_msg_pmu_queue_params(queue, id, init);
+        queue->mutex_id = id;
+        mutex_init(&queue->mutex);
+        gk20a_dbg_pmu("queue %d: index %d, offset 0x%08x, size 0x%08x",
+                id, queue->index, queue->offset, queue->size);
+        return 0;
+}
+static int pmu_queue_head(struct pmu_gk20a *pmu, struct pmu_queue *queue,
+                        u32 *head, bool set)
+{
+        struct gk20a *g = pmu->g;
+        BUG_ON(!head);
+        if (PMU_IS_COMMAND_QUEUE(queue->id)) {
+                if (queue->index >= pwr_pmu_queue_head__size_1_v())
+                        return -EINVAL;
+                if (!set)
+                        *head = pwr_pmu_queue_head_address_v(
+                                gk20a_readl(g,
+                                        pwr_pmu_queue_head_r(queue->index)));
+                else
+                        gk20a_writel(g,
+                                pwr_pmu_queue_head_r(queue->index),
+                                pwr_pmu_queue_head_address_f(*head));
+        } else {
+                if (!set)
+                        *head = pwr_pmu_msgq_head_val_v(
+                                gk20a_readl(g, pwr_pmu_msgq_head_r()));
+                else
+                        gk20a_writel(g,
+                                pwr_pmu_msgq_head_r(),
+                                pwr_pmu_msgq_head_val_f(*head));
+        }
+        return 0;
+}
+static int pmu_queue_tail(struct pmu_gk20a *pmu, struct pmu_queue *queue,
+                        u32 *tail, bool set)
+{
+        struct gk20a *g = pmu->g;
+        BUG_ON(!tail);
+        if (PMU_IS_COMMAND_QUEUE(queue->id)) {
+                if (queue->index >= pwr_pmu_queue_tail__size_1_v())
+                        return -EINVAL;
+                if (!set)
+                        *tail = pwr_pmu_queue_tail_address_v(
+                                gk20a_readl(g,
+                                        pwr_pmu_queue_tail_r(queue->index)));
+                else
+                        gk20a_writel(g,
+                                pwr_pmu_queue_tail_r(queue->index),
+                                pwr_pmu_queue_tail_address_f(*tail));
+        } else {
+                if (!set)
+                        *tail = pwr_pmu_msgq_tail_val_v(
+                                gk20a_readl(g, pwr_pmu_msgq_tail_r()));
+                else
+                        gk20a_writel(g,
+                                pwr_pmu_msgq_tail_r(),
+                                pwr_pmu_msgq_tail_val_f(*tail));
+        }
+        return 0;
+}
+static inline void pmu_queue_read(struct pmu_gk20a *pmu,
+                        u32 offset, u8 *dst, u32 size)
+{
+        pmu_copy_from_dmem(pmu, offset, dst, size, 0);
+}
+static inline void pmu_queue_write(struct pmu_gk20a *pmu,
+                        u32 offset, u8 *src, u32 size)
+{
+        pmu_copy_to_dmem(pmu, offset, src, size, 0);
+}
+int pmu_mutex_acquire(struct pmu_gk20a *pmu, u32 id, u32 *token)
+{
+        struct gk20a *g = pmu->g;
+        struct pmu_mutex *mutex;
+        u32 data, owner, max_retry;
+        if (!pmu->initialized)
+                return 0;
+        BUG_ON(!token);
+        BUG_ON(!PMU_MUTEX_ID_IS_VALID(id));
+        BUG_ON(id > pmu->mutex_cnt);
+        mutex = &pmu->mutex[id];
+        owner = pwr_pmu_mutex_value_v(
+                gk20a_readl(g, pwr_pmu_mutex_r(mutex->index)));
+        if (*token != PMU_INVALID_MUTEX_OWNER_ID && *token == owner) {
+                BUG_ON(mutex->ref_cnt == 0);
+                gk20a_dbg_pmu("already acquired by owner : 0x%08x", *token);
+                mutex->ref_cnt++;
+                return 0;
+        }
+        max_retry = 40;
+        do {
+                data = pwr_pmu_mutex_id_value_v(
+                        gk20a_readl(g, pwr_pmu_mutex_id_r()));
+                if (data == pwr_pmu_mutex_id_value_init_v() ||
+                    data == pwr_pmu_mutex_id_value_not_avail_v()) {
+                        gk20a_warn(dev_from_gk20a(g),
+                                "fail to generate mutex token: val 0x%08x",
+                                owner);
+                        usleep_range(20, 40);
+                        continue;
+                }
+                owner = data;
+                gk20a_writel(g, pwr_pmu_mutex_r(mutex->index),
+                        pwr_pmu_mutex_value_f(owner));
+                data = pwr_pmu_mutex_value_v(
+                        gk20a_readl(g, pwr_pmu_mutex_r(mutex->index)));
+                if (owner == data) {
+                        mutex->ref_cnt = 1;
+                        gk20a_dbg_pmu("mutex acquired: id=%d, token=0x%x",
+                                mutex->index, *token);
+                        *token = owner;
+                        return 0;
+                } else {
+                        gk20a_dbg_info("fail to acquire mutex idx=0x%08x",
+                                mutex->index);
+                        data = gk20a_readl(g, pwr_pmu_mutex_id_release_r());
+                        data = set_field(data,
+                                pwr_pmu_mutex_id_release_value_m(),
+                                pwr_pmu_mutex_id_release_value_f(owner));
+                        gk20a_writel(g, pwr_pmu_mutex_id_release_r(), data);
+                        usleep_range(20, 40);
+                        continue;
+                }
+        } while (max_retry-- > 0);
+        return -EBUSY;
+}
+int pmu_mutex_release(struct pmu_gk20a *pmu, u32 id, u32 *token)
+{
+        struct gk20a *g = pmu->g;
+        struct pmu_mutex *mutex;
+        u32 owner, data;
+        if (!pmu->initialized)
+                return 0;
+        BUG_ON(!token);
+        BUG_ON(!PMU_MUTEX_ID_IS_VALID(id));
+        BUG_ON(id > pmu->mutex_cnt);
+        mutex = &pmu->mutex[id];
+        owner = pwr_pmu_mutex_value_v(
+                gk20a_readl(g, pwr_pmu_mutex_r(mutex->index)));
+        if (*token != owner) {
+                gk20a_err(dev_from_gk20a(g),
+                        "requester 0x%08x NOT match owner 0x%08x",
+                        *token, owner);
+                return -EINVAL;
+        }
+        if (--mutex->ref_cnt == 0) {
+                gk20a_writel(g, pwr_pmu_mutex_r(mutex->index),
+                        pwr_pmu_mutex_value_initial_lock_f());
+                data = gk20a_readl(g, pwr_pmu_mutex_id_release_r());
+                data = set_field(data, pwr_pmu_mutex_id_release_value_m(),
+                        pwr_pmu_mutex_id_release_value_f(owner));
+                gk20a_writel(g, pwr_pmu_mutex_id_release_r(), data);
+                gk20a_dbg_pmu("mutex released: id=%d, token=0x%x",
+                        mutex->index, *token);
+        }
+        return 0;
+}
+static int pmu_queue_lock(struct pmu_gk20a *pmu,
+                        struct pmu_queue *queue)
+{
+        int err;
+        if (PMU_IS_MESSAGE_QUEUE(queue->id))
+                return 0;
+        if (PMU_IS_SW_COMMAND_QUEUE(queue->id)) {
+                mutex_lock(&queue->mutex);
+                queue->locked = true;
+                return 0;
+        }
+        err = pmu_mutex_acquire(pmu, queue->mutex_id,
+                        &queue->mutex_lock);
+        if (err == 0)
+                queue->locked = true;
+        return err;
+}
+static int pmu_queue_unlock(struct pmu_gk20a *pmu,
+                        struct pmu_queue *queue)
+{
+        int err;
+        if (PMU_IS_MESSAGE_QUEUE(queue->id))
+                return 0;
+        if (PMU_IS_SW_COMMAND_QUEUE(queue->id)) {
+                mutex_unlock(&queue->mutex);
+                queue->locked = false;
+                return 0;
+        }
+        if (queue->locked) {
+                err = pmu_mutex_release(pmu, queue->mutex_id,
+                                &queue->mutex_lock);
+                if (err == 0)
+                        queue->locked = false;
+        }
+        return 0;
+}
+/* called by pmu_read_message, no lock */
+static bool pmu_queue_is_empty(struct pmu_gk20a *pmu,
+                        struct pmu_queue *queue)
+{
+        u32 head, tail;
+        pmu_queue_head(pmu, queue, &head, QUEUE_GET);
+        if (queue->opened && queue->oflag == OFLAG_READ)
+                tail = queue->position;
+        else
+                pmu_queue_tail(pmu, queue, &tail, QUEUE_GET);
+        return head == tail;
+}
+static bool pmu_queue_has_room(struct pmu_gk20a *pmu,
+                        struct pmu_queue *queue, u32 size, bool *need_rewind)
+{
+        u32 head, tail, free;
+        bool rewind = false;
+        BUG_ON(!queue->locked);
+        size = ALIGN(size, QUEUE_ALIGNMENT);
+        pmu_queue_head(pmu, queue, &head, QUEUE_GET);
+        pmu_queue_tail(pmu, queue, &tail, QUEUE_GET);
+        if (head >= tail) {
+                free = queue->offset + queue->size - head;
+                free -= PMU_CMD_HDR_SIZE;
+                if (size > free) {
+                        rewind = true;
+                        head = queue->offset;
+                }
+        }
+        if (head < tail)
+                free = tail - head - 1;
+        if (need_rewind)
+                *need_rewind = rewind;
+        return size <= free;
+}
+static int pmu_queue_push(struct pmu_gk20a *pmu,
+                        struct pmu_queue *queue, void *data, u32 size)
+{
+        gk20a_dbg_fn("");
+        if (!queue->opened && queue->oflag == OFLAG_WRITE){
+                gk20a_err(dev_from_gk20a(pmu->g),
+                        "queue not opened for write");
+                return -EINVAL;
+        }
+        pmu_queue_write(pmu, queue->position, data, size);
+        queue->position += ALIGN(size, QUEUE_ALIGNMENT);
+        return 0;
+}
+static int pmu_queue_pop(struct pmu_gk20a *pmu,
+                        struct pmu_queue *queue, void *data, u32 size,
+                        u32 *bytes_read)
+{
+        u32 head, tail, used;
+        *bytes_read = 0;
+        if (!queue->opened && queue->oflag == OFLAG_READ){
+                gk20a_err(dev_from_gk20a(pmu->g),
+                        "queue not opened for read");
+                return -EINVAL;
+        }
+        pmu_queue_head(pmu, queue, &head, QUEUE_GET);
+        tail = queue->position;
+        if (head == tail)
+                return 0;
+        if (head > tail)
+                used = head - tail;
+        else
+                used = queue->offset + queue->size - tail;
+        if (size > used) {
+                gk20a_warn(dev_from_gk20a(pmu->g),
+                        "queue size smaller than request read");
+                size = used;
+        }
+        pmu_queue_read(pmu, tail, data, size);
+        queue->position += ALIGN(size, QUEUE_ALIGNMENT);
+        *bytes_read = size;
+        return 0;
+}
+static void pmu_queue_rewind(struct pmu_gk20a *pmu,
+                        struct pmu_queue *queue)
+{
+        struct pmu_cmd cmd;
+        gk20a_dbg_fn("");
+        if (!queue->opened) {
+                gk20a_err(dev_from_gk20a(pmu->g),
+                        "queue not opened");
+                return;
+        }
+        if (queue->oflag == OFLAG_WRITE) {
+                cmd.hdr.unit_id = PMU_UNIT_REWIND;
+                cmd.hdr.size = PMU_CMD_HDR_SIZE;
+                pmu_queue_push(pmu, queue, &cmd, cmd.hdr.size);
+                gk20a_dbg_pmu("queue %d rewinded", queue->id);
+        }
+        queue->position = queue->offset;
+        return;
+}
+/* open for read and lock the queue */
+static int pmu_queue_open_read(struct pmu_gk20a *pmu,
+                        struct pmu_queue *queue)
+{
+        int err;
+        err = pmu_queue_lock(pmu, queue);
+        if (err)
+                return err;
+        if (queue->opened)
+                BUG();
+        pmu_queue_tail(pmu, queue, &queue->position, QUEUE_GET);
+        queue->oflag = OFLAG_READ;
+        queue->opened = true;
+        return 0;
+}
+/* open for write and lock the queue
+   make sure there's enough free space for the write */
+static int pmu_queue_open_write(struct pmu_gk20a *pmu,
+                        struct pmu_queue *queue, u32 size)
+{
+        bool rewind = false;
+        int err;
+        err = pmu_queue_lock(pmu, queue);
+        if (err)
+                return err;
+        if (queue->opened)
+                BUG();
+        if (!pmu_queue_has_room(pmu, queue, size, &rewind)) {
+                gk20a_err(dev_from_gk20a(pmu->g), "queue full");
+                return -EAGAIN;
+        }
+        pmu_queue_head(pmu, queue, &queue->position, QUEUE_GET);
+        queue->oflag = OFLAG_WRITE;
+        queue->opened = true;
+        if (rewind)
+                pmu_queue_rewind(pmu, queue);
+        return 0;
+}
+/* close and unlock the queue */
+static int pmu_queue_close(struct pmu_gk20a *pmu,
+                        struct pmu_queue *queue, bool commit)
+{
+        if (!queue->opened)
+                return 0;
+        if (commit) {
+                if (queue->oflag == OFLAG_READ) {
+                        pmu_queue_tail(pmu, queue,
+                                &queue->position, QUEUE_SET);
+                }
+                else {
+                        pmu_queue_head(pmu, queue,
+                                &queue->position, QUEUE_SET);
+                }
+        }
+        queue->opened = false;
+        pmu_queue_unlock(pmu, queue);
+        return 0;
+}
+static void gk20a_save_pmu_sw_state(struct pmu_gk20a *pmu,
+                        struct gk20a_pmu_save_state *save)
+{
+        save->seq = pmu->seq;
+        save->next_seq_desc = pmu->next_seq_desc;
+        save->mutex = pmu->mutex;
+        save->mutex_cnt = pmu->mutex_cnt;
+        save->desc = pmu->desc;
+        save->ucode = pmu->ucode;
+        save->elpg_enable = pmu->elpg_enable;
+        save->pg_wq = pmu->pg_wq;
+        save->seq_buf = pmu->seq_buf;
+        save->pg_buf = pmu->pg_buf;
+        save->sw_ready = pmu->sw_ready;
+        save->pg_init = pmu->pg_init;
+}
+static void gk20a_restore_pmu_sw_state(struct pmu_gk20a *pmu,
+                        struct gk20a_pmu_save_state *save)
+{
+        pmu->seq = save->seq;
+        pmu->next_seq_desc = save->next_seq_desc;
+        pmu->mutex = save->mutex;
+        pmu->mutex_cnt = save->mutex_cnt;
+        pmu->desc = save->desc;
+        pmu->ucode = save->ucode;
+        pmu->elpg_enable = save->elpg_enable;
+        pmu->pg_wq = save->pg_wq;
+        pmu->seq_buf = save->seq_buf;
+        pmu->pg_buf = save->pg_buf;
+        pmu->sw_ready = save->sw_ready;
+        pmu->pg_init = save->pg_init;
+}
+void gk20a_remove_pmu_support(struct pmu_gk20a *pmu)
+{
+        struct gk20a_pmu_save_state save;
+        gk20a_dbg_fn("");
+        gk20a_allocator_destroy(&pmu->dmem);
+        /* Save the stuff you don't want to lose */
+        gk20a_save_pmu_sw_state(pmu, &save);
+        /* this function is also called by pmu_destory outside gk20a deinit that
+           releases gk20a struct so fill up with zeros here. */
+        memset(pmu, 0, sizeof(struct pmu_gk20a));
+        /* Restore stuff you want to keep */
+        gk20a_restore_pmu_sw_state(pmu, &save);
+}
+int gk20a_init_pmu_reset_enable_hw(struct gk20a *g)
+{
+        struct pmu_gk20a *pmu = &g->pmu;
+        gk20a_dbg_fn("");
+        pmu_enable_hw(pmu, true);
+        return 0;
+}
+static void pmu_elpg_enable_allow(struct work_struct *work);
+int gk20a_init_pmu_setup_sw(struct gk20a *g)
+{
+        struct pmu_gk20a *pmu = &g->pmu;
+        struct mm_gk20a *mm = &g->mm;
+        struct vm_gk20a *vm = &mm->pmu.vm;
+        struct device *d = dev_from_gk20a(g);
+        int i, err = 0;
+        u8 *ptr;
+        void *ucode_ptr;
+        struct sg_table *sgt_pmu_ucode;
+        struct sg_table *sgt_seq_buf;
+        DEFINE_DMA_ATTRS(attrs);
+        dma_addr_t iova;
+        gk20a_dbg_fn("");
+        if (pmu->sw_ready) {
+                for (i = 0; i < pmu->mutex_cnt; i++) {
+                        pmu->mutex[i].id    = i;
+                        pmu->mutex[i].index = i;
+                }
+                pmu_seq_init(pmu);
+                gk20a_dbg_fn("skip init");
+                goto skip_init;
+        }
+        /* no infoRom script from vbios? */
+        /* TBD: sysmon subtask */
+        pmu->mutex_cnt = pwr_pmu_mutex__size_1_v();
+        pmu->mutex = kzalloc(pmu->mutex_cnt *
+                sizeof(struct pmu_mutex), GFP_KERNEL);
+        if (!pmu->mutex) {
+                err = -ENOMEM;
+                goto err;
+        }
+        for (i = 0; i < pmu->mutex_cnt; i++) {
+                pmu->mutex[i].id    = i;
+                pmu->mutex[i].index = i;
+        }
+        pmu->seq = kzalloc(PMU_MAX_NUM_SEQUENCES *
+                sizeof(struct pmu_sequence), GFP_KERNEL);
+        if (!pmu->seq) {
+                err = -ENOMEM;
+                goto err_free_mutex;
+        }
+        pmu_seq_init(pmu);
+        if (!g->pmu_fw) {
+                g->pmu_fw = gk20a_request_firmware(g, GK20A_PMU_UCODE_IMAGE);
+                if (!g->pmu_fw) {
+                        gk20a_err(d, "failed to load pmu ucode!!");
+                        err = -ENOENT;
+                        goto err_free_seq;
+                }
+        }
+        gk20a_dbg_fn("firmware loaded");
+        pmu->desc = (struct pmu_ucode_desc *)g->pmu_fw->data;
+        pmu->ucode_image = (u32 *)((u8 *)pmu->desc +
+                        pmu->desc->descriptor_size);
+        INIT_DELAYED_WORK(&pmu->elpg_enable, pmu_elpg_enable_allow);
+        INIT_WORK(&pmu->pg_init, gk20a_init_pmu_setup_hw2_workqueue);
+        gk20a_init_pmu_vm(mm);
+        dma_set_attr(DMA_ATTR_READ_ONLY, &attrs);
+        pmu->ucode.cpuva = dma_alloc_attrs(d, GK20A_PMU_UCODE_SIZE_MAX,
+                                        &iova,
+                                        GFP_KERNEL,
+                                        &attrs);
+        if (!pmu->ucode.cpuva) {
+                gk20a_err(d, "failed to allocate memory\n");
+                err = -ENOMEM;
+                goto err_release_fw;
+        }
+        pmu->ucode.iova = iova;
+        pmu->seq_buf.cpuva = dma_alloc_coherent(d, GK20A_PMU_SEQ_BUF_SIZE,
+                                        &iova,
+                                        GFP_KERNEL);
+        if (!pmu->seq_buf.cpuva) {
+                gk20a_err(d, "failed to allocate memory\n");
+                err = -ENOMEM;
+                goto err_free_pmu_ucode;
+        }
+        pmu->seq_buf.iova = iova;
+        init_waitqueue_head(&pmu->pg_wq);
+        err = gk20a_get_sgtable(d, &sgt_pmu_ucode,
+                                pmu->ucode.cpuva,
+                                pmu->ucode.iova,
+                                GK20A_PMU_UCODE_SIZE_MAX);
+        if (err) {
+                gk20a_err(d, "failed to allocate sg table\n");
+                goto err_free_seq_buf;
+        }
+        pmu->ucode.pmu_va = gk20a_gmmu_map(vm, &sgt_pmu_ucode,
+                                        GK20A_PMU_UCODE_SIZE_MAX,
+                                        0, /* flags */
+                                        gk20a_mem_flag_read_only);
+        if (!pmu->ucode.pmu_va) {
+                gk20a_err(d, "failed to map pmu ucode memory!!");
+                goto err_free_ucode_sgt;
+        }
+        err = gk20a_get_sgtable(d, &sgt_seq_buf,
+                                pmu->seq_buf.cpuva,
+                                pmu->seq_buf.iova,
+                                GK20A_PMU_SEQ_BUF_SIZE);
+        if (err) {
+                gk20a_err(d, "failed to allocate sg table\n");
+                goto err_unmap_ucode;
+        }
+        pmu->seq_buf.pmu_va = gk20a_gmmu_map(vm, &sgt_seq_buf,
+                                        GK20A_PMU_SEQ_BUF_SIZE,
+                                        0, /* flags */
+                                        gk20a_mem_flag_none);
+        if (!pmu->seq_buf.pmu_va) {
+                gk20a_err(d, "failed to map pmu ucode memory!!");
+                goto err_free_seq_buf_sgt;
+        }
+        ptr = (u8 *)pmu->seq_buf.cpuva;
+        if (!ptr) {
+                gk20a_err(d, "failed to map cpu ptr for zbc buffer");
+                goto err_unmap_seq_buf;
+        }
+        /* TBD: remove this if ZBC save/restore is handled by PMU
+         * end an empty ZBC sequence for now */
+        ptr[0] = 0x16; /* opcode EXIT */
+        ptr[1] = 0; ptr[2] = 1; ptr[3] = 0;
+        ptr[4] = 0; ptr[5] = 0; ptr[6] = 0; ptr[7] = 0;
+        pmu->seq_buf.size = GK20A_PMU_SEQ_BUF_SIZE;
+        ucode_ptr = pmu->ucode.cpuva;
+        for (i = 0; i < (pmu->desc->app_start_offset +
+                        pmu->desc->app_size) >> 2; i++)
+                gk20a_mem_wr32(ucode_ptr, i, pmu->ucode_image[i]);
+        gk20a_free_sgtable(&sgt_pmu_ucode);
+        gk20a_free_sgtable(&sgt_seq_buf);
+skip_init:
+        mutex_init(&pmu->elpg_mutex);
+        mutex_init(&pmu->isr_mutex);
+        mutex_init(&pmu->pmu_copy_lock);
+        mutex_init(&pmu->pmu_seq_lock);
+        pmu->perfmon_counter.index = 3; /* GR & CE2 */
+        pmu->perfmon_counter.group_id = PMU_DOMAIN_GROUP_PSTATE;
+        pmu->remove_support = gk20a_remove_pmu_support;
+        err = gk20a_init_pmu(pmu);
+        if (err) {
+                gk20a_err(d, "failed to set function pointers\n");
+                return err;
+        }
+        gk20a_dbg_fn("done");
+        return 0;
+ err_unmap_seq_buf:
+        gk20a_gmmu_unmap(vm, pmu->seq_buf.pmu_va,
+                GK20A_PMU_SEQ_BUF_SIZE, gk20a_mem_flag_none);
+ err_free_seq_buf_sgt:
+        gk20a_free_sgtable(&sgt_seq_buf);
+ err_unmap_ucode:
+        gk20a_gmmu_unmap(vm, pmu->ucode.pmu_va,
+                GK20A_PMU_UCODE_SIZE_MAX, gk20a_mem_flag_none);
+ err_free_ucode_sgt:
+        gk20a_free_sgtable(&sgt_pmu_ucode);
+ err_free_seq_buf:
+        dma_free_coherent(d, GK20A_PMU_SEQ_BUF_SIZE,
+                pmu->seq_buf.cpuva, pmu->seq_buf.iova);
+        pmu->seq_buf.cpuva = NULL;
+        pmu->seq_buf.iova = 0;
+ err_free_pmu_ucode:
+        dma_free_attrs(d, GK20A_PMU_UCODE_SIZE_MAX,
+                pmu->ucode.cpuva, pmu->ucode.iova, &attrs);
+        pmu->ucode.cpuva = NULL;
+        pmu->ucode.iova = 0;
+ err_release_fw:
+        release_firmware(g->pmu_fw);
+ err_free_seq:
+        kfree(pmu->seq);
+ err_free_mutex:
+        kfree(pmu->mutex);
+ err:
+        gk20a_dbg_fn("fail");
+        return err;
+}
+static void pmu_handle_pg_elpg_msg(struct gk20a *g, struct pmu_msg *msg,
+                        void *param, u32 handle, u32 status);
+static void pmu_handle_pg_buf_config_msg(struct gk20a *g, struct pmu_msg *msg,
+                        void *param, u32 handle, u32 status)
+{
+        struct pmu_gk20a *pmu = param;
+        struct pmu_pg_msg_eng_buf_stat *eng_buf_stat = &msg->msg.pg.eng_buf_stat;
+        gk20a_dbg_fn("");
+        if (status != 0) {
+                gk20a_err(dev_from_gk20a(g), "PGENG cmd aborted");
+                /* TBD: disable ELPG */
+                return;
+        }
+        if (eng_buf_stat->status == PMU_PG_MSG_ENG_BUF_FAILED) {
+                gk20a_err(dev_from_gk20a(g), "failed to load PGENG buffer");
+        }
+        pmu->buf_loaded = (eng_buf_stat->status == PMU_PG_MSG_ENG_BUF_LOADED);
+        wake_up(&pmu->pg_wq);
+}
+int gk20a_init_pmu_setup_hw1(struct gk20a *g)
+{
+        struct pmu_gk20a *pmu = &g->pmu;
+        int err;
+        gk20a_dbg_fn("");
+        pmu_reset(pmu);
+        /* setup apertures - virtual */
+        gk20a_writel(g, pwr_fbif_transcfg_r(GK20A_PMU_DMAIDX_UCODE),
+                pwr_fbif_transcfg_mem_type_virtual_f());
+        gk20a_writel(g, pwr_fbif_transcfg_r(GK20A_PMU_DMAIDX_VIRT),
+                pwr_fbif_transcfg_mem_type_virtual_f());
+        /* setup apertures - physical */
+        gk20a_writel(g, pwr_fbif_transcfg_r(GK20A_PMU_DMAIDX_PHYS_VID),
+                pwr_fbif_transcfg_mem_type_physical_f() |
+                pwr_fbif_transcfg_target_local_fb_f());
+        gk20a_writel(g, pwr_fbif_transcfg_r(GK20A_PMU_DMAIDX_PHYS_SYS_COH),
+                pwr_fbif_transcfg_mem_type_physical_f() |
+                pwr_fbif_transcfg_target_coherent_sysmem_f());
+        gk20a_writel(g, pwr_fbif_transcfg_r(GK20A_PMU_DMAIDX_PHYS_SYS_NCOH),
+                pwr_fbif_transcfg_mem_type_physical_f() |
+                pwr_fbif_transcfg_target_noncoherent_sysmem_f());
+        /* TBD: load pmu ucode */
+        err = pmu_bootstrap(pmu);
+        if (err)
+                return err;
+        return 0;
+}
+static int gk20a_aelpg_init(struct gk20a *g);
+static int gk20a_aelpg_init_and_enable(struct gk20a *g, u8 ctrl_id);
+static void gk20a_init_pmu_setup_hw2_workqueue(struct work_struct *work)
+{
+        struct pmu_gk20a *pmu = container_of(work, struct pmu_gk20a, pg_init);
+        struct gk20a *g = pmu->g;
+        gk20a_init_pmu_setup_hw2(g);
+}
+int gk20a_init_pmu_setup_hw2(struct gk20a *g)
+{
+        struct pmu_gk20a *pmu = &g->pmu;
+        struct mm_gk20a *mm = &g->mm;
+        struct vm_gk20a *vm = &mm->pmu.vm;
+        struct device *d = dev_from_gk20a(g);
+        struct pmu_cmd cmd;
+        u32 desc;
+        long remain;
+        int err;
+        bool status;
+        u32 size;
+        struct sg_table *sgt_pg_buf;
+        dma_addr_t iova;
+        gk20a_dbg_fn("");
+        if (!support_gk20a_pmu())
+                return 0;
+        size = 0;
+        err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
+        if (err) {
+                gk20a_err(dev_from_gk20a(g),
+                        "fail to query fecs pg buffer size");
+                return err;
+        }
+        if (!pmu->sw_ready) {
+                pmu->pg_buf.cpuva = dma_alloc_coherent(d, size,
+                                                &iova,
+                                                GFP_KERNEL);
+                if (!pmu->pg_buf.cpuva) {
+                        gk20a_err(d, "failed to allocate memory\n");
+                        err = -ENOMEM;
+                        goto err;
+                }
+                pmu->pg_buf.iova = iova;
+                pmu->pg_buf.size = size;
+                err = gk20a_get_sgtable(d, &sgt_pg_buf,
+                                        pmu->pg_buf.cpuva,
+                                        pmu->pg_buf.iova,
+                                        size);
+                if (err) {
+                        gk20a_err(d, "failed to create sg table\n");
+                        goto err_free_pg_buf;
+                }
+                pmu->pg_buf.pmu_va = gk20a_gmmu_map(vm,
+                                        &sgt_pg_buf,
+                                        size,
+                                        0, /* flags */
+                                        gk20a_mem_flag_none);
+                if (!pmu->pg_buf.pmu_va) {
+                        gk20a_err(d, "failed to map fecs pg buffer");
+                        err = -ENOMEM;
+                        goto err_free_sgtable;
+                }
+                gk20a_free_sgtable(&sgt_pg_buf);
+        }
+        /*
+         * This is the actual point at which sw setup is complete, so set the
+         * sw_ready flag here.
+         */
+        pmu->sw_ready = true;
+        /* TBD: acquire pmu hw mutex */
+        /* TBD: post reset again? */
+        /* PMU_INIT message handler will send PG_INIT */
+        remain = wait_event_timeout(
+                        pmu->pg_wq,
+                        (status = (pmu->elpg_ready &&
+                                pmu->stat_dmem_offset != 0 &&
+                                pmu->elpg_stat == PMU_ELPG_STAT_OFF)),
+                        msecs_to_jiffies(gk20a_get_gr_idle_timeout(g)));
+        if (status == 0) {
+                gk20a_err(dev_from_gk20a(g),
+                        "PG_INIT_ACK failed, remaining timeout : 0x%lx", remain);
+                pmu_dump_falcon_stats(pmu);
+                return -EBUSY;
+        }
+        err = gr_gk20a_fecs_set_reglist_bind_inst(g, mm->pmu.inst_block.cpu_pa);
+        if (err) {
+                gk20a_err(dev_from_gk20a(g),
+                        "fail to bind pmu inst to gr");
+                return err;
+        }
+        err = gr_gk20a_fecs_set_reglist_virual_addr(g, pmu->pg_buf.pmu_va);
+        if (err) {
+                gk20a_err(dev_from_gk20a(g),
+                        "fail to set pg buffer pmu va");
+                return err;
+        }
+        memset(&cmd, 0, sizeof(struct pmu_cmd));
+        cmd.hdr.unit_id = PMU_UNIT_PG;
+        cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_pg_cmd_eng_buf_load);
+        cmd.cmd.pg.eng_buf_load.cmd_type = PMU_PG_CMD_ID_ENG_BUF_LOAD;
+        cmd.cmd.pg.eng_buf_load.engine_id = ENGINE_GR_GK20A;
+        cmd.cmd.pg.eng_buf_load.buf_idx = PMU_PGENG_GR_BUFFER_IDX_FECS;
+        cmd.cmd.pg.eng_buf_load.buf_size = pmu->pg_buf.size;
+        cmd.cmd.pg.eng_buf_load.dma_base = u64_lo32(pmu->pg_buf.pmu_va >> 8);
+        cmd.cmd.pg.eng_buf_load.dma_offset = (u8)(pmu->pg_buf.pmu_va & 0xFF);
+        cmd.cmd.pg.eng_buf_load.dma_idx = PMU_DMAIDX_VIRT;
+        pmu->buf_loaded = false;
+        gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_LPQ,
+                        pmu_handle_pg_buf_config_msg, pmu, &desc, ~0);
+        remain = wait_event_timeout(
+                        pmu->pg_wq,
+                        pmu->buf_loaded,
+                        msecs_to_jiffies(gk20a_get_gr_idle_timeout(g)));
+        if (!pmu->buf_loaded) {
+                gk20a_err(dev_from_gk20a(g),
+                        "PGENG FECS buffer load failed, remaining timeout : 0x%lx",
+                        remain);
+                return -EBUSY;
+        }
+        memset(&cmd, 0, sizeof(struct pmu_cmd));
+        cmd.hdr.unit_id = PMU_UNIT_PG;
+        cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_pg_cmd_eng_buf_load);
+        cmd.cmd.pg.eng_buf_load.cmd_type = PMU_PG_CMD_ID_ENG_BUF_LOAD;
+        cmd.cmd.pg.eng_buf_load.engine_id = ENGINE_GR_GK20A;
+        cmd.cmd.pg.eng_buf_load.buf_idx = PMU_PGENG_GR_BUFFER_IDX_ZBC;
+        cmd.cmd.pg.eng_buf_load.buf_size = pmu->seq_buf.size;
+        cmd.cmd.pg.eng_buf_load.dma_base = u64_lo32(pmu->seq_buf.pmu_va >> 8);
+        cmd.cmd.pg.eng_buf_load.dma_offset = (u8)(pmu->seq_buf.pmu_va & 0xFF);
+        cmd.cmd.pg.eng_buf_load.dma_idx = PMU_DMAIDX_VIRT;
+        pmu->buf_loaded = false;
+        gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_LPQ,
+                        pmu_handle_pg_buf_config_msg, pmu, &desc, ~0);
+        remain = wait_event_timeout(
+                        pmu->pg_wq,
+                        pmu->buf_loaded,
+                        msecs_to_jiffies(gk20a_get_gr_idle_timeout(g)));
+        if (!pmu->buf_loaded) {
+                gk20a_err(dev_from_gk20a(g),
+                        "PGENG ZBC buffer load failed, remaining timeout 0x%lx",
+                        remain);
+                return -EBUSY;
+        }
+        /*
+         * FIXME: To enable ELPG, we increase the PMU ext2priv timeout unit to
+         * 7. This prevents PMU stalling on Host register accesses. Once the
+         * cause for this hang is discovered and fixed, this WAR should be
+         * removed.
+         */
+        gk20a_writel(g, 0x10a164, 0x109ff);
+        pmu->initialized = true;
+        pmu->zbc_ready = true;
+        /* Save zbc table after PMU is initialized. */
+        pmu_save_zbc(g, 0xf);
+        /*
+         * We can't guarantee that gr code to enable ELPG will be
+         * invoked, so we explicitly call disable-enable here
+         * to enable elpg.
+         */
+        gk20a_pmu_disable_elpg(g);
+        if (g->elpg_enabled)
+                gk20a_pmu_enable_elpg(g);
+        udelay(50);
+        /* Enable AELPG */
+        if (g->aelpg_enabled) {
+                gk20a_aelpg_init(g);
+                gk20a_aelpg_init_and_enable(g, PMU_AP_CTRL_ID_GRAPHICS);
+        }
+        return 0;
+ err_free_sgtable:
+        gk20a_free_sgtable(&sgt_pg_buf);
+ err_free_pg_buf:
+        dma_free_coherent(d, size,
+                pmu->pg_buf.cpuva, pmu->pg_buf.iova);
+        pmu->pg_buf.cpuva = NULL;
+        pmu->pg_buf.iova = 0;
+ err:
+        return err;
+}
+int gk20a_init_pmu_support(struct gk20a *g)
+{
+        struct pmu_gk20a *pmu = &g->pmu;
+        u32 err;
+        gk20a_dbg_fn("");
+        if (pmu->initialized)
+                return 0;
+        pmu->g = g;
+        err = gk20a_init_pmu_reset_enable_hw(g);
+        if (err)
+                return err;
+        if (support_gk20a_pmu()) {
+                err = gk20a_init_pmu_setup_sw(g);
+                if (err)
+                        return err;
+                err = gk20a_init_pmu_setup_hw1(g);
+                if (err)
+                        return err;
+        }
+        return err;
+}
+static void pmu_handle_pg_elpg_msg(struct gk20a *g, struct pmu_msg *msg,
+                        void *param, u32 handle, u32 status)
+{
+        struct pmu_gk20a *pmu = param;
+        struct pmu_pg_msg_elpg_msg *elpg_msg = &msg->msg.pg.elpg_msg;
+        gk20a_dbg_fn("");
+        if (status != 0) {
+                gk20a_err(dev_from_gk20a(g), "ELPG cmd aborted");
+                /* TBD: disable ELPG */
+                return;
+        }
+        switch (elpg_msg->msg) {
+        case PMU_PG_ELPG_MSG_INIT_ACK:
+                gk20a_dbg_pmu("INIT_PG is acknowledged from PMU");
+                pmu->elpg_ready = true;
+                wake_up(&pmu->pg_wq);
+                break;
+        case PMU_PG_ELPG_MSG_ALLOW_ACK:
+                gk20a_dbg_pmu("ALLOW is acknowledged from PMU");
+                pmu->elpg_stat = PMU_ELPG_STAT_ON;
+                wake_up(&pmu->pg_wq);
+                break;
+        case PMU_PG_ELPG_MSG_DISALLOW_ACK:
+                gk20a_dbg_pmu("DISALLOW is acknowledged from PMU");
+                pmu->elpg_stat = PMU_ELPG_STAT_OFF;
+                wake_up(&pmu->pg_wq);
+                break;
+        default:
+                gk20a_err(dev_from_gk20a(g),
+                        "unsupported ELPG message : 0x%04x", elpg_msg->msg);
+        }
+        return;
+}
+static void pmu_handle_pg_stat_msg(struct gk20a *g, struct pmu_msg *msg,
+                        void *param, u32 handle, u32 status)
+{
+        struct pmu_gk20a *pmu = param;
+        gk20a_dbg_fn("");
+        if (status != 0) {
+                gk20a_err(dev_from_gk20a(g), "ELPG cmd aborted");
+                /* TBD: disable ELPG */
+                return;
+        }
+        switch (msg->msg.pg.stat.sub_msg_id) {
+        case PMU_PG_STAT_MSG_RESP_DMEM_OFFSET:
+                gk20a_dbg_pmu("ALLOC_DMEM_OFFSET is acknowledged from PMU");
+                pmu->stat_dmem_offset = msg->msg.pg.stat.data;
+                wake_up(&pmu->pg_wq);
+                break;
+        default:
+                break;
+        }
+}
+static int pmu_init_powergating(struct pmu_gk20a *pmu)
+{
+        struct gk20a *g = pmu->g;
+        struct pmu_cmd cmd;
+        u32 seq;
+        gk20a_dbg_fn("");
+        if (tegra_cpu_is_asim()) {
+                /* TBD: calculate threshold for silicon */
+                gk20a_writel(g, pwr_pmu_pg_idlefilth_r(ENGINE_GR_GK20A),
+                                PMU_PG_IDLE_THRESHOLD_SIM);
+                gk20a_writel(g, pwr_pmu_pg_ppuidlefilth_r(ENGINE_GR_GK20A),
+                                PMU_PG_POST_POWERUP_IDLE_THRESHOLD_SIM);
+        } else {
+                /* TBD: calculate threshold for silicon */
+                gk20a_writel(g, pwr_pmu_pg_idlefilth_r(ENGINE_GR_GK20A),
+                                PMU_PG_IDLE_THRESHOLD);
+                gk20a_writel(g, pwr_pmu_pg_ppuidlefilth_r(ENGINE_GR_GK20A),
+                                PMU_PG_POST_POWERUP_IDLE_THRESHOLD);
+        }
+        /* init ELPG */
+        memset(&cmd, 0, sizeof(struct pmu_cmd));
+        cmd.hdr.unit_id = PMU_UNIT_PG;
+        cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_pg_cmd_elpg_cmd);
+        cmd.cmd.pg.elpg_cmd.cmd_type = PMU_PG_CMD_ID_ELPG_CMD;
+        cmd.cmd.pg.elpg_cmd.engine_id = ENGINE_GR_GK20A;
+        cmd.cmd.pg.elpg_cmd.cmd = PMU_PG_ELPG_CMD_INIT;
+        gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_HPQ,
+                        pmu_handle_pg_elpg_msg, pmu, &seq, ~0);
+        /* alloc dmem for powergating state log */
+        pmu->stat_dmem_offset = 0;
+        memset(&cmd, 0, sizeof(struct pmu_cmd));
+        cmd.hdr.unit_id = PMU_UNIT_PG;
+        cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_pg_cmd_stat);
+        cmd.cmd.pg.stat.cmd_type = PMU_PG_CMD_ID_PG_STAT;
+        cmd.cmd.pg.stat.engine_id = ENGINE_GR_GK20A;
+        cmd.cmd.pg.stat.sub_cmd_id = PMU_PG_STAT_CMD_ALLOC_DMEM;
+        cmd.cmd.pg.stat.data = 0;
+        gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_LPQ,
+                        pmu_handle_pg_stat_msg, pmu, &seq, ~0);
+        /* disallow ELPG initially
+           PMU ucode requires a disallow cmd before allow cmd */
+        pmu->elpg_stat = PMU_ELPG_STAT_ON; /* set for wait_event PMU_ELPG_STAT_OFF */
+        memset(&cmd, 0, sizeof(struct pmu_cmd));
+        cmd.hdr.unit_id = PMU_UNIT_PG;
+        cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_pg_cmd_elpg_cmd);
+        cmd.cmd.pg.elpg_cmd.cmd_type = PMU_PG_CMD_ID_ELPG_CMD;
+        cmd.cmd.pg.elpg_cmd.engine_id = ENGINE_GR_GK20A;
+        cmd.cmd.pg.elpg_cmd.cmd = PMU_PG_ELPG_CMD_DISALLOW;
+        gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_HPQ,
+                        pmu_handle_pg_elpg_msg, pmu, &seq, ~0);
+        /* start with elpg disabled until first enable call */
+        pmu->elpg_refcnt = 1;
+        return 0;
+}
+static int pmu_init_perfmon(struct pmu_gk20a *pmu)
+{
+        struct gk20a *g = pmu->g;
+        struct pmu_v *pv = &g->ops.pmu_ver;
+        struct pmu_cmd cmd;
+        struct pmu_payload payload;
+        u32 seq;
+        u32 data;
+        int err;
+        gk20a_dbg_fn("");
+        pmu->perfmon_ready = 0;
+        /* use counter #3 for GR && CE2 busy cycles */
+        gk20a_writel(g, pwr_pmu_idle_mask_r(3),
+                pwr_pmu_idle_mask_gr_enabled_f() |
+                pwr_pmu_idle_mask_ce_2_enabled_f());
+        /* disable idle filtering for counters 3 and 6 */
+        data = gk20a_readl(g, pwr_pmu_idle_ctrl_r(3));
+        data = set_field(data, pwr_pmu_idle_ctrl_value_m() |
+                        pwr_pmu_idle_ctrl_filter_m(),
+                        pwr_pmu_idle_ctrl_value_busy_f() |
+                        pwr_pmu_idle_ctrl_filter_disabled_f());
+        gk20a_writel(g, pwr_pmu_idle_ctrl_r(3), data);
+        /* use counter #6 for total cycles */
+        data = gk20a_readl(g, pwr_pmu_idle_ctrl_r(6));
+        data = set_field(data, pwr_pmu_idle_ctrl_value_m() |
+                        pwr_pmu_idle_ctrl_filter_m(),
+                        pwr_pmu_idle_ctrl_value_always_f() |
+                        pwr_pmu_idle_ctrl_filter_disabled_f());
+        gk20a_writel(g, pwr_pmu_idle_ctrl_r(6), data);
+        /*
+         * We don't want to disturb counters #3 and #6, which are used by
+         * perfmon, so we add wiring also to counters #1 and #2 for
+         * exposing raw counter readings.
+         */
+        gk20a_writel(g, pwr_pmu_idle_mask_r(1),
+                pwr_pmu_idle_mask_gr_enabled_f() |
+                pwr_pmu_idle_mask_ce_2_enabled_f());
+        data = gk20a_readl(g, pwr_pmu_idle_ctrl_r(1));
+        data = set_field(data, pwr_pmu_idle_ctrl_value_m() |
+                        pwr_pmu_idle_ctrl_filter_m(),
+                        pwr_pmu_idle_ctrl_value_busy_f() |
+                        pwr_pmu_idle_ctrl_filter_disabled_f());
+        gk20a_writel(g, pwr_pmu_idle_ctrl_r(1), data);
+        data = gk20a_readl(g, pwr_pmu_idle_ctrl_r(2));
+        data = set_field(data, pwr_pmu_idle_ctrl_value_m() |
+                        pwr_pmu_idle_ctrl_filter_m(),
+                        pwr_pmu_idle_ctrl_value_always_f() |
+                        pwr_pmu_idle_ctrl_filter_disabled_f());
+        gk20a_writel(g, pwr_pmu_idle_ctrl_r(2), data);
+        pmu->sample_buffer = 0;
+        err = pmu->dmem.alloc(&pmu->dmem, &pmu->sample_buffer, 2 * sizeof(u16));
+        if (err) {
+                gk20a_err(dev_from_gk20a(g),
+                        "failed to allocate perfmon sample buffer");
+                return -ENOMEM;
+        }
+        /* init PERFMON */
+        memset(&cmd, 0, sizeof(struct pmu_cmd));
+        cmd.hdr.unit_id = PMU_UNIT_PERFMON;
+        cmd.hdr.size = PMU_CMD_HDR_SIZE + pv->get_pmu_perfmon_cmd_init_size();
+        cmd.cmd.perfmon.cmd_type = PMU_PERFMON_CMD_ID_INIT;
+        /* buffer to save counter values for pmu perfmon */
+        pv->perfmon_cmd_init_set_sample_buffer(&cmd.cmd.perfmon,
+        (u16)pmu->sample_buffer);
+        /* number of sample periods below lower threshold
+           before pmu triggers perfmon decrease event
+           TBD: = 15 */
+        pv->perfmon_cmd_init_set_dec_cnt(&cmd.cmd.perfmon, 15);
+        /* index of base counter, aka. always ticking counter */
+        pv->perfmon_cmd_init_set_base_cnt_id(&cmd.cmd.perfmon, 6);
+        /* microseconds interval between pmu polls perf counters */
+        pv->perfmon_cmd_init_set_samp_period_us(&cmd.cmd.perfmon, 16700);
+        /* number of perfmon counters
+           counter #3 (GR and CE2) for gk20a */
+        pv->perfmon_cmd_init_set_num_cnt(&cmd.cmd.perfmon, 1);
+        /* moving average window for sample periods
+           TBD: = 3000000 / sample_period_us = 17 */
+        pv->perfmon_cmd_init_set_mov_avg(&cmd.cmd.perfmon, 17);
+        memset(&payload, 0, sizeof(struct pmu_payload));
+        payload.in.buf = &pmu->perfmon_counter;
+        payload.in.size = sizeof(struct pmu_perfmon_counter);
+        payload.in.offset = pv->get_perfmon_cmd_init_offsetofvar(COUNTER_ALLOC);
+        gk20a_pmu_cmd_post(g, &cmd, NULL, &payload, PMU_COMMAND_QUEUE_LPQ,
+                        NULL, NULL, &seq, ~0);
+        return 0;
+}
+static int pmu_process_init_msg(struct pmu_gk20a *pmu,
+                        struct pmu_msg *msg)
+{
+        struct gk20a *g = pmu->g;
+        struct pmu_v *pv = &g->ops.pmu_ver;
+        union pmu_init_msg_pmu *init;
+        struct pmu_sha1_gid_data gid_data;
+        u32 i, tail = 0;
+        tail = pwr_pmu_msgq_tail_val_v(
+                gk20a_readl(g, pwr_pmu_msgq_tail_r()));
+        pmu_copy_from_dmem(pmu, tail,
+                (u8 *)&msg->hdr, PMU_MSG_HDR_SIZE, 0);
+        if (msg->hdr.unit_id != PMU_UNIT_INIT) {
+                gk20a_err(dev_from_gk20a(g),
+                        "expecting init msg");
+                return -EINVAL;
+        }
+        pmu_copy_from_dmem(pmu, tail + PMU_MSG_HDR_SIZE,
+                (u8 *)&msg->msg, msg->hdr.size - PMU_MSG_HDR_SIZE, 0);
+        if (msg->msg.init.msg_type != PMU_INIT_MSG_TYPE_PMU_INIT) {
+                gk20a_err(dev_from_gk20a(g),
+                        "expecting init msg");
+                return -EINVAL;
+        }
+        tail += ALIGN(msg->hdr.size, PMU_DMEM_ALIGNMENT);
+        gk20a_writel(g, pwr_pmu_msgq_tail_r(),
+                pwr_pmu_msgq_tail_val_f(tail));
+        init = pv->get_pmu_msg_pmu_init_msg_ptr(&(msg->msg.init));
+        if (!pmu->gid_info.valid) {
+                pmu_copy_from_dmem(pmu,
+                        pv->get_pmu_init_msg_pmu_sw_mg_off(init),
+                        (u8 *)&gid_data,
+                        sizeof(struct pmu_sha1_gid_data), 0);
+                pmu->gid_info.valid =
+                        (*(u32 *)gid_data.signature == PMU_SHA1_GID_SIGNATURE);
+                if (pmu->gid_info.valid) {
+                        BUG_ON(sizeof(pmu->gid_info.gid) !=
+                                sizeof(gid_data.gid));
+                        memcpy(pmu->gid_info.gid, gid_data.gid,
+                                sizeof(pmu->gid_info.gid));
+                }
+        }
+        for (i = 0; i < PMU_QUEUE_COUNT; i++)
+                pmu_queue_init(pmu, i, init);
+        gk20a_allocator_init(&pmu->dmem, "gk20a_pmu_dmem",
+                        pv->get_pmu_init_msg_pmu_sw_mg_off(init),
+                        pv->get_pmu_init_msg_pmu_sw_mg_size(init),
+                        PMU_DMEM_ALLOC_ALIGNMENT);
+        pmu->pmu_ready = true;
+        return 0;
+}
+static bool pmu_read_message(struct pmu_gk20a *pmu, struct pmu_queue *queue,
+                        struct pmu_msg *msg, int *status)
+{
+        struct gk20a *g = pmu->g;
+        u32 read_size, bytes_read;
+        int err;
+        *status = 0;
+        if (pmu_queue_is_empty(pmu, queue))
+                return false;
+        err = pmu_queue_open_read(pmu, queue);
+        if (err) {
+                gk20a_err(dev_from_gk20a(g),
+                        "fail to open queue %d for read", queue->id);
+                *status = err;
+                return false;
+        }
+        err = pmu_queue_pop(pmu, queue, &msg->hdr,
+                        PMU_MSG_HDR_SIZE, &bytes_read);
+        if (err || bytes_read != PMU_MSG_HDR_SIZE) {
+                gk20a_err(dev_from_gk20a(g),
+                        "fail to read msg from queue %d", queue->id);
+                *status = err | -EINVAL;
+                goto clean_up;
+        }
+        if (msg->hdr.unit_id == PMU_UNIT_REWIND) {
+                pmu_queue_rewind(pmu, queue);
+                /* read again after rewind */
+                err = pmu_queue_pop(pmu, queue, &msg->hdr,
+                                PMU_MSG_HDR_SIZE, &bytes_read);
+                if (err || bytes_read != PMU_MSG_HDR_SIZE) {
+                        gk20a_err(dev_from_gk20a(g),
+                                "fail to read msg from queue %d", queue->id);
+                        *status = err | -EINVAL;
+                        goto clean_up;
+                }
+        }
+        if (!PMU_UNIT_ID_IS_VALID(msg->hdr.unit_id)) {
+                gk20a_err(dev_from_gk20a(g),
+                        "read invalid unit_id %d from queue %d",
+                        msg->hdr.unit_id, queue->id);
+                        *status = -EINVAL;
+                        goto clean_up;
+        }
+        if (msg->hdr.size > PMU_MSG_HDR_SIZE) {
+                read_size = msg->hdr.size - PMU_MSG_HDR_SIZE;
+                err = pmu_queue_pop(pmu, queue, &msg->msg,
+                        read_size, &bytes_read);
+                if (err || bytes_read != read_size) {
+                        gk20a_err(dev_from_gk20a(g),
+                                "fail to read msg from queue %d", queue->id);
+                        *status = err;
+                        goto clean_up;
+                }
+        }
+        err = pmu_queue_close(pmu, queue, true);
+        if (err) {
+                gk20a_err(dev_from_gk20a(g),
+                        "fail to close queue %d", queue->id);
+                *status = err;
+                return false;
+        }
+        return true;
+clean_up:
+        err = pmu_queue_close(pmu, queue, false);
+        if (err)
+                gk20a_err(dev_from_gk20a(g),
+                        "fail to close queue %d", queue->id);
+        return false;
+}
+static int pmu_response_handle(struct pmu_gk20a *pmu,
+                        struct pmu_msg *msg)
+{
+        struct gk20a *g = pmu->g;
+        struct pmu_sequence *seq;
+        struct pmu_v *pv = &g->ops.pmu_ver;
+        int ret = 0;
+        gk20a_dbg_fn("");
+        seq = &pmu->seq[msg->hdr.seq_id];
+        if (seq->state != PMU_SEQ_STATE_USED &&
+            seq->state != PMU_SEQ_STATE_CANCELLED) {
+                gk20a_err(dev_from_gk20a(g),
+                        "msg for an unknown sequence %d", seq->id);
+                return -EINVAL;
+        }
+        if (msg->hdr.unit_id == PMU_UNIT_RC &&
+            msg->msg.rc.msg_type == PMU_RC_MSG_TYPE_UNHANDLED_CMD) {
+                gk20a_err(dev_from_gk20a(g),
+                        "unhandled cmd: seq %d", seq->id);
+        }
+        else if (seq->state != PMU_SEQ_STATE_CANCELLED) {
+                if (seq->msg) {
+                        if (seq->msg->hdr.size >= msg->hdr.size) {
+                                memcpy(seq->msg, msg, msg->hdr.size);
+                                if (pv->pmu_allocation_get_dmem_size(pmu,
+                                pv->get_pmu_seq_out_a_ptr(seq)) != 0) {
+                                        pmu_copy_from_dmem(pmu,
+                                        pv->pmu_allocation_get_dmem_offset(pmu,
+                                        pv->get_pmu_seq_out_a_ptr(seq)),
+                                        seq->out_payload,
+                                        pv->pmu_allocation_get_dmem_size(pmu,
+                                        pv->get_pmu_seq_out_a_ptr(seq)), 0);
+                                }
+                        } else {
+                                gk20a_err(dev_from_gk20a(g),
+                                        "sequence %d msg buffer too small",
+                                        seq->id);
+                        }
+                }
+        } else
+                seq->callback = NULL;
+        if (pv->pmu_allocation_get_dmem_size(pmu,
+                        pv->get_pmu_seq_in_a_ptr(seq)) != 0)
+                pmu->dmem.free(&pmu->dmem,
+                pv->pmu_allocation_get_dmem_offset(pmu,
+                pv->get_pmu_seq_in_a_ptr(seq)),
+                pv->pmu_allocation_get_dmem_size(pmu,
+                pv->get_pmu_seq_in_a_ptr(seq)));
+        if (pv->pmu_allocation_get_dmem_size(pmu,
+                        pv->get_pmu_seq_out_a_ptr(seq)) != 0)
+                pmu->dmem.free(&pmu->dmem,
+                pv->pmu_allocation_get_dmem_offset(pmu,
+                pv->get_pmu_seq_out_a_ptr(seq)),
+                pv->pmu_allocation_get_dmem_size(pmu,
+                pv->get_pmu_seq_out_a_ptr(seq)));
+        if (seq->callback)
+                seq->callback(g, msg, seq->cb_params, seq->desc, ret);
+        pmu_seq_release(pmu, seq);
+        /* TBD: notify client waiting for available dmem */
+        gk20a_dbg_fn("done");
+        return 0;
+}
+static int pmu_wait_message_cond(struct pmu_gk20a *pmu, u32 timeout,
+                                 u32 *var, u32 val);
+static void pmu_handle_zbc_msg(struct gk20a *g, struct pmu_msg *msg,
+                        void *param, u32 handle, u32 status)
+{
+        struct pmu_gk20a *pmu = param;
+        pmu->zbc_save_done = 1;
+}
+static void pmu_save_zbc(struct gk20a *g, u32 entries)
+{
+        struct pmu_gk20a *pmu = &g->pmu;
+        struct pmu_cmd cmd;
+        u32 seq;
+        if (!pmu->pmu_ready || !entries || !pmu->zbc_ready)
+                return;
+        memset(&cmd, 0, sizeof(struct pmu_cmd));
+        cmd.hdr.unit_id = PMU_UNIT_PG;
+        cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_zbc_cmd);
+        cmd.cmd.zbc.cmd_type = g->ops.pmu_ver.cmd_id_zbc_table_update;
+        cmd.cmd.zbc.entry_mask = ZBC_MASK(entries);
+        pmu->zbc_save_done = 0;
+        gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_HPQ,
+                           pmu_handle_zbc_msg, pmu, &seq, ~0);
+        pmu_wait_message_cond(pmu, gk20a_get_gr_idle_timeout(g),
+                              &pmu->zbc_save_done, 1);
+        if (!pmu->zbc_save_done)
+                gk20a_err(dev_from_gk20a(g), "ZBC save timeout");
+}
+void gk20a_pmu_save_zbc(struct gk20a *g, u32 entries)
+{
+        if (g->pmu.zbc_ready)
+                pmu_save_zbc(g, entries);
+}
+static int pmu_perfmon_start_sampling(struct pmu_gk20a *pmu)
+{
+        struct gk20a *g = pmu->g;
+        struct pmu_v *pv = &g->ops.pmu_ver;
+        struct pmu_cmd cmd;
+        struct pmu_payload payload;
+        u32 current_rate = 0;
+        u32 seq;
+        /* PERFMON Start */
+        memset(&cmd, 0, sizeof(struct pmu_cmd));
+        cmd.hdr.unit_id = PMU_UNIT_PERFMON;
+        cmd.hdr.size = PMU_CMD_HDR_SIZE + pv->get_pmu_perfmon_cmd_start_size();
+        pv->perfmon_start_set_cmd_type(&cmd.cmd.perfmon,
+                PMU_PERFMON_CMD_ID_START);
+        pv->perfmon_start_set_group_id(&cmd.cmd.perfmon,
+                PMU_DOMAIN_GROUP_PSTATE);
+        pv->perfmon_start_set_state_id(&cmd.cmd.perfmon,
+                pmu->perfmon_state_id[PMU_DOMAIN_GROUP_PSTATE]);
+        current_rate = rate_gpu_to_gpc2clk(gk20a_clk_get_rate(g));
+        if (current_rate >= gpc_pll_params.max_freq)
+                pv->perfmon_start_set_flags(&cmd.cmd.perfmon,
+                PMU_PERFMON_FLAG_ENABLE_DECREASE);
+        else if (current_rate <= gpc_pll_params.min_freq)
+                pv->perfmon_start_set_flags(&cmd.cmd.perfmon,
+                PMU_PERFMON_FLAG_ENABLE_INCREASE);
+        else
+                pv->perfmon_start_set_flags(&cmd.cmd.perfmon,
+                PMU_PERFMON_FLAG_ENABLE_INCREASE |
+                PMU_PERFMON_FLAG_ENABLE_DECREASE);
+        pv->perfmon_start_set_flags(&cmd.cmd.perfmon,
+                pv->perfmon_start_get_flags(&cmd.cmd.perfmon) |
+                PMU_PERFMON_FLAG_CLEAR_PREV);
+        memset(&payload, 0, sizeof(struct pmu_payload));
+        /* TBD: PMU_PERFMON_PCT_TO_INC * 100 */
+        pmu->perfmon_counter.upper_threshold = 3000; /* 30% */
+        /* TBD: PMU_PERFMON_PCT_TO_DEC * 100 */
+        pmu->perfmon_counter.lower_threshold = 1000; /* 10% */
+        pmu->perfmon_counter.valid = true;
+        payload.in.buf = &pmu->perfmon_counter;
+        payload.in.size = sizeof(pmu->perfmon_counter);
+        payload.in.offset =
+                pv->get_perfmon_cmd_start_offsetofvar(COUNTER_ALLOC);
+        gk20a_pmu_cmd_post(g, &cmd, NULL, &payload, PMU_COMMAND_QUEUE_LPQ,
+                        NULL, NULL, &seq, ~0);
+        return 0;
+}
+static int pmu_perfmon_stop_sampling(struct pmu_gk20a *pmu)
+{
+        struct gk20a *g = pmu->g;
+        struct pmu_cmd cmd;
+        u32 seq;
+        /* PERFMON Stop */
+        memset(&cmd, 0, sizeof(struct pmu_cmd));
+        cmd.hdr.unit_id = PMU_UNIT_PERFMON;
+        cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_perfmon_cmd_stop);
+        cmd.cmd.perfmon.stop.cmd_type = PMU_PERFMON_CMD_ID_STOP;
+        gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_LPQ,
+                        NULL, NULL, &seq, ~0);
+        return 0;
+}
+static int pmu_handle_perfmon_event(struct pmu_gk20a *pmu,
+                        struct pmu_perfmon_msg *msg)
+{
+        struct gk20a *g = pmu->g;
+        u32 rate;
+        gk20a_dbg_fn("");
+        switch (msg->msg_type) {
+        case PMU_PERFMON_MSG_ID_INCREASE_EVENT:
+                gk20a_dbg_pmu("perfmon increase event: "
+                        "state_id %d, ground_id %d, pct %d",
+                        msg->gen.state_id, msg->gen.group_id, msg->gen.data);
+                /* increase gk20a clock freq by 20% */
+                rate = gk20a_clk_get_rate(g);
+                gk20a_clk_set_rate(g, rate * 6 / 5);
+                break;
+        case PMU_PERFMON_MSG_ID_DECREASE_EVENT:
+                gk20a_dbg_pmu("perfmon decrease event: "
+                        "state_id %d, ground_id %d, pct %d",
+                        msg->gen.state_id, msg->gen.group_id, msg->gen.data);
+                /* decrease gk20a clock freq by 10% */
+                rate = gk20a_clk_get_rate(g);
+                gk20a_clk_set_rate(g, (rate / 10) * 7);
+                break;
+        case PMU_PERFMON_MSG_ID_INIT_EVENT:
+                pmu->perfmon_ready = 1;
+                gk20a_dbg_pmu("perfmon init event");
+                break;
+        default:
+                break;
+        }
+        /* restart sampling */
+        if (IS_ENABLED(CONFIG_GK20A_PERFMON))
+                return pmu_perfmon_start_sampling(pmu);
+        return 0;
+}
+static int pmu_handle_event(struct pmu_gk20a *pmu, struct pmu_msg *msg)
+{
+        int err;
+        gk20a_dbg_fn("");
+        switch (msg->hdr.unit_id) {
+        case PMU_UNIT_PERFMON:
+                err = pmu_handle_perfmon_event(pmu, &msg->msg.perfmon);
+                break;
+        default:
+                break;
+        }
+        return err;
+}
+static int pmu_process_message(struct pmu_gk20a *pmu)
+{
+        struct pmu_msg msg;
+        int status;
+        if (unlikely(!pmu->pmu_ready)) {
+                pmu_process_init_msg(pmu, &msg);
+                pmu_init_powergating(pmu);
+                pmu_init_perfmon(pmu);
+                return 0;
+        }
+        while (pmu_read_message(pmu,
+                &pmu->queue[PMU_MESSAGE_QUEUE], &msg, &status)) {
+                gk20a_dbg_pmu("read msg hdr: "
+                                "unit_id = 0x%08x, size = 0x%08x, "
+                                "ctrl_flags = 0x%08x, seq_id = 0x%08x",
+                                msg.hdr.unit_id, msg.hdr.size,
+                                msg.hdr.ctrl_flags, msg.hdr.seq_id);
+                msg.hdr.ctrl_flags &= ~PMU_CMD_FLAGS_PMU_MASK;
+                if (msg.hdr.ctrl_flags == PMU_CMD_FLAGS_EVENT) {
+                        pmu_handle_event(pmu, &msg);
+                } else {
+                        pmu_response_handle(pmu, &msg);
+                }
+        }
+        return 0;
+}
+static int pmu_wait_message_cond(struct pmu_gk20a *pmu, u32 timeout,
+                                 u32 *var, u32 val)
+{
+        struct gk20a *g = pmu->g;
+        unsigned long end_jiffies = jiffies + msecs_to_jiffies(timeout);
+        unsigned long delay = GR_IDLE_CHECK_DEFAULT;
+        do {
+                if (*var == val)
+                        return 0;
+                if (gk20a_readl(g, pwr_falcon_irqstat_r()))
+                        gk20a_pmu_isr(g);
+                usleep_range(delay, delay * 2);
+                delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+        } while (time_before(jiffies, end_jiffies) ||
+                        !tegra_platform_is_silicon());
+        return -ETIMEDOUT;
+}
+static void pmu_dump_elpg_stats(struct pmu_gk20a *pmu)
+{
+        struct gk20a *g = pmu->g;
+        struct pmu_pg_stats stats;
+        pmu_copy_from_dmem(pmu, pmu->stat_dmem_offset,
+                (u8 *)&stats, sizeof(struct pmu_pg_stats), 0);
+        gk20a_dbg_pmu("pg_entry_start_timestamp : 0x%016llx",
+                stats.pg_entry_start_timestamp);
+        gk20a_dbg_pmu("pg_exit_start_timestamp : 0x%016llx",
+                stats.pg_exit_start_timestamp);
+        gk20a_dbg_pmu("pg_ingating_start_timestamp : 0x%016llx",
+                stats.pg_ingating_start_timestamp);
+        gk20a_dbg_pmu("pg_ungating_start_timestamp : 0x%016llx",
+                stats.pg_ungating_start_timestamp);
+        gk20a_dbg_pmu("pg_avg_entry_time_us : 0x%08x",
+                stats.pg_avg_entry_time_us);
+        gk20a_dbg_pmu("pg_avg_exit_time_us : 0x%08x",
+                stats.pg_avg_exit_time_us);
+        gk20a_dbg_pmu("pg_ingating_cnt : 0x%08x",
+                stats.pg_ingating_cnt);
+        gk20a_dbg_pmu("pg_ingating_time_us : 0x%08x",
+                stats.pg_ingating_time_us);
+        gk20a_dbg_pmu("pg_ungating_count : 0x%08x",
+                stats.pg_ungating_count);
+        gk20a_dbg_pmu("pg_ungating_time_us 0x%08x: ",
+                stats.pg_ungating_time_us);
+        gk20a_dbg_pmu("pg_gating_cnt : 0x%08x",
+                stats.pg_gating_cnt);
+        gk20a_dbg_pmu("pg_gating_deny_cnt : 0x%08x",
+                stats.pg_gating_deny_cnt);
+        /*
+           Turn on PG_DEBUG in ucode and locate symbol "ElpgLog" offset
+           in .nm file, e.g. 0x1000066c. use 0x66c.
+        u32 i, val[20];
+        pmu_copy_from_dmem(pmu, 0x66c,
+                (u8 *)val, sizeof(val), 0);
+        gk20a_dbg_pmu("elpg log begin");
+        for (i = 0; i < 20; i++)
+                gk20a_dbg_pmu("0x%08x", val[i]);
+        gk20a_dbg_pmu("elpg log end");
+        */
+        gk20a_dbg_pmu("pwr_pmu_idle_mask_supp_r(3): 0x%08x",
+                gk20a_readl(g, pwr_pmu_idle_mask_supp_r(3)));
+        gk20a_dbg_pmu("pwr_pmu_idle_mask_1_supp_r(3): 0x%08x",
+                gk20a_readl(g, pwr_pmu_idle_mask_1_supp_r(3)));
+        gk20a_dbg_pmu("pwr_pmu_idle_ctrl_supp_r(3): 0x%08x",
+                gk20a_readl(g, pwr_pmu_idle_ctrl_supp_r(3)));
+        gk20a_dbg_pmu("pwr_pmu_pg_idle_cnt_r(0): 0x%08x",
+                gk20a_readl(g, pwr_pmu_pg_idle_cnt_r(0)));
+        gk20a_dbg_pmu("pwr_pmu_pg_intren_r(0): 0x%08x",
+                gk20a_readl(g, pwr_pmu_pg_intren_r(0)));
+        gk20a_dbg_pmu("pwr_pmu_idle_count_r(3): 0x%08x",
+                gk20a_readl(g, pwr_pmu_idle_count_r(3)));
+        gk20a_dbg_pmu("pwr_pmu_idle_count_r(4): 0x%08x",
+                gk20a_readl(g, pwr_pmu_idle_count_r(4)));
+        gk20a_dbg_pmu("pwr_pmu_idle_count_r(7): 0x%08x",
+                gk20a_readl(g, pwr_pmu_idle_count_r(7)));
+        /*
+         TBD: script can't generate those registers correctly
+        gk20a_dbg_pmu("pwr_pmu_idle_status_r(): 0x%08x",
+                gk20a_readl(g, pwr_pmu_idle_status_r()));
+        gk20a_dbg_pmu("pwr_pmu_pg_ctrl_r(): 0x%08x",
+                gk20a_readl(g, pwr_pmu_pg_ctrl_r()));
+        */
+}
+static void pmu_dump_falcon_stats(struct pmu_gk20a *pmu)
+{
+        struct gk20a *g = pmu->g;
+        int i;
+        gk20a_err(dev_from_gk20a(g), "pwr_falcon_os_r : %d",
+                gk20a_readl(g, pwr_falcon_os_r()));
+        gk20a_err(dev_from_gk20a(g), "pwr_falcon_cpuctl_r : 0x%x",
+                gk20a_readl(g, pwr_falcon_cpuctl_r()));
+        gk20a_err(dev_from_gk20a(g), "pwr_falcon_idlestate_r : 0x%x",
+                gk20a_readl(g, pwr_falcon_idlestate_r()));
+        gk20a_err(dev_from_gk20a(g), "pwr_falcon_mailbox0_r : 0x%x",
+                gk20a_readl(g, pwr_falcon_mailbox0_r()));
+        gk20a_err(dev_from_gk20a(g), "pwr_falcon_mailbox1_r : 0x%x",
+                gk20a_readl(g, pwr_falcon_mailbox1_r()));
+        gk20a_err(dev_from_gk20a(g), "pwr_falcon_irqstat_r : 0x%x",
+                gk20a_readl(g, pwr_falcon_irqstat_r()));
+        gk20a_err(dev_from_gk20a(g), "pwr_falcon_irqmode_r : 0x%x",
+                gk20a_readl(g, pwr_falcon_irqmode_r()));
+        gk20a_err(dev_from_gk20a(g), "pwr_falcon_irqmask_r : 0x%x",
+                gk20a_readl(g, pwr_falcon_irqmask_r()));
+        gk20a_err(dev_from_gk20a(g), "pwr_falcon_irqdest_r : 0x%x",
+                gk20a_readl(g, pwr_falcon_irqdest_r()));
+        for (i = 0; i < pwr_pmu_mailbox__size_1_v(); i++)
+                gk20a_err(dev_from_gk20a(g), "pwr_pmu_mailbox_r(%d) : 0x%x",
+                        i, gk20a_readl(g, pwr_pmu_mailbox_r(i)));
+        for (i = 0; i < pwr_pmu_debug__size_1_v(); i++)
+                gk20a_err(dev_from_gk20a(g), "pwr_pmu_debug_r(%d) : 0x%x",
+                        i, gk20a_readl(g, pwr_pmu_debug_r(i)));
+        for (i = 0; i < 6/*NV_PPWR_FALCON_ICD_IDX_RSTAT__SIZE_1*/; i++) {
+                gk20a_writel(g, pwr_pmu_falcon_icd_cmd_r(),
+                        pwr_pmu_falcon_icd_cmd_opc_rstat_f() |
+                        pwr_pmu_falcon_icd_cmd_idx_f(i));
+                gk20a_err(dev_from_gk20a(g), "pmu_rstat (%d) : 0x%x",
+                        i, gk20a_readl(g, pwr_pmu_falcon_icd_rdata_r()));
+        }
+        i = gk20a_readl(g, pwr_pmu_bar0_error_status_r());
+        gk20a_err(dev_from_gk20a(g), "pwr_pmu_bar0_error_status_r : 0x%x", i);
+        if (i != 0) {
+                gk20a_err(dev_from_gk20a(g), "pwr_pmu_bar0_addr_r : 0x%x",
+                        gk20a_readl(g, pwr_pmu_bar0_addr_r()));
+                gk20a_err(dev_from_gk20a(g), "pwr_pmu_bar0_data_r : 0x%x",
+                        gk20a_readl(g, pwr_pmu_bar0_data_r()));
+                gk20a_err(dev_from_gk20a(g), "pwr_pmu_bar0_timeout_r : 0x%x",
+                        gk20a_readl(g, pwr_pmu_bar0_timeout_r()));
+                gk20a_err(dev_from_gk20a(g), "pwr_pmu_bar0_ctl_r : 0x%x",
+                        gk20a_readl(g, pwr_pmu_bar0_ctl_r()));
+        }
+        i = gk20a_readl(g, pwr_pmu_bar0_fecs_error_r());
+        gk20a_err(dev_from_gk20a(g), "pwr_pmu_bar0_fecs_error_r : 0x%x", i);
+        i = gk20a_readl(g, pwr_falcon_exterrstat_r());
+        gk20a_err(dev_from_gk20a(g), "pwr_falcon_exterrstat_r : 0x%x", i);
+        if (pwr_falcon_exterrstat_valid_v(i) ==
+                        pwr_falcon_exterrstat_valid_true_v()) {
+                gk20a_err(dev_from_gk20a(g), "pwr_falcon_exterraddr_r : 0x%x",
+                        gk20a_readl(g, pwr_falcon_exterraddr_r()));
+                gk20a_err(dev_from_gk20a(g), "top_fs_status_r : 0x%x",
+                        gk20a_readl(g, top_fs_status_r()));
+                gk20a_err(dev_from_gk20a(g), "pmc_enable : 0x%x",
+                        gk20a_readl(g, mc_enable_r()));
+        }
+        gk20a_err(dev_from_gk20a(g), "pwr_falcon_engctl_r : 0x%x",
+                gk20a_readl(g, pwr_falcon_engctl_r()));
+        gk20a_err(dev_from_gk20a(g), "pwr_falcon_curctx_r : 0x%x",
+                gk20a_readl(g, pwr_falcon_curctx_r()));
+        gk20a_err(dev_from_gk20a(g), "pwr_falcon_nxtctx_r : 0x%x",
+                gk20a_readl(g, pwr_falcon_nxtctx_r()));
+        gk20a_writel(g, pwr_pmu_falcon_icd_cmd_r(),
+                pwr_pmu_falcon_icd_cmd_opc_rreg_f() |
+                pwr_pmu_falcon_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
+        gk20a_err(dev_from_gk20a(g), "PMU_FALCON_REG_IMB : 0x%x",
+                gk20a_readl(g, pwr_pmu_falcon_icd_rdata_r()));
+        gk20a_writel(g, pwr_pmu_falcon_icd_cmd_r(),
+                pwr_pmu_falcon_icd_cmd_opc_rreg_f() |
+                pwr_pmu_falcon_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
+        gk20a_err(dev_from_gk20a(g), "PMU_FALCON_REG_DMB : 0x%x",
+                gk20a_readl(g, pwr_pmu_falcon_icd_rdata_r()));
+        gk20a_writel(g, pwr_pmu_falcon_icd_cmd_r(),
+                pwr_pmu_falcon_icd_cmd_opc_rreg_f() |
+                pwr_pmu_falcon_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
+        gk20a_err(dev_from_gk20a(g), "PMU_FALCON_REG_CSW : 0x%x",
+                gk20a_readl(g, pwr_pmu_falcon_icd_rdata_r()));
+        gk20a_writel(g, pwr_pmu_falcon_icd_cmd_r(),
+                pwr_pmu_falcon_icd_cmd_opc_rreg_f() |
+                pwr_pmu_falcon_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
+        gk20a_err(dev_from_gk20a(g), "PMU_FALCON_REG_CTX : 0x%x",
+                gk20a_readl(g, pwr_pmu_falcon_icd_rdata_r()));
+        gk20a_writel(g, pwr_pmu_falcon_icd_cmd_r(),
+                pwr_pmu_falcon_icd_cmd_opc_rreg_f() |
+                pwr_pmu_falcon_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
+        gk20a_err(dev_from_gk20a(g), "PMU_FALCON_REG_EXCI : 0x%x",
+                gk20a_readl(g, pwr_pmu_falcon_icd_rdata_r()));
+        for (i = 0; i < 4; i++) {
+                gk20a_writel(g, pwr_pmu_falcon_icd_cmd_r(),
+                        pwr_pmu_falcon_icd_cmd_opc_rreg_f() |
+                        pwr_pmu_falcon_icd_cmd_idx_f(PMU_FALCON_REG_PC));
+                gk20a_err(dev_from_gk20a(g), "PMU_FALCON_REG_PC : 0x%x",
+                        gk20a_readl(g, pwr_pmu_falcon_icd_rdata_r()));
+                gk20a_writel(g, pwr_pmu_falcon_icd_cmd_r(),
+                        pwr_pmu_falcon_icd_cmd_opc_rreg_f() |
+                        pwr_pmu_falcon_icd_cmd_idx_f(PMU_FALCON_REG_SP));
+                gk20a_err(dev_from_gk20a(g), "PMU_FALCON_REG_SP : 0x%x",
+                        gk20a_readl(g, pwr_pmu_falcon_icd_rdata_r()));
+        }
+        /* PMU may crash due to FECS crash. Dump FECS status */
+        gk20a_fecs_dump_falcon_stats(g);
+}
+void gk20a_pmu_isr(struct gk20a *g)
+{
+        struct pmu_gk20a *pmu = &g->pmu;
+        struct pmu_queue *queue;
+        u32 intr, mask;
+        bool recheck = false;
+        gk20a_dbg_fn("");
+        mutex_lock(&pmu->isr_mutex);
+        mask = gk20a_readl(g, pwr_falcon_irqmask_r()) &
+                gk20a_readl(g, pwr_falcon_irqdest_r());
+        intr = gk20a_readl(g, pwr_falcon_irqstat_r()) & mask;
+        gk20a_dbg_pmu("received falcon interrupt: 0x%08x", intr);
+        if (!intr) {
+                mutex_unlock(&pmu->isr_mutex);
+                return;
+        }
+        if (intr & pwr_falcon_irqstat_halt_true_f()) {
+                gk20a_err(dev_from_gk20a(g),
+                        "pmu halt intr not implemented");
+                pmu_dump_falcon_stats(pmu);
+        }
+        if (intr & pwr_falcon_irqstat_exterr_true_f()) {
+                gk20a_err(dev_from_gk20a(g),
+                        "pmu exterr intr not implemented. Clearing interrupt.");
+                pmu_dump_falcon_stats(pmu);
+                gk20a_writel(g, pwr_falcon_exterrstat_r(),
+                        gk20a_readl(g, pwr_falcon_exterrstat_r()) &
+                                ~pwr_falcon_exterrstat_valid_m());
+        }
+        if (intr & pwr_falcon_irqstat_swgen0_true_f()) {
+                pmu_process_message(pmu);
+                recheck = true;
+        }
+        gk20a_writel(g, pwr_falcon_irqsclr_r(), intr);
+        if (recheck) {
+                queue = &pmu->queue[PMU_MESSAGE_QUEUE];
+                if (!pmu_queue_is_empty(pmu, queue))
+                        gk20a_writel(g, pwr_falcon_irqsset_r(),
+                                pwr_falcon_irqsset_swgen0_set_f());
+        }
+        mutex_unlock(&pmu->isr_mutex);
+}
+static bool pmu_validate_cmd(struct pmu_gk20a *pmu, struct pmu_cmd *cmd,
+                        struct pmu_msg *msg, struct pmu_payload *payload,
+                        u32 queue_id)
+{
+        struct gk20a *g = pmu->g;
+        struct pmu_queue *queue;
+        u32 in_size, out_size;
+        if (!PMU_IS_SW_COMMAND_QUEUE(queue_id))
+                goto invalid_cmd;
+        queue = &pmu->queue[queue_id];
+        if (cmd->hdr.size < PMU_CMD_HDR_SIZE)
+                goto invalid_cmd;
+        if (cmd->hdr.size > (queue->size >> 1))
+                goto invalid_cmd;
+        if (msg != NULL && msg->hdr.size < PMU_MSG_HDR_SIZE)
+                goto invalid_cmd;
+        if (!PMU_UNIT_ID_IS_VALID(cmd->hdr.unit_id))
+                goto invalid_cmd;
+        if (payload == NULL)
+                return true;
+        if (payload->in.buf == NULL && payload->out.buf == NULL)
+                goto invalid_cmd;
+        if ((payload->in.buf != NULL && payload->in.size == 0) ||
+            (payload->out.buf != NULL && payload->out.size == 0))
+                goto invalid_cmd;
+        in_size = PMU_CMD_HDR_SIZE;
+        if (payload->in.buf) {
+                in_size += payload->in.offset;
+                in_size += g->ops.pmu_ver.get_pmu_allocation_struct_size(pmu);
+        }
+        out_size = PMU_CMD_HDR_SIZE;
+        if (payload->out.buf) {
+                out_size += payload->out.offset;
+                out_size += g->ops.pmu_ver.get_pmu_allocation_struct_size(pmu);
+        }
+        if (in_size > cmd->hdr.size || out_size > cmd->hdr.size)
+                goto invalid_cmd;
+        if ((payload->in.offset != 0 && payload->in.buf == NULL) ||
+            (payload->out.offset != 0 && payload->out.buf == NULL))
+                goto invalid_cmd;
+        return true;
+invalid_cmd:
+        gk20a_err(dev_from_gk20a(g), "invalid pmu cmd :\n"
+                "queue_id=%d,\n"
+                "cmd_size=%d, cmd_unit_id=%d, msg=%p, msg_size=%d,\n"
+                "payload in=%p, in_size=%d, in_offset=%d,\n"
+                "payload out=%p, out_size=%d, out_offset=%d",
+                queue_id, cmd->hdr.size, cmd->hdr.unit_id,
+                msg, msg?msg->hdr.unit_id:~0,
+                &payload->in, payload->in.size, payload->in.offset,
+                &payload->out, payload->out.size, payload->out.offset);
+        return false;
+}
+static int pmu_write_cmd(struct pmu_gk20a *pmu, struct pmu_cmd *cmd,
+                        u32 queue_id, unsigned long timeout)
+{
+        struct gk20a *g = pmu->g;
+        struct pmu_queue *queue;
+        unsigned long end_jiffies = jiffies +
+                msecs_to_jiffies(timeout);
+        int err;
+        gk20a_dbg_fn("");
+        queue = &pmu->queue[queue_id];
+        do {
+                err = pmu_queue_open_write(pmu, queue, cmd->hdr.size);
+                if (err == -EAGAIN && time_before(jiffies, end_jiffies))
+                        usleep_range(1000, 2000);
+                else
+                        break;
+        } while (1);
+        if (err)
+                goto clean_up;
+        pmu_queue_push(pmu, queue, cmd, cmd->hdr.size);
+        err = pmu_queue_close(pmu, queue, true);
+clean_up:
+        if (err)
+                gk20a_err(dev_from_gk20a(g),
+                        "fail to write cmd to queue %d", queue_id);
+        else
+                gk20a_dbg_fn("done");
+        return err;
+}
+int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
+                struct pmu_msg *msg, struct pmu_payload *payload,
+                u32 queue_id, pmu_callback callback, void* cb_param,
+                u32 *seq_desc, unsigned long timeout)
+{
+        struct pmu_gk20a *pmu = &g->pmu;
+        struct pmu_v *pv = &g->ops.pmu_ver;
+        struct pmu_sequence *seq;
+        void *in = NULL, *out = NULL;
+        int err;
+        gk20a_dbg_fn("");
+        BUG_ON(!cmd);
+        BUG_ON(!seq_desc);
+        BUG_ON(!pmu->pmu_ready);
+        if (!pmu_validate_cmd(pmu, cmd, msg, payload, queue_id))
+                return -EINVAL;
+        err = pmu_seq_acquire(pmu, &seq);
+        if (err)
+                return err;
+        cmd->hdr.seq_id = seq->id;
+        cmd->hdr.ctrl_flags = 0;
+        cmd->hdr.ctrl_flags |= PMU_CMD_FLAGS_STATUS;
+        cmd->hdr.ctrl_flags |= PMU_CMD_FLAGS_INTR;
+        seq->callback = callback;
+        seq->cb_params = cb_param;
+        seq->msg = msg;
+        seq->out_payload = NULL;
+        seq->desc = pmu->next_seq_desc++;
+        if (payload)
+                seq->out_payload = payload->out.buf;
+        *seq_desc = seq->desc;
+        if (payload && payload->in.offset != 0) {
+                pv->set_pmu_allocation_ptr(pmu, &in,
+                ((u8 *)&cmd->cmd + payload->in.offset));
+                if (payload->in.buf != payload->out.buf)
+                        pv->pmu_allocation_set_dmem_size(pmu, in,
+                        (u16)payload->in.size);
+                else
+                        pv->pmu_allocation_set_dmem_size(pmu, in,
+                        (u16)max(payload->in.size, payload->out.size));
+                err = pmu->dmem.alloc(&pmu->dmem,
+                pv->pmu_allocation_get_dmem_offset_addr(pmu, in),
+                pv->pmu_allocation_get_dmem_size(pmu, in));
+                if (err)
+                        goto clean_up;
+                pmu_copy_to_dmem(pmu, (pv->pmu_allocation_get_dmem_offset(pmu,
+                in)),
+                        payload->in.buf, payload->in.size, 0);
+                pv->pmu_allocation_set_dmem_size(pmu,
+                pv->get_pmu_seq_in_a_ptr(seq),
+                pv->pmu_allocation_get_dmem_size(pmu, in));
+                pv->pmu_allocation_set_dmem_offset(pmu,
+                pv->get_pmu_seq_in_a_ptr(seq),
+                pv->pmu_allocation_get_dmem_offset(pmu, in));
+        }
+        if (payload && payload->out.offset != 0) {
+                pv->set_pmu_allocation_ptr(pmu, &out,
+                ((u8 *)&cmd->cmd + payload->out.offset));
+                pv->pmu_allocation_set_dmem_size(pmu, out,
+                (u16)payload->out.size);
+                if (payload->out.buf != payload->in.buf) {
+                        err = pmu->dmem.alloc(&pmu->dmem,
+                        pv->pmu_allocation_get_dmem_offset_addr(pmu, out),
+                        pv->pmu_allocation_get_dmem_size(pmu, out));
+                        if (err)
+                                goto clean_up;
+                } else {
+                        BUG_ON(in == NULL);
+                        pv->pmu_allocation_set_dmem_offset(pmu, out,
+                        pv->pmu_allocation_get_dmem_offset(pmu, in));
+                }
+                pv->pmu_allocation_set_dmem_size(pmu,
+                pv->get_pmu_seq_out_a_ptr(seq),
+                pv->pmu_allocation_get_dmem_size(pmu, out));
+                pv->pmu_allocation_set_dmem_offset(pmu,
+                pv->get_pmu_seq_out_a_ptr(seq),
+                pv->pmu_allocation_get_dmem_offset(pmu, out));
+        }
+        seq->state = PMU_SEQ_STATE_USED;
+        err = pmu_write_cmd(pmu, cmd, queue_id, timeout);
+        if (err)
+                seq->state = PMU_SEQ_STATE_PENDING;
+        gk20a_dbg_fn("done");
+        return 0;
+clean_up:
+        gk20a_dbg_fn("fail");
+        if (in)
+                pmu->dmem.free(&pmu->dmem,
+                pv->pmu_allocation_get_dmem_offset(pmu, in),
+                pv->pmu_allocation_get_dmem_size(pmu, in));
+        if (out)
+                pmu->dmem.free(&pmu->dmem,
+                pv->pmu_allocation_get_dmem_offset(pmu, out),
+                pv->pmu_allocation_get_dmem_size(pmu, out));
+        pmu_seq_release(pmu, seq);
+        return err;
+}
+static int gk20a_pmu_enable_elpg_locked(struct gk20a *g)
+{
+        struct pmu_gk20a *pmu = &g->pmu;
+        struct pmu_cmd cmd;
+        u32 seq, status;
+        gk20a_dbg_fn("");
+        memset(&cmd, 0, sizeof(struct pmu_cmd));
+        cmd.hdr.unit_id = PMU_UNIT_PG;
+        cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_pg_cmd_elpg_cmd);
+        cmd.cmd.pg.elpg_cmd.cmd_type = PMU_PG_CMD_ID_ELPG_CMD;
+        cmd.cmd.pg.elpg_cmd.engine_id = ENGINE_GR_GK20A;
+        cmd.cmd.pg.elpg_cmd.cmd = PMU_PG_ELPG_CMD_ALLOW;
+        /* no need to wait ack for ELPG enable but set pending to sync
+           with follow up ELPG disable */
+        pmu->elpg_stat = PMU_ELPG_STAT_ON_PENDING;
+        status = gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_HPQ,
+                        pmu_handle_pg_elpg_msg, pmu, &seq, ~0);
+        BUG_ON(status != 0);
+        gk20a_dbg_fn("done");
+        return 0;
+}
+int gk20a_pmu_enable_elpg(struct gk20a *g)
+{
+        struct pmu_gk20a *pmu = &g->pmu;
+        struct gr_gk20a *gr = &g->gr;
+        int ret = 0;
+        gk20a_dbg_fn("");
+        if (!pmu->elpg_ready || !pmu->initialized)
+                goto exit;
+        mutex_lock(&pmu->elpg_mutex);
+        pmu->elpg_refcnt++;
+        if (pmu->elpg_refcnt <= 0)
+                goto exit_unlock;
+        /* something is not right if we end up in following code path */
+        if (unlikely(pmu->elpg_refcnt > 1)) {
+                gk20a_warn(dev_from_gk20a(g),
+                "%s(): possible elpg refcnt mismatch. elpg refcnt=%d",
+                __func__, pmu->elpg_refcnt);
+                WARN_ON(1);
+        }
+        /* do NOT enable elpg until golden ctx is created,
+           which is related with the ctx that ELPG save and restore. */
+        if (unlikely(!gr->ctx_vars.golden_image_initialized))
+                goto exit_unlock;
+        /* return if ELPG is already on or on_pending or off_on_pending */
+        if (pmu->elpg_stat != PMU_ELPG_STAT_OFF)
+                goto exit_unlock;
+        /* if ELPG is not allowed right now, mark that it should be enabled
+         * immediately after it is allowed */
+        if (!pmu->elpg_enable_allow) {
+                pmu->elpg_stat = PMU_ELPG_STAT_OFF_ON_PENDING;
+                goto exit_unlock;
+        }
+        ret = gk20a_pmu_enable_elpg_locked(g);
+exit_unlock:
+        mutex_unlock(&pmu->elpg_mutex);
+exit:
+        gk20a_dbg_fn("done");
+        return ret;
+}
+static void pmu_elpg_enable_allow(struct work_struct *work)
+{
+        struct pmu_gk20a *pmu = container_of(to_delayed_work(work),
+                                        struct pmu_gk20a, elpg_enable);
+        gk20a_dbg_fn("");
+        mutex_lock(&pmu->elpg_mutex);
+        /* It is ok to enabled powergating now */
+        pmu->elpg_enable_allow = true;
+        /* do we have pending requests? */
+        if (pmu->elpg_stat == PMU_ELPG_STAT_OFF_ON_PENDING) {
+                pmu->elpg_stat = PMU_ELPG_STAT_OFF;
+                gk20a_pmu_enable_elpg_locked(pmu->g);
+        }
+        mutex_unlock(&pmu->elpg_mutex);
+        gk20a_dbg_fn("done");
+}
+static int gk20a_pmu_disable_elpg_defer_enable(struct gk20a *g, bool enable)
+{
+        struct pmu_gk20a *pmu = &g->pmu;
+        struct pmu_cmd cmd;
+        u32 seq;
+        int ret = 0;
+        gk20a_dbg_fn("");
+        if (!pmu->elpg_ready || !pmu->initialized)
+                return 0;
+        /* remove the work from queue */
+        cancel_delayed_work_sync(&pmu->elpg_enable);
+        mutex_lock(&pmu->elpg_mutex);
+        pmu->elpg_refcnt--;
+        if (pmu->elpg_refcnt > 0) {
+                gk20a_warn(dev_from_gk20a(g),
+                "%s(): possible elpg refcnt mismatch. elpg refcnt=%d",
+                __func__, pmu->elpg_refcnt);
+                WARN_ON(1);
+                ret = 0;
+                goto exit_unlock;
+        }
+        /* cancel off_on_pending and return */
+        if (pmu->elpg_stat == PMU_ELPG_STAT_OFF_ON_PENDING) {
+                pmu->elpg_stat = PMU_ELPG_STAT_OFF;
+                ret = 0;
+                goto exit_reschedule;
+        }
+        /* wait if on_pending */
+        else if (pmu->elpg_stat == PMU_ELPG_STAT_ON_PENDING) {
+                pmu_wait_message_cond(pmu, gk20a_get_gr_idle_timeout(g),
+                                      &pmu->elpg_stat, PMU_ELPG_STAT_ON);
+                if (pmu->elpg_stat != PMU_ELPG_STAT_ON) {
+                        gk20a_err(dev_from_gk20a(g),
+                                "ELPG_ALLOW_ACK failed, elpg_stat=%d",
+                                pmu->elpg_stat);
+                        pmu_dump_elpg_stats(pmu);
+                        pmu_dump_falcon_stats(pmu);
+                        ret = -EBUSY;
+                        goto exit_unlock;
+                }
+        }
+        /* return if ELPG is already off */
+        else if (pmu->elpg_stat != PMU_ELPG_STAT_ON) {
+                ret = 0;
+                goto exit_reschedule;
+        }
+        memset(&cmd, 0, sizeof(struct pmu_cmd));
+        cmd.hdr.unit_id = PMU_UNIT_PG;
+        cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_pg_cmd_elpg_cmd);
+        cmd.cmd.pg.elpg_cmd.cmd_type = PMU_PG_CMD_ID_ELPG_CMD;
+        cmd.cmd.pg.elpg_cmd.engine_id = ENGINE_GR_GK20A;
+        cmd.cmd.pg.elpg_cmd.cmd = PMU_PG_ELPG_CMD_DISALLOW;
+        pmu->elpg_stat = PMU_ELPG_STAT_OFF_PENDING;
+        gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_HPQ,
+                        pmu_handle_pg_elpg_msg, pmu, &seq, ~0);
+        pmu_wait_message_cond(pmu, gk20a_get_gr_idle_timeout(g),
+                              &pmu->elpg_stat, PMU_ELPG_STAT_OFF);
+        if (pmu->elpg_stat != PMU_ELPG_STAT_OFF) {
+                gk20a_err(dev_from_gk20a(g),
+                        "ELPG_DISALLOW_ACK failed");
+                pmu_dump_elpg_stats(pmu);
+                pmu_dump_falcon_stats(pmu);
+                ret = -EBUSY;
+                goto exit_unlock;
+        }
+exit_reschedule:
+        if (enable) {
+                pmu->elpg_enable_allow = false;
+                schedule_delayed_work(&pmu->elpg_enable,
+                        msecs_to_jiffies(PMU_ELPG_ENABLE_ALLOW_DELAY_MSEC));
+        } else
+                pmu->elpg_enable_allow = true;
+exit_unlock:
+        mutex_unlock(&pmu->elpg_mutex);
+        gk20a_dbg_fn("done");
+        return ret;
+}
+int gk20a_pmu_disable_elpg(struct gk20a *g)
+{
+        return gk20a_pmu_disable_elpg_defer_enable(g, true);
+}
+int gk20a_pmu_perfmon_enable(struct gk20a *g, bool enable)
+{
+        struct pmu_gk20a *pmu = &g->pmu;
+        int err;
+        gk20a_dbg_fn("");
+        if (enable)
+                err = pmu_perfmon_start_sampling(pmu);
+        else
+                err = pmu_perfmon_stop_sampling(pmu);
+        return err;
+}
+int gk20a_pmu_destroy(struct gk20a *g)
+{
+        struct pmu_gk20a *pmu = &g->pmu;
+        u32 elpg_ingating_time, elpg_ungating_time, gating_cnt;
+        gk20a_dbg_fn("");
+        if (!support_gk20a_pmu())
+                return 0;
+        /* make sure the pending operations are finished before we continue */
+        cancel_delayed_work_sync(&pmu->elpg_enable);
+        cancel_work_sync(&pmu->pg_init);
+        gk20a_pmu_get_elpg_residency_gating(g, &elpg_ingating_time,
+                &elpg_ungating_time, &gating_cnt);
+        gk20a_pmu_disable_elpg_defer_enable(g, false);
+        pmu->initialized = false;
+        /* update the s/w ELPG residency counters */
+        g->pg_ingating_time_us += (u64)elpg_ingating_time;
+        g->pg_ungating_time_us += (u64)elpg_ungating_time;
+        g->pg_gating_cnt += gating_cnt;
+        pmu_enable(pmu, false);
+        if (pmu->remove_support) {
+                pmu->remove_support(pmu);
+                pmu->remove_support = NULL;
+        }
+        gk20a_dbg_fn("done");
+        return 0;
+}
+int gk20a_pmu_load_norm(struct gk20a *g, u32 *load)
+{
+        struct pmu_gk20a *pmu = &g->pmu;
+        u16 _load = 0;
+        if (!pmu->perfmon_ready) {
+                *load = 0;
+                return 0;
+        }
+        pmu_copy_from_dmem(pmu, pmu->sample_buffer, (u8 *)&_load, 2, 0);
+        *load = _load / 10;
+        return 0;
+}
+void gk20a_pmu_get_load_counters(struct gk20a *g, u32 *busy_cycles,
+                                 u32 *total_cycles)
+{
+        if (!g->power_on) {
+                *busy_cycles = 0;
+                *total_cycles = 0;
+                return;
+        }
+        gk20a_busy(g->dev);
+        *busy_cycles = pwr_pmu_idle_count_value_v(
+                gk20a_readl(g, pwr_pmu_idle_count_r(1)));
+        rmb();
+        *total_cycles = pwr_pmu_idle_count_value_v(
+                gk20a_readl(g, pwr_pmu_idle_count_r(2)));
+        gk20a_idle(g->dev);
+}
+void gk20a_pmu_reset_load_counters(struct gk20a *g)
+{
+        u32 reg_val = pwr_pmu_idle_count_reset_f(1);
+        if (!g->power_on)
+                return;
+        gk20a_busy(g->dev);
+        gk20a_writel(g, pwr_pmu_idle_count_r(2), reg_val);
+        wmb();
+        gk20a_writel(g, pwr_pmu_idle_count_r(1), reg_val);
+        gk20a_idle(g->dev);
+}
+static int gk20a_pmu_get_elpg_residency_gating(struct gk20a *g,
+                        u32 *ingating_time, u32 *ungating_time, u32 *gating_cnt)
+{
+        struct pmu_gk20a *pmu = &g->pmu;
+        struct pmu_pg_stats stats;
+        if (!pmu->initialized) {
+                *ingating_time = 0;
+                *ungating_time = 0;
+                *gating_cnt = 0;
+                return 0;
+        }
+        pmu_copy_from_dmem(pmu, pmu->stat_dmem_offset,
+                (u8 *)&stats, sizeof(struct pmu_pg_stats), 0);
+        *ingating_time = stats.pg_ingating_time_us;
+        *ungating_time = stats.pg_ungating_time_us;
+        *gating_cnt = stats.pg_gating_cnt;
+        return 0;
+}
+/* Send an Adaptive Power (AP) related command to PMU */
+static int gk20a_pmu_ap_send_command(struct gk20a *g,
+                        union pmu_ap_cmd *p_ap_cmd, bool b_block)
+{
+        struct pmu_gk20a *pmu = &g->pmu;
+        /* FIXME: where is the PG structure defined?? */
+        u32 status = 0;
+        struct pmu_cmd cmd;
+        u32 seq;
+        pmu_callback p_callback = NULL;
+        memset(&cmd, 0, sizeof(struct pmu_cmd));
+        /* Copy common members */
+        cmd.hdr.unit_id = PMU_UNIT_PG;
+        cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(union pmu_ap_cmd);
+        cmd.cmd.pg.ap_cmd.cmn.cmd_type = PMU_PG_CMD_ID_AP;
+        cmd.cmd.pg.ap_cmd.cmn.cmd_id = p_ap_cmd->cmn.cmd_id;
+        /* Copy other members of command */
+        switch (p_ap_cmd->cmn.cmd_id) {
+        case PMU_AP_CMD_ID_INIT:
+                cmd.cmd.pg.ap_cmd.init.pg_sampling_period_us =
+                        p_ap_cmd->init.pg_sampling_period_us;
+                p_callback = ap_callback_init_and_enable_ctrl;
+                break;
+        case PMU_AP_CMD_ID_INIT_AND_ENABLE_CTRL:
+                cmd.cmd.pg.ap_cmd.init_and_enable_ctrl.ctrl_id =
+                p_ap_cmd->init_and_enable_ctrl.ctrl_id;
+                memcpy(
+                (void *)&(cmd.cmd.pg.ap_cmd.init_and_enable_ctrl.params),
+                        (void *)&(p_ap_cmd->init_and_enable_ctrl.params),
+                        sizeof(struct pmu_ap_ctrl_init_params));
+                p_callback = ap_callback_init_and_enable_ctrl;
+                break;
+        case PMU_AP_CMD_ID_ENABLE_CTRL:
+                cmd.cmd.pg.ap_cmd.enable_ctrl.ctrl_id =
+                        p_ap_cmd->enable_ctrl.ctrl_id;
+                break;
+        case PMU_AP_CMD_ID_DISABLE_CTRL:
+                cmd.cmd.pg.ap_cmd.disable_ctrl.ctrl_id =
+                        p_ap_cmd->disable_ctrl.ctrl_id;
+                break;
+        case PMU_AP_CMD_ID_KICK_CTRL:
+                cmd.cmd.pg.ap_cmd.kick_ctrl.ctrl_id =
+                        p_ap_cmd->kick_ctrl.ctrl_id;
+                cmd.cmd.pg.ap_cmd.kick_ctrl.skip_count =
+                        p_ap_cmd->kick_ctrl.skip_count;
+                break;
+        default:
+                gk20a_dbg_pmu("%s: Invalid Adaptive Power command %d\n",
+                        __func__, p_ap_cmd->cmn.cmd_id);
+                return 0x2f;
+        }
+        status = gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_HPQ,
+                        p_callback, pmu, &seq, ~0);
+        if (!status) {
+                gk20a_dbg_pmu(
+                        "%s: Unable to submit Adaptive Power Command %d\n",
+                        __func__, p_ap_cmd->cmn.cmd_id);
+                goto err_return;
+        }
+        /* TODO: Implement blocking calls (b_block) */
+err_return:
+        return status;
+}
+static void ap_callback_init_and_enable_ctrl(
+                struct gk20a *g, struct pmu_msg *msg,
+                void *param, u32 seq_desc, u32 status)
+{
+        /* Define p_ap (i.e pointer to pmu_ap structure) */
+        WARN_ON(!msg);
+        if (!status) {
+                switch (msg->msg.pg.ap_msg.cmn.msg_id) {
+                case PMU_AP_MSG_ID_INIT_ACK:
+                        break;
+                default:
+                        gk20a_dbg_pmu(
+                        "%s: Invalid Adaptive Power Message: %x\n",
+                        __func__, msg->msg.pg.ap_msg.cmn.msg_id);
+                        break;
+                }
+        }
+}
+static int gk20a_aelpg_init(struct gk20a *g)
+{
+        int status = 0;
+        /* Remove reliance on app_ctrl field. */
+        union pmu_ap_cmd ap_cmd;
+        /* TODO: Check for elpg being ready? */
+        ap_cmd.init.cmd_id = PMU_AP_CMD_ID_INIT;
+        ap_cmd.init.pg_sampling_period_us =
+                APCTRL_SAMPLING_PERIOD_PG_DEFAULT_US;
+        status = gk20a_pmu_ap_send_command(g, &ap_cmd, false);
+        return status;
+}
+static int gk20a_aelpg_init_and_enable(struct gk20a *g, u8 ctrl_id)
+{
+        int status = 0;
+        union pmu_ap_cmd ap_cmd;
+        /* TODO: Probably check if ELPG is ready? */
+        ap_cmd.init_and_enable_ctrl.cmd_id = PMU_AP_CMD_ID_INIT_AND_ENABLE_CTRL;
+        ap_cmd.init_and_enable_ctrl.ctrl_id = ctrl_id;
+        ap_cmd.init_and_enable_ctrl.params.min_idle_filter_us =
+                APCTRL_MINIMUM_IDLE_FILTER_DEFAULT_US;
+        ap_cmd.init_and_enable_ctrl.params.min_target_saving_us =
+                APCTRL_MINIMUM_TARGET_SAVING_DEFAULT_US;
+        ap_cmd.init_and_enable_ctrl.params.power_break_even_us =
+                APCTRL_POWER_BREAKEVEN_DEFAULT_US;
+        ap_cmd.init_and_enable_ctrl.params.cycles_per_sample_max =
+                APCTRL_CYCLES_PER_SAMPLE_MAX_DEFAULT;
+        switch (ctrl_id) {
+        case PMU_AP_CTRL_ID_GRAPHICS:
+                break;
+        default:
+                break;
+        }
+        status = gk20a_pmu_ap_send_command(g, &ap_cmd, true);
+        return status;
+}
+#if CONFIG_DEBUG_FS
+static int elpg_residency_show(struct seq_file *s, void *data)
+{
+        struct gk20a *g = s->private;
+        u32 ingating_time = 0;
+        u32 ungating_time = 0;
+        u32 gating_cnt;
+        u64 total_ingating, total_ungating, residency, divisor, dividend;
+        /* Don't unnecessarily power on the device */
+        if (g->power_on) {
+                gk20a_busy(g->dev);
+                gk20a_pmu_get_elpg_residency_gating(g, &ingating_time,
+                        &ungating_time, &gating_cnt);
+                gk20a_idle(g->dev);
+        }
+        total_ingating = g->pg_ingating_time_us + (u64)ingating_time;
+        total_ungating = g->pg_ungating_time_us + (u64)ungating_time;
+        divisor = total_ingating + total_ungating;
+        /* We compute the residency on a scale of 1000 */
+        dividend = total_ingating * 1000;
+        if (divisor)
+                residency = div64_u64(dividend, divisor);
+        else
+                residency = 0;
+        seq_printf(s, "Time in ELPG: %llu us\n"
+                        "Time out of ELPG: %llu us\n"
+                        "ELPG residency ratio: %llu\n",
+                        total_ingating, total_ungating, residency);
+        return 0;
+}
+static int elpg_residency_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, elpg_residency_show, inode->i_private);
+}
+static const struct file_operations elpg_residency_fops = {
+        .open           = elpg_residency_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int elpg_transitions_show(struct seq_file *s, void *data)
+{
+        struct gk20a *g = s->private;
+        u32 ingating_time, ungating_time, total_gating_cnt;
+        u32 gating_cnt = 0;
+        if (g->power_on) {
+                gk20a_busy(g->dev);
+                gk20a_pmu_get_elpg_residency_gating(g, &ingating_time,
+                        &ungating_time, &gating_cnt);
+                gk20a_idle(g->dev);
+        }
+        total_gating_cnt = g->pg_gating_cnt + gating_cnt;
+        seq_printf(s, "%u\n", total_gating_cnt);
+        return 0;
+}
+static int elpg_transitions_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, elpg_transitions_show, inode->i_private);
+}
+static const struct file_operations elpg_transitions_fops = {
+        .open           = elpg_transitions_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+int gk20a_pmu_debugfs_init(struct platform_device *dev)
+{
+        struct dentry *d;
+        struct gk20a_platform *platform = platform_get_drvdata(dev);
+        struct gk20a *g = get_gk20a(dev);
+        d = debugfs_create_file(
+                "elpg_residency", S_IRUGO|S_IWUSR, platform->debugfs, g,
+                                                &elpg_residency_fops);
+        if (!d)
+                goto err_out;
+        d = debugfs_create_file(
+                "elpg_transitions", S_IRUGO, platform->debugfs, g,
+                                                &elpg_transitions_fops);
+        if (!d)
+                goto err_out;
+        return 0;
+err_out:
+        pr_err("%s: Failed to make debugfs node\n", __func__);
+        debugfs_remove_recursive(platform->debugfs);
+        return -ENOMEM;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
new file mode 100644
index 00000000..c1b8ff1f
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
@@ -0,0 +1,1097 @@
+/*
+ * drivers/video/tegra/host/gk20a/pmu_gk20a.h
+ *
+ * GK20A PMU (aka. gPMU outside gk20a context)
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __PMU_GK20A_H__
+#define __PMU_GK20A_H__
+/* defined by pmu hw spec */
+#define GK20A_PMU_VA_START              ((128 * 1024) << 10)
+#define GK20A_PMU_VA_SIZE               (512 * 1024 * 1024)
+#define GK20A_PMU_INST_SIZE             (4 * 1024)
+#define GK20A_PMU_UCODE_SIZE_MAX        (256 * 1024)
+#define GK20A_PMU_SEQ_BUF_SIZE          4096
+#define ZBC_MASK(i)                     (~(~(0) << ((i)+1)) & 0xfffe)
+/* PMU Command/Message Interfaces for Adaptive Power */
+/* Macro to get Histogram index */
+#define PMU_AP_HISTOGRAM(idx)           (idx)
+#define PMU_AP_HISTOGRAM_CONT           (4)
+/* Total number of histogram bins */
+#define PMU_AP_CFG_HISTOGRAM_BIN_N      (16)
+/* Mapping between Idle counters and histograms */
+#define PMU_AP_IDLE_MASK_HIST_IDX_0             (2)
+#define PMU_AP_IDLE_MASK_HIST_IDX_1             (3)
+#define PMU_AP_IDLE_MASK_HIST_IDX_2             (5)
+#define PMU_AP_IDLE_MASK_HIST_IDX_3             (6)
+/* Mapping between AP_CTRLs and Histograms */
+#define PMU_AP_HISTOGRAM_IDX_GRAPHICS   (PMU_AP_HISTOGRAM(1))
+/* Mapping between AP_CTRLs and Idle counters */
+#define PMU_AP_IDLE_MASK_GRAPHICS       (PMU_AP_IDLE_MASK_HIST_IDX_1)
+#define APP_VERSION_1 17997577
+#define APP_VERSION_0 16856675
+enum pmu_perfmon_cmd_start_fields {
+        COUNTER_ALLOC
+};
+/* Adaptive Power Controls (AP_CTRL) */
+enum {
+        PMU_AP_CTRL_ID_GRAPHICS = 0x0,
+        /* PMU_AP_CTRL_ID_MS         ,*/
+        PMU_AP_CTRL_ID_MAX           ,
+};
+/* AP_CTRL Statistics */
+struct pmu_ap_ctrl_stat {
+        /*
+         * Represents whether AP is active or not
+         * TODO: This is NvBool in RM; is that 1 byte of 4 bytes?
+         */
+        u8      b_active;
+        /* Idle filter represented by histogram bin index */
+        u8      idle_filter_x;
+        u8      rsvd[2];
+        /* Total predicted power saving cycles. */
+        s32     power_saving_h_cycles;
+        /* Counts how many times AP gave us -ve power benefits. */
+        u32     bad_decision_count;
+        /*
+         * Number of times ap structure needs to skip AP iterations
+         * KICK_CTRL from kernel updates this parameter.
+         */
+        u32     skip_count;
+        u8      bin[PMU_AP_CFG_HISTOGRAM_BIN_N];
+};
+/* Parameters initialized by INITn APCTRL command */
+struct pmu_ap_ctrl_init_params {
+        /* Minimum idle filter value in Us */
+        u32     min_idle_filter_us;
+        /*
+         * Minimum Targeted Saving in Us. AP will update idle thresholds only
+         * if power saving achieved by updating idle thresholds is greater than
+         * Minimum targeted saving.
+         */
+        u32     min_target_saving_us;
+        /* Minimum targeted residency of power feature in Us */
+        u32     power_break_even_us;
+        /*
+         * Maximum number of allowed power feature cycles per sample.
+         *
+         * We are allowing at max "pgPerSampleMax" cycles in one iteration of AP
+         * AKA pgPerSampleMax in original algorithm.
+         */
+        u32     cycles_per_sample_max;
+};
+/* AP Commands/Message structures */
+/*
+ * Structure for Generic AP Commands
+ */
+struct pmu_ap_cmd_common {
+        u8      cmd_type;
+        u16     cmd_id;
+};
+/*
+ * Structure for INIT AP command
+ */
+struct pmu_ap_cmd_init {
+        u8      cmd_type;
+        u16     cmd_id;
+        u8      rsvd;
+        u32     pg_sampling_period_us;
+};
+/*
+ * Structure for Enable/Disable ApCtrl Commands
+ */
+struct pmu_ap_cmd_enable_ctrl {
+        u8      cmd_type;
+        u16     cmd_id;
+        u8      ctrl_id;
+};
+struct pmu_ap_cmd_disable_ctrl {
+        u8      cmd_type;
+        u16     cmd_id;
+        u8      ctrl_id;
+};
+/*
+ * Structure for INIT command
+ */
+struct pmu_ap_cmd_init_ctrl {
+        u8                              cmd_type;
+        u16                             cmd_id;
+        u8                              ctrl_id;
+        struct pmu_ap_ctrl_init_params  params;
+};
+struct pmu_ap_cmd_init_and_enable_ctrl {
+        u8                              cmd_type;
+        u16                             cmd_id;
+        u8                              ctrl_id;
+        struct pmu_ap_ctrl_init_params  params;
+};
+/*
+ * Structure for KICK_CTRL command
+ */
+struct pmu_ap_cmd_kick_ctrl {
+        u8      cmd_type;
+        u16     cmd_id;
+        u8      ctrl_id;
+        u32     skip_count;
+};
+/*
+ * Structure for PARAM command
+ */
+struct pmu_ap_cmd_param {
+        u8      cmd_type;
+        u16     cmd_id;
+        u8      ctrl_id;
+        u32     data;
+};
+/*
+ * Defines for AP commands
+ */
+enum {
+        PMU_AP_CMD_ID_INIT = 0x0          ,
+        PMU_AP_CMD_ID_INIT_AND_ENABLE_CTRL,
+        PMU_AP_CMD_ID_ENABLE_CTRL         ,
+        PMU_AP_CMD_ID_DISABLE_CTRL        ,
+        PMU_AP_CMD_ID_KICK_CTRL           ,
+};
+/*
+ * AP Command
+ */
+union pmu_ap_cmd {
+        u8                                      cmd_type;
+        struct pmu_ap_cmd_common                cmn;
+        struct pmu_ap_cmd_init                  init;
+        struct pmu_ap_cmd_init_and_enable_ctrl  init_and_enable_ctrl;
+        struct pmu_ap_cmd_enable_ctrl           enable_ctrl;
+        struct pmu_ap_cmd_disable_ctrl          disable_ctrl;
+        struct pmu_ap_cmd_kick_ctrl             kick_ctrl;
+};
+/*
+ * Structure for generic AP Message
+ */
+struct pmu_ap_msg_common {
+        u8      msg_type;
+        u16     msg_id;
+};
+/*
+ * Structure for INIT_ACK Message
+ */
+struct pmu_ap_msg_init_ack {
+        u8      msg_type;
+        u16     msg_id;
+        u8      ctrl_id;
+        u32     stats_dmem_offset;
+};
+/*
+ * Defines for AP messages
+ */
+enum {
+        PMU_AP_MSG_ID_INIT_ACK = 0x0,
+};
+/*
+ * AP Message
+ */
+union pmu_ap_msg {
+        u8                              msg_type;
+        struct pmu_ap_msg_common        cmn;
+        struct pmu_ap_msg_init_ack      init_ack;
+};
+/* Default Sampling Period of AELPG */
+#define APCTRL_SAMPLING_PERIOD_PG_DEFAULT_US                    (1000000)
+/* Default values of APCTRL parameters */
+#define APCTRL_MINIMUM_IDLE_FILTER_DEFAULT_US                   (100)
+#define APCTRL_MINIMUM_TARGET_SAVING_DEFAULT_US                 (10000)
+#define APCTRL_POWER_BREAKEVEN_DEFAULT_US                       (2000)
+#define APCTRL_CYCLES_PER_SAMPLE_MAX_DEFAULT                    (100)
+/*
+ * Disable reason for Adaptive Power Controller
+ */
+enum {
+        APCTRL_DISABLE_REASON_RM_UNLOAD,
+        APCTRL_DISABLE_REASON_RMCTRL,
+};
+/*
+ * Adaptive Power Controller
+ */
+struct ap_ctrl {
+        u32                     stats_dmem_offset;
+        u32                     disable_reason_mask;
+        struct pmu_ap_ctrl_stat stat_cache;
+        u8                      b_ready;
+};
+/*
+ * Adaptive Power structure
+ *
+ * ap structure provides generic infrastructure to make any power feature
+ * adaptive.
+ */
+struct pmu_ap {
+        u32                     supported_mask;
+        struct ap_ctrl          ap_ctrl[PMU_AP_CTRL_ID_MAX];
+};
+enum {
+        GK20A_PMU_DMAIDX_UCODE          = 0,
+        GK20A_PMU_DMAIDX_VIRT           = 1,
+        GK20A_PMU_DMAIDX_PHYS_VID       = 2,
+        GK20A_PMU_DMAIDX_PHYS_SYS_COH   = 3,
+        GK20A_PMU_DMAIDX_PHYS_SYS_NCOH  = 4,
+        GK20A_PMU_DMAIDX_RSVD           = 5,
+        GK20A_PMU_DMAIDX_PELPG          = 6,
+        GK20A_PMU_DMAIDX_END            = 7
+};
+struct pmu_mem_v0 {
+        u32 dma_base;
+        u8  dma_offset;
+        u8  dma_idx;
+};
+struct pmu_mem_v1 {
+        u32 dma_base;
+        u8  dma_offset;
+        u8  dma_idx;
+        u16 fb_size;
+};
+struct pmu_dmem {
+        u16 size;
+        u32 offset;
+};
+/* Make sure size of this structure is a multiple of 4 bytes */
+struct pmu_cmdline_args_v0 {
+        u32 cpu_freq_hz;                /* Frequency of the clock driving PMU */
+        u32 falc_trace_size;            /* falctrace buffer size (bytes) */
+        u32 falc_trace_dma_base;        /* 256-byte block address */
+        u32 falc_trace_dma_idx;         /* dmaIdx for DMA operations */
+        struct pmu_mem_v0 gc6_ctx;              /* dmem offset of gc6 context */
+};
+struct pmu_cmdline_args_v1 {
+        u32 cpu_freq_hz;                /* Frequency of the clock driving PMU */
+        u32 falc_trace_size;            /* falctrace buffer size (bytes) */
+        u32 falc_trace_dma_base;        /* 256-byte block address */
+        u32 falc_trace_dma_idx;         /* dmaIdx for DMA operations */
+        u8 secure_mode;
+        struct pmu_mem_v1 gc6_ctx;              /* dmem offset of gc6 context */
+};
+#define GK20A_PMU_DMEM_BLKSIZE2         8
+#define GK20A_PMU_UCODE_NB_MAX_OVERLAY      32
+#define GK20A_PMU_UCODE_NB_MAX_DATE_LENGTH  64
+struct pmu_ucode_desc {
+        u32 descriptor_size;
+        u32 image_size;
+        u32 tools_version;
+        u32 app_version;
+        char date[GK20A_PMU_UCODE_NB_MAX_DATE_LENGTH];
+        u32 bootloader_start_offset;
+        u32 bootloader_size;
+        u32 bootloader_imem_offset;
+        u32 bootloader_entry_point;
+        u32 app_start_offset;
+        u32 app_size;
+        u32 app_imem_offset;
+        u32 app_imem_entry;
+        u32 app_dmem_offset;
+        u32 app_resident_code_offset;  /* Offset from appStartOffset */
+        u32 app_resident_code_size;    /* Exact size of the resident code ( potentially contains CRC inside at the end ) */
+        u32 app_resident_data_offset;  /* Offset from appStartOffset */
+        u32 app_resident_data_size;    /* Exact size of the resident code ( potentially contains CRC inside at the end ) */
+        u32 nb_overlays;
+        struct {u32 start; u32 size;} load_ovl[GK20A_PMU_UCODE_NB_MAX_OVERLAY];
+        u32 compressed;
+};
+#define PMU_UNIT_REWIND         (0x00)
+#define PMU_UNIT_I2C            (0x01)
+#define PMU_UNIT_SEQ            (0x02)
+#define PMU_UNIT_PG             (0x03)
+#define PMU_UNIT_AVAILABLE1     (0x04)
+#define PMU_UNIT_AVAILABLE2     (0x05)
+#define PMU_UNIT_MEM            (0x06)
+#define PMU_UNIT_INIT           (0x07)
+#define PMU_UNIT_FBBA           (0x08)
+#define PMU_UNIT_DIDLE          (0x09)
+#define PMU_UNIT_AVAILABLE3     (0x0A)
+#define PMU_UNIT_AVAILABLE4     (0x0B)
+#define PMU_UNIT_HDCP_MAIN      (0x0C)
+#define PMU_UNIT_HDCP_V         (0x0D)
+#define PMU_UNIT_HDCP_SRM       (0x0E)
+#define PMU_UNIT_NVDPS          (0x0F)
+#define PMU_UNIT_DEINIT         (0x10)
+#define PMU_UNIT_AVAILABLE5     (0x11)
+#define PMU_UNIT_PERFMON        (0x12)
+#define PMU_UNIT_FAN            (0x13)
+#define PMU_UNIT_PBI            (0x14)
+#define PMU_UNIT_ISOBLIT        (0x15)
+#define PMU_UNIT_DETACH         (0x16)
+#define PMU_UNIT_DISP           (0x17)
+#define PMU_UNIT_HDCP           (0x18)
+#define PMU_UNIT_REGCACHE       (0x19)
+#define PMU_UNIT_SYSMON         (0x1A)
+#define PMU_UNIT_THERM          (0x1B)
+#define PMU_UNIT_PMGR           (0x1C)
+#define PMU_UNIT_PERF           (0x1D)
+#define PMU_UNIT_PCM            (0x1E)
+#define PMU_UNIT_RC             (0x1F)
+#define PMU_UNIT_NULL           (0x20)
+#define PMU_UNIT_LOGGER         (0x21)
+#define PMU_UNIT_SMBPBI         (0x22)
+#define PMU_UNIT_END            (0x23)
+#define PMU_UNIT_TEST_START     (0xFE)
+#define PMU_UNIT_END_SIM        (0xFF)
+#define PMU_UNIT_TEST_END       (0xFF)
+#define PMU_UNIT_ID_IS_VALID(id)                \
+                (((id) < PMU_UNIT_END) || ((id) >= PMU_UNIT_TEST_START))
+#define PMU_DMEM_ALLOC_ALIGNMENT        (32)
+#define PMU_DMEM_ALIGNMENT              (4)
+#define PMU_CMD_FLAGS_PMU_MASK          (0xF0)
+#define PMU_CMD_FLAGS_STATUS            BIT(0)
+#define PMU_CMD_FLAGS_INTR              BIT(1)
+#define PMU_CMD_FLAGS_EVENT             BIT(2)
+#define PMU_CMD_FLAGS_WATERMARK         BIT(3)
+struct pmu_hdr {
+        u8 unit_id;
+        u8 size;
+        u8 ctrl_flags;
+        u8 seq_id;
+};
+#define PMU_MSG_HDR_SIZE        sizeof(struct pmu_hdr)
+#define PMU_CMD_HDR_SIZE        sizeof(struct pmu_hdr)
+#define PMU_QUEUE_COUNT         5
+struct pmu_allocation_v0 {
+        u8 pad[3];
+        u8 fb_mem_use;
+        struct {
+                struct pmu_dmem dmem;
+                struct pmu_mem_v0 fb;
+        } alloc;
+};
+struct pmu_allocation_v1 {
+        struct {
+                struct pmu_dmem dmem;
+                struct pmu_mem_v1 fb;
+        } alloc;
+};
+enum {
+        PMU_INIT_MSG_TYPE_PMU_INIT = 0,
+};
+struct pmu_init_msg_pmu_v0 {
+        u8 msg_type;
+        u8 pad;
+        struct {
+                u16 size;
+                u16 offset;
+                u8  index;
+                u8  pad;
+        } queue_info[PMU_QUEUE_COUNT];
+        u16 sw_managed_area_offset;
+        u16 sw_managed_area_size;
+};
+struct pmu_init_msg_pmu_v1 {
+        u8 msg_type;
+        u8 pad;
+        u16  os_debug_entry_point;
+        struct {
+                u16 size;
+                u16 offset;
+                u8  index;
+                u8  pad;
+        } queue_info[PMU_QUEUE_COUNT];
+        u16 sw_managed_area_offset;
+        u16 sw_managed_area_size;
+};
+union pmu_init_msg_pmu {
+        struct pmu_init_msg_pmu_v0 v0;
+        struct pmu_init_msg_pmu_v1 v1;
+};
+struct pmu_init_msg {
+        union {
+                u8 msg_type;
+                struct pmu_init_msg_pmu_v1 pmu_init_v1;
+                struct pmu_init_msg_pmu_v0 pmu_init_v0;
+        };
+};
+enum {
+        PMU_PG_ELPG_MSG_INIT_ACK,
+        PMU_PG_ELPG_MSG_DISALLOW_ACK,
+        PMU_PG_ELPG_MSG_ALLOW_ACK,
+        PMU_PG_ELPG_MSG_FREEZE_ACK,
+        PMU_PG_ELPG_MSG_FREEZE_ABORT,
+        PMU_PG_ELPG_MSG_UNFREEZE_ACK,
+};
+struct pmu_pg_msg_elpg_msg {
+        u8 msg_type;
+        u8 engine_id;
+        u16 msg;
+};
+enum {
+        PMU_PG_STAT_MSG_RESP_DMEM_OFFSET = 0,
+};
+struct pmu_pg_msg_stat {
+        u8 msg_type;
+        u8 engine_id;
+        u16 sub_msg_id;
+        u32 data;
+};
+enum {
+        PMU_PG_MSG_ENG_BUF_LOADED,
+        PMU_PG_MSG_ENG_BUF_UNLOADED,
+        PMU_PG_MSG_ENG_BUF_FAILED,
+};
+struct pmu_pg_msg_eng_buf_stat {
+        u8 msg_type;
+        u8 engine_id;
+        u8 buf_idx;
+        u8 status;
+};
+struct pmu_pg_msg {
+        union {
+                u8 msg_type;
+                struct pmu_pg_msg_elpg_msg elpg_msg;
+                struct pmu_pg_msg_stat stat;
+                struct pmu_pg_msg_eng_buf_stat eng_buf_stat;
+                /* TBD: other pg messages */
+                union pmu_ap_msg ap_msg;
+        };
+};
+enum {
+        PMU_RC_MSG_TYPE_UNHANDLED_CMD = 0,
+};
+struct pmu_rc_msg_unhandled_cmd {
+        u8 msg_type;
+        u8 unit_id;
+};
+struct pmu_rc_msg {
+        u8 msg_type;
+        struct pmu_rc_msg_unhandled_cmd unhandled_cmd;
+};
+enum {
+        PMU_PG_CMD_ID_ELPG_CMD = 0,
+        PMU_PG_CMD_ID_ENG_BUF_LOAD,
+        PMU_PG_CMD_ID_ENG_BUF_UNLOAD,
+        PMU_PG_CMD_ID_PG_STAT,
+        PMU_PG_CMD_ID_PG_LOG_INIT,
+        PMU_PG_CMD_ID_PG_LOG_FLUSH,
+        PMU_PG_CMD_ID_PG_PARAM,
+        PMU_PG_CMD_ID_ELPG_INIT,
+        PMU_PG_CMD_ID_ELPG_POLL_CTXSAVE,
+        PMU_PG_CMD_ID_ELPG_ABORT_POLL,
+        PMU_PG_CMD_ID_ELPG_PWR_UP,
+        PMU_PG_CMD_ID_ELPG_DISALLOW,
+        PMU_PG_CMD_ID_ELPG_ALLOW,
+        PMU_PG_CMD_ID_AP,
+        RM_PMU_PG_CMD_ID_PSI,
+        RM_PMU_PG_CMD_ID_CG,
+        PMU_PG_CMD_ID_ZBC_TABLE_UPDATE,
+        PMU_PG_CMD_ID_PWR_RAIL_GATE_DISABLE = 0x20,
+        PMU_PG_CMD_ID_PWR_RAIL_GATE_ENABLE,
+        PMU_PG_CMD_ID_PWR_RAIL_SMU_MSG_DISABLE
+};
+enum {
+        PMU_PG_ELPG_CMD_INIT,
+        PMU_PG_ELPG_CMD_DISALLOW,
+        PMU_PG_ELPG_CMD_ALLOW,
+        PMU_PG_ELPG_CMD_FREEZE,
+        PMU_PG_ELPG_CMD_UNFREEZE,
+};
+struct pmu_pg_cmd_elpg_cmd {
+        u8 cmd_type;
+        u8 engine_id;
+        u16 cmd;
+};
+struct pmu_pg_cmd_eng_buf_load {
+        u8 cmd_type;
+        u8 engine_id;
+        u8 buf_idx;
+        u8 pad;
+        u16 buf_size;
+        u32 dma_base;
+        u8 dma_offset;
+        u8 dma_idx;
+};
+enum {
+        PMU_PG_STAT_CMD_ALLOC_DMEM = 0,
+};
+struct pmu_pg_cmd_stat {
+        u8 cmd_type;
+        u8 engine_id;
+        u16 sub_cmd_id;
+        u32 data;
+};
+struct pmu_pg_cmd {
+        union {
+                u8 cmd_type;
+                struct pmu_pg_cmd_elpg_cmd elpg_cmd;
+                struct pmu_pg_cmd_eng_buf_load eng_buf_load;
+                struct pmu_pg_cmd_stat stat;
+                /* TBD: other pg commands */
+                union pmu_ap_cmd ap_cmd;
+        };
+};
+/* PERFMON */
+#define PMU_DOMAIN_GROUP_PSTATE         0
+#define PMU_DOMAIN_GROUP_GPC2CLK        1
+#define PMU_DOMAIN_GROUP_NUM            2
+/* TBD: smart strategy */
+#define PMU_PERFMON_PCT_TO_INC          58
+#define PMU_PERFMON_PCT_TO_DEC          23
+struct pmu_perfmon_counter {
+        u8 index;
+        u8 flags;
+        u8 group_id;
+        u8 valid;
+        u16 upper_threshold; /* units of 0.01% */
+        u16 lower_threshold; /* units of 0.01% */
+};
+#define PMU_PERFMON_FLAG_ENABLE_INCREASE        (0x00000001)
+#define PMU_PERFMON_FLAG_ENABLE_DECREASE        (0x00000002)
+#define PMU_PERFMON_FLAG_CLEAR_PREV             (0x00000004)
+/* PERFMON CMD */
+enum {
+        PMU_PERFMON_CMD_ID_START = 0,
+        PMU_PERFMON_CMD_ID_STOP  = 1,
+        PMU_PERFMON_CMD_ID_INIT  = 2
+};
+struct pmu_perfmon_cmd_start_v1 {
+        u8 cmd_type;
+        u8 group_id;
+        u8 state_id;
+        u8 flags;
+        struct pmu_allocation_v1 counter_alloc;
+};
+struct pmu_perfmon_cmd_start_v0 {
+        u8 cmd_type;
+        u8 group_id;
+        u8 state_id;
+        u8 flags;
+        struct pmu_allocation_v0 counter_alloc;
+};
+struct pmu_perfmon_cmd_stop {
+        u8 cmd_type;
+};
+struct pmu_perfmon_cmd_init_v1 {
+        u8 cmd_type;
+        u8 to_decrease_count;
+        u8 base_counter_id;
+        u32 sample_period_us;
+        struct pmu_allocation_v1 counter_alloc;
+        u8 num_counters;
+        u8 samples_in_moving_avg;
+        u16 sample_buffer;
+};
+struct pmu_perfmon_cmd_init_v0 {
+        u8 cmd_type;
+        u8 to_decrease_count;
+        u8 base_counter_id;
+        u32 sample_period_us;
+        struct pmu_allocation_v0 counter_alloc;
+        u8 num_counters;
+        u8 samples_in_moving_avg;
+        u16 sample_buffer;
+};
+struct pmu_perfmon_cmd {
+        union {
+                u8 cmd_type;
+                struct pmu_perfmon_cmd_start_v0 start_v0;
+                struct pmu_perfmon_cmd_start_v1 start_v1;
+                struct pmu_perfmon_cmd_stop stop;
+                struct pmu_perfmon_cmd_init_v0 init_v0;
+                struct pmu_perfmon_cmd_init_v1 init_v1;
+        };
+};
+struct pmu_zbc_cmd {
+        u8 cmd_type;
+        u8 pad;
+        u16 entry_mask;
+};
+/* PERFMON MSG */
+enum {
+        PMU_PERFMON_MSG_ID_INCREASE_EVENT = 0,
+        PMU_PERFMON_MSG_ID_DECREASE_EVENT = 1,
+        PMU_PERFMON_MSG_ID_INIT_EVENT     = 2,
+        PMU_PERFMON_MSG_ID_ACK            = 3
+};
+struct pmu_perfmon_msg_generic {
+        u8 msg_type;
+        u8 state_id;
+        u8 group_id;
+        u8 data;
+};
+struct pmu_perfmon_msg {
+        union {
+                u8 msg_type;
+                struct pmu_perfmon_msg_generic gen;
+        };
+};
+struct pmu_cmd {
+        struct pmu_hdr hdr;
+        union {
+                struct pmu_perfmon_cmd perfmon;
+                struct pmu_pg_cmd pg;
+                struct pmu_zbc_cmd zbc;
+        } cmd;
+};
+struct pmu_msg {
+        struct pmu_hdr hdr;
+        union {
+                struct pmu_init_msg init;
+                struct pmu_perfmon_msg perfmon;
+                struct pmu_pg_msg pg;
+                struct pmu_rc_msg rc;
+        } msg;
+};
+#define PMU_SHA1_GID_SIGNATURE          0xA7C66AD2
+#define PMU_SHA1_GID_SIGNATURE_SIZE     4
+#define PMU_SHA1_GID_SIZE       16
+struct pmu_sha1_gid {
+        bool valid;
+        u8 gid[PMU_SHA1_GID_SIZE];
+};
+struct pmu_sha1_gid_data {
+        u8 signature[PMU_SHA1_GID_SIGNATURE_SIZE];
+        u8 gid[PMU_SHA1_GID_SIZE];
+};
+#define PMU_COMMAND_QUEUE_HPQ           0       /* write by sw, read by pmu, protected by sw mutex lock */
+#define PMU_COMMAND_QUEUE_LPQ           1       /* write by sw, read by pmu, protected by sw mutex lock */
+#define PMU_COMMAND_QUEUE_BIOS          2       /* read/write by sw/hw, protected by hw pmu mutex, id = 2 */
+#define PMU_COMMAND_QUEUE_SMI           3       /* read/write by sw/hw, protected by hw pmu mutex, id = 3 */
+#define PMU_MESSAGE_QUEUE               4       /* write by pmu, read by sw, accessed by interrupt handler, no lock */
+#define PMU_QUEUE_COUNT                 5
+enum {
+        PMU_MUTEX_ID_RSVD1 = 0  ,
+        PMU_MUTEX_ID_GPUSER     ,
+        PMU_MUTEX_ID_QUEUE_BIOS ,
+        PMU_MUTEX_ID_QUEUE_SMI  ,
+        PMU_MUTEX_ID_GPMUTEX    ,
+        PMU_MUTEX_ID_I2C        ,
+        PMU_MUTEX_ID_RMLOCK     ,
+        PMU_MUTEX_ID_MSGBOX     ,
+        PMU_MUTEX_ID_FIFO       ,
+        PMU_MUTEX_ID_PG         ,
+        PMU_MUTEX_ID_GR         ,
+        PMU_MUTEX_ID_CLK        ,
+        PMU_MUTEX_ID_RSVD6      ,
+        PMU_MUTEX_ID_RSVD7      ,
+        PMU_MUTEX_ID_RSVD8      ,
+        PMU_MUTEX_ID_RSVD9      ,
+        PMU_MUTEX_ID_INVALID
+};
+#define PMU_IS_COMMAND_QUEUE(id)        \
+                ((id)  < PMU_MESSAGE_QUEUE)
+#define PMU_IS_SW_COMMAND_QUEUE(id)     \
+                (((id) == PMU_COMMAND_QUEUE_HPQ) || \
+                 ((id) == PMU_COMMAND_QUEUE_LPQ))
+#define  PMU_IS_MESSAGE_QUEUE(id)       \
+                ((id) == PMU_MESSAGE_QUEUE)
+enum
+{
+        OFLAG_READ = 0,
+        OFLAG_WRITE
+};
+#define QUEUE_SET               (true)
+#define QUEUE_GET               (false)
+#define QUEUE_ALIGNMENT         (4)
+#define PMU_PGENG_GR_BUFFER_IDX_INIT    (0)
+#define PMU_PGENG_GR_BUFFER_IDX_ZBC     (1)
+#define PMU_PGENG_GR_BUFFER_IDX_FECS    (2)
+enum
+{
+    PMU_DMAIDX_UCODE         = 0,
+    PMU_DMAIDX_VIRT          = 1,
+    PMU_DMAIDX_PHYS_VID      = 2,
+    PMU_DMAIDX_PHYS_SYS_COH  = 3,
+    PMU_DMAIDX_PHYS_SYS_NCOH = 4,
+    PMU_DMAIDX_RSVD          = 5,
+    PMU_DMAIDX_PELPG         = 6,
+    PMU_DMAIDX_END           = 7
+};
+struct pmu_gk20a;
+struct pmu_queue;
+struct pmu_queue {
+        /* used by hw, for BIOS/SMI queue */
+        u32 mutex_id;
+        u32 mutex_lock;
+        /* used by sw, for LPQ/HPQ queue */
+        struct mutex mutex;
+        /* current write position */
+        u32 position;
+        /* physical dmem offset where this queue begins */
+        u32 offset;
+        /* logical queue identifier */
+        u32 id;
+        /* physical queue index */
+        u32 index;
+        /* in bytes */
+        u32 size;
+        /* open-flag */
+        u32 oflag;
+        bool opened; /* opened implies locked */
+        bool locked; /* check free space after setting locked but before setting opened */
+};
+#define PMU_MUTEX_ID_IS_VALID(id)       \
+                ((id) < PMU_MUTEX_ID_INVALID)
+#define PMU_INVALID_MUTEX_OWNER_ID      (0)
+struct pmu_mutex {
+        u32 id;
+        u32 index;
+        u32 ref_cnt;
+};
+#define PMU_MAX_NUM_SEQUENCES           (256)
+#define PMU_SEQ_BIT_SHIFT               (5)
+#define PMU_SEQ_TBL_SIZE        \
+                (PMU_MAX_NUM_SEQUENCES >> PMU_SEQ_BIT_SHIFT)
+#define PMU_INVALID_SEQ_DESC            (~0)
+enum
+{
+        PMU_SEQ_STATE_FREE = 0,
+        PMU_SEQ_STATE_PENDING,
+        PMU_SEQ_STATE_USED,
+        PMU_SEQ_STATE_CANCELLED
+};
+struct pmu_payload {
+        struct {
+                void *buf;
+                u32 offset;
+                u32 size;
+        } in, out;
+};
+typedef void (*pmu_callback)(struct gk20a *, struct pmu_msg *, void *, u32,
+        u32);
+struct pmu_sequence {
+        u8 id;
+        u32 state;
+        u32 desc;
+        struct pmu_msg *msg;
+        union {
+                struct pmu_allocation_v0 in_v0;
+                struct pmu_allocation_v1 in_v1;
+        };
+        union {
+                struct pmu_allocation_v0 out_v0;
+                struct pmu_allocation_v1 out_v1;
+        };
+        u8 *out_payload;
+        pmu_callback callback;
+        void* cb_params;
+};
+struct pmu_pg_stats {
+        u64 pg_entry_start_timestamp;
+        u64 pg_ingating_start_timestamp;
+        u64 pg_exit_start_timestamp;
+        u64 pg_ungating_start_timestamp;
+        u32 pg_avg_entry_time_us;
+        u32 pg_ingating_cnt;
+        u32 pg_ingating_time_us;
+        u32 pg_avg_exit_time_us;
+        u32 pg_ungating_count;
+        u32 pg_ungating_time_us;
+        u32 pg_gating_cnt;
+        u32 pg_gating_deny_cnt;
+};
+#define PMU_PG_IDLE_THRESHOLD_SIM               1000
+#define PMU_PG_POST_POWERUP_IDLE_THRESHOLD_SIM  4000000
+/* TBD: QT or else ? */
+#define PMU_PG_IDLE_THRESHOLD                   15000
+#define PMU_PG_POST_POWERUP_IDLE_THRESHOLD      1000000
+/* state transition :
+    OFF => [OFF_ON_PENDING optional] => ON_PENDING => ON => OFF
+    ON => OFF is always synchronized */
+#define PMU_ELPG_STAT_OFF               0   /* elpg is off */
+#define PMU_ELPG_STAT_ON                1   /* elpg is on */
+#define PMU_ELPG_STAT_ON_PENDING        2   /* elpg is off, ALLOW cmd has been sent, wait for ack */
+#define PMU_ELPG_STAT_OFF_PENDING       3   /* elpg is on, DISALLOW cmd has been sent, wait for ack */
+#define PMU_ELPG_STAT_OFF_ON_PENDING    4   /* elpg is off, caller has requested on, but ALLOW
+                                               cmd hasn't been sent due to ENABLE_ALLOW delay */
+/* Falcon Register index */
+#define PMU_FALCON_REG_R0               (0)
+#define PMU_FALCON_REG_R1               (1)
+#define PMU_FALCON_REG_R2               (2)
+#define PMU_FALCON_REG_R3               (3)
+#define PMU_FALCON_REG_R4               (4)
+#define PMU_FALCON_REG_R5               (5)
+#define PMU_FALCON_REG_R6               (6)
+#define PMU_FALCON_REG_R7               (7)
+#define PMU_FALCON_REG_R8               (8)
+#define PMU_FALCON_REG_R9               (9)
+#define PMU_FALCON_REG_R10              (10)
+#define PMU_FALCON_REG_R11              (11)
+#define PMU_FALCON_REG_R12              (12)
+#define PMU_FALCON_REG_R13              (13)
+#define PMU_FALCON_REG_R14              (14)
+#define PMU_FALCON_REG_R15              (15)
+#define PMU_FALCON_REG_IV0              (16)
+#define PMU_FALCON_REG_IV1              (17)
+#define PMU_FALCON_REG_UNDEFINED        (18)
+#define PMU_FALCON_REG_EV               (19)
+#define PMU_FALCON_REG_SP               (20)
+#define PMU_FALCON_REG_PC               (21)
+#define PMU_FALCON_REG_IMB              (22)
+#define PMU_FALCON_REG_DMB              (23)
+#define PMU_FALCON_REG_CSW              (24)
+#define PMU_FALCON_REG_CCR              (25)
+#define PMU_FALCON_REG_SEC              (26)
+#define PMU_FALCON_REG_CTX              (27)
+#define PMU_FALCON_REG_EXCI             (28)
+#define PMU_FALCON_REG_RSVD0            (29)
+#define PMU_FALCON_REG_RSVD1            (30)
+#define PMU_FALCON_REG_RSVD2            (31)
+#define PMU_FALCON_REG_SIZE             (32)
+struct pmu_gk20a {
+        struct gk20a *g;
+        struct pmu_ucode_desc *desc;
+        struct pmu_mem_desc ucode;
+        struct pmu_mem_desc pg_buf;
+        /* TBD: remove this if ZBC seq is fixed */
+        struct pmu_mem_desc seq_buf;
+        bool buf_loaded;
+        struct pmu_sha1_gid gid_info;
+        struct pmu_queue queue[PMU_QUEUE_COUNT];
+        struct pmu_sequence *seq;
+        unsigned long pmu_seq_tbl[PMU_SEQ_TBL_SIZE];
+        u32 next_seq_desc;
+        struct pmu_mutex *mutex;
+        u32 mutex_cnt;
+        struct mutex pmu_copy_lock;
+        struct mutex pmu_seq_lock;
+        struct gk20a_allocator dmem;
+        u32 *ucode_image;
+        bool pmu_ready;
+        u32 zbc_save_done;
+        u32 stat_dmem_offset;
+        bool elpg_ready;
+        u32 elpg_stat;
+        wait_queue_head_t pg_wq;
+#define PMU_ELPG_ENABLE_ALLOW_DELAY_MSEC        1 /* msec */
+        struct delayed_work elpg_enable; /* deferred elpg enable */
+        struct work_struct pg_init;
+        bool elpg_enable_allow; /* true after init, false after disable, true after delay */
+        struct mutex elpg_mutex; /* protect elpg enable/disable */
+        int elpg_refcnt; /* disable -1, enable +1, <=0 elpg disabled, > 0 elpg enabled */
+        struct pmu_perfmon_counter perfmon_counter;
+        u32 perfmon_state_id[PMU_DOMAIN_GROUP_NUM];
+        bool initialized;
+        void (*remove_support)(struct pmu_gk20a *pmu);
+        bool sw_ready;
+        bool perfmon_ready;
+        u32 sample_buffer;
+        struct mutex isr_mutex;
+        bool zbc_ready;
+        union {
+                struct pmu_cmdline_args_v0 args_v0;
+                struct pmu_cmdline_args_v1 args_v1;
+        };
+};
+struct gk20a_pmu_save_state {
+        struct pmu_sequence *seq;
+        u32 next_seq_desc;
+        struct pmu_mutex *mutex;
+        u32 mutex_cnt;
+        struct pmu_ucode_desc *desc;
+        struct pmu_mem_desc ucode;
+        struct pmu_mem_desc seq_buf;
+        struct pmu_mem_desc pg_buf;
+        struct delayed_work elpg_enable;
+        wait_queue_head_t pg_wq;
+        bool sw_ready;
+        struct work_struct pg_init;
+};
+int gk20a_init_pmu_support(struct gk20a *g);
+int gk20a_init_pmu_setup_hw2(struct gk20a *g);
+void gk20a_pmu_isr(struct gk20a *g);
+/* send a cmd to pmu */
+int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd, struct pmu_msg *msg,
+                struct pmu_payload *payload, u32 queue_id,
+                pmu_callback callback, void* cb_param,
+                u32 *seq_desc, unsigned long timeout);
+int gk20a_pmu_enable_elpg(struct gk20a *g);
+int gk20a_pmu_disable_elpg(struct gk20a *g);
+void gk20a_pmu_save_zbc(struct gk20a *g, u32 entries);
+int gk20a_pmu_perfmon_enable(struct gk20a *g, bool enable);
+int pmu_mutex_acquire(struct pmu_gk20a *pmu, u32 id, u32 *token);
+int pmu_mutex_release(struct pmu_gk20a *pmu, u32 id, u32 *token);
+int gk20a_pmu_destroy(struct gk20a *g);
+int gk20a_pmu_load_norm(struct gk20a *g, u32 *load);
+int gk20a_pmu_debugfs_init(struct platform_device *dev);
+void gk20a_pmu_reset_load_counters(struct gk20a *g);
+void gk20a_pmu_get_load_counters(struct gk20a *g, u32 *busy_cycles,
+                u32 *total_cycles);
+#endif /*__PMU_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/priv_ring_gk20a.c b/drivers/gpu/nvgpu/gk20a/priv_ring_gk20a.c
new file mode 100644
index 00000000..aea1a80b
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/priv_ring_gk20a.c
@@ -0,0 +1,91 @@
+/*
+ * GK20A priv ring
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/delay.h>        /* for mdelay */
+#include "gk20a.h"
+#include "hw_mc_gk20a.h"
+#include "hw_pri_ringmaster_gk20a.h"
+#include "hw_pri_ringstation_sys_gk20a.h"
+#include "hw_trim_gk20a.h"
+void gk20a_reset_priv_ring(struct gk20a *g)
+{
+        u32 data;
+        if (tegra_platform_is_linsim())
+                return;
+        data = gk20a_readl(g, trim_sys_gpc2clk_out_r());
+        data = set_field(data,
+                        trim_sys_gpc2clk_out_bypdiv_m(),
+                        trim_sys_gpc2clk_out_bypdiv_f(0));
+        gk20a_writel(g, trim_sys_gpc2clk_out_r(), data);
+        gk20a_reset(g, mc_enable_priv_ring_enabled_f());
+        gk20a_writel(g,pri_ringmaster_command_r(),
+                        0x4);
+        gk20a_writel(g, pri_ringstation_sys_decode_config_r(),
+                        0x2);
+        gk20a_readl(g, pri_ringstation_sys_decode_config_r());
+}
+void gk20a_priv_ring_isr(struct gk20a *g)
+{
+        u32 status0, status1;
+        u32 cmd;
+        s32 retry = 100;
+        if (tegra_platform_is_linsim())
+                return;
+        status0 = gk20a_readl(g, pri_ringmaster_intr_status0_r());
+        status1 = gk20a_readl(g, pri_ringmaster_intr_status1_r());
+        gk20a_dbg_info("ringmaster intr status0: 0x%08x,"
+                "status1: 0x%08x", status0, status1);
+        if (status0 & (0x1 | 0x2 | 0x4)) {
+                gk20a_reset_priv_ring(g);
+        }
+        cmd = gk20a_readl(g, pri_ringmaster_command_r());
+        cmd = set_field(cmd, pri_ringmaster_command_cmd_m(),
+                pri_ringmaster_command_cmd_ack_interrupt_f());
+        gk20a_writel(g, pri_ringmaster_command_r(), cmd);
+        do {
+                cmd = pri_ringmaster_command_cmd_v(
+                        gk20a_readl(g, pri_ringmaster_command_r()));
+                usleep_range(20, 40);
+        } while (cmd != pri_ringmaster_command_cmd_no_cmd_v() && --retry);
+        if (retry <= 0)
+                gk20a_warn(dev_from_gk20a(g),
+                        "priv ringmaster cmd ack too many retries");
+        status0 = gk20a_readl(g, pri_ringmaster_intr_status0_r());
+        status1 = gk20a_readl(g, pri_ringmaster_intr_status1_r());
+        gk20a_dbg_info("ringmaster intr status0: 0x%08x,"
+                " status1: 0x%08x", status0, status1);
+}
diff --git a/drivers/gpu/nvgpu/gk20a/priv_ring_gk20a.h b/drivers/gpu/nvgpu/gk20a/priv_ring_gk20a.h
new file mode 100644
index 00000000..cb9d49c7
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/priv_ring_gk20a.h
@@ -0,0 +1,27 @@
+/*
+ * drivers/video/tegra/host/gk20a/priv_ring_gk20a.h
+ *
+ * GK20A PRIV ringmaster
+ *
+ * Copyright (c) 2011-2012, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __PRIV_RING_GK20A_H__
+#define __PRIV_RING_GK20A_H__
+void gk20a_reset_priv_ring(struct gk20a *g);
+void gk20a_priv_ring_isr(struct gk20a *g);
+#endif /*__PRIV_RING_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/regops_gk20a.c b/drivers/gpu/nvgpu/gk20a/regops_gk20a.c
new file mode 100644
index 00000000..4a115fb1
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/regops_gk20a.c
@@ -0,0 +1,704 @@
+/*
+ *
+ * Tegra GK20A GPU Debugger Driver Register Ops
+ *
+ * Copyright (c) 2013-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/bsearch.h>
+#include <linux/nvhost_dbg_gpu_ioctl.h>
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include "dbg_gpu_gk20a.h"
+#include "regops_gk20a.h"
+struct regop_offset_range {
+        u32 base:24;
+        u32 count:8;
+};
+static int regop_bsearch_range_cmp(const void *pkey, const void *pelem)
+{
+        u32 key = *(u32 *)pkey;
+        struct regop_offset_range *prange = (struct regop_offset_range *)pelem;
+        if (key < prange->base)
+                return -1;
+        else if (prange->base <= key && key < (prange->base +
+                                               (prange->count * 4)))
+                return 0;
+        return 1;
+}
+static inline bool linear_search(u32 offset, const u32 *list, int size)
+{
+        int i;
+        for (i = 0; i < size; i++)
+                if (list[i] == offset)
+                        return true;
+        return false;
+}
+static const struct regop_offset_range gk20a_global_whitelist_ranges[] = {
+        { 0x000004f0,   1 },
+        { 0x00001a00,   3 },
+        { 0x0000259c,   1 },
+        { 0x0000280c,   1 },
+        { 0x00009400,   1 },
+        { 0x00009410,   1 },
+        { 0x00020200,   1 },
+        { 0x00022430,   7 },
+        { 0x00022548,   1 },
+        { 0x00100c18,   3 },
+        { 0x00100c84,   1 },
+        { 0x00100cc4,   1 },
+        { 0x00106640,   1 },
+        { 0x0010a0a8,   1 },
+        { 0x0010a4f0,   1 },
+        { 0x0010e064,   1 },
+        { 0x0010e164,   1 },
+        { 0x0010e490,   1 },
+        { 0x00110100,   1 },
+        { 0x00140028,   1 },
+        { 0x001408dc,   1 },
+        { 0x00140a5c,   1 },
+        { 0x001410dc,   1 },
+        { 0x0014125c,   1 },
+        { 0x0017e028,   1 },
+        { 0x0017e8dc,   1 },
+        { 0x0017ea5c,   1 },
+        { 0x0017f0dc,   1 },
+        { 0x0017f25c,   1 },
+        { 0x00180000,  68 },
+        { 0x00180200,  68 },
+        { 0x001a0000,  68 },
+        { 0x001b0000,  68 },
+        { 0x001b0200,  68 },
+        { 0x001b0400,  68 },
+        { 0x001b0600,  68 },
+        { 0x001b4000,   3 },
+        { 0x001b4010,   3 },
+        { 0x001b4020,   3 },
+        { 0x001b4040,   3 },
+        { 0x001b4050,   3 },
+        { 0x001b4060,  16 },
+        { 0x001b40a4,   1 },
+        { 0x001b4100,   6 },
+        { 0x001b4124,   2 },
+        { 0x001b8000,   7 },
+        { 0x001bc000,   7 },
+        { 0x001be000,   7 },
+        { 0x00400500,   1 },
+        { 0x00400700,   1 },
+        { 0x0040415c,   1 },
+        { 0x00405850,   1 },
+        { 0x00405908,   1 },
+        { 0x00405b40,   1 },
+        { 0x00405b50,   1 },
+        { 0x00406024,   1 },
+        { 0x00407010,   1 },
+        { 0x00407808,   1 },
+        { 0x0040803c,   1 },
+        { 0x0040880c,   1 },
+        { 0x00408910,   1 },
+        { 0x00408984,   1 },
+        { 0x004090a8,   1 },
+        { 0x004098a0,   1 },
+        { 0x0041000c,   1 },
+        { 0x00410110,   1 },
+        { 0x00410184,   1 },
+        { 0x00418384,   1 },
+        { 0x004184a0,   1 },
+        { 0x00418604,   1 },
+        { 0x00418680,   1 },
+        { 0x00418714,   1 },
+        { 0x0041881c,   1 },
+        { 0x004188c8,   2 },
+        { 0x00418b04,   1 },
+        { 0x00418c04,   1 },
+        { 0x00418c64,   2 },
+        { 0x00418c88,   1 },
+        { 0x00418cb4,   2 },
+        { 0x00418d00,   1 },
+        { 0x00418d28,   2 },
+        { 0x00418e08,   1 },
+        { 0x00418e1c,   2 },
+        { 0x00418f08,   1 },
+        { 0x00418f20,   2 },
+        { 0x00419000,   1 },
+        { 0x0041900c,   1 },
+        { 0x00419018,   1 },
+        { 0x00419854,   1 },
+        { 0x00419ab0,   1 },
+        { 0x00419ab8,   3 },
+        { 0x00419ac8,   1 },
+        { 0x00419c0c,   1 },
+        { 0x00419c8c,   3 },
+        { 0x00419ca8,   1 },
+        { 0x00419d08,   2 },
+        { 0x00419e00,   1 },
+        { 0x00419e0c,   1 },
+        { 0x00419e14,   2 },
+        { 0x00419e24,   2 },
+        { 0x00419e34,   2 },
+        { 0x00419e44,   4 },
+        { 0x00419ea4,   1 },
+        { 0x00419eb0,   1 },
+        { 0x0041a0a0,   1 },
+        { 0x0041a0a8,   1 },
+        { 0x0041a17c,   1 },
+        { 0x0041a890,   2 },
+        { 0x0041a8a0,   3 },
+        { 0x0041a8b0,   2 },
+        { 0x0041b014,   1 },
+        { 0x0041b0a0,   1 },
+        { 0x0041b0cc,   1 },
+        { 0x0041b0e8,   2 },
+        { 0x0041b1dc,   1 },
+        { 0x0041b1f8,   2 },
+        { 0x0041be14,   1 },
+        { 0x0041bea0,   1 },
+        { 0x0041becc,   1 },
+        { 0x0041bee8,   2 },
+        { 0x0041bfdc,   1 },
+        { 0x0041bff8,   2 },
+        { 0x0041c054,   1 },
+        { 0x0041c2b0,   1 },
+        { 0x0041c2b8,   3 },
+        { 0x0041c2c8,   1 },
+        { 0x0041c40c,   1 },
+        { 0x0041c48c,   3 },
+        { 0x0041c4a8,   1 },
+        { 0x0041c508,   2 },
+        { 0x0041c600,   1 },
+        { 0x0041c60c,   1 },
+        { 0x0041c614,   2 },
+        { 0x0041c624,   2 },
+        { 0x0041c634,   2 },
+        { 0x0041c644,   4 },
+        { 0x0041c6a4,   1 },
+        { 0x0041c6b0,   1 },
+        { 0x00500384,   1 },
+        { 0x005004a0,   1 },
+        { 0x00500604,   1 },
+        { 0x00500680,   1 },
+        { 0x00500714,   1 },
+        { 0x0050081c,   1 },
+        { 0x005008c8,   2 },
+        { 0x00500b04,   1 },
+        { 0x00500c04,   1 },
+        { 0x00500c64,   2 },
+        { 0x00500c88,   1 },
+        { 0x00500cb4,   2 },
+        { 0x00500d00,   1 },
+        { 0x00500d28,   2 },
+        { 0x00500e08,   1 },
+        { 0x00500e1c,   2 },
+        { 0x00500f08,   1 },
+        { 0x00500f20,   2 },
+        { 0x00501000,   1 },
+        { 0x0050100c,   1 },
+        { 0x00501018,   1 },
+        { 0x00501854,   1 },
+        { 0x00501ab0,   1 },
+        { 0x00501ab8,   3 },
+        { 0x00501ac8,   1 },
+        { 0x00501c0c,   1 },
+        { 0x00501c8c,   3 },
+        { 0x00501ca8,   1 },
+        { 0x00501d08,   2 },
+        { 0x00501e00,   1 },
+        { 0x00501e0c,   1 },
+        { 0x00501e14,   2 },
+        { 0x00501e24,   2 },
+        { 0x00501e34,   2 },
+        { 0x00501e44,   4 },
+        { 0x00501ea4,   1 },
+        { 0x00501eb0,   1 },
+        { 0x005020a0,   1 },
+        { 0x005020a8,   1 },
+        { 0x0050217c,   1 },
+        { 0x00502890,   2 },
+        { 0x005028a0,   3 },
+        { 0x005028b0,   2 },
+        { 0x00503014,   1 },
+        { 0x005030a0,   1 },
+        { 0x005030cc,   1 },
+        { 0x005030e8,   2 },
+        { 0x005031dc,   1 },
+        { 0x005031f8,   2 },
+        { 0x00503e14,   1 },
+        { 0x00503ea0,   1 },
+        { 0x00503ecc,   1 },
+        { 0x00503ee8,   2 },
+        { 0x00503fdc,   1 },
+        { 0x00503ff8,   2 },
+        { 0x00504054,   1 },
+        { 0x005042b0,   1 },
+        { 0x005042b8,   3 },
+        { 0x005042c8,   1 },
+        { 0x0050440c,   1 },
+        { 0x0050448c,   3 },
+        { 0x005044a8,   1 },
+        { 0x00504508,   2 },
+        { 0x00504600,   1 },
+        { 0x0050460c,   1 },
+        { 0x00504614,   2 },
+        { 0x00504624,   2 },
+        { 0x00504634,   2 },
+        { 0x00504644,   4 },
+        { 0x005046a4,   1 },
+        { 0x005046b0,   1 },
+};
+static const u32 gk20a_global_whitelist_ranges_count =
+        ARRAY_SIZE(gk20a_global_whitelist_ranges);
+/* context */
+static const struct regop_offset_range gk20a_context_whitelist_ranges[] = {
+        { 0x0000280c,   1 },
+        { 0x00100cc4,   1 },
+        { 0x00400500,   1 },
+        { 0x00405b40,   1 },
+        { 0x00419000,   1 },
+        { 0x00419c8c,   3 },
+        { 0x00419d08,   2 },
+        { 0x00419e04,   3 },
+        { 0x00419e14,   2 },
+        { 0x00419e24,   2 },
+        { 0x00419e34,   2 },
+        { 0x00419e44,   4 },
+        { 0x00419e58,   6 },
+        { 0x00419e84,   5 },
+        { 0x00419ea4,   1 },
+        { 0x00419eac,   2 },
+        { 0x00419f30,   8 },
+        { 0x0041c48c,   3 },
+        { 0x0041c508,   2 },
+        { 0x0041c604,   3 },
+        { 0x0041c614,   2 },
+        { 0x0041c624,   2 },
+        { 0x0041c634,   2 },
+        { 0x0041c644,   4 },
+        { 0x0041c658,   6 },
+        { 0x0041c684,   5 },
+        { 0x0041c6a4,   1 },
+        { 0x0041c6ac,   2 },
+        { 0x0041c730,   8 },
+        { 0x00501000,   1 },
+        { 0x00501c8c,   3 },
+        { 0x00501d08,   2 },
+        { 0x00501e04,   3 },
+        { 0x00501e14,   2 },
+        { 0x00501e24,   2 },
+        { 0x00501e34,   2 },
+        { 0x00501e44,   4 },
+        { 0x00501e58,   6 },
+        { 0x00501e84,   5 },
+        { 0x00501ea4,   1 },
+        { 0x00501eac,   2 },
+        { 0x00501f30,   8 },
+        { 0x0050448c,   3 },
+        { 0x00504508,   2 },
+        { 0x00504604,   3 },
+        { 0x00504614,   2 },
+        { 0x00504624,   2 },
+        { 0x00504634,   2 },
+        { 0x00504644,   4 },
+        { 0x00504658,   6 },
+        { 0x00504684,   5 },
+        { 0x005046a4,   1 },
+        { 0x005046ac,   2 },
+        { 0x00504730,   8 },
+};
+static const u32 gk20a_context_whitelist_ranges_count =
+        ARRAY_SIZE(gk20a_context_whitelist_ranges);
+/* runcontrol */
+static const u32 gk20a_runcontrol_whitelist[] = {
+        0x00419e10,
+        0x0041c610,
+        0x00501e10,
+        0x00504610,
+};
+static const u32 gk20a_runcontrol_whitelist_count =
+        ARRAY_SIZE(gk20a_runcontrol_whitelist);
+static const struct regop_offset_range gk20a_runcontrol_whitelist_ranges[] = {
+        { 0x00419e10,   1 },
+        { 0x0041c610,   1 },
+        { 0x00501e10,   1 },
+        { 0x00504610,   1 },
+};
+static const u32 gk20a_runcontrol_whitelist_ranges_count =
+        ARRAY_SIZE(gk20a_runcontrol_whitelist_ranges);
+/* quad ctl */
+static const u32 gk20a_qctl_whitelist[] = {
+        0x00504670,
+    0x00504674,
+    0x00504678,
+    0x0050467c,
+    0x00504680,
+        0x00504730,
+        0x00504734,
+        0x00504738,
+        0x0050473c,
+};
+static const u32 gk20a_qctl_whitelist_count =
+        ARRAY_SIZE(gk20a_qctl_whitelist);
+static const struct regop_offset_range gk20a_qctl_whitelist_ranges[] = {
+        { 0x00504670,   1 },
+        { 0x00504730,   4 },
+};
+static const u32 gk20a_qctl_whitelist_ranges_count =
+        ARRAY_SIZE(gk20a_qctl_whitelist_ranges);
+static bool validate_reg_ops(struct dbg_session_gk20a *dbg_s,
+                             u32 *ctx_rd_count, u32 *ctx_wr_count,
+                             struct nvhost_dbg_gpu_reg_op *ops,
+                             u32 op_count);
+int exec_regops_gk20a(struct dbg_session_gk20a *dbg_s,
+                      struct nvhost_dbg_gpu_reg_op *ops,
+                      u64 num_ops)
+{
+        int err = 0, i;
+        struct channel_gk20a *ch = NULL;
+        struct gk20a *g = dbg_s->g;
+        /*struct gr_gk20a *gr = &g->gr;*/
+        u32 data32_lo = 0, data32_hi = 0;
+        u32 ctx_rd_count = 0, ctx_wr_count = 0;
+        bool skip_read_lo, skip_read_hi;
+        bool ok;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+        ch = dbg_s->ch;
+        ok = validate_reg_ops(dbg_s,
+                              &ctx_rd_count, &ctx_wr_count,
+                              ops, num_ops);
+        if (!ok) {
+                dev_err(dbg_s->dev, "invalid op(s)");
+                err = -EINVAL;
+                /* each op has its own err/status */
+                goto clean_up;
+        }
+        for (i = 0; i < num_ops; i++) {
+                /* if it isn't global then it is done in the ctx ops... */
+                if (ops[i].type != REGOP(TYPE_GLOBAL))
+                        continue;
+                switch (ops[i].op) {
+                case REGOP(READ_32):
+                        ops[i].value_hi = 0;
+                        ops[i].value_lo = gk20a_readl(g, ops[i].offset);
+                        gk20a_dbg(gpu_dbg_gpu_dbg, "read_32 0x%08x from 0x%08x",
+                                   ops[i].value_lo, ops[i].offset);
+                        break;
+                case REGOP(READ_64):
+                        ops[i].value_lo = gk20a_readl(g, ops[i].offset);
+                        ops[i].value_hi =
+                                gk20a_readl(g, ops[i].offset + 4);
+                        gk20a_dbg(gpu_dbg_gpu_dbg, "read_64 0x%08x:%08x from 0x%08x",
+                                   ops[i].value_hi, ops[i].value_lo,
+                                   ops[i].offset);
+                break;
+                case REGOP(WRITE_32):
+                case REGOP(WRITE_64):
+                        /* some of this appears wonky/unnecessary but
+                           we've kept it for compat with existing
+                           debugger code.  just in case... */
+                        skip_read_lo = skip_read_hi = false;
+                        if (ops[i].and_n_mask_lo == ~(u32)0) {
+                                data32_lo = ops[i].value_lo;
+                                skip_read_lo = true;
+                        }
+                        if ((ops[i].op == REGOP(WRITE_64)) &&
+                            (ops[i].and_n_mask_hi == ~(u32)0)) {
+                                data32_hi = ops[i].value_hi;
+                                skip_read_hi = true;
+                        }
+                        /* read first 32bits */
+                        if (unlikely(skip_read_lo == false)) {
+                                data32_lo = gk20a_readl(g, ops[i].offset);
+                                data32_lo &= ~ops[i].and_n_mask_lo;
+                                data32_lo |= ops[i].value_lo;
+                        }
+                        /* if desired, read second 32bits */
+                        if ((ops[i].op == REGOP(WRITE_64)) &&
+                            !skip_read_hi) {
+                                data32_hi = gk20a_readl(g, ops[i].offset + 4);
+                                data32_hi &= ~ops[i].and_n_mask_hi;
+                                data32_hi |= ops[i].value_hi;
+                        }
+                        /* now update first 32bits */
+                        gk20a_writel(g, ops[i].offset, data32_lo);
+                        gk20a_dbg(gpu_dbg_gpu_dbg, "Wrote 0x%08x to 0x%08x ",
+                                   data32_lo, ops[i].offset);
+                        /* if desired, update second 32bits */
+                        if (ops[i].op == REGOP(WRITE_64)) {
+                                gk20a_writel(g, ops[i].offset + 4, data32_hi);
+                                gk20a_dbg(gpu_dbg_gpu_dbg, "Wrote 0x%08x to 0x%08x ",
+                                           data32_hi, ops[i].offset + 4);
+                        }
+                        break;
+                /* shouldn't happen as we've already screened */
+                default:
+                        BUG();
+                        err = -EINVAL;
+                        goto clean_up;
+                        break;
+                }
+        }
+        if (ctx_wr_count | ctx_rd_count) {
+                err = gr_gk20a_exec_ctx_ops(ch, ops, num_ops,
+                                            ctx_wr_count, ctx_rd_count);
+                if (err) {
+                        dev_warn(dbg_s->dev,
+                                 "failed to perform ctx ops\n");
+                        goto clean_up;
+                }
+        }
+ clean_up:
+        gk20a_dbg(gpu_dbg_gpu_dbg, "ret=%d", err);
+        return err;
+}
+static int validate_reg_op_info(struct dbg_session_gk20a *dbg_s,
+                                struct nvhost_dbg_gpu_reg_op *op)
+{
+        int err = 0;
+        op->status = REGOP(STATUS_SUCCESS);
+        switch (op->op) {
+        case REGOP(READ_32):
+        case REGOP(READ_64):
+        case REGOP(WRITE_32):
+        case REGOP(WRITE_64):
+                break;
+        default:
+                op->status |= REGOP(STATUS_UNSUPPORTED_OP);
+                /*gk20a_err(dbg_s->dev, "Invalid regops op %d!", op->op);*/
+                err = -EINVAL;
+                break;
+        }
+        switch (op->type) {
+        case REGOP(TYPE_GLOBAL):
+        case REGOP(TYPE_GR_CTX):
+        case REGOP(TYPE_GR_CTX_TPC):
+        case REGOP(TYPE_GR_CTX_SM):
+        case REGOP(TYPE_GR_CTX_CROP):
+        case REGOP(TYPE_GR_CTX_ZROP):
+        case REGOP(TYPE_GR_CTX_QUAD):
+                break;
+        /*
+        case NVHOST_DBG_GPU_REG_OP_TYPE_FB:
+        */
+        default:
+                op->status |= REGOP(STATUS_INVALID_TYPE);
+                /*gk20a_err(dbg_s->dev, "Invalid regops type %d!", op->type);*/
+                err = -EINVAL;
+                break;
+        }
+        return err;
+}
+static bool check_whitelists(struct dbg_session_gk20a *dbg_s,
+                          struct nvhost_dbg_gpu_reg_op *op, u32 offset)
+{
+        bool valid = false;
+        if (op->type == REGOP(TYPE_GLOBAL)) {
+                /* search global list */
+                valid = !!bsearch(&offset,
+                                  gk20a_global_whitelist_ranges,
+                                  gk20a_global_whitelist_ranges_count,
+                                  sizeof(*gk20a_global_whitelist_ranges),
+                                  regop_bsearch_range_cmp);
+                /* if debug session and channel is bound search context list */
+                if ((!valid) && (!dbg_s->is_profiler && dbg_s->ch)) {
+                        /* binary search context list */
+                        valid = !!bsearch(&offset,
+                                          gk20a_context_whitelist_ranges,
+                                          gk20a_context_whitelist_ranges_count,
+                                          sizeof(*gk20a_context_whitelist_ranges),
+                                          regop_bsearch_range_cmp);
+                }
+                /* if debug session and channel is bound search runcontrol list */
+                if ((!valid) && (!dbg_s->is_profiler && dbg_s->ch)) {
+                        valid = linear_search(offset,
+                                              gk20a_runcontrol_whitelist,
+                                              gk20a_runcontrol_whitelist_count);
+                }
+        } else if (op->type == REGOP(TYPE_GR_CTX)) {
+                /* it's a context-relative op */
+                if (!dbg_s->ch) {
+                        gk20a_err(dbg_s->dev, "can't perform ctx regop unless bound");
+                        op->status = REGOP(STATUS_UNSUPPORTED_OP);
+                        return -ENODEV;
+                }
+                /* binary search context list */
+                valid = !!bsearch(&offset,
+                                  gk20a_context_whitelist_ranges,
+                                  gk20a_context_whitelist_ranges_count,
+                                  sizeof(*gk20a_context_whitelist_ranges),
+                                  regop_bsearch_range_cmp);
+                /* if debug session and channel is bound search runcontrol list */
+                if ((!valid) && (!dbg_s->is_profiler && dbg_s->ch)) {
+                        valid = linear_search(offset,
+                                              gk20a_runcontrol_whitelist,
+                                              gk20a_runcontrol_whitelist_count);
+                }
+        } else if (op->type == REGOP(TYPE_GR_CTX_QUAD)) {
+                valid = linear_search(offset,
+                                      gk20a_qctl_whitelist,
+                                      gk20a_qctl_whitelist_count);
+        }
+        return valid;
+}
+/* note: the op here has already been through validate_reg_op_info */
+static int validate_reg_op_offset(struct dbg_session_gk20a *dbg_s,
+                                  struct nvhost_dbg_gpu_reg_op *op)
+{
+        int err;
+        u32 buf_offset_lo, buf_offset_addr, num_offsets, offset;
+        bool valid = false;
+        op->status = 0;
+        offset = op->offset;
+        /* support only 24-bit 4-byte aligned offsets */
+        if (offset & 0xFF000003) {
+                gk20a_err(dbg_s->dev, "invalid regop offset: 0x%x\n", offset);
+                op->status |= REGOP(STATUS_INVALID_OFFSET);
+                return -EINVAL;
+        }
+        valid = check_whitelists(dbg_s, op, offset);
+        if ((op->op == REGOP(READ_64) || op->op == REGOP(WRITE_64)) && valid)
+                valid = check_whitelists(dbg_s, op, offset + 4);
+        if (valid && (op->type != REGOP(TYPE_GLOBAL))) {
+                        err = gr_gk20a_get_ctx_buffer_offsets(dbg_s->g,
+                                                              op->offset,
+                                                              1,
+                                                              &buf_offset_lo,
+                                                              &buf_offset_addr,
+                                                              &num_offsets,
+                                                              op->type == REGOP(TYPE_GR_CTX_QUAD),
+                                                              op->quad);
+                        if (err) {
+                                op->status |= REGOP(STATUS_INVALID_OFFSET);
+                                return -EINVAL;
+                        }
+                        if (!buf_offset_lo) {
+                                op->status |= REGOP(STATUS_INVALID_OFFSET);
+                                return -EINVAL;
+                        }
+        }
+        if (!valid) {
+                gk20a_err(dbg_s->dev, "invalid regop offset: 0x%x\n", offset);
+                op->status |= REGOP(STATUS_INVALID_OFFSET);
+                return -EINVAL;
+        }
+        return 0;
+}
+static bool validate_reg_ops(struct dbg_session_gk20a *dbg_s,
+                            u32 *ctx_rd_count, u32 *ctx_wr_count,
+                            struct nvhost_dbg_gpu_reg_op *ops,
+                            u32 op_count)
+{
+        u32 i;
+        int err;
+        bool ok = true;
+        /* keep going until the end so every op can get
+         * a separate error code if needed */
+        for (i = 0; i < op_count; i++) {
+                err = validate_reg_op_info(dbg_s, &ops[i]);
+                ok &= !err;
+                if (reg_op_is_gr_ctx(ops[i].type)) {
+                        if (reg_op_is_read(ops[i].op))
+                                (*ctx_rd_count)++;
+                        else
+                                (*ctx_wr_count)++;
+                }
+                err = validate_reg_op_offset(dbg_s, &ops[i]);
+                ok &= !err;
+        }
+        gk20a_dbg(gpu_dbg_gpu_dbg, "ctx_wrs:%d ctx_rds:%d\n",
+                   *ctx_wr_count, *ctx_rd_count);
+        return ok;
+}
+/* exported for tools like cyclestats, etc */
+bool is_bar0_global_offset_whitelisted_gk20a(u32 offset)
+{
+        bool valid = !!bsearch(&offset,
+                               gk20a_global_whitelist_ranges,
+                               gk20a_global_whitelist_ranges_count,
+                               sizeof(*gk20a_global_whitelist_ranges),
+                               regop_bsearch_range_cmp);
+        return valid;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/regops_gk20a.h b/drivers/gpu/nvgpu/gk20a/regops_gk20a.h
new file mode 100644
index 00000000..23b4865b
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/regops_gk20a.h
@@ -0,0 +1,47 @@
+/*
+ *
+ * Tegra GK20A GPU Debugger Driver Register Ops
+ *
+ * Copyright (c) 2013-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __REGOPS_GK20A_H_
+#define __REGOPS_GK20A_H_
+int exec_regops_gk20a(struct dbg_session_gk20a *dbg_s,
+                      struct nvhost_dbg_gpu_reg_op *ops,
+                      u64 num_ops);
+/* turn seriously unwieldy names -> something shorter */
+#define REGOP(x) NVHOST_DBG_GPU_REG_OP_##x
+static inline bool reg_op_is_gr_ctx(u8 type)
+{
+        return  type == REGOP(TYPE_GR_CTX) ||
+                type == REGOP(TYPE_GR_CTX_TPC) ||
+                type == REGOP(TYPE_GR_CTX_SM) ||
+                type == REGOP(TYPE_GR_CTX_CROP) ||
+                type == REGOP(TYPE_GR_CTX_ZROP) ||
+                type == REGOP(TYPE_GR_CTX_QUAD);
+}
+static inline bool reg_op_is_read(u8 op)
+{
+        return  op == REGOP(READ_32) ||
+                op == REGOP(READ_64) ;
+}
+bool is_bar0_global_offset_whitelisted_gk20a(u32 offset);
+#endif /* __REGOPS_GK20A_H_ */
diff --git a/drivers/gpu/nvgpu/gk20a/sim_gk20a.h b/drivers/gpu/nvgpu/gk20a/sim_gk20a.h
new file mode 100644
index 00000000..5fc8006e
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/sim_gk20a.h
@@ -0,0 +1,62 @@
+/*
+ * drivers/video/tegra/host/gk20a/sim_gk20a.h
+ *
+ * GK20A sim support
+ *
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __SIM_GK20A_H__
+#define __SIM_GK20A_H__
+struct gk20a;
+struct sim_gk20a {
+        struct gk20a *g;
+        struct resource *reg_mem;
+        void __iomem *regs;
+        struct {
+                struct page *page;
+                void *kvaddr;
+                phys_addr_t phys;
+        } send_bfr, recv_bfr, msg_bfr;
+        u32 send_ring_put;
+        u32 recv_ring_get;
+        u32 recv_ring_put;
+        u32 sequence_base;
+        void (*remove_support)(struct sim_gk20a *);
+};
+int gk20a_sim_esc_read(struct gk20a *g, char *path, u32 index,
+                          u32 count, u32 *data);
+static inline int gk20a_sim_esc_read_no_sim(struct gk20a *g, char *p,
+                                     u32 i, u32 c, u32 *d)
+{
+        *d = ~(u32)0;
+        return -1;
+}
+static inline int gk20a_sim_esc_readl(struct gk20a *g, char * p, u32 i, u32 *d)
+{
+        if (tegra_cpu_is_asim())
+                return gk20a_sim_esc_read(g, p, i, sizeof(u32), d);
+        return gk20a_sim_esc_read_no_sim(g, p, i, sizeof(u32), d);
+}
+#endif /*__SIM_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/therm_gk20a.c b/drivers/gpu/nvgpu/gk20a/therm_gk20a.c
new file mode 100644
index 00000000..da911979
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/therm_gk20a.c
@@ -0,0 +1,142 @@
+/*
+ * drivers/video/tegra/host/gk20a/therm_gk20a.c
+ *
+ * GK20A Therm
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include "gk20a.h"
+#include "hw_chiplet_pwr_gk20a.h"
+#include "hw_gr_gk20a.h"
+#include "hw_therm_gk20a.h"
+static int gk20a_init_therm_reset_enable_hw(struct gk20a *g)
+{
+        return 0;
+}
+static int gk20a_init_therm_setup_sw(struct gk20a *g)
+{
+        return 0;
+}
+static int gk20a_init_therm_setup_hw(struct gk20a *g)
+{
+        /* program NV_THERM registers */
+        gk20a_writel(g, therm_use_a_r(), NV_THERM_USE_A_INIT);
+        gk20a_writel(g, therm_evt_ext_therm_0_r(),
+                NV_THERM_EVT_EXT_THERM_0_INIT);
+        gk20a_writel(g, therm_evt_ext_therm_1_r(),
+                NV_THERM_EVT_EXT_THERM_1_INIT);
+        gk20a_writel(g, therm_evt_ext_therm_2_r(),
+                NV_THERM_EVT_EXT_THERM_2_INIT);
+/*
+        u32 data;
+        data = gk20a_readl(g, gr_gpcs_tpcs_l1c_cfg_r());
+        data = set_field(data, gr_gpcs_tpcs_l1c_cfg_blkactivity_enable_m(),
+                gr_gpcs_tpcs_l1c_cfg_blkactivity_enable_enable_f());
+        gk20a_writel(g, gr_gpcs_tpcs_l1c_cfg_r(), data);
+        data = gk20a_readl(g, gr_gpcs_tpcs_l1c_pm_r());
+        data = set_field(data, gr_gpcs_tpcs_l1c_pm_enable_m(),
+                gr_gpcs_tpcs_l1c_pm_enable_enable_f());
+        gk20a_writel(g, gr_gpcs_tpcs_l1c_pm_r(), data);
+        data = gk20a_readl(g, gr_gpcs_tpcs_sm_pm_ctrl_r());
+        data = set_field(data, gr_gpcs_tpcs_sm_pm_ctrl_core_enable_m(),
+                gr_gpcs_tpcs_sm_pm_ctrl_core_enable_enable_f());
+        data = set_field(data, gr_gpcs_tpcs_sm_pm_ctrl_qctl_enable_m(),
+                gr_gpcs_tpcs_sm_pm_ctrl_qctl_enable_enable_f());
+        gk20a_writel(g, gr_gpcs_tpcs_sm_pm_ctrl_r(), data);
+        data = gk20a_readl(g, gr_gpcs_tpcs_sm_halfctl_ctrl_r());
+        data = set_field(data, gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_blkactivity_enable_m(),
+                gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_blkactivity_enable_enable_f());
+        gk20a_writel(g, gr_gpcs_tpcs_sm_halfctl_ctrl_r(), data);
+        data = gk20a_readl(g, gr_gpcs_tpcs_sm_debug_sfe_control_r());
+        data = set_field(data, gr_gpcs_tpcs_sm_debug_sfe_control_blkactivity_enable_m(),
+                gr_gpcs_tpcs_sm_debug_sfe_control_blkactivity_enable_enable_f());
+        gk20a_writel(g, gr_gpcs_tpcs_sm_debug_sfe_control_r(), data);
+        gk20a_writel(g, therm_peakpower_config6_r(0),
+                therm_peakpower_config6_trigger_cfg_1h_intr_f() |
+                therm_peakpower_config6_trigger_cfg_1l_intr_f());
+        gk20a_writel(g, chiplet_pwr_gpcs_config_1_r(),
+                chiplet_pwr_gpcs_config_1_ba_enable_yes_f());
+        gk20a_writel(g, chiplet_pwr_fbps_config_1_r(),
+                chiplet_pwr_fbps_config_1_ba_enable_yes_f());
+        data = gk20a_readl(g, therm_config1_r());
+        data = set_field(data, therm_config1_ba_enable_m(),
+                therm_config1_ba_enable_yes_f());
+        gk20a_writel(g, therm_config1_r(), data);
+        gk20a_writel(g, gr_gpcs_tpcs_sm_power_throttle_r(), 0x441a);
+        gk20a_writel(g, therm_weight_1_r(), 0xd3);
+        gk20a_writel(g, chiplet_pwr_gpcs_weight_6_r(), 0x7d);
+        gk20a_writel(g, chiplet_pwr_gpcs_weight_7_r(), 0xff);
+        gk20a_writel(g, chiplet_pwr_fbps_weight_0_r(), 0x13000000);
+        gk20a_writel(g, chiplet_pwr_fbps_weight_1_r(), 0x19);
+        gk20a_writel(g, therm_peakpower_config8_r(0), 0x8);
+        gk20a_writel(g, therm_peakpower_config9_r(0), 0x0);
+        gk20a_writel(g, therm_evt_ba_w0_t1h_r(), 0x100);
+        gk20a_writel(g, therm_use_a_r(), therm_use_a_ba_w0_t1h_yes_f());
+        gk20a_writel(g, therm_peakpower_config1_r(0),
+                therm_peakpower_config1_window_period_2m_f() |
+                therm_peakpower_config1_ba_sum_shift_20_f() |
+                therm_peakpower_config1_window_en_enabled_f());
+        gk20a_writel(g, therm_peakpower_config2_r(0),
+                therm_peakpower_config2_ba_threshold_1h_val_f(1) |
+                therm_peakpower_config2_ba_threshold_1h_en_enabled_f());
+        gk20a_writel(g, therm_peakpower_config4_r(0),
+                therm_peakpower_config4_ba_threshold_1l_val_f(1) |
+                therm_peakpower_config4_ba_threshold_1l_en_enabled_f());
+*/
+        return 0;
+}
+int gk20a_init_therm_support(struct gk20a *g)
+{
+        u32 err;
+        gk20a_dbg_fn("");
+        err = gk20a_init_therm_reset_enable_hw(g);
+        if (err)
+                return err;
+        err = gk20a_init_therm_setup_sw(g);
+        if (err)
+                return err;
+        err = gk20a_init_therm_setup_hw(g);
+        if (err)
+                return err;
+        return err;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/therm_gk20a.h b/drivers/gpu/nvgpu/gk20a/therm_gk20a.h
new file mode 100644
index 00000000..3f67ee12
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/therm_gk20a.h
@@ -0,0 +1,33 @@
+/*
+ * drivers/video/tegra/host/gk20a/therm_gk20a.h
+ *
+ * GK20A Therm
+ *
+ * Copyright (c) 2011 - 2012, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _NVHOST_THERM_GK20A_H_
+#define _NVHOST_THERM_GK20A_H_
+/* priority for EXT_THERM_0 event set to highest */
+#define NV_THERM_EVT_EXT_THERM_0_INIT   0x3000100
+#define NV_THERM_EVT_EXT_THERM_1_INIT   0x2000200
+#define NV_THERM_EVT_EXT_THERM_2_INIT   0x1000300
+/* configures the thermal events that may cause clock slowdown */
+#define NV_THERM_USE_A_INIT     0x7
+int gk20a_init_therm_support(struct gk20a *g);
+#endif /* _NVHOST_THERM_GK20A_H_ */
author	Arto Merilainen <amerilainen@nvidia.com>	2014-03-19 03:38:25 -0400
committer	Dan Willemsen <dwillemsen@nvidia.com>	2015-03-18 15:08:53 -0400
commit	a9785995d5f22aaeb659285f8aeb64d8b56982e0 (patch)
tree	cc75f75bcf43db316a002a7a240b81f299bf6d7f
parent	61efaf843c22b85424036ec98015121c08f5f16c (diff)